1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2012 Alexander Block. All rights reserved.
6 #include <linux/bsearch.h>
8 #include <linux/file.h>
9 #include <linux/sort.h>
10 #include <linux/mount.h>
11 #include <linux/xattr.h>
12 #include <linux/posix_acl_xattr.h>
13 #include <linux/radix-tree.h>
14 #include <linux/vmalloc.h>
15 #include <linux/string.h>
16 #include <linux/compat.h>
17 #include <linux/crc32c.h>
18 #include <linux/fsverity.h>
25 #include "btrfs_inode.h"
26 #include "transaction.h"
27 #include "compression.h"
28 #include "print-tree.h"
29 #include "accessors.h"
31 #include "file-item.h"
34 #include "lru_cache.h"
37 * Maximum number of references an extent can have in order for us to attempt to
38 * issue clone operations instead of write operations. This currently exists to
39 * avoid hitting limitations of the backreference walking code (taking a lot of
40 * time and using too much memory for extents with large number of references).
42 #define SEND_MAX_EXTENT_REFS 1024
45 * A fs_path is a helper to dynamically build path names with unknown size.
46 * It reallocates the internal buffer on demand.
47 * It allows fast adding of path elements on the right side (normal path) and
48 * fast adding to the left side (reversed path). A reversed path can also be
49 * unreversed if needed.
58 unsigned short buf_len
:15;
59 unsigned short reversed
:1;
63 * Average path length does not exceed 200 bytes, we'll have
64 * better packing in the slab and higher chance to satisfy
65 * an allocation later during send.
70 #define FS_PATH_INLINE_SIZE \
71 (sizeof(struct fs_path) - offsetof(struct fs_path, inline_buf))
74 /* reused for each extent */
76 struct btrfs_root
*root
;
83 #define SEND_MAX_NAME_CACHE_SIZE 256
86 * Limit the root_ids array of struct backref_cache_entry to 17 elements.
87 * This makes the size of a cache entry to be exactly 192 bytes on x86_64, which
88 * can be satisfied from the kmalloc-192 slab, without wasting any space.
89 * The most common case is to have a single root for cloning, which corresponds
90 * to the send root. Having the user specify more than 16 clone roots is not
91 * common, and in such rare cases we simply don't use caching if the number of
92 * cloning roots that lead down to a leaf is more than 17.
94 #define SEND_MAX_BACKREF_CACHE_ROOTS 17
97 * Max number of entries in the cache.
98 * With SEND_MAX_BACKREF_CACHE_ROOTS as 17, the size in bytes, excluding
99 * maple tree's internal nodes, is 24K.
101 #define SEND_MAX_BACKREF_CACHE_SIZE 128
104 * A backref cache entry maps a leaf to a list of IDs of roots from which the
105 * leaf is accessible and we can use for clone operations.
106 * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, each cache entry is 128 bytes (on
109 struct backref_cache_entry
{
110 struct btrfs_lru_cache_entry entry
;
111 u64 root_ids
[SEND_MAX_BACKREF_CACHE_ROOTS
];
112 /* Number of valid elements in the root_ids array. */
116 /* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */
117 static_assert(offsetof(struct backref_cache_entry
, entry
) == 0);
120 * Max number of entries in the cache that stores directories that were already
121 * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses
122 * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but
123 * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64).
125 #define SEND_MAX_DIR_CREATED_CACHE_SIZE 64
128 * Max number of entries in the cache that stores directories that were already
129 * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses
130 * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but
131 * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64).
133 #define SEND_MAX_DIR_UTIMES_CACHE_SIZE 64
136 struct file
*send_filp
;
142 * Whether BTRFS_SEND_A_DATA attribute was already added to current
143 * command (since protocol v2, data must be the last attribute).
146 struct page
**send_buf_pages
;
147 u64 flags
; /* 'flags' member of btrfs_ioctl_send_args is u64 */
148 /* Protocol version compatibility requested */
151 struct btrfs_root
*send_root
;
152 struct btrfs_root
*parent_root
;
153 struct clone_root
*clone_roots
;
156 /* current state of the compare_tree call */
157 struct btrfs_path
*left_path
;
158 struct btrfs_path
*right_path
;
159 struct btrfs_key
*cmp_key
;
162 * Keep track of the generation of the last transaction that was used
163 * for relocating a block group. This is periodically checked in order
164 * to detect if a relocation happened since the last check, so that we
165 * don't operate on stale extent buffers for nodes (level >= 1) or on
166 * stale disk_bytenr values of file extent items.
168 u64 last_reloc_trans
;
171 * infos of the currently processed inode. In case of deleted inodes,
172 * these are the values from the deleted inode.
179 u64 cur_inode_last_extent
;
180 u64 cur_inode_next_write_offset
;
182 bool cur_inode_new_gen
;
183 bool cur_inode_deleted
;
184 bool ignore_cur_inode
;
185 bool cur_inode_needs_verity
;
186 void *verity_descriptor
;
190 struct list_head new_refs
;
191 struct list_head deleted_refs
;
193 struct btrfs_lru_cache name_cache
;
196 * The inode we are currently processing. It's not NULL only when we
197 * need to issue write commands for data extents from this inode.
199 struct inode
*cur_inode
;
200 struct file_ra_state ra
;
201 u64 page_cache_clear_start
;
202 bool clean_page_cache
;
205 * We process inodes by their increasing order, so if before an
206 * incremental send we reverse the parent/child relationship of
207 * directories such that a directory with a lower inode number was
208 * the parent of a directory with a higher inode number, and the one
209 * becoming the new parent got renamed too, we can't rename/move the
210 * directory with lower inode number when we finish processing it - we
211 * must process the directory with higher inode number first, then
212 * rename/move it and then rename/move the directory with lower inode
213 * number. Example follows.
215 * Tree state when the first send was performed:
227 * Tree state when the second (incremental) send is performed:
236 * The sequence of steps that lead to the second state was:
238 * mv /a/b/c/d /a/b/c2/d2
239 * mv /a/b/c /a/b/c2/d2/cc
241 * "c" has lower inode number, but we can't move it (2nd mv operation)
242 * before we move "d", which has higher inode number.
244 * So we just memorize which move/rename operations must be performed
245 * later when their respective parent is processed and moved/renamed.
248 /* Indexed by parent directory inode number. */
249 struct rb_root pending_dir_moves
;
252 * Reverse index, indexed by the inode number of a directory that
253 * is waiting for the move/rename of its immediate parent before its
254 * own move/rename can be performed.
256 struct rb_root waiting_dir_moves
;
259 * A directory that is going to be rm'ed might have a child directory
260 * which is in the pending directory moves index above. In this case,
261 * the directory can only be removed after the move/rename of its child
262 * is performed. Example:
282 * Sequence of steps that lead to the send snapshot:
283 * rm -f /a/b/c/foo.txt
285 * mv /a/b/c/x /a/b/YY
288 * When the child is processed, its move/rename is delayed until its
289 * parent is processed (as explained above), but all other operations
290 * like update utimes, chown, chgrp, etc, are performed and the paths
291 * that it uses for those operations must use the orphanized name of
292 * its parent (the directory we're going to rm later), so we need to
293 * memorize that name.
295 * Indexed by the inode number of the directory to be deleted.
297 struct rb_root orphan_dirs
;
299 struct rb_root rbtree_new_refs
;
300 struct rb_root rbtree_deleted_refs
;
302 struct btrfs_lru_cache backref_cache
;
303 u64 backref_cache_last_reloc_trans
;
305 struct btrfs_lru_cache dir_created_cache
;
306 struct btrfs_lru_cache dir_utimes_cache
;
309 struct pending_dir_move
{
311 struct list_head list
;
315 struct list_head update_refs
;
318 struct waiting_dir_move
{
322 * There might be some directory that could not be removed because it
323 * was waiting for this directory inode to be moved first. Therefore
324 * after this directory is moved, we can try to rmdir the ino rmdir_ino.
331 struct orphan_dir_info
{
335 u64 last_dir_index_offset
;
336 u64 dir_high_seq_ino
;
339 struct name_cache_entry
{
341 * The key in the entry is an inode number, and the generation matches
342 * the inode's generation.
344 struct btrfs_lru_cache_entry entry
;
348 int need_later_update
;
349 /* Name length without NUL terminator. */
351 /* Not NUL terminated. */
352 char name
[] __counted_by(name_len
) __nonstring
;
355 /* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */
356 static_assert(offsetof(struct name_cache_entry
, entry
) == 0);
359 #define ADVANCE_ONLY_NEXT -1
361 enum btrfs_compare_tree_result
{
362 BTRFS_COMPARE_TREE_NEW
,
363 BTRFS_COMPARE_TREE_DELETED
,
364 BTRFS_COMPARE_TREE_CHANGED
,
365 BTRFS_COMPARE_TREE_SAME
,
369 static void inconsistent_snapshot_error(struct send_ctx
*sctx
,
370 enum btrfs_compare_tree_result result
,
373 const char *result_string
;
376 case BTRFS_COMPARE_TREE_NEW
:
377 result_string
= "new";
379 case BTRFS_COMPARE_TREE_DELETED
:
380 result_string
= "deleted";
382 case BTRFS_COMPARE_TREE_CHANGED
:
383 result_string
= "updated";
385 case BTRFS_COMPARE_TREE_SAME
:
387 result_string
= "unchanged";
391 result_string
= "unexpected";
394 btrfs_err(sctx
->send_root
->fs_info
,
395 "Send: inconsistent snapshot, found %s %s for inode %llu without updated inode item, send root is %llu, parent root is %llu",
396 result_string
, what
, sctx
->cmp_key
->objectid
,
397 btrfs_root_id(sctx
->send_root
),
398 (sctx
->parent_root
? btrfs_root_id(sctx
->parent_root
) : 0));
402 static bool proto_cmd_ok(const struct send_ctx
*sctx
, int cmd
)
404 switch (sctx
->proto
) {
405 case 1: return cmd
<= BTRFS_SEND_C_MAX_V1
;
406 case 2: return cmd
<= BTRFS_SEND_C_MAX_V2
;
407 case 3: return cmd
<= BTRFS_SEND_C_MAX_V3
;
408 default: return false;
412 static int is_waiting_for_move(struct send_ctx
*sctx
, u64 ino
);
414 static struct waiting_dir_move
*
415 get_waiting_dir_move(struct send_ctx
*sctx
, u64 ino
);
417 static int is_waiting_for_rm(struct send_ctx
*sctx
, u64 dir_ino
, u64 gen
);
419 static int need_send_hole(struct send_ctx
*sctx
)
421 return (sctx
->parent_root
&& !sctx
->cur_inode_new
&&
422 !sctx
->cur_inode_new_gen
&& !sctx
->cur_inode_deleted
&&
423 S_ISREG(sctx
->cur_inode_mode
));
426 static void fs_path_reset(struct fs_path
*p
)
429 p
->start
= p
->buf
+ p
->buf_len
- 1;
439 static struct fs_path
*fs_path_alloc(void)
443 p
= kmalloc(sizeof(*p
), GFP_KERNEL
);
447 p
->buf
= p
->inline_buf
;
448 p
->buf_len
= FS_PATH_INLINE_SIZE
;
453 static struct fs_path
*fs_path_alloc_reversed(void)
465 static void fs_path_free(struct fs_path
*p
)
469 if (p
->buf
!= p
->inline_buf
)
474 static int fs_path_len(struct fs_path
*p
)
476 return p
->end
- p
->start
;
479 static int fs_path_ensure_buf(struct fs_path
*p
, int len
)
487 if (p
->buf_len
>= len
)
490 if (len
> PATH_MAX
) {
495 path_len
= p
->end
- p
->start
;
496 old_buf_len
= p
->buf_len
;
499 * Allocate to the next largest kmalloc bucket size, to let
500 * the fast path happen most of the time.
502 len
= kmalloc_size_roundup(len
);
504 * First time the inline_buf does not suffice
506 if (p
->buf
== p
->inline_buf
) {
507 tmp_buf
= kmalloc(len
, GFP_KERNEL
);
509 memcpy(tmp_buf
, p
->buf
, old_buf_len
);
511 tmp_buf
= krealloc(p
->buf
, len
, GFP_KERNEL
);
519 tmp_buf
= p
->buf
+ old_buf_len
- path_len
- 1;
520 p
->end
= p
->buf
+ p
->buf_len
- 1;
521 p
->start
= p
->end
- path_len
;
522 memmove(p
->start
, tmp_buf
, path_len
+ 1);
525 p
->end
= p
->start
+ path_len
;
530 static int fs_path_prepare_for_add(struct fs_path
*p
, int name_len
,
536 new_len
= p
->end
- p
->start
+ name_len
;
537 if (p
->start
!= p
->end
)
539 ret
= fs_path_ensure_buf(p
, new_len
);
544 if (p
->start
!= p
->end
)
546 p
->start
-= name_len
;
547 *prepared
= p
->start
;
549 if (p
->start
!= p
->end
)
560 static int fs_path_add(struct fs_path
*p
, const char *name
, int name_len
)
565 ret
= fs_path_prepare_for_add(p
, name_len
, &prepared
);
568 memcpy(prepared
, name
, name_len
);
574 static int fs_path_add_path(struct fs_path
*p
, struct fs_path
*p2
)
579 ret
= fs_path_prepare_for_add(p
, p2
->end
- p2
->start
, &prepared
);
582 memcpy(prepared
, p2
->start
, p2
->end
- p2
->start
);
588 static int fs_path_add_from_extent_buffer(struct fs_path
*p
,
589 struct extent_buffer
*eb
,
590 unsigned long off
, int len
)
595 ret
= fs_path_prepare_for_add(p
, len
, &prepared
);
599 read_extent_buffer(eb
, prepared
, off
, len
);
605 static int fs_path_copy(struct fs_path
*p
, struct fs_path
*from
)
607 p
->reversed
= from
->reversed
;
610 return fs_path_add_path(p
, from
);
613 static void fs_path_unreverse(struct fs_path
*p
)
622 len
= p
->end
- p
->start
;
624 p
->end
= p
->start
+ len
;
625 memmove(p
->start
, tmp
, len
+ 1);
629 static struct btrfs_path
*alloc_path_for_send(void)
631 struct btrfs_path
*path
;
633 path
= btrfs_alloc_path();
636 path
->search_commit_root
= 1;
637 path
->skip_locking
= 1;
638 path
->need_commit_sem
= 1;
642 static int write_buf(struct file
*filp
, const void *buf
, u32 len
, loff_t
*off
)
648 ret
= kernel_write(filp
, buf
+ pos
, len
- pos
, off
);
659 static int tlv_put(struct send_ctx
*sctx
, u16 attr
, const void *data
, int len
)
661 struct btrfs_tlv_header
*hdr
;
662 int total_len
= sizeof(*hdr
) + len
;
663 int left
= sctx
->send_max_size
- sctx
->send_size
;
665 if (WARN_ON_ONCE(sctx
->put_data
))
668 if (unlikely(left
< total_len
))
671 hdr
= (struct btrfs_tlv_header
*) (sctx
->send_buf
+ sctx
->send_size
);
672 put_unaligned_le16(attr
, &hdr
->tlv_type
);
673 put_unaligned_le16(len
, &hdr
->tlv_len
);
674 memcpy(hdr
+ 1, data
, len
);
675 sctx
->send_size
+= total_len
;
680 #define TLV_PUT_DEFINE_INT(bits) \
681 static int tlv_put_u##bits(struct send_ctx *sctx, \
682 u##bits attr, u##bits value) \
684 __le##bits __tmp = cpu_to_le##bits(value); \
685 return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \
688 TLV_PUT_DEFINE_INT(8)
689 TLV_PUT_DEFINE_INT(32)
690 TLV_PUT_DEFINE_INT(64)
692 static int tlv_put_string(struct send_ctx
*sctx
, u16 attr
,
693 const char *str
, int len
)
697 return tlv_put(sctx
, attr
, str
, len
);
700 static int tlv_put_uuid(struct send_ctx
*sctx
, u16 attr
,
703 return tlv_put(sctx
, attr
, uuid
, BTRFS_UUID_SIZE
);
706 static int tlv_put_btrfs_timespec(struct send_ctx
*sctx
, u16 attr
,
707 struct extent_buffer
*eb
,
708 struct btrfs_timespec
*ts
)
710 struct btrfs_timespec bts
;
711 read_extent_buffer(eb
, &bts
, (unsigned long)ts
, sizeof(bts
));
712 return tlv_put(sctx
, attr
, &bts
, sizeof(bts
));
716 #define TLV_PUT(sctx, attrtype, data, attrlen) \
718 ret = tlv_put(sctx, attrtype, data, attrlen); \
720 goto tlv_put_failure; \
723 #define TLV_PUT_INT(sctx, attrtype, bits, value) \
725 ret = tlv_put_u##bits(sctx, attrtype, value); \
727 goto tlv_put_failure; \
730 #define TLV_PUT_U8(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 8, data)
731 #define TLV_PUT_U16(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 16, data)
732 #define TLV_PUT_U32(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 32, data)
733 #define TLV_PUT_U64(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 64, data)
734 #define TLV_PUT_STRING(sctx, attrtype, str, len) \
736 ret = tlv_put_string(sctx, attrtype, str, len); \
738 goto tlv_put_failure; \
740 #define TLV_PUT_PATH(sctx, attrtype, p) \
742 ret = tlv_put_string(sctx, attrtype, p->start, \
743 p->end - p->start); \
745 goto tlv_put_failure; \
747 #define TLV_PUT_UUID(sctx, attrtype, uuid) \
749 ret = tlv_put_uuid(sctx, attrtype, uuid); \
751 goto tlv_put_failure; \
753 #define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \
755 ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \
757 goto tlv_put_failure; \
760 static int send_header(struct send_ctx
*sctx
)
762 struct btrfs_stream_header hdr
;
764 strcpy(hdr
.magic
, BTRFS_SEND_STREAM_MAGIC
);
765 hdr
.version
= cpu_to_le32(sctx
->proto
);
766 return write_buf(sctx
->send_filp
, &hdr
, sizeof(hdr
),
771 * For each command/item we want to send to userspace, we call this function.
773 static int begin_cmd(struct send_ctx
*sctx
, int cmd
)
775 struct btrfs_cmd_header
*hdr
;
777 if (WARN_ON(!sctx
->send_buf
))
780 if (unlikely(sctx
->send_size
!= 0)) {
781 btrfs_err(sctx
->send_root
->fs_info
,
782 "send: command header buffer not empty cmd %d offset %llu",
783 cmd
, sctx
->send_off
);
787 sctx
->send_size
+= sizeof(*hdr
);
788 hdr
= (struct btrfs_cmd_header
*)sctx
->send_buf
;
789 put_unaligned_le16(cmd
, &hdr
->cmd
);
794 static int send_cmd(struct send_ctx
*sctx
)
797 struct btrfs_cmd_header
*hdr
;
800 hdr
= (struct btrfs_cmd_header
*)sctx
->send_buf
;
801 put_unaligned_le32(sctx
->send_size
- sizeof(*hdr
), &hdr
->len
);
802 put_unaligned_le32(0, &hdr
->crc
);
804 crc
= crc32c(0, (unsigned char *)sctx
->send_buf
, sctx
->send_size
);
805 put_unaligned_le32(crc
, &hdr
->crc
);
807 ret
= write_buf(sctx
->send_filp
, sctx
->send_buf
, sctx
->send_size
,
811 sctx
->put_data
= false;
817 * Sends a move instruction to user space
819 static int send_rename(struct send_ctx
*sctx
,
820 struct fs_path
*from
, struct fs_path
*to
)
822 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
825 btrfs_debug(fs_info
, "send_rename %s -> %s", from
->start
, to
->start
);
827 ret
= begin_cmd(sctx
, BTRFS_SEND_C_RENAME
);
831 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, from
);
832 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH_TO
, to
);
834 ret
= send_cmd(sctx
);
842 * Sends a link instruction to user space
844 static int send_link(struct send_ctx
*sctx
,
845 struct fs_path
*path
, struct fs_path
*lnk
)
847 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
850 btrfs_debug(fs_info
, "send_link %s -> %s", path
->start
, lnk
->start
);
852 ret
= begin_cmd(sctx
, BTRFS_SEND_C_LINK
);
856 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, path
);
857 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH_LINK
, lnk
);
859 ret
= send_cmd(sctx
);
867 * Sends an unlink instruction to user space
869 static int send_unlink(struct send_ctx
*sctx
, struct fs_path
*path
)
871 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
874 btrfs_debug(fs_info
, "send_unlink %s", path
->start
);
876 ret
= begin_cmd(sctx
, BTRFS_SEND_C_UNLINK
);
880 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, path
);
882 ret
= send_cmd(sctx
);
890 * Sends a rmdir instruction to user space
892 static int send_rmdir(struct send_ctx
*sctx
, struct fs_path
*path
)
894 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
897 btrfs_debug(fs_info
, "send_rmdir %s", path
->start
);
899 ret
= begin_cmd(sctx
, BTRFS_SEND_C_RMDIR
);
903 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, path
);
905 ret
= send_cmd(sctx
);
912 struct btrfs_inode_info
{
924 * Helper function to retrieve some fields from an inode item.
926 static int get_inode_info(struct btrfs_root
*root
, u64 ino
,
927 struct btrfs_inode_info
*info
)
930 struct btrfs_path
*path
;
931 struct btrfs_inode_item
*ii
;
932 struct btrfs_key key
;
934 path
= alloc_path_for_send();
939 key
.type
= BTRFS_INODE_ITEM_KEY
;
941 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
951 ii
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
952 struct btrfs_inode_item
);
953 info
->size
= btrfs_inode_size(path
->nodes
[0], ii
);
954 info
->gen
= btrfs_inode_generation(path
->nodes
[0], ii
);
955 info
->mode
= btrfs_inode_mode(path
->nodes
[0], ii
);
956 info
->uid
= btrfs_inode_uid(path
->nodes
[0], ii
);
957 info
->gid
= btrfs_inode_gid(path
->nodes
[0], ii
);
958 info
->rdev
= btrfs_inode_rdev(path
->nodes
[0], ii
);
959 info
->nlink
= btrfs_inode_nlink(path
->nodes
[0], ii
);
961 * Transfer the unchanged u64 value of btrfs_inode_item::flags, that's
962 * otherwise logically split to 32/32 parts.
964 info
->fileattr
= btrfs_inode_flags(path
->nodes
[0], ii
);
967 btrfs_free_path(path
);
971 static int get_inode_gen(struct btrfs_root
*root
, u64 ino
, u64
*gen
)
974 struct btrfs_inode_info info
= { 0 };
978 ret
= get_inode_info(root
, ino
, &info
);
983 typedef int (*iterate_inode_ref_t
)(u64 dir
, struct fs_path
*p
, void *ctx
);
986 * Helper function to iterate the entries in ONE btrfs_inode_ref or
987 * btrfs_inode_extref.
988 * The iterate callback may return a non zero value to stop iteration. This can
989 * be a negative value for error codes or 1 to simply stop it.
991 * path must point to the INODE_REF or INODE_EXTREF when called.
993 static int iterate_inode_ref(struct btrfs_root
*root
, struct btrfs_path
*path
,
994 struct btrfs_key
*found_key
, int resolve
,
995 iterate_inode_ref_t iterate
, void *ctx
)
997 struct extent_buffer
*eb
= path
->nodes
[0];
998 struct btrfs_inode_ref
*iref
;
999 struct btrfs_inode_extref
*extref
;
1000 struct btrfs_path
*tmp_path
;
1004 int slot
= path
->slots
[0];
1009 unsigned long name_off
;
1010 unsigned long elem_size
;
1013 p
= fs_path_alloc_reversed();
1017 tmp_path
= alloc_path_for_send();
1024 if (found_key
->type
== BTRFS_INODE_REF_KEY
) {
1025 ptr
= (unsigned long)btrfs_item_ptr(eb
, slot
,
1026 struct btrfs_inode_ref
);
1027 total
= btrfs_item_size(eb
, slot
);
1028 elem_size
= sizeof(*iref
);
1030 ptr
= btrfs_item_ptr_offset(eb
, slot
);
1031 total
= btrfs_item_size(eb
, slot
);
1032 elem_size
= sizeof(*extref
);
1035 while (cur
< total
) {
1038 if (found_key
->type
== BTRFS_INODE_REF_KEY
) {
1039 iref
= (struct btrfs_inode_ref
*)(ptr
+ cur
);
1040 name_len
= btrfs_inode_ref_name_len(eb
, iref
);
1041 name_off
= (unsigned long)(iref
+ 1);
1042 dir
= found_key
->offset
;
1044 extref
= (struct btrfs_inode_extref
*)(ptr
+ cur
);
1045 name_len
= btrfs_inode_extref_name_len(eb
, extref
);
1046 name_off
= (unsigned long)&extref
->name
;
1047 dir
= btrfs_inode_extref_parent(eb
, extref
);
1051 start
= btrfs_ref_to_path(root
, tmp_path
, name_len
,
1053 p
->buf
, p
->buf_len
);
1054 if (IS_ERR(start
)) {
1055 ret
= PTR_ERR(start
);
1058 if (start
< p
->buf
) {
1059 /* overflow , try again with larger buffer */
1060 ret
= fs_path_ensure_buf(p
,
1061 p
->buf_len
+ p
->buf
- start
);
1064 start
= btrfs_ref_to_path(root
, tmp_path
,
1067 p
->buf
, p
->buf_len
);
1068 if (IS_ERR(start
)) {
1069 ret
= PTR_ERR(start
);
1072 if (unlikely(start
< p
->buf
)) {
1073 btrfs_err(root
->fs_info
,
1074 "send: path ref buffer underflow for key (%llu %u %llu)",
1075 found_key
->objectid
,
1084 ret
= fs_path_add_from_extent_buffer(p
, eb
, name_off
,
1090 cur
+= elem_size
+ name_len
;
1091 ret
= iterate(dir
, p
, ctx
);
1097 btrfs_free_path(tmp_path
);
1102 typedef int (*iterate_dir_item_t
)(int num
, struct btrfs_key
*di_key
,
1103 const char *name
, int name_len
,
1104 const char *data
, int data_len
,
1108 * Helper function to iterate the entries in ONE btrfs_dir_item.
1109 * The iterate callback may return a non zero value to stop iteration. This can
1110 * be a negative value for error codes or 1 to simply stop it.
1112 * path must point to the dir item when called.
1114 static int iterate_dir_item(struct btrfs_root
*root
, struct btrfs_path
*path
,
1115 iterate_dir_item_t iterate
, void *ctx
)
1118 struct extent_buffer
*eb
;
1119 struct btrfs_dir_item
*di
;
1120 struct btrfs_key di_key
;
1132 * Start with a small buffer (1 page). If later we end up needing more
1133 * space, which can happen for xattrs on a fs with a leaf size greater
1134 * than the page size, attempt to increase the buffer. Typically xattr
1138 buf
= kmalloc(buf_len
, GFP_KERNEL
);
1144 eb
= path
->nodes
[0];
1145 slot
= path
->slots
[0];
1146 di
= btrfs_item_ptr(eb
, slot
, struct btrfs_dir_item
);
1149 total
= btrfs_item_size(eb
, slot
);
1152 while (cur
< total
) {
1153 name_len
= btrfs_dir_name_len(eb
, di
);
1154 data_len
= btrfs_dir_data_len(eb
, di
);
1155 btrfs_dir_item_key_to_cpu(eb
, di
, &di_key
);
1157 if (btrfs_dir_ftype(eb
, di
) == BTRFS_FT_XATTR
) {
1158 if (name_len
> XATTR_NAME_MAX
) {
1159 ret
= -ENAMETOOLONG
;
1162 if (name_len
+ data_len
>
1163 BTRFS_MAX_XATTR_SIZE(root
->fs_info
)) {
1171 if (name_len
+ data_len
> PATH_MAX
) {
1172 ret
= -ENAMETOOLONG
;
1177 if (name_len
+ data_len
> buf_len
) {
1178 buf_len
= name_len
+ data_len
;
1179 if (is_vmalloc_addr(buf
)) {
1183 char *tmp
= krealloc(buf
, buf_len
,
1184 GFP_KERNEL
| __GFP_NOWARN
);
1191 buf
= kvmalloc(buf_len
, GFP_KERNEL
);
1199 read_extent_buffer(eb
, buf
, (unsigned long)(di
+ 1),
1200 name_len
+ data_len
);
1202 len
= sizeof(*di
) + name_len
+ data_len
;
1203 di
= (struct btrfs_dir_item
*)((char *)di
+ len
);
1206 ret
= iterate(num
, &di_key
, buf
, name_len
, buf
+ name_len
,
1223 static int __copy_first_ref(u64 dir
, struct fs_path
*p
, void *ctx
)
1226 struct fs_path
*pt
= ctx
;
1228 ret
= fs_path_copy(pt
, p
);
1232 /* we want the first only */
1237 * Retrieve the first path of an inode. If an inode has more then one
1238 * ref/hardlink, this is ignored.
1240 static int get_inode_path(struct btrfs_root
*root
,
1241 u64 ino
, struct fs_path
*path
)
1244 struct btrfs_key key
, found_key
;
1245 struct btrfs_path
*p
;
1247 p
= alloc_path_for_send();
1251 fs_path_reset(path
);
1254 key
.type
= BTRFS_INODE_REF_KEY
;
1257 ret
= btrfs_search_slot_for_read(root
, &key
, p
, 1, 0);
1264 btrfs_item_key_to_cpu(p
->nodes
[0], &found_key
, p
->slots
[0]);
1265 if (found_key
.objectid
!= ino
||
1266 (found_key
.type
!= BTRFS_INODE_REF_KEY
&&
1267 found_key
.type
!= BTRFS_INODE_EXTREF_KEY
)) {
1272 ret
= iterate_inode_ref(root
, p
, &found_key
, 1,
1273 __copy_first_ref
, path
);
1283 struct backref_ctx
{
1284 struct send_ctx
*sctx
;
1286 /* number of total found references */
1290 * used for clones found in send_root. clones found behind cur_objectid
1291 * and cur_offset are not considered as allowed clones.
1296 /* may be truncated in case it's the last extent in a file */
1299 /* The bytenr the file extent item we are processing refers to. */
1301 /* The owner (root id) of the data backref for the current extent. */
1303 /* The offset of the data backref for the current extent. */
1307 static int __clone_root_cmp_bsearch(const void *key
, const void *elt
)
1309 u64 root
= (u64
)(uintptr_t)key
;
1310 const struct clone_root
*cr
= elt
;
1312 if (root
< btrfs_root_id(cr
->root
))
1314 if (root
> btrfs_root_id(cr
->root
))
1319 static int __clone_root_cmp_sort(const void *e1
, const void *e2
)
1321 const struct clone_root
*cr1
= e1
;
1322 const struct clone_root
*cr2
= e2
;
1324 if (btrfs_root_id(cr1
->root
) < btrfs_root_id(cr2
->root
))
1326 if (btrfs_root_id(cr1
->root
) > btrfs_root_id(cr2
->root
))
1332 * Called for every backref that is found for the current extent.
1333 * Results are collected in sctx->clone_roots->ino/offset.
1335 static int iterate_backrefs(u64 ino
, u64 offset
, u64 num_bytes
, u64 root_id
,
1338 struct backref_ctx
*bctx
= ctx_
;
1339 struct clone_root
*clone_root
;
1341 /* First check if the root is in the list of accepted clone sources */
1342 clone_root
= bsearch((void *)(uintptr_t)root_id
, bctx
->sctx
->clone_roots
,
1343 bctx
->sctx
->clone_roots_cnt
,
1344 sizeof(struct clone_root
),
1345 __clone_root_cmp_bsearch
);
1349 /* This is our own reference, bail out as we can't clone from it. */
1350 if (clone_root
->root
== bctx
->sctx
->send_root
&&
1351 ino
== bctx
->cur_objectid
&&
1352 offset
== bctx
->cur_offset
)
1356 * Make sure we don't consider clones from send_root that are
1357 * behind the current inode/offset.
1359 if (clone_root
->root
== bctx
->sctx
->send_root
) {
1361 * If the source inode was not yet processed we can't issue a
1362 * clone operation, as the source extent does not exist yet at
1363 * the destination of the stream.
1365 if (ino
> bctx
->cur_objectid
)
1368 * We clone from the inode currently being sent as long as the
1369 * source extent is already processed, otherwise we could try
1370 * to clone from an extent that does not exist yet at the
1371 * destination of the stream.
1373 if (ino
== bctx
->cur_objectid
&&
1374 offset
+ bctx
->extent_len
>
1375 bctx
->sctx
->cur_inode_next_write_offset
)
1380 clone_root
->found_ref
= true;
1383 * If the given backref refers to a file extent item with a larger
1384 * number of bytes than what we found before, use the new one so that
1385 * we clone more optimally and end up doing less writes and getting
1386 * less exclusive, non-shared extents at the destination.
1388 if (num_bytes
> clone_root
->num_bytes
) {
1389 clone_root
->ino
= ino
;
1390 clone_root
->offset
= offset
;
1391 clone_root
->num_bytes
= num_bytes
;
1394 * Found a perfect candidate, so there's no need to continue
1397 if (num_bytes
>= bctx
->extent_len
)
1398 return BTRFS_ITERATE_EXTENT_INODES_STOP
;
1404 static bool lookup_backref_cache(u64 leaf_bytenr
, void *ctx
,
1405 const u64
**root_ids_ret
, int *root_count_ret
)
1407 struct backref_ctx
*bctx
= ctx
;
1408 struct send_ctx
*sctx
= bctx
->sctx
;
1409 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
1410 const u64 key
= leaf_bytenr
>> fs_info
->sectorsize_bits
;
1411 struct btrfs_lru_cache_entry
*raw_entry
;
1412 struct backref_cache_entry
*entry
;
1414 if (sctx
->backref_cache
.size
== 0)
1418 * If relocation happened since we first filled the cache, then we must
1419 * empty the cache and can not use it, because even though we operate on
1420 * read-only roots, their leaves and nodes may have been reallocated and
1421 * now be used for different nodes/leaves of the same tree or some other
1424 * We are called from iterate_extent_inodes() while either holding a
1425 * transaction handle or holding fs_info->commit_root_sem, so no need
1426 * to take any lock here.
1428 if (fs_info
->last_reloc_trans
> sctx
->backref_cache_last_reloc_trans
) {
1429 btrfs_lru_cache_clear(&sctx
->backref_cache
);
1433 raw_entry
= btrfs_lru_cache_lookup(&sctx
->backref_cache
, key
, 0);
1437 entry
= container_of(raw_entry
, struct backref_cache_entry
, entry
);
1438 *root_ids_ret
= entry
->root_ids
;
1439 *root_count_ret
= entry
->num_roots
;
1444 static void store_backref_cache(u64 leaf_bytenr
, const struct ulist
*root_ids
,
1447 struct backref_ctx
*bctx
= ctx
;
1448 struct send_ctx
*sctx
= bctx
->sctx
;
1449 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
1450 struct backref_cache_entry
*new_entry
;
1451 struct ulist_iterator uiter
;
1452 struct ulist_node
*node
;
1456 * We're called while holding a transaction handle or while holding
1457 * fs_info->commit_root_sem (at iterate_extent_inodes()), so must do a
1460 new_entry
= kmalloc(sizeof(struct backref_cache_entry
), GFP_NOFS
);
1461 /* No worries, cache is optional. */
1465 new_entry
->entry
.key
= leaf_bytenr
>> fs_info
->sectorsize_bits
;
1466 new_entry
->entry
.gen
= 0;
1467 new_entry
->num_roots
= 0;
1468 ULIST_ITER_INIT(&uiter
);
1469 while ((node
= ulist_next(root_ids
, &uiter
)) != NULL
) {
1470 const u64 root_id
= node
->val
;
1471 struct clone_root
*root
;
1473 root
= bsearch((void *)(uintptr_t)root_id
, sctx
->clone_roots
,
1474 sctx
->clone_roots_cnt
, sizeof(struct clone_root
),
1475 __clone_root_cmp_bsearch
);
1479 /* Too many roots, just exit, no worries as caching is optional. */
1480 if (new_entry
->num_roots
>= SEND_MAX_BACKREF_CACHE_ROOTS
) {
1485 new_entry
->root_ids
[new_entry
->num_roots
] = root_id
;
1486 new_entry
->num_roots
++;
1490 * We may have not added any roots to the new cache entry, which means
1491 * none of the roots is part of the list of roots from which we are
1492 * allowed to clone. Cache the new entry as it's still useful to avoid
1493 * backref walking to determine which roots have a path to the leaf.
1495 * Also use GFP_NOFS because we're called while holding a transaction
1496 * handle or while holding fs_info->commit_root_sem.
1498 ret
= btrfs_lru_cache_store(&sctx
->backref_cache
, &new_entry
->entry
,
1500 ASSERT(ret
== 0 || ret
== -ENOMEM
);
1502 /* Caching is optional, no worries. */
1508 * We are called from iterate_extent_inodes() while either holding a
1509 * transaction handle or holding fs_info->commit_root_sem, so no need
1510 * to take any lock here.
1512 if (sctx
->backref_cache
.size
== 1)
1513 sctx
->backref_cache_last_reloc_trans
= fs_info
->last_reloc_trans
;
1516 static int check_extent_item(u64 bytenr
, const struct btrfs_extent_item
*ei
,
1517 const struct extent_buffer
*leaf
, void *ctx
)
1519 const u64 refs
= btrfs_extent_refs(leaf
, ei
);
1520 const struct backref_ctx
*bctx
= ctx
;
1521 const struct send_ctx
*sctx
= bctx
->sctx
;
1523 if (bytenr
== bctx
->bytenr
) {
1524 const u64 flags
= btrfs_extent_flags(leaf
, ei
);
1526 if (WARN_ON(flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
))
1530 * If we have only one reference and only the send root as a
1531 * clone source - meaning no clone roots were given in the
1532 * struct btrfs_ioctl_send_args passed to the send ioctl - then
1533 * it's our reference and there's no point in doing backref
1534 * walking which is expensive, so exit early.
1536 if (refs
== 1 && sctx
->clone_roots_cnt
== 1)
1541 * Backreference walking (iterate_extent_inodes() below) is currently
1542 * too expensive when an extent has a large number of references, both
1543 * in time spent and used memory. So for now just fallback to write
1544 * operations instead of clone operations when an extent has more than
1545 * a certain amount of references.
1547 if (refs
> SEND_MAX_EXTENT_REFS
)
1553 static bool skip_self_data_ref(u64 root
, u64 ino
, u64 offset
, void *ctx
)
1555 const struct backref_ctx
*bctx
= ctx
;
1557 if (ino
== bctx
->cur_objectid
&&
1558 root
== bctx
->backref_owner
&&
1559 offset
== bctx
->backref_offset
)
1566 * Given an inode, offset and extent item, it finds a good clone for a clone
1567 * instruction. Returns -ENOENT when none could be found. The function makes
1568 * sure that the returned clone is usable at the point where sending is at the
1569 * moment. This means, that no clones are accepted which lie behind the current
1572 * path must point to the extent item when called.
1574 static int find_extent_clone(struct send_ctx
*sctx
,
1575 struct btrfs_path
*path
,
1576 u64 ino
, u64 data_offset
,
1578 struct clone_root
**found
)
1580 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
1586 struct btrfs_file_extent_item
*fi
;
1587 struct extent_buffer
*eb
= path
->nodes
[0];
1588 struct backref_ctx backref_ctx
= { 0 };
1589 struct btrfs_backref_walk_ctx backref_walk_ctx
= { 0 };
1590 struct clone_root
*cur_clone_root
;
1595 * With fallocate we can get prealloc extents beyond the inode's i_size,
1596 * so we don't do anything here because clone operations can not clone
1597 * to a range beyond i_size without increasing the i_size of the
1598 * destination inode.
1600 if (data_offset
>= ino_size
)
1603 fi
= btrfs_item_ptr(eb
, path
->slots
[0], struct btrfs_file_extent_item
);
1604 extent_type
= btrfs_file_extent_type(eb
, fi
);
1605 if (extent_type
== BTRFS_FILE_EXTENT_INLINE
)
1608 disk_byte
= btrfs_file_extent_disk_bytenr(eb
, fi
);
1612 compressed
= btrfs_file_extent_compression(eb
, fi
);
1613 num_bytes
= btrfs_file_extent_num_bytes(eb
, fi
);
1614 logical
= disk_byte
+ btrfs_file_extent_offset(eb
, fi
);
1617 * Setup the clone roots.
1619 for (i
= 0; i
< sctx
->clone_roots_cnt
; i
++) {
1620 cur_clone_root
= sctx
->clone_roots
+ i
;
1621 cur_clone_root
->ino
= (u64
)-1;
1622 cur_clone_root
->offset
= 0;
1623 cur_clone_root
->num_bytes
= 0;
1624 cur_clone_root
->found_ref
= false;
1627 backref_ctx
.sctx
= sctx
;
1628 backref_ctx
.cur_objectid
= ino
;
1629 backref_ctx
.cur_offset
= data_offset
;
1630 backref_ctx
.bytenr
= disk_byte
;
1632 * Use the header owner and not the send root's id, because in case of a
1633 * snapshot we can have shared subtrees.
1635 backref_ctx
.backref_owner
= btrfs_header_owner(eb
);
1636 backref_ctx
.backref_offset
= data_offset
- btrfs_file_extent_offset(eb
, fi
);
1639 * The last extent of a file may be too large due to page alignment.
1640 * We need to adjust extent_len in this case so that the checks in
1641 * iterate_backrefs() work.
1643 if (data_offset
+ num_bytes
>= ino_size
)
1644 backref_ctx
.extent_len
= ino_size
- data_offset
;
1646 backref_ctx
.extent_len
= num_bytes
;
1649 * Now collect all backrefs.
1651 backref_walk_ctx
.bytenr
= disk_byte
;
1652 if (compressed
== BTRFS_COMPRESS_NONE
)
1653 backref_walk_ctx
.extent_item_pos
= btrfs_file_extent_offset(eb
, fi
);
1654 backref_walk_ctx
.fs_info
= fs_info
;
1655 backref_walk_ctx
.cache_lookup
= lookup_backref_cache
;
1656 backref_walk_ctx
.cache_store
= store_backref_cache
;
1657 backref_walk_ctx
.indirect_ref_iterator
= iterate_backrefs
;
1658 backref_walk_ctx
.check_extent_item
= check_extent_item
;
1659 backref_walk_ctx
.user_ctx
= &backref_ctx
;
1662 * If have a single clone root, then it's the send root and we can tell
1663 * the backref walking code to skip our own backref and not resolve it,
1664 * since we can not use it for cloning - the source and destination
1665 * ranges can't overlap and in case the leaf is shared through a subtree
1666 * due to snapshots, we can't use those other roots since they are not
1667 * in the list of clone roots.
1669 if (sctx
->clone_roots_cnt
== 1)
1670 backref_walk_ctx
.skip_data_ref
= skip_self_data_ref
;
1672 ret
= iterate_extent_inodes(&backref_walk_ctx
, true, iterate_backrefs
,
1677 down_read(&fs_info
->commit_root_sem
);
1678 if (fs_info
->last_reloc_trans
> sctx
->last_reloc_trans
) {
1680 * A transaction commit for a transaction in which block group
1681 * relocation was done just happened.
1682 * The disk_bytenr of the file extent item we processed is
1683 * possibly stale, referring to the extent's location before
1684 * relocation. So act as if we haven't found any clone sources
1685 * and fallback to write commands, which will read the correct
1686 * data from the new extent location. Otherwise we will fail
1687 * below because we haven't found our own back reference or we
1688 * could be getting incorrect sources in case the old extent
1689 * was already reallocated after the relocation.
1691 up_read(&fs_info
->commit_root_sem
);
1694 up_read(&fs_info
->commit_root_sem
);
1696 btrfs_debug(fs_info
,
1697 "find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu",
1698 data_offset
, ino
, num_bytes
, logical
);
1700 if (!backref_ctx
.found
) {
1701 btrfs_debug(fs_info
, "no clones found");
1705 cur_clone_root
= NULL
;
1706 for (i
= 0; i
< sctx
->clone_roots_cnt
; i
++) {
1707 struct clone_root
*clone_root
= &sctx
->clone_roots
[i
];
1709 if (!clone_root
->found_ref
)
1713 * Choose the root from which we can clone more bytes, to
1714 * minimize write operations and therefore have more extent
1715 * sharing at the destination (the same as in the source).
1717 if (!cur_clone_root
||
1718 clone_root
->num_bytes
> cur_clone_root
->num_bytes
) {
1719 cur_clone_root
= clone_root
;
1722 * We found an optimal clone candidate (any inode from
1723 * any root is fine), so we're done.
1725 if (clone_root
->num_bytes
>= backref_ctx
.extent_len
)
1730 if (cur_clone_root
) {
1731 *found
= cur_clone_root
;
1740 static int read_symlink(struct btrfs_root
*root
,
1742 struct fs_path
*dest
)
1745 struct btrfs_path
*path
;
1746 struct btrfs_key key
;
1747 struct btrfs_file_extent_item
*ei
;
1753 path
= alloc_path_for_send();
1758 key
.type
= BTRFS_EXTENT_DATA_KEY
;
1760 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
1765 * An empty symlink inode. Can happen in rare error paths when
1766 * creating a symlink (transaction committed before the inode
1767 * eviction handler removed the symlink inode items and a crash
1768 * happened in between or the subvol was snapshoted in between).
1769 * Print an informative message to dmesg/syslog so that the user
1770 * can delete the symlink.
1772 btrfs_err(root
->fs_info
,
1773 "Found empty symlink inode %llu at root %llu",
1774 ino
, btrfs_root_id(root
));
1779 ei
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
1780 struct btrfs_file_extent_item
);
1781 type
= btrfs_file_extent_type(path
->nodes
[0], ei
);
1782 if (unlikely(type
!= BTRFS_FILE_EXTENT_INLINE
)) {
1784 btrfs_crit(root
->fs_info
,
1785 "send: found symlink extent that is not inline, ino %llu root %llu extent type %d",
1786 ino
, btrfs_root_id(root
), type
);
1789 compression
= btrfs_file_extent_compression(path
->nodes
[0], ei
);
1790 if (unlikely(compression
!= BTRFS_COMPRESS_NONE
)) {
1792 btrfs_crit(root
->fs_info
,
1793 "send: found symlink extent with compression, ino %llu root %llu compression type %d",
1794 ino
, btrfs_root_id(root
), compression
);
1798 off
= btrfs_file_extent_inline_start(ei
);
1799 len
= btrfs_file_extent_ram_bytes(path
->nodes
[0], ei
);
1801 ret
= fs_path_add_from_extent_buffer(dest
, path
->nodes
[0], off
, len
);
1804 btrfs_free_path(path
);
1809 * Helper function to generate a file name that is unique in the root of
1810 * send_root and parent_root. This is used to generate names for orphan inodes.
1812 static int gen_unique_name(struct send_ctx
*sctx
,
1814 struct fs_path
*dest
)
1817 struct btrfs_path
*path
;
1818 struct btrfs_dir_item
*di
;
1823 path
= alloc_path_for_send();
1828 struct fscrypt_str tmp_name
;
1830 len
= snprintf(tmp
, sizeof(tmp
), "o%llu-%llu-%llu",
1832 ASSERT(len
< sizeof(tmp
));
1833 tmp_name
.name
= tmp
;
1834 tmp_name
.len
= strlen(tmp
);
1836 di
= btrfs_lookup_dir_item(NULL
, sctx
->send_root
,
1837 path
, BTRFS_FIRST_FREE_OBJECTID
,
1839 btrfs_release_path(path
);
1845 /* not unique, try again */
1850 if (!sctx
->parent_root
) {
1856 di
= btrfs_lookup_dir_item(NULL
, sctx
->parent_root
,
1857 path
, BTRFS_FIRST_FREE_OBJECTID
,
1859 btrfs_release_path(path
);
1865 /* not unique, try again */
1873 ret
= fs_path_add(dest
, tmp
, strlen(tmp
));
1876 btrfs_free_path(path
);
1881 inode_state_no_change
,
1882 inode_state_will_create
,
1883 inode_state_did_create
,
1884 inode_state_will_delete
,
1885 inode_state_did_delete
,
1888 static int get_cur_inode_state(struct send_ctx
*sctx
, u64 ino
, u64 gen
,
1889 u64
*send_gen
, u64
*parent_gen
)
1896 struct btrfs_inode_info info
;
1898 ret
= get_inode_info(sctx
->send_root
, ino
, &info
);
1899 if (ret
< 0 && ret
!= -ENOENT
)
1901 left_ret
= (info
.nlink
== 0) ? -ENOENT
: ret
;
1902 left_gen
= info
.gen
;
1904 *send_gen
= ((left_ret
== -ENOENT
) ? 0 : info
.gen
);
1906 if (!sctx
->parent_root
) {
1907 right_ret
= -ENOENT
;
1909 ret
= get_inode_info(sctx
->parent_root
, ino
, &info
);
1910 if (ret
< 0 && ret
!= -ENOENT
)
1912 right_ret
= (info
.nlink
== 0) ? -ENOENT
: ret
;
1913 right_gen
= info
.gen
;
1915 *parent_gen
= ((right_ret
== -ENOENT
) ? 0 : info
.gen
);
1918 if (!left_ret
&& !right_ret
) {
1919 if (left_gen
== gen
&& right_gen
== gen
) {
1920 ret
= inode_state_no_change
;
1921 } else if (left_gen
== gen
) {
1922 if (ino
< sctx
->send_progress
)
1923 ret
= inode_state_did_create
;
1925 ret
= inode_state_will_create
;
1926 } else if (right_gen
== gen
) {
1927 if (ino
< sctx
->send_progress
)
1928 ret
= inode_state_did_delete
;
1930 ret
= inode_state_will_delete
;
1934 } else if (!left_ret
) {
1935 if (left_gen
== gen
) {
1936 if (ino
< sctx
->send_progress
)
1937 ret
= inode_state_did_create
;
1939 ret
= inode_state_will_create
;
1943 } else if (!right_ret
) {
1944 if (right_gen
== gen
) {
1945 if (ino
< sctx
->send_progress
)
1946 ret
= inode_state_did_delete
;
1948 ret
= inode_state_will_delete
;
1960 static int is_inode_existent(struct send_ctx
*sctx
, u64 ino
, u64 gen
,
1961 u64
*send_gen
, u64
*parent_gen
)
1965 if (ino
== BTRFS_FIRST_FREE_OBJECTID
)
1968 ret
= get_cur_inode_state(sctx
, ino
, gen
, send_gen
, parent_gen
);
1972 if (ret
== inode_state_no_change
||
1973 ret
== inode_state_did_create
||
1974 ret
== inode_state_will_delete
)
1984 * Helper function to lookup a dir item in a dir.
1986 static int lookup_dir_item_inode(struct btrfs_root
*root
,
1987 u64 dir
, const char *name
, int name_len
,
1991 struct btrfs_dir_item
*di
;
1992 struct btrfs_key key
;
1993 struct btrfs_path
*path
;
1994 struct fscrypt_str name_str
= FSTR_INIT((char *)name
, name_len
);
1996 path
= alloc_path_for_send();
2000 di
= btrfs_lookup_dir_item(NULL
, root
, path
, dir
, &name_str
, 0);
2001 if (IS_ERR_OR_NULL(di
)) {
2002 ret
= di
? PTR_ERR(di
) : -ENOENT
;
2005 btrfs_dir_item_key_to_cpu(path
->nodes
[0], di
, &key
);
2006 if (key
.type
== BTRFS_ROOT_ITEM_KEY
) {
2010 *found_inode
= key
.objectid
;
2013 btrfs_free_path(path
);
2018 * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
2019 * generation of the parent dir and the name of the dir entry.
2021 static int get_first_ref(struct btrfs_root
*root
, u64 ino
,
2022 u64
*dir
, u64
*dir_gen
, struct fs_path
*name
)
2025 struct btrfs_key key
;
2026 struct btrfs_key found_key
;
2027 struct btrfs_path
*path
;
2031 path
= alloc_path_for_send();
2036 key
.type
= BTRFS_INODE_REF_KEY
;
2039 ret
= btrfs_search_slot_for_read(root
, &key
, path
, 1, 0);
2043 btrfs_item_key_to_cpu(path
->nodes
[0], &found_key
,
2045 if (ret
|| found_key
.objectid
!= ino
||
2046 (found_key
.type
!= BTRFS_INODE_REF_KEY
&&
2047 found_key
.type
!= BTRFS_INODE_EXTREF_KEY
)) {
2052 if (found_key
.type
== BTRFS_INODE_REF_KEY
) {
2053 struct btrfs_inode_ref
*iref
;
2054 iref
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
2055 struct btrfs_inode_ref
);
2056 len
= btrfs_inode_ref_name_len(path
->nodes
[0], iref
);
2057 ret
= fs_path_add_from_extent_buffer(name
, path
->nodes
[0],
2058 (unsigned long)(iref
+ 1),
2060 parent_dir
= found_key
.offset
;
2062 struct btrfs_inode_extref
*extref
;
2063 extref
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
2064 struct btrfs_inode_extref
);
2065 len
= btrfs_inode_extref_name_len(path
->nodes
[0], extref
);
2066 ret
= fs_path_add_from_extent_buffer(name
, path
->nodes
[0],
2067 (unsigned long)&extref
->name
, len
);
2068 parent_dir
= btrfs_inode_extref_parent(path
->nodes
[0], extref
);
2072 btrfs_release_path(path
);
2075 ret
= get_inode_gen(root
, parent_dir
, dir_gen
);
2083 btrfs_free_path(path
);
2087 static int is_first_ref(struct btrfs_root
*root
,
2089 const char *name
, int name_len
)
2092 struct fs_path
*tmp_name
;
2095 tmp_name
= fs_path_alloc();
2099 ret
= get_first_ref(root
, ino
, &tmp_dir
, NULL
, tmp_name
);
2103 if (dir
!= tmp_dir
|| name_len
!= fs_path_len(tmp_name
)) {
2108 ret
= !memcmp(tmp_name
->start
, name
, name_len
);
2111 fs_path_free(tmp_name
);
2116 * Used by process_recorded_refs to determine if a new ref would overwrite an
2117 * already existing ref. In case it detects an overwrite, it returns the
2118 * inode/gen in who_ino/who_gen.
2119 * When an overwrite is detected, process_recorded_refs does proper orphanizing
2120 * to make sure later references to the overwritten inode are possible.
2121 * Orphanizing is however only required for the first ref of an inode.
2122 * process_recorded_refs does an additional is_first_ref check to see if
2123 * orphanizing is really required.
2125 static int will_overwrite_ref(struct send_ctx
*sctx
, u64 dir
, u64 dir_gen
,
2126 const char *name
, int name_len
,
2127 u64
*who_ino
, u64
*who_gen
, u64
*who_mode
)
2130 u64 parent_root_dir_gen
;
2131 u64 other_inode
= 0;
2132 struct btrfs_inode_info info
;
2134 if (!sctx
->parent_root
)
2137 ret
= is_inode_existent(sctx
, dir
, dir_gen
, NULL
, &parent_root_dir_gen
);
2142 * If we have a parent root we need to verify that the parent dir was
2143 * not deleted and then re-created, if it was then we have no overwrite
2144 * and we can just unlink this entry.
2146 * @parent_root_dir_gen was set to 0 if the inode does not exist in the
2149 if (sctx
->parent_root
&& dir
!= BTRFS_FIRST_FREE_OBJECTID
&&
2150 parent_root_dir_gen
!= dir_gen
)
2153 ret
= lookup_dir_item_inode(sctx
->parent_root
, dir
, name
, name_len
,
2161 * Check if the overwritten ref was already processed. If yes, the ref
2162 * was already unlinked/moved, so we can safely assume that we will not
2163 * overwrite anything at this point in time.
2165 if (other_inode
> sctx
->send_progress
||
2166 is_waiting_for_move(sctx
, other_inode
)) {
2167 ret
= get_inode_info(sctx
->parent_root
, other_inode
, &info
);
2171 *who_ino
= other_inode
;
2172 *who_gen
= info
.gen
;
2173 *who_mode
= info
.mode
;
2181 * Checks if the ref was overwritten by an already processed inode. This is
2182 * used by __get_cur_name_and_parent to find out if the ref was orphanized and
2183 * thus the orphan name needs be used.
2184 * process_recorded_refs also uses it to avoid unlinking of refs that were
2187 static int did_overwrite_ref(struct send_ctx
*sctx
,
2188 u64 dir
, u64 dir_gen
,
2189 u64 ino
, u64 ino_gen
,
2190 const char *name
, int name_len
)
2195 u64 send_root_dir_gen
;
2197 if (!sctx
->parent_root
)
2200 ret
= is_inode_existent(sctx
, dir
, dir_gen
, &send_root_dir_gen
, NULL
);
2205 * @send_root_dir_gen was set to 0 if the inode does not exist in the
2208 if (dir
!= BTRFS_FIRST_FREE_OBJECTID
&& send_root_dir_gen
!= dir_gen
)
2211 /* check if the ref was overwritten by another ref */
2212 ret
= lookup_dir_item_inode(sctx
->send_root
, dir
, name
, name_len
,
2214 if (ret
== -ENOENT
) {
2215 /* was never and will never be overwritten */
2217 } else if (ret
< 0) {
2221 if (ow_inode
== ino
) {
2222 ret
= get_inode_gen(sctx
->send_root
, ow_inode
, &ow_gen
);
2226 /* It's the same inode, so no overwrite happened. */
2227 if (ow_gen
== ino_gen
)
2232 * We know that it is or will be overwritten. Check this now.
2233 * The current inode being processed might have been the one that caused
2234 * inode 'ino' to be orphanized, therefore check if ow_inode matches
2235 * the current inode being processed.
2237 if (ow_inode
< sctx
->send_progress
)
2240 if (ino
!= sctx
->cur_ino
&& ow_inode
== sctx
->cur_ino
) {
2242 ret
= get_inode_gen(sctx
->send_root
, ow_inode
, &ow_gen
);
2246 if (ow_gen
== sctx
->cur_inode_gen
)
2254 * Same as did_overwrite_ref, but also checks if it is the first ref of an inode
2255 * that got overwritten. This is used by process_recorded_refs to determine
2256 * if it has to use the path as returned by get_cur_path or the orphan name.
2258 static int did_overwrite_first_ref(struct send_ctx
*sctx
, u64 ino
, u64 gen
)
2261 struct fs_path
*name
= NULL
;
2265 if (!sctx
->parent_root
)
2268 name
= fs_path_alloc();
2272 ret
= get_first_ref(sctx
->parent_root
, ino
, &dir
, &dir_gen
, name
);
2276 ret
= did_overwrite_ref(sctx
, dir
, dir_gen
, ino
, gen
,
2277 name
->start
, fs_path_len(name
));
2284 static inline struct name_cache_entry
*name_cache_search(struct send_ctx
*sctx
,
2287 struct btrfs_lru_cache_entry
*entry
;
2289 entry
= btrfs_lru_cache_lookup(&sctx
->name_cache
, ino
, gen
);
2293 return container_of(entry
, struct name_cache_entry
, entry
);
2297 * Used by get_cur_path for each ref up to the root.
2298 * Returns 0 if it succeeded.
2299 * Returns 1 if the inode is not existent or got overwritten. In that case, the
2300 * name is an orphan name. This instructs get_cur_path to stop iterating. If 1
2301 * is returned, parent_ino/parent_gen are not guaranteed to be valid.
2302 * Returns <0 in case of error.
2304 static int __get_cur_name_and_parent(struct send_ctx
*sctx
,
2308 struct fs_path
*dest
)
2312 struct name_cache_entry
*nce
;
2315 * First check if we already did a call to this function with the same
2316 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes
2317 * return the cached result.
2319 nce
= name_cache_search(sctx
, ino
, gen
);
2321 if (ino
< sctx
->send_progress
&& nce
->need_later_update
) {
2322 btrfs_lru_cache_remove(&sctx
->name_cache
, &nce
->entry
);
2325 *parent_ino
= nce
->parent_ino
;
2326 *parent_gen
= nce
->parent_gen
;
2327 ret
= fs_path_add(dest
, nce
->name
, nce
->name_len
);
2336 * If the inode is not existent yet, add the orphan name and return 1.
2337 * This should only happen for the parent dir that we determine in
2338 * record_new_ref_if_needed().
2340 ret
= is_inode_existent(sctx
, ino
, gen
, NULL
, NULL
);
2345 ret
= gen_unique_name(sctx
, ino
, gen
, dest
);
2353 * Depending on whether the inode was already processed or not, use
2354 * send_root or parent_root for ref lookup.
2356 if (ino
< sctx
->send_progress
)
2357 ret
= get_first_ref(sctx
->send_root
, ino
,
2358 parent_ino
, parent_gen
, dest
);
2360 ret
= get_first_ref(sctx
->parent_root
, ino
,
2361 parent_ino
, parent_gen
, dest
);
2366 * Check if the ref was overwritten by an inode's ref that was processed
2367 * earlier. If yes, treat as orphan and return 1.
2369 ret
= did_overwrite_ref(sctx
, *parent_ino
, *parent_gen
, ino
, gen
,
2370 dest
->start
, dest
->end
- dest
->start
);
2374 fs_path_reset(dest
);
2375 ret
= gen_unique_name(sctx
, ino
, gen
, dest
);
2383 * Store the result of the lookup in the name cache.
2385 nce
= kmalloc(sizeof(*nce
) + fs_path_len(dest
), GFP_KERNEL
);
2391 nce
->entry
.key
= ino
;
2392 nce
->entry
.gen
= gen
;
2393 nce
->parent_ino
= *parent_ino
;
2394 nce
->parent_gen
= *parent_gen
;
2395 nce
->name_len
= fs_path_len(dest
);
2397 memcpy(nce
->name
, dest
->start
, nce
->name_len
);
2399 if (ino
< sctx
->send_progress
)
2400 nce
->need_later_update
= 0;
2402 nce
->need_later_update
= 1;
2404 nce_ret
= btrfs_lru_cache_store(&sctx
->name_cache
, &nce
->entry
, GFP_KERNEL
);
2415 * Magic happens here. This function returns the first ref to an inode as it
2416 * would look like while receiving the stream at this point in time.
2417 * We walk the path up to the root. For every inode in between, we check if it
2418 * was already processed/sent. If yes, we continue with the parent as found
2419 * in send_root. If not, we continue with the parent as found in parent_root.
2420 * If we encounter an inode that was deleted at this point in time, we use the
2421 * inodes "orphan" name instead of the real name and stop. Same with new inodes
2422 * that were not created yet and overwritten inodes/refs.
2424 * When do we have orphan inodes:
2425 * 1. When an inode is freshly created and thus no valid refs are available yet
2426 * 2. When a directory lost all it's refs (deleted) but still has dir items
2427 * inside which were not processed yet (pending for move/delete). If anyone
2428 * tried to get the path to the dir items, it would get a path inside that
2430 * 3. When an inode is moved around or gets new links, it may overwrite the ref
2431 * of an unprocessed inode. If in that case the first ref would be
2432 * overwritten, the overwritten inode gets "orphanized". Later when we
2433 * process this overwritten inode, it is restored at a new place by moving
2436 * sctx->send_progress tells this function at which point in time receiving
2439 static int get_cur_path(struct send_ctx
*sctx
, u64 ino
, u64 gen
,
2440 struct fs_path
*dest
)
2443 struct fs_path
*name
= NULL
;
2444 u64 parent_inode
= 0;
2448 name
= fs_path_alloc();
2455 fs_path_reset(dest
);
2457 while (!stop
&& ino
!= BTRFS_FIRST_FREE_OBJECTID
) {
2458 struct waiting_dir_move
*wdm
;
2460 fs_path_reset(name
);
2462 if (is_waiting_for_rm(sctx
, ino
, gen
)) {
2463 ret
= gen_unique_name(sctx
, ino
, gen
, name
);
2466 ret
= fs_path_add_path(dest
, name
);
2470 wdm
= get_waiting_dir_move(sctx
, ino
);
2471 if (wdm
&& wdm
->orphanized
) {
2472 ret
= gen_unique_name(sctx
, ino
, gen
, name
);
2475 ret
= get_first_ref(sctx
->parent_root
, ino
,
2476 &parent_inode
, &parent_gen
, name
);
2478 ret
= __get_cur_name_and_parent(sctx
, ino
, gen
,
2488 ret
= fs_path_add_path(dest
, name
);
2499 fs_path_unreverse(dest
);
2504 * Sends a BTRFS_SEND_C_SUBVOL command/item to userspace
2506 static int send_subvol_begin(struct send_ctx
*sctx
)
2509 struct btrfs_root
*send_root
= sctx
->send_root
;
2510 struct btrfs_root
*parent_root
= sctx
->parent_root
;
2511 struct btrfs_path
*path
;
2512 struct btrfs_key key
;
2513 struct btrfs_root_ref
*ref
;
2514 struct extent_buffer
*leaf
;
2518 path
= btrfs_alloc_path();
2522 name
= kmalloc(BTRFS_PATH_NAME_MAX
, GFP_KERNEL
);
2524 btrfs_free_path(path
);
2528 key
.objectid
= btrfs_root_id(send_root
);
2529 key
.type
= BTRFS_ROOT_BACKREF_KEY
;
2532 ret
= btrfs_search_slot_for_read(send_root
->fs_info
->tree_root
,
2541 leaf
= path
->nodes
[0];
2542 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
2543 if (key
.type
!= BTRFS_ROOT_BACKREF_KEY
||
2544 key
.objectid
!= btrfs_root_id(send_root
)) {
2548 ref
= btrfs_item_ptr(leaf
, path
->slots
[0], struct btrfs_root_ref
);
2549 namelen
= btrfs_root_ref_name_len(leaf
, ref
);
2550 read_extent_buffer(leaf
, name
, (unsigned long)(ref
+ 1), namelen
);
2551 btrfs_release_path(path
);
2554 ret
= begin_cmd(sctx
, BTRFS_SEND_C_SNAPSHOT
);
2558 ret
= begin_cmd(sctx
, BTRFS_SEND_C_SUBVOL
);
2563 TLV_PUT_STRING(sctx
, BTRFS_SEND_A_PATH
, name
, namelen
);
2565 if (!btrfs_is_empty_uuid(sctx
->send_root
->root_item
.received_uuid
))
2566 TLV_PUT_UUID(sctx
, BTRFS_SEND_A_UUID
,
2567 sctx
->send_root
->root_item
.received_uuid
);
2569 TLV_PUT_UUID(sctx
, BTRFS_SEND_A_UUID
,
2570 sctx
->send_root
->root_item
.uuid
);
2572 TLV_PUT_U64(sctx
, BTRFS_SEND_A_CTRANSID
,
2573 btrfs_root_ctransid(&sctx
->send_root
->root_item
));
2575 if (!btrfs_is_empty_uuid(parent_root
->root_item
.received_uuid
))
2576 TLV_PUT_UUID(sctx
, BTRFS_SEND_A_CLONE_UUID
,
2577 parent_root
->root_item
.received_uuid
);
2579 TLV_PUT_UUID(sctx
, BTRFS_SEND_A_CLONE_UUID
,
2580 parent_root
->root_item
.uuid
);
2581 TLV_PUT_U64(sctx
, BTRFS_SEND_A_CLONE_CTRANSID
,
2582 btrfs_root_ctransid(&sctx
->parent_root
->root_item
));
2585 ret
= send_cmd(sctx
);
2589 btrfs_free_path(path
);
2594 static int send_truncate(struct send_ctx
*sctx
, u64 ino
, u64 gen
, u64 size
)
2596 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
2600 btrfs_debug(fs_info
, "send_truncate %llu size=%llu", ino
, size
);
2602 p
= fs_path_alloc();
2606 ret
= begin_cmd(sctx
, BTRFS_SEND_C_TRUNCATE
);
2610 ret
= get_cur_path(sctx
, ino
, gen
, p
);
2613 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, p
);
2614 TLV_PUT_U64(sctx
, BTRFS_SEND_A_SIZE
, size
);
2616 ret
= send_cmd(sctx
);
2624 static int send_chmod(struct send_ctx
*sctx
, u64 ino
, u64 gen
, u64 mode
)
2626 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
2630 btrfs_debug(fs_info
, "send_chmod %llu mode=%llu", ino
, mode
);
2632 p
= fs_path_alloc();
2636 ret
= begin_cmd(sctx
, BTRFS_SEND_C_CHMOD
);
2640 ret
= get_cur_path(sctx
, ino
, gen
, p
);
2643 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, p
);
2644 TLV_PUT_U64(sctx
, BTRFS_SEND_A_MODE
, mode
& 07777);
2646 ret
= send_cmd(sctx
);
2654 static int send_fileattr(struct send_ctx
*sctx
, u64 ino
, u64 gen
, u64 fileattr
)
2656 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
2660 if (sctx
->proto
< 2)
2663 btrfs_debug(fs_info
, "send_fileattr %llu fileattr=%llu", ino
, fileattr
);
2665 p
= fs_path_alloc();
2669 ret
= begin_cmd(sctx
, BTRFS_SEND_C_FILEATTR
);
2673 ret
= get_cur_path(sctx
, ino
, gen
, p
);
2676 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, p
);
2677 TLV_PUT_U64(sctx
, BTRFS_SEND_A_FILEATTR
, fileattr
);
2679 ret
= send_cmd(sctx
);
2687 static int send_chown(struct send_ctx
*sctx
, u64 ino
, u64 gen
, u64 uid
, u64 gid
)
2689 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
2693 btrfs_debug(fs_info
, "send_chown %llu uid=%llu, gid=%llu",
2696 p
= fs_path_alloc();
2700 ret
= begin_cmd(sctx
, BTRFS_SEND_C_CHOWN
);
2704 ret
= get_cur_path(sctx
, ino
, gen
, p
);
2707 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, p
);
2708 TLV_PUT_U64(sctx
, BTRFS_SEND_A_UID
, uid
);
2709 TLV_PUT_U64(sctx
, BTRFS_SEND_A_GID
, gid
);
2711 ret
= send_cmd(sctx
);
2719 static int send_utimes(struct send_ctx
*sctx
, u64 ino
, u64 gen
)
2721 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
2723 struct fs_path
*p
= NULL
;
2724 struct btrfs_inode_item
*ii
;
2725 struct btrfs_path
*path
= NULL
;
2726 struct extent_buffer
*eb
;
2727 struct btrfs_key key
;
2730 btrfs_debug(fs_info
, "send_utimes %llu", ino
);
2732 p
= fs_path_alloc();
2736 path
= alloc_path_for_send();
2743 key
.type
= BTRFS_INODE_ITEM_KEY
;
2745 ret
= btrfs_search_slot(NULL
, sctx
->send_root
, &key
, path
, 0, 0);
2751 eb
= path
->nodes
[0];
2752 slot
= path
->slots
[0];
2753 ii
= btrfs_item_ptr(eb
, slot
, struct btrfs_inode_item
);
2755 ret
= begin_cmd(sctx
, BTRFS_SEND_C_UTIMES
);
2759 ret
= get_cur_path(sctx
, ino
, gen
, p
);
2762 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, p
);
2763 TLV_PUT_BTRFS_TIMESPEC(sctx
, BTRFS_SEND_A_ATIME
, eb
, &ii
->atime
);
2764 TLV_PUT_BTRFS_TIMESPEC(sctx
, BTRFS_SEND_A_MTIME
, eb
, &ii
->mtime
);
2765 TLV_PUT_BTRFS_TIMESPEC(sctx
, BTRFS_SEND_A_CTIME
, eb
, &ii
->ctime
);
2766 if (sctx
->proto
>= 2)
2767 TLV_PUT_BTRFS_TIMESPEC(sctx
, BTRFS_SEND_A_OTIME
, eb
, &ii
->otime
);
2769 ret
= send_cmd(sctx
);
2774 btrfs_free_path(path
);
2779 * If the cache is full, we can't remove entries from it and do a call to
2780 * send_utimes() for each respective inode, because we might be finishing
2781 * processing an inode that is a directory and it just got renamed, and existing
2782 * entries in the cache may refer to inodes that have the directory in their
2783 * full path - in which case we would generate outdated paths (pre-rename)
2784 * for the inodes that the cache entries point to. Instead of prunning the
2785 * cache when inserting, do it after we finish processing each inode at
2786 * finish_inode_if_needed().
2788 static int cache_dir_utimes(struct send_ctx
*sctx
, u64 dir
, u64 gen
)
2790 struct btrfs_lru_cache_entry
*entry
;
2793 entry
= btrfs_lru_cache_lookup(&sctx
->dir_utimes_cache
, dir
, gen
);
2797 /* Caching is optional, don't fail if we can't allocate memory. */
2798 entry
= kmalloc(sizeof(*entry
), GFP_KERNEL
);
2800 return send_utimes(sctx
, dir
, gen
);
2805 ret
= btrfs_lru_cache_store(&sctx
->dir_utimes_cache
, entry
, GFP_KERNEL
);
2806 ASSERT(ret
!= -EEXIST
);
2809 return send_utimes(sctx
, dir
, gen
);
2815 static int trim_dir_utimes_cache(struct send_ctx
*sctx
)
2817 while (sctx
->dir_utimes_cache
.size
> SEND_MAX_DIR_UTIMES_CACHE_SIZE
) {
2818 struct btrfs_lru_cache_entry
*lru
;
2821 lru
= btrfs_lru_cache_lru_entry(&sctx
->dir_utimes_cache
);
2822 ASSERT(lru
!= NULL
);
2824 ret
= send_utimes(sctx
, lru
->key
, lru
->gen
);
2828 btrfs_lru_cache_remove(&sctx
->dir_utimes_cache
, lru
);
2835 * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have
2836 * a valid path yet because we did not process the refs yet. So, the inode
2837 * is created as orphan.
2839 static int send_create_inode(struct send_ctx
*sctx
, u64 ino
)
2841 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
2845 struct btrfs_inode_info info
;
2850 btrfs_debug(fs_info
, "send_create_inode %llu", ino
);
2852 p
= fs_path_alloc();
2856 if (ino
!= sctx
->cur_ino
) {
2857 ret
= get_inode_info(sctx
->send_root
, ino
, &info
);
2864 gen
= sctx
->cur_inode_gen
;
2865 mode
= sctx
->cur_inode_mode
;
2866 rdev
= sctx
->cur_inode_rdev
;
2869 if (S_ISREG(mode
)) {
2870 cmd
= BTRFS_SEND_C_MKFILE
;
2871 } else if (S_ISDIR(mode
)) {
2872 cmd
= BTRFS_SEND_C_MKDIR
;
2873 } else if (S_ISLNK(mode
)) {
2874 cmd
= BTRFS_SEND_C_SYMLINK
;
2875 } else if (S_ISCHR(mode
) || S_ISBLK(mode
)) {
2876 cmd
= BTRFS_SEND_C_MKNOD
;
2877 } else if (S_ISFIFO(mode
)) {
2878 cmd
= BTRFS_SEND_C_MKFIFO
;
2879 } else if (S_ISSOCK(mode
)) {
2880 cmd
= BTRFS_SEND_C_MKSOCK
;
2882 btrfs_warn(sctx
->send_root
->fs_info
, "unexpected inode type %o",
2883 (int)(mode
& S_IFMT
));
2888 ret
= begin_cmd(sctx
, cmd
);
2892 ret
= gen_unique_name(sctx
, ino
, gen
, p
);
2896 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, p
);
2897 TLV_PUT_U64(sctx
, BTRFS_SEND_A_INO
, ino
);
2899 if (S_ISLNK(mode
)) {
2901 ret
= read_symlink(sctx
->send_root
, ino
, p
);
2904 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH_LINK
, p
);
2905 } else if (S_ISCHR(mode
) || S_ISBLK(mode
) ||
2906 S_ISFIFO(mode
) || S_ISSOCK(mode
)) {
2907 TLV_PUT_U64(sctx
, BTRFS_SEND_A_RDEV
, new_encode_dev(rdev
));
2908 TLV_PUT_U64(sctx
, BTRFS_SEND_A_MODE
, mode
);
2911 ret
= send_cmd(sctx
);
2922 static void cache_dir_created(struct send_ctx
*sctx
, u64 dir
)
2924 struct btrfs_lru_cache_entry
*entry
;
2927 /* Caching is optional, ignore any failures. */
2928 entry
= kmalloc(sizeof(*entry
), GFP_KERNEL
);
2934 ret
= btrfs_lru_cache_store(&sctx
->dir_created_cache
, entry
, GFP_KERNEL
);
2940 * We need some special handling for inodes that get processed before the parent
2941 * directory got created. See process_recorded_refs for details.
2942 * This function does the check if we already created the dir out of order.
2944 static int did_create_dir(struct send_ctx
*sctx
, u64 dir
)
2948 struct btrfs_path
*path
= NULL
;
2949 struct btrfs_key key
;
2950 struct btrfs_key found_key
;
2951 struct btrfs_key di_key
;
2952 struct btrfs_dir_item
*di
;
2954 if (btrfs_lru_cache_lookup(&sctx
->dir_created_cache
, dir
, 0))
2957 path
= alloc_path_for_send();
2962 key
.type
= BTRFS_DIR_INDEX_KEY
;
2965 btrfs_for_each_slot(sctx
->send_root
, &key
, &found_key
, path
, iter_ret
) {
2966 struct extent_buffer
*eb
= path
->nodes
[0];
2968 if (found_key
.objectid
!= key
.objectid
||
2969 found_key
.type
!= key
.type
) {
2974 di
= btrfs_item_ptr(eb
, path
->slots
[0], struct btrfs_dir_item
);
2975 btrfs_dir_item_key_to_cpu(eb
, di
, &di_key
);
2977 if (di_key
.type
!= BTRFS_ROOT_ITEM_KEY
&&
2978 di_key
.objectid
< sctx
->send_progress
) {
2980 cache_dir_created(sctx
, dir
);
2984 /* Catch error found during iteration */
2988 btrfs_free_path(path
);
2993 * Only creates the inode if it is:
2994 * 1. Not a directory
2995 * 2. Or a directory which was not created already due to out of order
2996 * directories. See did_create_dir and process_recorded_refs for details.
2998 static int send_create_inode_if_needed(struct send_ctx
*sctx
)
3002 if (S_ISDIR(sctx
->cur_inode_mode
)) {
3003 ret
= did_create_dir(sctx
, sctx
->cur_ino
);
3010 ret
= send_create_inode(sctx
, sctx
->cur_ino
);
3012 if (ret
== 0 && S_ISDIR(sctx
->cur_inode_mode
))
3013 cache_dir_created(sctx
, sctx
->cur_ino
);
3018 struct recorded_ref
{
3019 struct list_head list
;
3021 struct fs_path
*full_path
;
3025 struct rb_node node
;
3026 struct rb_root
*root
;
3029 static struct recorded_ref
*recorded_ref_alloc(void)
3031 struct recorded_ref
*ref
;
3033 ref
= kzalloc(sizeof(*ref
), GFP_KERNEL
);
3036 RB_CLEAR_NODE(&ref
->node
);
3037 INIT_LIST_HEAD(&ref
->list
);
3041 static void recorded_ref_free(struct recorded_ref
*ref
)
3045 if (!RB_EMPTY_NODE(&ref
->node
))
3046 rb_erase(&ref
->node
, ref
->root
);
3047 list_del(&ref
->list
);
3048 fs_path_free(ref
->full_path
);
3052 static void set_ref_path(struct recorded_ref
*ref
, struct fs_path
*path
)
3054 ref
->full_path
= path
;
3055 ref
->name
= (char *)kbasename(ref
->full_path
->start
);
3056 ref
->name_len
= ref
->full_path
->end
- ref
->name
;
3059 static int dup_ref(struct recorded_ref
*ref
, struct list_head
*list
)
3061 struct recorded_ref
*new;
3063 new = recorded_ref_alloc();
3067 new->dir
= ref
->dir
;
3068 new->dir_gen
= ref
->dir_gen
;
3069 list_add_tail(&new->list
, list
);
3073 static void __free_recorded_refs(struct list_head
*head
)
3075 struct recorded_ref
*cur
;
3077 while (!list_empty(head
)) {
3078 cur
= list_entry(head
->next
, struct recorded_ref
, list
);
3079 recorded_ref_free(cur
);
3083 static void free_recorded_refs(struct send_ctx
*sctx
)
3085 __free_recorded_refs(&sctx
->new_refs
);
3086 __free_recorded_refs(&sctx
->deleted_refs
);
3090 * Renames/moves a file/dir to its orphan name. Used when the first
3091 * ref of an unprocessed inode gets overwritten and for all non empty
3094 static int orphanize_inode(struct send_ctx
*sctx
, u64 ino
, u64 gen
,
3095 struct fs_path
*path
)
3098 struct fs_path
*orphan
;
3100 orphan
= fs_path_alloc();
3104 ret
= gen_unique_name(sctx
, ino
, gen
, orphan
);
3108 ret
= send_rename(sctx
, path
, orphan
);
3111 fs_path_free(orphan
);
3115 static struct orphan_dir_info
*add_orphan_dir_info(struct send_ctx
*sctx
,
3116 u64 dir_ino
, u64 dir_gen
)
3118 struct rb_node
**p
= &sctx
->orphan_dirs
.rb_node
;
3119 struct rb_node
*parent
= NULL
;
3120 struct orphan_dir_info
*entry
, *odi
;
3124 entry
= rb_entry(parent
, struct orphan_dir_info
, node
);
3125 if (dir_ino
< entry
->ino
)
3127 else if (dir_ino
> entry
->ino
)
3128 p
= &(*p
)->rb_right
;
3129 else if (dir_gen
< entry
->gen
)
3131 else if (dir_gen
> entry
->gen
)
3132 p
= &(*p
)->rb_right
;
3137 odi
= kmalloc(sizeof(*odi
), GFP_KERNEL
);
3139 return ERR_PTR(-ENOMEM
);
3142 odi
->last_dir_index_offset
= 0;
3143 odi
->dir_high_seq_ino
= 0;
3145 rb_link_node(&odi
->node
, parent
, p
);
3146 rb_insert_color(&odi
->node
, &sctx
->orphan_dirs
);
3150 static struct orphan_dir_info
*get_orphan_dir_info(struct send_ctx
*sctx
,
3151 u64 dir_ino
, u64 gen
)
3153 struct rb_node
*n
= sctx
->orphan_dirs
.rb_node
;
3154 struct orphan_dir_info
*entry
;
3157 entry
= rb_entry(n
, struct orphan_dir_info
, node
);
3158 if (dir_ino
< entry
->ino
)
3160 else if (dir_ino
> entry
->ino
)
3162 else if (gen
< entry
->gen
)
3164 else if (gen
> entry
->gen
)
3172 static int is_waiting_for_rm(struct send_ctx
*sctx
, u64 dir_ino
, u64 gen
)
3174 struct orphan_dir_info
*odi
= get_orphan_dir_info(sctx
, dir_ino
, gen
);
3179 static void free_orphan_dir_info(struct send_ctx
*sctx
,
3180 struct orphan_dir_info
*odi
)
3184 rb_erase(&odi
->node
, &sctx
->orphan_dirs
);
3189 * Returns 1 if a directory can be removed at this point in time.
3190 * We check this by iterating all dir items and checking if the inode behind
3191 * the dir item was already processed.
3193 static int can_rmdir(struct send_ctx
*sctx
, u64 dir
, u64 dir_gen
)
3197 struct btrfs_root
*root
= sctx
->parent_root
;
3198 struct btrfs_path
*path
;
3199 struct btrfs_key key
;
3200 struct btrfs_key found_key
;
3201 struct btrfs_key loc
;
3202 struct btrfs_dir_item
*di
;
3203 struct orphan_dir_info
*odi
= NULL
;
3204 u64 dir_high_seq_ino
= 0;
3205 u64 last_dir_index_offset
= 0;
3208 * Don't try to rmdir the top/root subvolume dir.
3210 if (dir
== BTRFS_FIRST_FREE_OBJECTID
)
3213 odi
= get_orphan_dir_info(sctx
, dir
, dir_gen
);
3214 if (odi
&& sctx
->cur_ino
< odi
->dir_high_seq_ino
)
3217 path
= alloc_path_for_send();
3223 * Find the inode number associated with the last dir index
3224 * entry. This is very likely the inode with the highest number
3225 * of all inodes that have an entry in the directory. We can
3226 * then use it to avoid future calls to can_rmdir(), when
3227 * processing inodes with a lower number, from having to search
3228 * the parent root b+tree for dir index keys.
3231 key
.type
= BTRFS_DIR_INDEX_KEY
;
3232 key
.offset
= (u64
)-1;
3234 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
3237 } else if (ret
> 0) {
3238 /* Can't happen, the root is never empty. */
3239 ASSERT(path
->slots
[0] > 0);
3240 if (WARN_ON(path
->slots
[0] == 0)) {
3247 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0]);
3248 if (key
.objectid
!= dir
|| key
.type
!= BTRFS_DIR_INDEX_KEY
) {
3249 /* No index keys, dir can be removed. */
3254 di
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
3255 struct btrfs_dir_item
);
3256 btrfs_dir_item_key_to_cpu(path
->nodes
[0], di
, &loc
);
3257 dir_high_seq_ino
= loc
.objectid
;
3258 if (sctx
->cur_ino
< dir_high_seq_ino
) {
3263 btrfs_release_path(path
);
3267 key
.type
= BTRFS_DIR_INDEX_KEY
;
3268 key
.offset
= (odi
? odi
->last_dir_index_offset
: 0);
3270 btrfs_for_each_slot(root
, &key
, &found_key
, path
, iter_ret
) {
3271 struct waiting_dir_move
*dm
;
3273 if (found_key
.objectid
!= key
.objectid
||
3274 found_key
.type
!= key
.type
)
3277 di
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
3278 struct btrfs_dir_item
);
3279 btrfs_dir_item_key_to_cpu(path
->nodes
[0], di
, &loc
);
3281 dir_high_seq_ino
= max(dir_high_seq_ino
, loc
.objectid
);
3282 last_dir_index_offset
= found_key
.offset
;
3284 dm
= get_waiting_dir_move(sctx
, loc
.objectid
);
3286 dm
->rmdir_ino
= dir
;
3287 dm
->rmdir_gen
= dir_gen
;
3292 if (loc
.objectid
> sctx
->cur_ino
) {
3301 free_orphan_dir_info(sctx
, odi
);
3306 btrfs_free_path(path
);
3312 odi
= add_orphan_dir_info(sctx
, dir
, dir_gen
);
3314 return PTR_ERR(odi
);
3319 odi
->last_dir_index_offset
= last_dir_index_offset
;
3320 odi
->dir_high_seq_ino
= max(odi
->dir_high_seq_ino
, dir_high_seq_ino
);
3325 static int is_waiting_for_move(struct send_ctx
*sctx
, u64 ino
)
3327 struct waiting_dir_move
*entry
= get_waiting_dir_move(sctx
, ino
);
3329 return entry
!= NULL
;
3332 static int add_waiting_dir_move(struct send_ctx
*sctx
, u64 ino
, bool orphanized
)
3334 struct rb_node
**p
= &sctx
->waiting_dir_moves
.rb_node
;
3335 struct rb_node
*parent
= NULL
;
3336 struct waiting_dir_move
*entry
, *dm
;
3338 dm
= kmalloc(sizeof(*dm
), GFP_KERNEL
);
3344 dm
->orphanized
= orphanized
;
3348 entry
= rb_entry(parent
, struct waiting_dir_move
, node
);
3349 if (ino
< entry
->ino
) {
3351 } else if (ino
> entry
->ino
) {
3352 p
= &(*p
)->rb_right
;
3359 rb_link_node(&dm
->node
, parent
, p
);
3360 rb_insert_color(&dm
->node
, &sctx
->waiting_dir_moves
);
3364 static struct waiting_dir_move
*
3365 get_waiting_dir_move(struct send_ctx
*sctx
, u64 ino
)
3367 struct rb_node
*n
= sctx
->waiting_dir_moves
.rb_node
;
3368 struct waiting_dir_move
*entry
;
3371 entry
= rb_entry(n
, struct waiting_dir_move
, node
);
3372 if (ino
< entry
->ino
)
3374 else if (ino
> entry
->ino
)
3382 static void free_waiting_dir_move(struct send_ctx
*sctx
,
3383 struct waiting_dir_move
*dm
)
3387 rb_erase(&dm
->node
, &sctx
->waiting_dir_moves
);
3391 static int add_pending_dir_move(struct send_ctx
*sctx
,
3395 struct list_head
*new_refs
,
3396 struct list_head
*deleted_refs
,
3397 const bool is_orphan
)
3399 struct rb_node
**p
= &sctx
->pending_dir_moves
.rb_node
;
3400 struct rb_node
*parent
= NULL
;
3401 struct pending_dir_move
*entry
= NULL
, *pm
;
3402 struct recorded_ref
*cur
;
3406 pm
= kmalloc(sizeof(*pm
), GFP_KERNEL
);
3409 pm
->parent_ino
= parent_ino
;
3412 INIT_LIST_HEAD(&pm
->list
);
3413 INIT_LIST_HEAD(&pm
->update_refs
);
3414 RB_CLEAR_NODE(&pm
->node
);
3418 entry
= rb_entry(parent
, struct pending_dir_move
, node
);
3419 if (parent_ino
< entry
->parent_ino
) {
3421 } else if (parent_ino
> entry
->parent_ino
) {
3422 p
= &(*p
)->rb_right
;
3429 list_for_each_entry(cur
, deleted_refs
, list
) {
3430 ret
= dup_ref(cur
, &pm
->update_refs
);
3434 list_for_each_entry(cur
, new_refs
, list
) {
3435 ret
= dup_ref(cur
, &pm
->update_refs
);
3440 ret
= add_waiting_dir_move(sctx
, pm
->ino
, is_orphan
);
3445 list_add_tail(&pm
->list
, &entry
->list
);
3447 rb_link_node(&pm
->node
, parent
, p
);
3448 rb_insert_color(&pm
->node
, &sctx
->pending_dir_moves
);
3453 __free_recorded_refs(&pm
->update_refs
);
3459 static struct pending_dir_move
*get_pending_dir_moves(struct send_ctx
*sctx
,
3462 struct rb_node
*n
= sctx
->pending_dir_moves
.rb_node
;
3463 struct pending_dir_move
*entry
;
3466 entry
= rb_entry(n
, struct pending_dir_move
, node
);
3467 if (parent_ino
< entry
->parent_ino
)
3469 else if (parent_ino
> entry
->parent_ino
)
3477 static int path_loop(struct send_ctx
*sctx
, struct fs_path
*name
,
3478 u64 ino
, u64 gen
, u64
*ancestor_ino
)
3481 u64 parent_inode
= 0;
3483 u64 start_ino
= ino
;
3486 while (ino
!= BTRFS_FIRST_FREE_OBJECTID
) {
3487 fs_path_reset(name
);
3489 if (is_waiting_for_rm(sctx
, ino
, gen
))
3491 if (is_waiting_for_move(sctx
, ino
)) {
3492 if (*ancestor_ino
== 0)
3493 *ancestor_ino
= ino
;
3494 ret
= get_first_ref(sctx
->parent_root
, ino
,
3495 &parent_inode
, &parent_gen
, name
);
3497 ret
= __get_cur_name_and_parent(sctx
, ino
, gen
,
3507 if (parent_inode
== start_ino
) {
3509 if (*ancestor_ino
== 0)
3510 *ancestor_ino
= ino
;
3519 static int apply_dir_move(struct send_ctx
*sctx
, struct pending_dir_move
*pm
)
3521 struct fs_path
*from_path
= NULL
;
3522 struct fs_path
*to_path
= NULL
;
3523 struct fs_path
*name
= NULL
;
3524 u64 orig_progress
= sctx
->send_progress
;
3525 struct recorded_ref
*cur
;
3526 u64 parent_ino
, parent_gen
;
3527 struct waiting_dir_move
*dm
= NULL
;
3534 name
= fs_path_alloc();
3535 from_path
= fs_path_alloc();
3536 if (!name
|| !from_path
) {
3541 dm
= get_waiting_dir_move(sctx
, pm
->ino
);
3543 rmdir_ino
= dm
->rmdir_ino
;
3544 rmdir_gen
= dm
->rmdir_gen
;
3545 is_orphan
= dm
->orphanized
;
3546 free_waiting_dir_move(sctx
, dm
);
3549 ret
= gen_unique_name(sctx
, pm
->ino
,
3550 pm
->gen
, from_path
);
3552 ret
= get_first_ref(sctx
->parent_root
, pm
->ino
,
3553 &parent_ino
, &parent_gen
, name
);
3556 ret
= get_cur_path(sctx
, parent_ino
, parent_gen
,
3560 ret
= fs_path_add_path(from_path
, name
);
3565 sctx
->send_progress
= sctx
->cur_ino
+ 1;
3566 ret
= path_loop(sctx
, name
, pm
->ino
, pm
->gen
, &ancestor
);
3570 LIST_HEAD(deleted_refs
);
3571 ASSERT(ancestor
> BTRFS_FIRST_FREE_OBJECTID
);
3572 ret
= add_pending_dir_move(sctx
, pm
->ino
, pm
->gen
, ancestor
,
3573 &pm
->update_refs
, &deleted_refs
,
3578 dm
= get_waiting_dir_move(sctx
, pm
->ino
);
3580 dm
->rmdir_ino
= rmdir_ino
;
3581 dm
->rmdir_gen
= rmdir_gen
;
3585 fs_path_reset(name
);
3588 ret
= get_cur_path(sctx
, pm
->ino
, pm
->gen
, to_path
);
3592 ret
= send_rename(sctx
, from_path
, to_path
);
3597 struct orphan_dir_info
*odi
;
3600 odi
= get_orphan_dir_info(sctx
, rmdir_ino
, rmdir_gen
);
3602 /* already deleted */
3607 ret
= can_rmdir(sctx
, rmdir_ino
, gen
);
3613 name
= fs_path_alloc();
3618 ret
= get_cur_path(sctx
, rmdir_ino
, gen
, name
);
3621 ret
= send_rmdir(sctx
, name
);
3627 ret
= cache_dir_utimes(sctx
, pm
->ino
, pm
->gen
);
3632 * After rename/move, need to update the utimes of both new parent(s)
3633 * and old parent(s).
3635 list_for_each_entry(cur
, &pm
->update_refs
, list
) {
3637 * The parent inode might have been deleted in the send snapshot
3639 ret
= get_inode_info(sctx
->send_root
, cur
->dir
, NULL
);
3640 if (ret
== -ENOENT
) {
3647 ret
= cache_dir_utimes(sctx
, cur
->dir
, cur
->dir_gen
);
3654 fs_path_free(from_path
);
3655 fs_path_free(to_path
);
3656 sctx
->send_progress
= orig_progress
;
3661 static void free_pending_move(struct send_ctx
*sctx
, struct pending_dir_move
*m
)
3663 if (!list_empty(&m
->list
))
3665 if (!RB_EMPTY_NODE(&m
->node
))
3666 rb_erase(&m
->node
, &sctx
->pending_dir_moves
);
3667 __free_recorded_refs(&m
->update_refs
);
3671 static void tail_append_pending_moves(struct send_ctx
*sctx
,
3672 struct pending_dir_move
*moves
,
3673 struct list_head
*stack
)
3675 if (list_empty(&moves
->list
)) {
3676 list_add_tail(&moves
->list
, stack
);
3679 list_splice_init(&moves
->list
, &list
);
3680 list_add_tail(&moves
->list
, stack
);
3681 list_splice_tail(&list
, stack
);
3683 if (!RB_EMPTY_NODE(&moves
->node
)) {
3684 rb_erase(&moves
->node
, &sctx
->pending_dir_moves
);
3685 RB_CLEAR_NODE(&moves
->node
);
3689 static int apply_children_dir_moves(struct send_ctx
*sctx
)
3691 struct pending_dir_move
*pm
;
3693 u64 parent_ino
= sctx
->cur_ino
;
3696 pm
= get_pending_dir_moves(sctx
, parent_ino
);
3700 tail_append_pending_moves(sctx
, pm
, &stack
);
3702 while (!list_empty(&stack
)) {
3703 pm
= list_first_entry(&stack
, struct pending_dir_move
, list
);
3704 parent_ino
= pm
->ino
;
3705 ret
= apply_dir_move(sctx
, pm
);
3706 free_pending_move(sctx
, pm
);
3709 pm
= get_pending_dir_moves(sctx
, parent_ino
);
3711 tail_append_pending_moves(sctx
, pm
, &stack
);
3716 while (!list_empty(&stack
)) {
3717 pm
= list_first_entry(&stack
, struct pending_dir_move
, list
);
3718 free_pending_move(sctx
, pm
);
3724 * We might need to delay a directory rename even when no ancestor directory
3725 * (in the send root) with a higher inode number than ours (sctx->cur_ino) was
3726 * renamed. This happens when we rename a directory to the old name (the name
3727 * in the parent root) of some other unrelated directory that got its rename
3728 * delayed due to some ancestor with higher number that got renamed.
3734 * |---- a/ (ino 257)
3735 * | |---- file (ino 260)
3737 * |---- b/ (ino 258)
3738 * |---- c/ (ino 259)
3742 * |---- a/ (ino 258)
3743 * |---- x/ (ino 259)
3744 * |---- y/ (ino 257)
3745 * |----- file (ino 260)
3747 * Here we can not rename 258 from 'b' to 'a' without the rename of inode 257
3748 * from 'a' to 'x/y' happening first, which in turn depends on the rename of
3749 * inode 259 from 'c' to 'x'. So the order of rename commands the send stream
3752 * 1 - rename 259 from 'c' to 'x'
3753 * 2 - rename 257 from 'a' to 'x/y'
3754 * 3 - rename 258 from 'b' to 'a'
3756 * Returns 1 if the rename of sctx->cur_ino needs to be delayed, 0 if it can
3757 * be done right away and < 0 on error.
3759 static int wait_for_dest_dir_move(struct send_ctx
*sctx
,
3760 struct recorded_ref
*parent_ref
,
3761 const bool is_orphan
)
3763 struct btrfs_path
*path
;
3764 struct btrfs_key key
;
3765 struct btrfs_key di_key
;
3766 struct btrfs_dir_item
*di
;
3770 struct waiting_dir_move
*wdm
;
3772 if (RB_EMPTY_ROOT(&sctx
->waiting_dir_moves
))
3775 path
= alloc_path_for_send();
3779 key
.objectid
= parent_ref
->dir
;
3780 key
.type
= BTRFS_DIR_ITEM_KEY
;
3781 key
.offset
= btrfs_name_hash(parent_ref
->name
, parent_ref
->name_len
);
3783 ret
= btrfs_search_slot(NULL
, sctx
->parent_root
, &key
, path
, 0, 0);
3786 } else if (ret
> 0) {
3791 di
= btrfs_match_dir_item_name(path
, parent_ref
->name
,
3792 parent_ref
->name_len
);
3798 * di_key.objectid has the number of the inode that has a dentry in the
3799 * parent directory with the same name that sctx->cur_ino is being
3800 * renamed to. We need to check if that inode is in the send root as
3801 * well and if it is currently marked as an inode with a pending rename,
3802 * if it is, we need to delay the rename of sctx->cur_ino as well, so
3803 * that it happens after that other inode is renamed.
3805 btrfs_dir_item_key_to_cpu(path
->nodes
[0], di
, &di_key
);
3806 if (di_key
.type
!= BTRFS_INODE_ITEM_KEY
) {
3811 ret
= get_inode_gen(sctx
->parent_root
, di_key
.objectid
, &left_gen
);
3814 ret
= get_inode_gen(sctx
->send_root
, di_key
.objectid
, &right_gen
);
3821 /* Different inode, no need to delay the rename of sctx->cur_ino */
3822 if (right_gen
!= left_gen
) {
3827 wdm
= get_waiting_dir_move(sctx
, di_key
.objectid
);
3828 if (wdm
&& !wdm
->orphanized
) {
3829 ret
= add_pending_dir_move(sctx
,
3831 sctx
->cur_inode_gen
,
3834 &sctx
->deleted_refs
,
3840 btrfs_free_path(path
);
3845 * Check if inode ino2, or any of its ancestors, is inode ino1.
3846 * Return 1 if true, 0 if false and < 0 on error.
3848 static int check_ino_in_path(struct btrfs_root
*root
,
3853 struct fs_path
*fs_path
)
3858 return ino1_gen
== ino2_gen
;
3860 while (ino
> BTRFS_FIRST_FREE_OBJECTID
) {
3865 fs_path_reset(fs_path
);
3866 ret
= get_first_ref(root
, ino
, &parent
, &parent_gen
, fs_path
);
3870 return parent_gen
== ino1_gen
;
3877 * Check if inode ino1 is an ancestor of inode ino2 in the given root for any
3878 * possible path (in case ino2 is not a directory and has multiple hard links).
3879 * Return 1 if true, 0 if false and < 0 on error.
3881 static int is_ancestor(struct btrfs_root
*root
,
3885 struct fs_path
*fs_path
)
3887 bool free_fs_path
= false;
3890 struct btrfs_path
*path
= NULL
;
3891 struct btrfs_key key
;
3894 fs_path
= fs_path_alloc();
3897 free_fs_path
= true;
3900 path
= alloc_path_for_send();
3906 key
.objectid
= ino2
;
3907 key
.type
= BTRFS_INODE_REF_KEY
;
3910 btrfs_for_each_slot(root
, &key
, &key
, path
, iter_ret
) {
3911 struct extent_buffer
*leaf
= path
->nodes
[0];
3912 int slot
= path
->slots
[0];
3916 if (key
.objectid
!= ino2
)
3918 if (key
.type
!= BTRFS_INODE_REF_KEY
&&
3919 key
.type
!= BTRFS_INODE_EXTREF_KEY
)
3922 item_size
= btrfs_item_size(leaf
, slot
);
3923 while (cur_offset
< item_size
) {
3927 if (key
.type
== BTRFS_INODE_EXTREF_KEY
) {
3929 struct btrfs_inode_extref
*extref
;
3931 ptr
= btrfs_item_ptr_offset(leaf
, slot
);
3932 extref
= (struct btrfs_inode_extref
*)
3934 parent
= btrfs_inode_extref_parent(leaf
,
3936 cur_offset
+= sizeof(*extref
);
3937 cur_offset
+= btrfs_inode_extref_name_len(leaf
,
3940 parent
= key
.offset
;
3941 cur_offset
= item_size
;
3944 ret
= get_inode_gen(root
, parent
, &parent_gen
);
3947 ret
= check_ino_in_path(root
, ino1
, ino1_gen
,
3948 parent
, parent_gen
, fs_path
);
3958 btrfs_free_path(path
);
3960 fs_path_free(fs_path
);
3964 static int wait_for_parent_move(struct send_ctx
*sctx
,
3965 struct recorded_ref
*parent_ref
,
3966 const bool is_orphan
)
3969 u64 ino
= parent_ref
->dir
;
3970 u64 ino_gen
= parent_ref
->dir_gen
;
3971 u64 parent_ino_before
, parent_ino_after
;
3972 struct fs_path
*path_before
= NULL
;
3973 struct fs_path
*path_after
= NULL
;
3976 path_after
= fs_path_alloc();
3977 path_before
= fs_path_alloc();
3978 if (!path_after
|| !path_before
) {
3984 * Our current directory inode may not yet be renamed/moved because some
3985 * ancestor (immediate or not) has to be renamed/moved first. So find if
3986 * such ancestor exists and make sure our own rename/move happens after
3987 * that ancestor is processed to avoid path build infinite loops (done
3988 * at get_cur_path()).
3990 while (ino
> BTRFS_FIRST_FREE_OBJECTID
) {
3991 u64 parent_ino_after_gen
;
3993 if (is_waiting_for_move(sctx
, ino
)) {
3995 * If the current inode is an ancestor of ino in the
3996 * parent root, we need to delay the rename of the
3997 * current inode, otherwise don't delayed the rename
3998 * because we can end up with a circular dependency
3999 * of renames, resulting in some directories never
4000 * getting the respective rename operations issued in
4001 * the send stream or getting into infinite path build
4004 ret
= is_ancestor(sctx
->parent_root
,
4005 sctx
->cur_ino
, sctx
->cur_inode_gen
,
4011 fs_path_reset(path_before
);
4012 fs_path_reset(path_after
);
4014 ret
= get_first_ref(sctx
->send_root
, ino
, &parent_ino_after
,
4015 &parent_ino_after_gen
, path_after
);
4018 ret
= get_first_ref(sctx
->parent_root
, ino
, &parent_ino_before
,
4020 if (ret
< 0 && ret
!= -ENOENT
) {
4022 } else if (ret
== -ENOENT
) {
4027 len1
= fs_path_len(path_before
);
4028 len2
= fs_path_len(path_after
);
4029 if (ino
> sctx
->cur_ino
&&
4030 (parent_ino_before
!= parent_ino_after
|| len1
!= len2
||
4031 memcmp(path_before
->start
, path_after
->start
, len1
))) {
4034 ret
= get_inode_gen(sctx
->parent_root
, ino
, &parent_ino_gen
);
4037 if (ino_gen
== parent_ino_gen
) {
4042 ino
= parent_ino_after
;
4043 ino_gen
= parent_ino_after_gen
;
4047 fs_path_free(path_before
);
4048 fs_path_free(path_after
);
4051 ret
= add_pending_dir_move(sctx
,
4053 sctx
->cur_inode_gen
,
4056 &sctx
->deleted_refs
,
4065 static int update_ref_path(struct send_ctx
*sctx
, struct recorded_ref
*ref
)
4068 struct fs_path
*new_path
;
4071 * Our reference's name member points to its full_path member string, so
4072 * we use here a new path.
4074 new_path
= fs_path_alloc();
4078 ret
= get_cur_path(sctx
, ref
->dir
, ref
->dir_gen
, new_path
);
4080 fs_path_free(new_path
);
4083 ret
= fs_path_add(new_path
, ref
->name
, ref
->name_len
);
4085 fs_path_free(new_path
);
4089 fs_path_free(ref
->full_path
);
4090 set_ref_path(ref
, new_path
);
4096 * When processing the new references for an inode we may orphanize an existing
4097 * directory inode because its old name conflicts with one of the new references
4098 * of the current inode. Later, when processing another new reference of our
4099 * inode, we might need to orphanize another inode, but the path we have in the
4100 * reference reflects the pre-orphanization name of the directory we previously
4101 * orphanized. For example:
4103 * parent snapshot looks like:
4106 * |----- f1 (ino 257)
4107 * |----- f2 (ino 258)
4108 * |----- d1/ (ino 259)
4109 * |----- d2/ (ino 260)
4111 * send snapshot looks like:
4114 * |----- d1 (ino 258)
4115 * |----- f2/ (ino 259)
4116 * |----- f2_link/ (ino 260)
4117 * | |----- f1 (ino 257)
4119 * |----- d2 (ino 258)
4121 * When processing inode 257 we compute the name for inode 259 as "d1", and we
4122 * cache it in the name cache. Later when we start processing inode 258, when
4123 * collecting all its new references we set a full path of "d1/d2" for its new
4124 * reference with name "d2". When we start processing the new references we
4125 * start by processing the new reference with name "d1", and this results in
4126 * orphanizing inode 259, since its old reference causes a conflict. Then we
4127 * move on the next new reference, with name "d2", and we find out we must
4128 * orphanize inode 260, as its old reference conflicts with ours - but for the
4129 * orphanization we use a source path corresponding to the path we stored in the
4130 * new reference, which is "d1/d2" and not "o259-6-0/d2" - this makes the
4131 * receiver fail since the path component "d1/" no longer exists, it was renamed
4132 * to "o259-6-0/" when processing the previous new reference. So in this case we
4133 * must recompute the path in the new reference and use it for the new
4134 * orphanization operation.
4136 static int refresh_ref_path(struct send_ctx
*sctx
, struct recorded_ref
*ref
)
4141 name
= kmemdup(ref
->name
, ref
->name_len
, GFP_KERNEL
);
4145 fs_path_reset(ref
->full_path
);
4146 ret
= get_cur_path(sctx
, ref
->dir
, ref
->dir_gen
, ref
->full_path
);
4150 ret
= fs_path_add(ref
->full_path
, name
, ref
->name_len
);
4154 /* Update the reference's base name pointer. */
4155 set_ref_path(ref
, ref
->full_path
);
4162 * This does all the move/link/unlink/rmdir magic.
4164 static int process_recorded_refs(struct send_ctx
*sctx
, int *pending_move
)
4166 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
4168 struct recorded_ref
*cur
;
4169 struct recorded_ref
*cur2
;
4170 LIST_HEAD(check_dirs
);
4171 struct fs_path
*valid_path
= NULL
;
4175 int did_overwrite
= 0;
4177 u64 last_dir_ino_rm
= 0;
4178 bool can_rename
= true;
4179 bool orphanized_dir
= false;
4180 bool orphanized_ancestor
= false;
4182 btrfs_debug(fs_info
, "process_recorded_refs %llu", sctx
->cur_ino
);
4185 * This should never happen as the root dir always has the same ref
4186 * which is always '..'
4188 if (unlikely(sctx
->cur_ino
<= BTRFS_FIRST_FREE_OBJECTID
)) {
4190 "send: unexpected inode %llu in process_recorded_refs()",
4196 valid_path
= fs_path_alloc();
4203 * First, check if the first ref of the current inode was overwritten
4204 * before. If yes, we know that the current inode was already orphanized
4205 * and thus use the orphan name. If not, we can use get_cur_path to
4206 * get the path of the first ref as it would like while receiving at
4207 * this point in time.
4208 * New inodes are always orphan at the beginning, so force to use the
4209 * orphan name in this case.
4210 * The first ref is stored in valid_path and will be updated if it
4211 * gets moved around.
4213 if (!sctx
->cur_inode_new
) {
4214 ret
= did_overwrite_first_ref(sctx
, sctx
->cur_ino
,
4215 sctx
->cur_inode_gen
);
4221 if (sctx
->cur_inode_new
|| did_overwrite
) {
4222 ret
= gen_unique_name(sctx
, sctx
->cur_ino
,
4223 sctx
->cur_inode_gen
, valid_path
);
4228 ret
= get_cur_path(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
,
4235 * Before doing any rename and link operations, do a first pass on the
4236 * new references to orphanize any unprocessed inodes that may have a
4237 * reference that conflicts with one of the new references of the current
4238 * inode. This needs to happen first because a new reference may conflict
4239 * with the old reference of a parent directory, so we must make sure
4240 * that the path used for link and rename commands don't use an
4241 * orphanized name when an ancestor was not yet orphanized.
4248 * |----- testdir/ (ino 259)
4249 * | |----- a (ino 257)
4251 * |----- b (ino 258)
4256 * |----- testdir_2/ (ino 259)
4257 * | |----- a (ino 260)
4259 * |----- testdir (ino 257)
4260 * |----- b (ino 257)
4261 * |----- b2 (ino 258)
4263 * Processing the new reference for inode 257 with name "b" may happen
4264 * before processing the new reference with name "testdir". If so, we
4265 * must make sure that by the time we send a link command to create the
4266 * hard link "b", inode 259 was already orphanized, since the generated
4267 * path in "valid_path" already contains the orphanized name for 259.
4268 * We are processing inode 257, so only later when processing 259 we do
4269 * the rename operation to change its temporary (orphanized) name to
4272 list_for_each_entry(cur
, &sctx
->new_refs
, list
) {
4273 ret
= get_cur_inode_state(sctx
, cur
->dir
, cur
->dir_gen
, NULL
, NULL
);
4276 if (ret
== inode_state_will_create
)
4280 * Check if this new ref would overwrite the first ref of another
4281 * unprocessed inode. If yes, orphanize the overwritten inode.
4282 * If we find an overwritten ref that is not the first ref,
4285 ret
= will_overwrite_ref(sctx
, cur
->dir
, cur
->dir_gen
,
4286 cur
->name
, cur
->name_len
,
4287 &ow_inode
, &ow_gen
, &ow_mode
);
4291 ret
= is_first_ref(sctx
->parent_root
,
4292 ow_inode
, cur
->dir
, cur
->name
,
4297 struct name_cache_entry
*nce
;
4298 struct waiting_dir_move
*wdm
;
4300 if (orphanized_dir
) {
4301 ret
= refresh_ref_path(sctx
, cur
);
4306 ret
= orphanize_inode(sctx
, ow_inode
, ow_gen
,
4310 if (S_ISDIR(ow_mode
))
4311 orphanized_dir
= true;
4314 * If ow_inode has its rename operation delayed
4315 * make sure that its orphanized name is used in
4316 * the source path when performing its rename
4319 wdm
= get_waiting_dir_move(sctx
, ow_inode
);
4321 wdm
->orphanized
= true;
4324 * Make sure we clear our orphanized inode's
4325 * name from the name cache. This is because the
4326 * inode ow_inode might be an ancestor of some
4327 * other inode that will be orphanized as well
4328 * later and has an inode number greater than
4329 * sctx->send_progress. We need to prevent
4330 * future name lookups from using the old name
4331 * and get instead the orphan name.
4333 nce
= name_cache_search(sctx
, ow_inode
, ow_gen
);
4335 btrfs_lru_cache_remove(&sctx
->name_cache
,
4339 * ow_inode might currently be an ancestor of
4340 * cur_ino, therefore compute valid_path (the
4341 * current path of cur_ino) again because it
4342 * might contain the pre-orphanization name of
4343 * ow_inode, which is no longer valid.
4345 ret
= is_ancestor(sctx
->parent_root
,
4347 sctx
->cur_ino
, NULL
);
4349 orphanized_ancestor
= true;
4350 fs_path_reset(valid_path
);
4351 ret
= get_cur_path(sctx
, sctx
->cur_ino
,
4352 sctx
->cur_inode_gen
,
4359 * If we previously orphanized a directory that
4360 * collided with a new reference that we already
4361 * processed, recompute the current path because
4362 * that directory may be part of the path.
4364 if (orphanized_dir
) {
4365 ret
= refresh_ref_path(sctx
, cur
);
4369 ret
= send_unlink(sctx
, cur
->full_path
);
4377 list_for_each_entry(cur
, &sctx
->new_refs
, list
) {
4379 * We may have refs where the parent directory does not exist
4380 * yet. This happens if the parent directories inum is higher
4381 * than the current inum. To handle this case, we create the
4382 * parent directory out of order. But we need to check if this
4383 * did already happen before due to other refs in the same dir.
4385 ret
= get_cur_inode_state(sctx
, cur
->dir
, cur
->dir_gen
, NULL
, NULL
);
4388 if (ret
== inode_state_will_create
) {
4391 * First check if any of the current inodes refs did
4392 * already create the dir.
4394 list_for_each_entry(cur2
, &sctx
->new_refs
, list
) {
4397 if (cur2
->dir
== cur
->dir
) {
4404 * If that did not happen, check if a previous inode
4405 * did already create the dir.
4408 ret
= did_create_dir(sctx
, cur
->dir
);
4412 ret
= send_create_inode(sctx
, cur
->dir
);
4415 cache_dir_created(sctx
, cur
->dir
);
4419 if (S_ISDIR(sctx
->cur_inode_mode
) && sctx
->parent_root
) {
4420 ret
= wait_for_dest_dir_move(sctx
, cur
, is_orphan
);
4429 if (S_ISDIR(sctx
->cur_inode_mode
) && sctx
->parent_root
&&
4431 ret
= wait_for_parent_move(sctx
, cur
, is_orphan
);
4441 * link/move the ref to the new place. If we have an orphan
4442 * inode, move it and update valid_path. If not, link or move
4443 * it depending on the inode mode.
4445 if (is_orphan
&& can_rename
) {
4446 ret
= send_rename(sctx
, valid_path
, cur
->full_path
);
4450 ret
= fs_path_copy(valid_path
, cur
->full_path
);
4453 } else if (can_rename
) {
4454 if (S_ISDIR(sctx
->cur_inode_mode
)) {
4456 * Dirs can't be linked, so move it. For moved
4457 * dirs, we always have one new and one deleted
4458 * ref. The deleted ref is ignored later.
4460 ret
= send_rename(sctx
, valid_path
,
4463 ret
= fs_path_copy(valid_path
,
4469 * We might have previously orphanized an inode
4470 * which is an ancestor of our current inode,
4471 * so our reference's full path, which was
4472 * computed before any such orphanizations, must
4475 if (orphanized_dir
) {
4476 ret
= update_ref_path(sctx
, cur
);
4480 ret
= send_link(sctx
, cur
->full_path
,
4486 ret
= dup_ref(cur
, &check_dirs
);
4491 if (S_ISDIR(sctx
->cur_inode_mode
) && sctx
->cur_inode_deleted
) {
4493 * Check if we can already rmdir the directory. If not,
4494 * orphanize it. For every dir item inside that gets deleted
4495 * later, we do this check again and rmdir it then if possible.
4496 * See the use of check_dirs for more details.
4498 ret
= can_rmdir(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
);
4502 ret
= send_rmdir(sctx
, valid_path
);
4505 } else if (!is_orphan
) {
4506 ret
= orphanize_inode(sctx
, sctx
->cur_ino
,
4507 sctx
->cur_inode_gen
, valid_path
);
4513 list_for_each_entry(cur
, &sctx
->deleted_refs
, list
) {
4514 ret
= dup_ref(cur
, &check_dirs
);
4518 } else if (S_ISDIR(sctx
->cur_inode_mode
) &&
4519 !list_empty(&sctx
->deleted_refs
)) {
4521 * We have a moved dir. Add the old parent to check_dirs
4523 cur
= list_entry(sctx
->deleted_refs
.next
, struct recorded_ref
,
4525 ret
= dup_ref(cur
, &check_dirs
);
4528 } else if (!S_ISDIR(sctx
->cur_inode_mode
)) {
4530 * We have a non dir inode. Go through all deleted refs and
4531 * unlink them if they were not already overwritten by other
4534 list_for_each_entry(cur
, &sctx
->deleted_refs
, list
) {
4535 ret
= did_overwrite_ref(sctx
, cur
->dir
, cur
->dir_gen
,
4536 sctx
->cur_ino
, sctx
->cur_inode_gen
,
4537 cur
->name
, cur
->name_len
);
4542 * If we orphanized any ancestor before, we need
4543 * to recompute the full path for deleted names,
4544 * since any such path was computed before we
4545 * processed any references and orphanized any
4548 if (orphanized_ancestor
) {
4549 ret
= update_ref_path(sctx
, cur
);
4553 ret
= send_unlink(sctx
, cur
->full_path
);
4557 ret
= dup_ref(cur
, &check_dirs
);
4562 * If the inode is still orphan, unlink the orphan. This may
4563 * happen when a previous inode did overwrite the first ref
4564 * of this inode and no new refs were added for the current
4565 * inode. Unlinking does not mean that the inode is deleted in
4566 * all cases. There may still be links to this inode in other
4570 ret
= send_unlink(sctx
, valid_path
);
4577 * We did collect all parent dirs where cur_inode was once located. We
4578 * now go through all these dirs and check if they are pending for
4579 * deletion and if it's finally possible to perform the rmdir now.
4580 * We also update the inode stats of the parent dirs here.
4582 list_for_each_entry(cur
, &check_dirs
, list
) {
4584 * In case we had refs into dirs that were not processed yet,
4585 * we don't need to do the utime and rmdir logic for these dirs.
4586 * The dir will be processed later.
4588 if (cur
->dir
> sctx
->cur_ino
)
4591 ret
= get_cur_inode_state(sctx
, cur
->dir
, cur
->dir_gen
, NULL
, NULL
);
4595 if (ret
== inode_state_did_create
||
4596 ret
== inode_state_no_change
) {
4597 ret
= cache_dir_utimes(sctx
, cur
->dir
, cur
->dir_gen
);
4600 } else if (ret
== inode_state_did_delete
&&
4601 cur
->dir
!= last_dir_ino_rm
) {
4602 ret
= can_rmdir(sctx
, cur
->dir
, cur
->dir_gen
);
4606 ret
= get_cur_path(sctx
, cur
->dir
,
4607 cur
->dir_gen
, valid_path
);
4610 ret
= send_rmdir(sctx
, valid_path
);
4613 last_dir_ino_rm
= cur
->dir
;
4621 __free_recorded_refs(&check_dirs
);
4622 free_recorded_refs(sctx
);
4623 fs_path_free(valid_path
);
4627 static int rbtree_ref_comp(const void *k
, const struct rb_node
*node
)
4629 const struct recorded_ref
*data
= k
;
4630 const struct recorded_ref
*ref
= rb_entry(node
, struct recorded_ref
, node
);
4633 if (data
->dir
> ref
->dir
)
4635 if (data
->dir
< ref
->dir
)
4637 if (data
->dir_gen
> ref
->dir_gen
)
4639 if (data
->dir_gen
< ref
->dir_gen
)
4641 if (data
->name_len
> ref
->name_len
)
4643 if (data
->name_len
< ref
->name_len
)
4645 result
= strcmp(data
->name
, ref
->name
);
4653 static bool rbtree_ref_less(struct rb_node
*node
, const struct rb_node
*parent
)
4655 const struct recorded_ref
*entry
= rb_entry(node
, struct recorded_ref
, node
);
4657 return rbtree_ref_comp(entry
, parent
) < 0;
4660 static int record_ref_in_tree(struct rb_root
*root
, struct list_head
*refs
,
4661 struct fs_path
*name
, u64 dir
, u64 dir_gen
,
4662 struct send_ctx
*sctx
)
4665 struct fs_path
*path
= NULL
;
4666 struct recorded_ref
*ref
= NULL
;
4668 path
= fs_path_alloc();
4674 ref
= recorded_ref_alloc();
4680 ret
= get_cur_path(sctx
, dir
, dir_gen
, path
);
4683 ret
= fs_path_add_path(path
, name
);
4688 ref
->dir_gen
= dir_gen
;
4689 set_ref_path(ref
, path
);
4690 list_add_tail(&ref
->list
, refs
);
4691 rb_add(&ref
->node
, root
, rbtree_ref_less
);
4695 if (path
&& (!ref
|| !ref
->full_path
))
4697 recorded_ref_free(ref
);
4702 static int record_new_ref_if_needed(u64 dir
, struct fs_path
*name
, void *ctx
)
4705 struct send_ctx
*sctx
= ctx
;
4706 struct rb_node
*node
= NULL
;
4707 struct recorded_ref data
;
4708 struct recorded_ref
*ref
;
4711 ret
= get_inode_gen(sctx
->send_root
, dir
, &dir_gen
);
4716 data
.dir_gen
= dir_gen
;
4717 set_ref_path(&data
, name
);
4718 node
= rb_find(&data
, &sctx
->rbtree_deleted_refs
, rbtree_ref_comp
);
4720 ref
= rb_entry(node
, struct recorded_ref
, node
);
4721 recorded_ref_free(ref
);
4723 ret
= record_ref_in_tree(&sctx
->rbtree_new_refs
,
4724 &sctx
->new_refs
, name
, dir
, dir_gen
,
4731 static int record_deleted_ref_if_needed(u64 dir
, struct fs_path
*name
, void *ctx
)
4734 struct send_ctx
*sctx
= ctx
;
4735 struct rb_node
*node
= NULL
;
4736 struct recorded_ref data
;
4737 struct recorded_ref
*ref
;
4740 ret
= get_inode_gen(sctx
->parent_root
, dir
, &dir_gen
);
4745 data
.dir_gen
= dir_gen
;
4746 set_ref_path(&data
, name
);
4747 node
= rb_find(&data
, &sctx
->rbtree_new_refs
, rbtree_ref_comp
);
4749 ref
= rb_entry(node
, struct recorded_ref
, node
);
4750 recorded_ref_free(ref
);
4752 ret
= record_ref_in_tree(&sctx
->rbtree_deleted_refs
,
4753 &sctx
->deleted_refs
, name
, dir
,
4760 static int record_new_ref(struct send_ctx
*sctx
)
4764 ret
= iterate_inode_ref(sctx
->send_root
, sctx
->left_path
,
4765 sctx
->cmp_key
, 0, record_new_ref_if_needed
, sctx
);
4774 static int record_deleted_ref(struct send_ctx
*sctx
)
4778 ret
= iterate_inode_ref(sctx
->parent_root
, sctx
->right_path
,
4779 sctx
->cmp_key
, 0, record_deleted_ref_if_needed
,
4789 static int record_changed_ref(struct send_ctx
*sctx
)
4793 ret
= iterate_inode_ref(sctx
->send_root
, sctx
->left_path
,
4794 sctx
->cmp_key
, 0, record_new_ref_if_needed
, sctx
);
4797 ret
= iterate_inode_ref(sctx
->parent_root
, sctx
->right_path
,
4798 sctx
->cmp_key
, 0, record_deleted_ref_if_needed
, sctx
);
4808 * Record and process all refs at once. Needed when an inode changes the
4809 * generation number, which means that it was deleted and recreated.
4811 static int process_all_refs(struct send_ctx
*sctx
,
4812 enum btrfs_compare_tree_result cmd
)
4816 struct btrfs_root
*root
;
4817 struct btrfs_path
*path
;
4818 struct btrfs_key key
;
4819 struct btrfs_key found_key
;
4820 iterate_inode_ref_t cb
;
4821 int pending_move
= 0;
4823 path
= alloc_path_for_send();
4827 if (cmd
== BTRFS_COMPARE_TREE_NEW
) {
4828 root
= sctx
->send_root
;
4829 cb
= record_new_ref_if_needed
;
4830 } else if (cmd
== BTRFS_COMPARE_TREE_DELETED
) {
4831 root
= sctx
->parent_root
;
4832 cb
= record_deleted_ref_if_needed
;
4834 btrfs_err(sctx
->send_root
->fs_info
,
4835 "Wrong command %d in process_all_refs", cmd
);
4840 key
.objectid
= sctx
->cmp_key
->objectid
;
4841 key
.type
= BTRFS_INODE_REF_KEY
;
4843 btrfs_for_each_slot(root
, &key
, &found_key
, path
, iter_ret
) {
4844 if (found_key
.objectid
!= key
.objectid
||
4845 (found_key
.type
!= BTRFS_INODE_REF_KEY
&&
4846 found_key
.type
!= BTRFS_INODE_EXTREF_KEY
))
4849 ret
= iterate_inode_ref(root
, path
, &found_key
, 0, cb
, sctx
);
4853 /* Catch error found during iteration */
4858 btrfs_release_path(path
);
4861 * We don't actually care about pending_move as we are simply
4862 * re-creating this inode and will be rename'ing it into place once we
4863 * rename the parent directory.
4865 ret
= process_recorded_refs(sctx
, &pending_move
);
4867 btrfs_free_path(path
);
4871 static int send_set_xattr(struct send_ctx
*sctx
,
4872 struct fs_path
*path
,
4873 const char *name
, int name_len
,
4874 const char *data
, int data_len
)
4878 ret
= begin_cmd(sctx
, BTRFS_SEND_C_SET_XATTR
);
4882 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, path
);
4883 TLV_PUT_STRING(sctx
, BTRFS_SEND_A_XATTR_NAME
, name
, name_len
);
4884 TLV_PUT(sctx
, BTRFS_SEND_A_XATTR_DATA
, data
, data_len
);
4886 ret
= send_cmd(sctx
);
4893 static int send_remove_xattr(struct send_ctx
*sctx
,
4894 struct fs_path
*path
,
4895 const char *name
, int name_len
)
4899 ret
= begin_cmd(sctx
, BTRFS_SEND_C_REMOVE_XATTR
);
4903 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, path
);
4904 TLV_PUT_STRING(sctx
, BTRFS_SEND_A_XATTR_NAME
, name
, name_len
);
4906 ret
= send_cmd(sctx
);
4913 static int __process_new_xattr(int num
, struct btrfs_key
*di_key
,
4914 const char *name
, int name_len
, const char *data
,
4915 int data_len
, void *ctx
)
4918 struct send_ctx
*sctx
= ctx
;
4920 struct posix_acl_xattr_header dummy_acl
;
4922 /* Capabilities are emitted by finish_inode_if_needed */
4923 if (!strncmp(name
, XATTR_NAME_CAPS
, name_len
))
4926 p
= fs_path_alloc();
4931 * This hack is needed because empty acls are stored as zero byte
4932 * data in xattrs. Problem with that is, that receiving these zero byte
4933 * acls will fail later. To fix this, we send a dummy acl list that
4934 * only contains the version number and no entries.
4936 if (!strncmp(name
, XATTR_NAME_POSIX_ACL_ACCESS
, name_len
) ||
4937 !strncmp(name
, XATTR_NAME_POSIX_ACL_DEFAULT
, name_len
)) {
4938 if (data_len
== 0) {
4939 dummy_acl
.a_version
=
4940 cpu_to_le32(POSIX_ACL_XATTR_VERSION
);
4941 data
= (char *)&dummy_acl
;
4942 data_len
= sizeof(dummy_acl
);
4946 ret
= get_cur_path(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
, p
);
4950 ret
= send_set_xattr(sctx
, p
, name
, name_len
, data
, data_len
);
4957 static int __process_deleted_xattr(int num
, struct btrfs_key
*di_key
,
4958 const char *name
, int name_len
,
4959 const char *data
, int data_len
, void *ctx
)
4962 struct send_ctx
*sctx
= ctx
;
4965 p
= fs_path_alloc();
4969 ret
= get_cur_path(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
, p
);
4973 ret
= send_remove_xattr(sctx
, p
, name
, name_len
);
4980 static int process_new_xattr(struct send_ctx
*sctx
)
4984 ret
= iterate_dir_item(sctx
->send_root
, sctx
->left_path
,
4985 __process_new_xattr
, sctx
);
4990 static int process_deleted_xattr(struct send_ctx
*sctx
)
4992 return iterate_dir_item(sctx
->parent_root
, sctx
->right_path
,
4993 __process_deleted_xattr
, sctx
);
4996 struct find_xattr_ctx
{
5004 static int __find_xattr(int num
, struct btrfs_key
*di_key
, const char *name
,
5005 int name_len
, const char *data
, int data_len
, void *vctx
)
5007 struct find_xattr_ctx
*ctx
= vctx
;
5009 if (name_len
== ctx
->name_len
&&
5010 strncmp(name
, ctx
->name
, name_len
) == 0) {
5011 ctx
->found_idx
= num
;
5012 ctx
->found_data_len
= data_len
;
5013 ctx
->found_data
= kmemdup(data
, data_len
, GFP_KERNEL
);
5014 if (!ctx
->found_data
)
5021 static int find_xattr(struct btrfs_root
*root
,
5022 struct btrfs_path
*path
,
5023 struct btrfs_key
*key
,
5024 const char *name
, int name_len
,
5025 char **data
, int *data_len
)
5028 struct find_xattr_ctx ctx
;
5031 ctx
.name_len
= name_len
;
5033 ctx
.found_data
= NULL
;
5034 ctx
.found_data_len
= 0;
5036 ret
= iterate_dir_item(root
, path
, __find_xattr
, &ctx
);
5040 if (ctx
.found_idx
== -1)
5043 *data
= ctx
.found_data
;
5044 *data_len
= ctx
.found_data_len
;
5046 kfree(ctx
.found_data
);
5048 return ctx
.found_idx
;
5052 static int __process_changed_new_xattr(int num
, struct btrfs_key
*di_key
,
5053 const char *name
, int name_len
,
5054 const char *data
, int data_len
,
5058 struct send_ctx
*sctx
= ctx
;
5059 char *found_data
= NULL
;
5060 int found_data_len
= 0;
5062 ret
= find_xattr(sctx
->parent_root
, sctx
->right_path
,
5063 sctx
->cmp_key
, name
, name_len
, &found_data
,
5065 if (ret
== -ENOENT
) {
5066 ret
= __process_new_xattr(num
, di_key
, name
, name_len
, data
,
5068 } else if (ret
>= 0) {
5069 if (data_len
!= found_data_len
||
5070 memcmp(data
, found_data
, data_len
)) {
5071 ret
= __process_new_xattr(num
, di_key
, name
, name_len
,
5072 data
, data_len
, ctx
);
5082 static int __process_changed_deleted_xattr(int num
, struct btrfs_key
*di_key
,
5083 const char *name
, int name_len
,
5084 const char *data
, int data_len
,
5088 struct send_ctx
*sctx
= ctx
;
5090 ret
= find_xattr(sctx
->send_root
, sctx
->left_path
, sctx
->cmp_key
,
5091 name
, name_len
, NULL
, NULL
);
5093 ret
= __process_deleted_xattr(num
, di_key
, name
, name_len
, data
,
5101 static int process_changed_xattr(struct send_ctx
*sctx
)
5105 ret
= iterate_dir_item(sctx
->send_root
, sctx
->left_path
,
5106 __process_changed_new_xattr
, sctx
);
5109 ret
= iterate_dir_item(sctx
->parent_root
, sctx
->right_path
,
5110 __process_changed_deleted_xattr
, sctx
);
5116 static int process_all_new_xattrs(struct send_ctx
*sctx
)
5120 struct btrfs_root
*root
;
5121 struct btrfs_path
*path
;
5122 struct btrfs_key key
;
5123 struct btrfs_key found_key
;
5125 path
= alloc_path_for_send();
5129 root
= sctx
->send_root
;
5131 key
.objectid
= sctx
->cmp_key
->objectid
;
5132 key
.type
= BTRFS_XATTR_ITEM_KEY
;
5134 btrfs_for_each_slot(root
, &key
, &found_key
, path
, iter_ret
) {
5135 if (found_key
.objectid
!= key
.objectid
||
5136 found_key
.type
!= key
.type
) {
5141 ret
= iterate_dir_item(root
, path
, __process_new_xattr
, sctx
);
5145 /* Catch error found during iteration */
5149 btrfs_free_path(path
);
5153 static int send_verity(struct send_ctx
*sctx
, struct fs_path
*path
,
5154 struct fsverity_descriptor
*desc
)
5158 ret
= begin_cmd(sctx
, BTRFS_SEND_C_ENABLE_VERITY
);
5162 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, path
);
5163 TLV_PUT_U8(sctx
, BTRFS_SEND_A_VERITY_ALGORITHM
,
5164 le8_to_cpu(desc
->hash_algorithm
));
5165 TLV_PUT_U32(sctx
, BTRFS_SEND_A_VERITY_BLOCK_SIZE
,
5166 1U << le8_to_cpu(desc
->log_blocksize
));
5167 TLV_PUT(sctx
, BTRFS_SEND_A_VERITY_SALT_DATA
, desc
->salt
,
5168 le8_to_cpu(desc
->salt_size
));
5169 TLV_PUT(sctx
, BTRFS_SEND_A_VERITY_SIG_DATA
, desc
->signature
,
5170 le32_to_cpu(desc
->sig_size
));
5172 ret
= send_cmd(sctx
);
5179 static int process_verity(struct send_ctx
*sctx
)
5182 struct inode
*inode
;
5185 inode
= btrfs_iget(sctx
->cur_ino
, sctx
->send_root
);
5187 return PTR_ERR(inode
);
5189 ret
= btrfs_get_verity_descriptor(inode
, NULL
, 0);
5193 if (ret
> FS_VERITY_MAX_DESCRIPTOR_SIZE
) {
5197 if (!sctx
->verity_descriptor
) {
5198 sctx
->verity_descriptor
= kvmalloc(FS_VERITY_MAX_DESCRIPTOR_SIZE
,
5200 if (!sctx
->verity_descriptor
) {
5206 ret
= btrfs_get_verity_descriptor(inode
, sctx
->verity_descriptor
, ret
);
5210 p
= fs_path_alloc();
5215 ret
= get_cur_path(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
, p
);
5219 ret
= send_verity(sctx
, p
, sctx
->verity_descriptor
);
5230 static inline u64
max_send_read_size(const struct send_ctx
*sctx
)
5232 return sctx
->send_max_size
- SZ_16K
;
5235 static int put_data_header(struct send_ctx
*sctx
, u32 len
)
5237 if (WARN_ON_ONCE(sctx
->put_data
))
5239 sctx
->put_data
= true;
5240 if (sctx
->proto
>= 2) {
5242 * Since v2, the data attribute header doesn't include a length,
5243 * it is implicitly to the end of the command.
5245 if (sctx
->send_max_size
- sctx
->send_size
< sizeof(__le16
) + len
)
5247 put_unaligned_le16(BTRFS_SEND_A_DATA
, sctx
->send_buf
+ sctx
->send_size
);
5248 sctx
->send_size
+= sizeof(__le16
);
5250 struct btrfs_tlv_header
*hdr
;
5252 if (sctx
->send_max_size
- sctx
->send_size
< sizeof(*hdr
) + len
)
5254 hdr
= (struct btrfs_tlv_header
*)(sctx
->send_buf
+ sctx
->send_size
);
5255 put_unaligned_le16(BTRFS_SEND_A_DATA
, &hdr
->tlv_type
);
5256 put_unaligned_le16(len
, &hdr
->tlv_len
);
5257 sctx
->send_size
+= sizeof(*hdr
);
5262 static int put_file_data(struct send_ctx
*sctx
, u64 offset
, u32 len
)
5264 struct btrfs_root
*root
= sctx
->send_root
;
5265 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
5266 struct folio
*folio
;
5267 pgoff_t index
= offset
>> PAGE_SHIFT
;
5269 unsigned pg_offset
= offset_in_page(offset
);
5270 struct address_space
*mapping
= sctx
->cur_inode
->i_mapping
;
5273 ret
= put_data_header(sctx
, len
);
5277 last_index
= (offset
+ len
- 1) >> PAGE_SHIFT
;
5279 while (index
<= last_index
) {
5280 unsigned cur_len
= min_t(unsigned, len
,
5281 PAGE_SIZE
- pg_offset
);
5283 folio
= filemap_lock_folio(mapping
, index
);
5284 if (IS_ERR(folio
)) {
5285 page_cache_sync_readahead(mapping
,
5286 &sctx
->ra
, NULL
, index
,
5287 last_index
+ 1 - index
);
5289 folio
= filemap_grab_folio(mapping
, index
);
5290 if (IS_ERR(folio
)) {
5291 ret
= PTR_ERR(folio
);
5296 WARN_ON(folio_order(folio
));
5298 if (folio_test_readahead(folio
))
5299 page_cache_async_readahead(mapping
, &sctx
->ra
, NULL
, folio
,
5300 last_index
+ 1 - index
);
5302 if (!folio_test_uptodate(folio
)) {
5303 btrfs_read_folio(NULL
, folio
);
5305 if (!folio_test_uptodate(folio
)) {
5306 folio_unlock(folio
);
5308 "send: IO error at offset %llu for inode %llu root %llu",
5309 folio_pos(folio
), sctx
->cur_ino
,
5310 btrfs_root_id(sctx
->send_root
));
5317 memcpy_from_folio(sctx
->send_buf
+ sctx
->send_size
, folio
,
5318 pg_offset
, cur_len
);
5319 folio_unlock(folio
);
5324 sctx
->send_size
+= cur_len
;
5331 * Read some bytes from the current inode/file and send a write command to
5334 static int send_write(struct send_ctx
*sctx
, u64 offset
, u32 len
)
5336 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
5340 p
= fs_path_alloc();
5344 btrfs_debug(fs_info
, "send_write offset=%llu, len=%d", offset
, len
);
5346 ret
= begin_cmd(sctx
, BTRFS_SEND_C_WRITE
);
5350 ret
= get_cur_path(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
, p
);
5354 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, p
);
5355 TLV_PUT_U64(sctx
, BTRFS_SEND_A_FILE_OFFSET
, offset
);
5356 ret
= put_file_data(sctx
, offset
, len
);
5360 ret
= send_cmd(sctx
);
5369 * Send a clone command to user space.
5371 static int send_clone(struct send_ctx
*sctx
,
5372 u64 offset
, u32 len
,
5373 struct clone_root
*clone_root
)
5379 btrfs_debug(sctx
->send_root
->fs_info
,
5380 "send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu",
5381 offset
, len
, btrfs_root_id(clone_root
->root
),
5382 clone_root
->ino
, clone_root
->offset
);
5384 p
= fs_path_alloc();
5388 ret
= begin_cmd(sctx
, BTRFS_SEND_C_CLONE
);
5392 ret
= get_cur_path(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
, p
);
5396 TLV_PUT_U64(sctx
, BTRFS_SEND_A_FILE_OFFSET
, offset
);
5397 TLV_PUT_U64(sctx
, BTRFS_SEND_A_CLONE_LEN
, len
);
5398 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, p
);
5400 if (clone_root
->root
== sctx
->send_root
) {
5401 ret
= get_inode_gen(sctx
->send_root
, clone_root
->ino
, &gen
);
5404 ret
= get_cur_path(sctx
, clone_root
->ino
, gen
, p
);
5406 ret
= get_inode_path(clone_root
->root
, clone_root
->ino
, p
);
5412 * If the parent we're using has a received_uuid set then use that as
5413 * our clone source as that is what we will look for when doing a
5416 * This covers the case that we create a snapshot off of a received
5417 * subvolume and then use that as the parent and try to receive on a
5420 if (!btrfs_is_empty_uuid(clone_root
->root
->root_item
.received_uuid
))
5421 TLV_PUT_UUID(sctx
, BTRFS_SEND_A_CLONE_UUID
,
5422 clone_root
->root
->root_item
.received_uuid
);
5424 TLV_PUT_UUID(sctx
, BTRFS_SEND_A_CLONE_UUID
,
5425 clone_root
->root
->root_item
.uuid
);
5426 TLV_PUT_U64(sctx
, BTRFS_SEND_A_CLONE_CTRANSID
,
5427 btrfs_root_ctransid(&clone_root
->root
->root_item
));
5428 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_CLONE_PATH
, p
);
5429 TLV_PUT_U64(sctx
, BTRFS_SEND_A_CLONE_OFFSET
,
5430 clone_root
->offset
);
5432 ret
= send_cmd(sctx
);
5441 * Send an update extent command to user space.
5443 static int send_update_extent(struct send_ctx
*sctx
,
5444 u64 offset
, u32 len
)
5449 p
= fs_path_alloc();
5453 ret
= begin_cmd(sctx
, BTRFS_SEND_C_UPDATE_EXTENT
);
5457 ret
= get_cur_path(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
, p
);
5461 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, p
);
5462 TLV_PUT_U64(sctx
, BTRFS_SEND_A_FILE_OFFSET
, offset
);
5463 TLV_PUT_U64(sctx
, BTRFS_SEND_A_SIZE
, len
);
5465 ret
= send_cmd(sctx
);
5473 static int send_hole(struct send_ctx
*sctx
, u64 end
)
5475 struct fs_path
*p
= NULL
;
5476 u64 read_size
= max_send_read_size(sctx
);
5477 u64 offset
= sctx
->cur_inode_last_extent
;
5481 * A hole that starts at EOF or beyond it. Since we do not yet support
5482 * fallocate (for extent preallocation and hole punching), sending a
5483 * write of zeroes starting at EOF or beyond would later require issuing
5484 * a truncate operation which would undo the write and achieve nothing.
5486 if (offset
>= sctx
->cur_inode_size
)
5490 * Don't go beyond the inode's i_size due to prealloc extents that start
5493 end
= min_t(u64
, end
, sctx
->cur_inode_size
);
5495 if (sctx
->flags
& BTRFS_SEND_FLAG_NO_FILE_DATA
)
5496 return send_update_extent(sctx
, offset
, end
- offset
);
5498 p
= fs_path_alloc();
5501 ret
= get_cur_path(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
, p
);
5503 goto tlv_put_failure
;
5504 while (offset
< end
) {
5505 u64 len
= min(end
- offset
, read_size
);
5507 ret
= begin_cmd(sctx
, BTRFS_SEND_C_WRITE
);
5510 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, p
);
5511 TLV_PUT_U64(sctx
, BTRFS_SEND_A_FILE_OFFSET
, offset
);
5512 ret
= put_data_header(sctx
, len
);
5515 memset(sctx
->send_buf
+ sctx
->send_size
, 0, len
);
5516 sctx
->send_size
+= len
;
5517 ret
= send_cmd(sctx
);
5522 sctx
->cur_inode_next_write_offset
= offset
;
5528 static int send_encoded_inline_extent(struct send_ctx
*sctx
,
5529 struct btrfs_path
*path
, u64 offset
,
5532 struct btrfs_root
*root
= sctx
->send_root
;
5533 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
5534 struct inode
*inode
;
5535 struct fs_path
*fspath
;
5536 struct extent_buffer
*leaf
= path
->nodes
[0];
5537 struct btrfs_key key
;
5538 struct btrfs_file_extent_item
*ei
;
5543 inode
= btrfs_iget(sctx
->cur_ino
, root
);
5545 return PTR_ERR(inode
);
5547 fspath
= fs_path_alloc();
5553 ret
= begin_cmd(sctx
, BTRFS_SEND_C_ENCODED_WRITE
);
5557 ret
= get_cur_path(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
, fspath
);
5561 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
5562 ei
= btrfs_item_ptr(leaf
, path
->slots
[0], struct btrfs_file_extent_item
);
5563 ram_bytes
= btrfs_file_extent_ram_bytes(leaf
, ei
);
5564 inline_size
= btrfs_file_extent_inline_item_len(leaf
, path
->slots
[0]);
5566 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, fspath
);
5567 TLV_PUT_U64(sctx
, BTRFS_SEND_A_FILE_OFFSET
, offset
);
5568 TLV_PUT_U64(sctx
, BTRFS_SEND_A_UNENCODED_FILE_LEN
,
5569 min(key
.offset
+ ram_bytes
- offset
, len
));
5570 TLV_PUT_U64(sctx
, BTRFS_SEND_A_UNENCODED_LEN
, ram_bytes
);
5571 TLV_PUT_U64(sctx
, BTRFS_SEND_A_UNENCODED_OFFSET
, offset
- key
.offset
);
5572 ret
= btrfs_encoded_io_compression_from_extent(fs_info
,
5573 btrfs_file_extent_compression(leaf
, ei
));
5576 TLV_PUT_U32(sctx
, BTRFS_SEND_A_COMPRESSION
, ret
);
5578 ret
= put_data_header(sctx
, inline_size
);
5581 read_extent_buffer(leaf
, sctx
->send_buf
+ sctx
->send_size
,
5582 btrfs_file_extent_inline_start(ei
), inline_size
);
5583 sctx
->send_size
+= inline_size
;
5585 ret
= send_cmd(sctx
);
5589 fs_path_free(fspath
);
5594 static int send_encoded_extent(struct send_ctx
*sctx
, struct btrfs_path
*path
,
5595 u64 offset
, u64 len
)
5597 struct btrfs_root
*root
= sctx
->send_root
;
5598 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
5599 struct inode
*inode
;
5600 struct fs_path
*fspath
;
5601 struct extent_buffer
*leaf
= path
->nodes
[0];
5602 struct btrfs_key key
;
5603 struct btrfs_file_extent_item
*ei
;
5604 u64 disk_bytenr
, disk_num_bytes
;
5606 struct btrfs_cmd_header
*hdr
;
5610 inode
= btrfs_iget(sctx
->cur_ino
, root
);
5612 return PTR_ERR(inode
);
5614 fspath
= fs_path_alloc();
5620 ret
= begin_cmd(sctx
, BTRFS_SEND_C_ENCODED_WRITE
);
5624 ret
= get_cur_path(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
, fspath
);
5628 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
5629 ei
= btrfs_item_ptr(leaf
, path
->slots
[0], struct btrfs_file_extent_item
);
5630 disk_bytenr
= btrfs_file_extent_disk_bytenr(leaf
, ei
);
5631 disk_num_bytes
= btrfs_file_extent_disk_num_bytes(leaf
, ei
);
5633 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, fspath
);
5634 TLV_PUT_U64(sctx
, BTRFS_SEND_A_FILE_OFFSET
, offset
);
5635 TLV_PUT_U64(sctx
, BTRFS_SEND_A_UNENCODED_FILE_LEN
,
5636 min(key
.offset
+ btrfs_file_extent_num_bytes(leaf
, ei
) - offset
,
5638 TLV_PUT_U64(sctx
, BTRFS_SEND_A_UNENCODED_LEN
,
5639 btrfs_file_extent_ram_bytes(leaf
, ei
));
5640 TLV_PUT_U64(sctx
, BTRFS_SEND_A_UNENCODED_OFFSET
,
5641 offset
- key
.offset
+ btrfs_file_extent_offset(leaf
, ei
));
5642 ret
= btrfs_encoded_io_compression_from_extent(fs_info
,
5643 btrfs_file_extent_compression(leaf
, ei
));
5646 TLV_PUT_U32(sctx
, BTRFS_SEND_A_COMPRESSION
, ret
);
5647 TLV_PUT_U32(sctx
, BTRFS_SEND_A_ENCRYPTION
, 0);
5649 ret
= put_data_header(sctx
, disk_num_bytes
);
5654 * We want to do I/O directly into the send buffer, so get the next page
5655 * boundary in the send buffer. This means that there may be a gap
5656 * between the beginning of the command and the file data.
5658 data_offset
= PAGE_ALIGN(sctx
->send_size
);
5659 if (data_offset
> sctx
->send_max_size
||
5660 sctx
->send_max_size
- data_offset
< disk_num_bytes
) {
5666 * Note that send_buf is a mapping of send_buf_pages, so this is really
5667 * reading into send_buf.
5669 ret
= btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode
),
5670 disk_bytenr
, disk_num_bytes
,
5671 sctx
->send_buf_pages
+
5672 (data_offset
>> PAGE_SHIFT
),
5677 hdr
= (struct btrfs_cmd_header
*)sctx
->send_buf
;
5678 hdr
->len
= cpu_to_le32(sctx
->send_size
+ disk_num_bytes
- sizeof(*hdr
));
5680 crc
= crc32c(0, sctx
->send_buf
, sctx
->send_size
);
5681 crc
= crc32c(crc
, sctx
->send_buf
+ data_offset
, disk_num_bytes
);
5682 hdr
->crc
= cpu_to_le32(crc
);
5684 ret
= write_buf(sctx
->send_filp
, sctx
->send_buf
, sctx
->send_size
,
5687 ret
= write_buf(sctx
->send_filp
, sctx
->send_buf
+ data_offset
,
5688 disk_num_bytes
, &sctx
->send_off
);
5690 sctx
->send_size
= 0;
5691 sctx
->put_data
= false;
5695 fs_path_free(fspath
);
5700 static int send_extent_data(struct send_ctx
*sctx
, struct btrfs_path
*path
,
5701 const u64 offset
, const u64 len
)
5703 const u64 end
= offset
+ len
;
5704 struct extent_buffer
*leaf
= path
->nodes
[0];
5705 struct btrfs_file_extent_item
*ei
;
5706 u64 read_size
= max_send_read_size(sctx
);
5709 if (sctx
->flags
& BTRFS_SEND_FLAG_NO_FILE_DATA
)
5710 return send_update_extent(sctx
, offset
, len
);
5712 ei
= btrfs_item_ptr(leaf
, path
->slots
[0],
5713 struct btrfs_file_extent_item
);
5714 if ((sctx
->flags
& BTRFS_SEND_FLAG_COMPRESSED
) &&
5715 btrfs_file_extent_compression(leaf
, ei
) != BTRFS_COMPRESS_NONE
) {
5716 bool is_inline
= (btrfs_file_extent_type(leaf
, ei
) ==
5717 BTRFS_FILE_EXTENT_INLINE
);
5720 * Send the compressed extent unless the compressed data is
5721 * larger than the decompressed data. This can happen if we're
5722 * not sending the entire extent, either because it has been
5723 * partially overwritten/truncated or because this is a part of
5724 * the extent that we couldn't clone in clone_range().
5727 btrfs_file_extent_inline_item_len(leaf
,
5728 path
->slots
[0]) <= len
) {
5729 return send_encoded_inline_extent(sctx
, path
, offset
,
5731 } else if (!is_inline
&&
5732 btrfs_file_extent_disk_num_bytes(leaf
, ei
) <= len
) {
5733 return send_encoded_extent(sctx
, path
, offset
, len
);
5737 if (sctx
->cur_inode
== NULL
) {
5738 struct btrfs_root
*root
= sctx
->send_root
;
5740 sctx
->cur_inode
= btrfs_iget(sctx
->cur_ino
, root
);
5741 if (IS_ERR(sctx
->cur_inode
)) {
5742 int err
= PTR_ERR(sctx
->cur_inode
);
5744 sctx
->cur_inode
= NULL
;
5747 memset(&sctx
->ra
, 0, sizeof(struct file_ra_state
));
5748 file_ra_state_init(&sctx
->ra
, sctx
->cur_inode
->i_mapping
);
5751 * It's very likely there are no pages from this inode in the page
5752 * cache, so after reading extents and sending their data, we clean
5753 * the page cache to avoid trashing the page cache (adding pressure
5754 * to the page cache and forcing eviction of other data more useful
5755 * for applications).
5757 * We decide if we should clean the page cache simply by checking
5758 * if the inode's mapping nrpages is 0 when we first open it, and
5759 * not by using something like filemap_range_has_page() before
5760 * reading an extent because when we ask the readahead code to
5761 * read a given file range, it may (and almost always does) read
5762 * pages from beyond that range (see the documentation for
5763 * page_cache_sync_readahead()), so it would not be reliable,
5764 * because after reading the first extent future calls to
5765 * filemap_range_has_page() would return true because the readahead
5766 * on the previous extent resulted in reading pages of the current
5769 sctx
->clean_page_cache
= (sctx
->cur_inode
->i_mapping
->nrpages
== 0);
5770 sctx
->page_cache_clear_start
= round_down(offset
, PAGE_SIZE
);
5773 while (sent
< len
) {
5774 u64 size
= min(len
- sent
, read_size
);
5777 ret
= send_write(sctx
, offset
+ sent
, size
);
5783 if (sctx
->clean_page_cache
&& PAGE_ALIGNED(end
)) {
5785 * Always operate only on ranges that are a multiple of the page
5786 * size. This is not only to prevent zeroing parts of a page in
5787 * the case of subpage sector size, but also to guarantee we evict
5788 * pages, as passing a range that is smaller than page size does
5789 * not evict the respective page (only zeroes part of its content).
5791 * Always start from the end offset of the last range cleared.
5792 * This is because the readahead code may (and very often does)
5793 * reads pages beyond the range we request for readahead. So if
5794 * we have an extent layout like this:
5796 * [ extent A ] [ extent B ] [ extent C ]
5798 * When we ask page_cache_sync_readahead() to read extent A, it
5799 * may also trigger reads for pages of extent B. If we are doing
5800 * an incremental send and extent B has not changed between the
5801 * parent and send snapshots, some or all of its pages may end
5802 * up being read and placed in the page cache. So when truncating
5803 * the page cache we always start from the end offset of the
5804 * previously processed extent up to the end of the current
5807 truncate_inode_pages_range(&sctx
->cur_inode
->i_data
,
5808 sctx
->page_cache_clear_start
,
5810 sctx
->page_cache_clear_start
= end
;
5817 * Search for a capability xattr related to sctx->cur_ino. If the capability is
5818 * found, call send_set_xattr function to emit it.
5820 * Return 0 if there isn't a capability, or when the capability was emitted
5821 * successfully, or < 0 if an error occurred.
5823 static int send_capabilities(struct send_ctx
*sctx
)
5825 struct fs_path
*fspath
= NULL
;
5826 struct btrfs_path
*path
;
5827 struct btrfs_dir_item
*di
;
5828 struct extent_buffer
*leaf
;
5829 unsigned long data_ptr
;
5834 path
= alloc_path_for_send();
5838 di
= btrfs_lookup_xattr(NULL
, sctx
->send_root
, path
, sctx
->cur_ino
,
5839 XATTR_NAME_CAPS
, strlen(XATTR_NAME_CAPS
), 0);
5841 /* There is no xattr for this inode */
5843 } else if (IS_ERR(di
)) {
5848 leaf
= path
->nodes
[0];
5849 buf_len
= btrfs_dir_data_len(leaf
, di
);
5851 fspath
= fs_path_alloc();
5852 buf
= kmalloc(buf_len
, GFP_KERNEL
);
5853 if (!fspath
|| !buf
) {
5858 ret
= get_cur_path(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
, fspath
);
5862 data_ptr
= (unsigned long)(di
+ 1) + btrfs_dir_name_len(leaf
, di
);
5863 read_extent_buffer(leaf
, buf
, data_ptr
, buf_len
);
5865 ret
= send_set_xattr(sctx
, fspath
, XATTR_NAME_CAPS
,
5866 strlen(XATTR_NAME_CAPS
), buf
, buf_len
);
5869 fs_path_free(fspath
);
5870 btrfs_free_path(path
);
5874 static int clone_range(struct send_ctx
*sctx
, struct btrfs_path
*dst_path
,
5875 struct clone_root
*clone_root
, const u64 disk_byte
,
5876 u64 data_offset
, u64 offset
, u64 len
)
5878 struct btrfs_path
*path
;
5879 struct btrfs_key key
;
5881 struct btrfs_inode_info info
;
5882 u64 clone_src_i_size
= 0;
5885 * Prevent cloning from a zero offset with a length matching the sector
5886 * size because in some scenarios this will make the receiver fail.
5888 * For example, if in the source filesystem the extent at offset 0
5889 * has a length of sectorsize and it was written using direct IO, then
5890 * it can never be an inline extent (even if compression is enabled).
5891 * Then this extent can be cloned in the original filesystem to a non
5892 * zero file offset, but it may not be possible to clone in the
5893 * destination filesystem because it can be inlined due to compression
5894 * on the destination filesystem (as the receiver's write operations are
5895 * always done using buffered IO). The same happens when the original
5896 * filesystem does not have compression enabled but the destination
5899 if (clone_root
->offset
== 0 &&
5900 len
== sctx
->send_root
->fs_info
->sectorsize
)
5901 return send_extent_data(sctx
, dst_path
, offset
, len
);
5903 path
= alloc_path_for_send();
5908 * There are inodes that have extents that lie behind its i_size. Don't
5909 * accept clones from these extents.
5911 ret
= get_inode_info(clone_root
->root
, clone_root
->ino
, &info
);
5912 btrfs_release_path(path
);
5915 clone_src_i_size
= info
.size
;
5918 * We can't send a clone operation for the entire range if we find
5919 * extent items in the respective range in the source file that
5920 * refer to different extents or if we find holes.
5921 * So check for that and do a mix of clone and regular write/copy
5922 * operations if needed.
5926 * mkfs.btrfs -f /dev/sda
5927 * mount /dev/sda /mnt
5928 * xfs_io -f -c "pwrite -S 0xaa 0K 100K" /mnt/foo
5929 * cp --reflink=always /mnt/foo /mnt/bar
5930 * xfs_io -c "pwrite -S 0xbb 50K 50K" /mnt/foo
5931 * btrfs subvolume snapshot -r /mnt /mnt/snap
5933 * If when we send the snapshot and we are processing file bar (which
5934 * has a higher inode number than foo) we blindly send a clone operation
5935 * for the [0, 100K[ range from foo to bar, the receiver ends up getting
5936 * a file bar that matches the content of file foo - iow, doesn't match
5937 * the content from bar in the original filesystem.
5939 key
.objectid
= clone_root
->ino
;
5940 key
.type
= BTRFS_EXTENT_DATA_KEY
;
5941 key
.offset
= clone_root
->offset
;
5942 ret
= btrfs_search_slot(NULL
, clone_root
->root
, &key
, path
, 0, 0);
5945 if (ret
> 0 && path
->slots
[0] > 0) {
5946 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0] - 1);
5947 if (key
.objectid
== clone_root
->ino
&&
5948 key
.type
== BTRFS_EXTENT_DATA_KEY
)
5953 struct extent_buffer
*leaf
= path
->nodes
[0];
5954 int slot
= path
->slots
[0];
5955 struct btrfs_file_extent_item
*ei
;
5959 u64 clone_data_offset
;
5960 bool crossed_src_i_size
= false;
5962 if (slot
>= btrfs_header_nritems(leaf
)) {
5963 ret
= btrfs_next_leaf(clone_root
->root
, path
);
5971 btrfs_item_key_to_cpu(leaf
, &key
, slot
);
5974 * We might have an implicit trailing hole (NO_HOLES feature
5975 * enabled). We deal with it after leaving this loop.
5977 if (key
.objectid
!= clone_root
->ino
||
5978 key
.type
!= BTRFS_EXTENT_DATA_KEY
)
5981 ei
= btrfs_item_ptr(leaf
, slot
, struct btrfs_file_extent_item
);
5982 type
= btrfs_file_extent_type(leaf
, ei
);
5983 if (type
== BTRFS_FILE_EXTENT_INLINE
) {
5984 ext_len
= btrfs_file_extent_ram_bytes(leaf
, ei
);
5985 ext_len
= PAGE_ALIGN(ext_len
);
5987 ext_len
= btrfs_file_extent_num_bytes(leaf
, ei
);
5990 if (key
.offset
+ ext_len
<= clone_root
->offset
)
5993 if (key
.offset
> clone_root
->offset
) {
5994 /* Implicit hole, NO_HOLES feature enabled. */
5995 u64 hole_len
= key
.offset
- clone_root
->offset
;
5999 ret
= send_extent_data(sctx
, dst_path
, offset
,
6008 clone_root
->offset
+= hole_len
;
6009 data_offset
+= hole_len
;
6012 if (key
.offset
>= clone_root
->offset
+ len
)
6015 if (key
.offset
>= clone_src_i_size
)
6018 if (key
.offset
+ ext_len
> clone_src_i_size
) {
6019 ext_len
= clone_src_i_size
- key
.offset
;
6020 crossed_src_i_size
= true;
6023 clone_data_offset
= btrfs_file_extent_offset(leaf
, ei
);
6024 if (btrfs_file_extent_disk_bytenr(leaf
, ei
) == disk_byte
) {
6025 clone_root
->offset
= key
.offset
;
6026 if (clone_data_offset
< data_offset
&&
6027 clone_data_offset
+ ext_len
> data_offset
) {
6030 extent_offset
= data_offset
- clone_data_offset
;
6031 ext_len
-= extent_offset
;
6032 clone_data_offset
+= extent_offset
;
6033 clone_root
->offset
+= extent_offset
;
6037 clone_len
= min_t(u64
, ext_len
, len
);
6039 if (btrfs_file_extent_disk_bytenr(leaf
, ei
) == disk_byte
&&
6040 clone_data_offset
== data_offset
) {
6041 const u64 src_end
= clone_root
->offset
+ clone_len
;
6042 const u64 sectorsize
= SZ_64K
;
6045 * We can't clone the last block, when its size is not
6046 * sector size aligned, into the middle of a file. If we
6047 * do so, the receiver will get a failure (-EINVAL) when
6048 * trying to clone or will silently corrupt the data in
6049 * the destination file if it's on a kernel without the
6050 * fix introduced by commit ac765f83f1397646
6051 * ("Btrfs: fix data corruption due to cloning of eof
6054 * So issue a clone of the aligned down range plus a
6055 * regular write for the eof block, if we hit that case.
6057 * Also, we use the maximum possible sector size, 64K,
6058 * because we don't know what's the sector size of the
6059 * filesystem that receives the stream, so we have to
6060 * assume the largest possible sector size.
6062 if (src_end
== clone_src_i_size
&&
6063 !IS_ALIGNED(src_end
, sectorsize
) &&
6064 offset
+ clone_len
< sctx
->cur_inode_size
) {
6067 slen
= ALIGN_DOWN(src_end
- clone_root
->offset
,
6070 ret
= send_clone(sctx
, offset
, slen
,
6075 ret
= send_extent_data(sctx
, dst_path
,
6079 ret
= send_clone(sctx
, offset
, clone_len
,
6082 } else if (crossed_src_i_size
&& clone_len
< len
) {
6084 * If we are at i_size of the clone source inode and we
6085 * can not clone from it, terminate the loop. This is
6086 * to avoid sending two write operations, one with a
6087 * length matching clone_len and the final one after
6088 * this loop with a length of len - clone_len.
6090 * When using encoded writes (BTRFS_SEND_FLAG_COMPRESSED
6091 * was passed to the send ioctl), this helps avoid
6092 * sending an encoded write for an offset that is not
6093 * sector size aligned, in case the i_size of the source
6094 * inode is not sector size aligned. That will make the
6095 * receiver fallback to decompression of the data and
6096 * writing it using regular buffered IO, therefore while
6097 * not incorrect, it's not optimal due decompression and
6098 * possible re-compression at the receiver.
6102 ret
= send_extent_data(sctx
, dst_path
, offset
,
6112 offset
+= clone_len
;
6113 clone_root
->offset
+= clone_len
;
6116 * If we are cloning from the file we are currently processing,
6117 * and using the send root as the clone root, we must stop once
6118 * the current clone offset reaches the current eof of the file
6119 * at the receiver, otherwise we would issue an invalid clone
6120 * operation (source range going beyond eof) and cause the
6121 * receiver to fail. So if we reach the current eof, bail out
6122 * and fallback to a regular write.
6124 if (clone_root
->root
== sctx
->send_root
&&
6125 clone_root
->ino
== sctx
->cur_ino
&&
6126 clone_root
->offset
>= sctx
->cur_inode_next_write_offset
)
6129 data_offset
+= clone_len
;
6135 ret
= send_extent_data(sctx
, dst_path
, offset
, len
);
6139 btrfs_free_path(path
);
6143 static int send_write_or_clone(struct send_ctx
*sctx
,
6144 struct btrfs_path
*path
,
6145 struct btrfs_key
*key
,
6146 struct clone_root
*clone_root
)
6149 u64 offset
= key
->offset
;
6151 u64 bs
= sctx
->send_root
->fs_info
->sectorsize
;
6152 struct btrfs_file_extent_item
*ei
;
6156 struct btrfs_inode_info info
= { 0 };
6158 end
= min_t(u64
, btrfs_file_extent_end(path
), sctx
->cur_inode_size
);
6162 num_bytes
= end
- offset
;
6167 if (IS_ALIGNED(end
, bs
))
6171 * If the extent end is not aligned, we can clone if the extent ends at
6172 * the i_size of the inode and the clone range ends at the i_size of the
6173 * source inode, otherwise the clone operation fails with -EINVAL.
6175 if (end
!= sctx
->cur_inode_size
)
6178 ret
= get_inode_info(clone_root
->root
, clone_root
->ino
, &info
);
6182 if (clone_root
->offset
+ num_bytes
== info
.size
) {
6184 * The final size of our file matches the end offset, but it may
6185 * be that its current size is larger, so we have to truncate it
6186 * to any value between the start offset of the range and the
6187 * final i_size, otherwise the clone operation is invalid
6188 * because it's unaligned and it ends before the current EOF.
6189 * We do this truncate to the final i_size when we finish
6190 * processing the inode, but it's too late by then. And here we
6191 * truncate to the start offset of the range because it's always
6192 * sector size aligned while if it were the final i_size it
6193 * would result in dirtying part of a page, filling part of a
6194 * page with zeroes and then having the clone operation at the
6195 * receiver trigger IO and wait for it due to the dirty page.
6197 if (sctx
->parent_root
!= NULL
) {
6198 ret
= send_truncate(sctx
, sctx
->cur_ino
,
6199 sctx
->cur_inode_gen
, offset
);
6207 ret
= send_extent_data(sctx
, path
, offset
, num_bytes
);
6208 sctx
->cur_inode_next_write_offset
= end
;
6212 ei
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
6213 struct btrfs_file_extent_item
);
6214 disk_byte
= btrfs_file_extent_disk_bytenr(path
->nodes
[0], ei
);
6215 data_offset
= btrfs_file_extent_offset(path
->nodes
[0], ei
);
6216 ret
= clone_range(sctx
, path
, clone_root
, disk_byte
, data_offset
, offset
,
6218 sctx
->cur_inode_next_write_offset
= end
;
6222 static int is_extent_unchanged(struct send_ctx
*sctx
,
6223 struct btrfs_path
*left_path
,
6224 struct btrfs_key
*ekey
)
6227 struct btrfs_key key
;
6228 struct btrfs_path
*path
= NULL
;
6229 struct extent_buffer
*eb
;
6231 struct btrfs_key found_key
;
6232 struct btrfs_file_extent_item
*ei
;
6237 u64 left_offset_fixed
;
6245 path
= alloc_path_for_send();
6249 eb
= left_path
->nodes
[0];
6250 slot
= left_path
->slots
[0];
6251 ei
= btrfs_item_ptr(eb
, slot
, struct btrfs_file_extent_item
);
6252 left_type
= btrfs_file_extent_type(eb
, ei
);
6254 if (left_type
!= BTRFS_FILE_EXTENT_REG
) {
6258 left_disknr
= btrfs_file_extent_disk_bytenr(eb
, ei
);
6259 left_len
= btrfs_file_extent_num_bytes(eb
, ei
);
6260 left_offset
= btrfs_file_extent_offset(eb
, ei
);
6261 left_gen
= btrfs_file_extent_generation(eb
, ei
);
6264 * Following comments will refer to these graphics. L is the left
6265 * extents which we are checking at the moment. 1-8 are the right
6266 * extents that we iterate.
6269 * |-1-|-2a-|-3-|-4-|-5-|-6-|
6272 * |--1--|-2b-|...(same as above)
6274 * Alternative situation. Happens on files where extents got split.
6276 * |-----------7-----------|-6-|
6278 * Alternative situation. Happens on files which got larger.
6281 * Nothing follows after 8.
6284 key
.objectid
= ekey
->objectid
;
6285 key
.type
= BTRFS_EXTENT_DATA_KEY
;
6286 key
.offset
= ekey
->offset
;
6287 ret
= btrfs_search_slot_for_read(sctx
->parent_root
, &key
, path
, 0, 0);
6296 * Handle special case where the right side has no extents at all.
6298 eb
= path
->nodes
[0];
6299 slot
= path
->slots
[0];
6300 btrfs_item_key_to_cpu(eb
, &found_key
, slot
);
6301 if (found_key
.objectid
!= key
.objectid
||
6302 found_key
.type
!= key
.type
) {
6303 /* If we're a hole then just pretend nothing changed */
6304 ret
= (left_disknr
) ? 0 : 1;
6309 * We're now on 2a, 2b or 7.
6312 while (key
.offset
< ekey
->offset
+ left_len
) {
6313 ei
= btrfs_item_ptr(eb
, slot
, struct btrfs_file_extent_item
);
6314 right_type
= btrfs_file_extent_type(eb
, ei
);
6315 if (right_type
!= BTRFS_FILE_EXTENT_REG
&&
6316 right_type
!= BTRFS_FILE_EXTENT_INLINE
) {
6321 if (right_type
== BTRFS_FILE_EXTENT_INLINE
) {
6322 right_len
= btrfs_file_extent_ram_bytes(eb
, ei
);
6323 right_len
= PAGE_ALIGN(right_len
);
6325 right_len
= btrfs_file_extent_num_bytes(eb
, ei
);
6329 * Are we at extent 8? If yes, we know the extent is changed.
6330 * This may only happen on the first iteration.
6332 if (found_key
.offset
+ right_len
<= ekey
->offset
) {
6333 /* If we're a hole just pretend nothing changed */
6334 ret
= (left_disknr
) ? 0 : 1;
6339 * We just wanted to see if when we have an inline extent, what
6340 * follows it is a regular extent (wanted to check the above
6341 * condition for inline extents too). This should normally not
6342 * happen but it's possible for example when we have an inline
6343 * compressed extent representing data with a size matching
6344 * the page size (currently the same as sector size).
6346 if (right_type
== BTRFS_FILE_EXTENT_INLINE
) {
6351 right_disknr
= btrfs_file_extent_disk_bytenr(eb
, ei
);
6352 right_offset
= btrfs_file_extent_offset(eb
, ei
);
6353 right_gen
= btrfs_file_extent_generation(eb
, ei
);
6355 left_offset_fixed
= left_offset
;
6356 if (key
.offset
< ekey
->offset
) {
6357 /* Fix the right offset for 2a and 7. */
6358 right_offset
+= ekey
->offset
- key
.offset
;
6360 /* Fix the left offset for all behind 2a and 2b */
6361 left_offset_fixed
+= key
.offset
- ekey
->offset
;
6365 * Check if we have the same extent.
6367 if (left_disknr
!= right_disknr
||
6368 left_offset_fixed
!= right_offset
||
6369 left_gen
!= right_gen
) {
6375 * Go to the next extent.
6377 ret
= btrfs_next_item(sctx
->parent_root
, path
);
6381 eb
= path
->nodes
[0];
6382 slot
= path
->slots
[0];
6383 btrfs_item_key_to_cpu(eb
, &found_key
, slot
);
6385 if (ret
|| found_key
.objectid
!= key
.objectid
||
6386 found_key
.type
!= key
.type
) {
6387 key
.offset
+= right_len
;
6390 if (found_key
.offset
!= key
.offset
+ right_len
) {
6398 * We're now behind the left extent (treat as unchanged) or at the end
6399 * of the right side (treat as changed).
6401 if (key
.offset
>= ekey
->offset
+ left_len
)
6408 btrfs_free_path(path
);
6412 static int get_last_extent(struct send_ctx
*sctx
, u64 offset
)
6414 struct btrfs_path
*path
;
6415 struct btrfs_root
*root
= sctx
->send_root
;
6416 struct btrfs_key key
;
6419 path
= alloc_path_for_send();
6423 sctx
->cur_inode_last_extent
= 0;
6425 key
.objectid
= sctx
->cur_ino
;
6426 key
.type
= BTRFS_EXTENT_DATA_KEY
;
6427 key
.offset
= offset
;
6428 ret
= btrfs_search_slot_for_read(root
, &key
, path
, 0, 1);
6432 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0]);
6433 if (key
.objectid
!= sctx
->cur_ino
|| key
.type
!= BTRFS_EXTENT_DATA_KEY
)
6436 sctx
->cur_inode_last_extent
= btrfs_file_extent_end(path
);
6438 btrfs_free_path(path
);
6442 static int range_is_hole_in_parent(struct send_ctx
*sctx
,
6446 struct btrfs_path
*path
;
6447 struct btrfs_key key
;
6448 struct btrfs_root
*root
= sctx
->parent_root
;
6449 u64 search_start
= start
;
6452 path
= alloc_path_for_send();
6456 key
.objectid
= sctx
->cur_ino
;
6457 key
.type
= BTRFS_EXTENT_DATA_KEY
;
6458 key
.offset
= search_start
;
6459 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
6462 if (ret
> 0 && path
->slots
[0] > 0)
6465 while (search_start
< end
) {
6466 struct extent_buffer
*leaf
= path
->nodes
[0];
6467 int slot
= path
->slots
[0];
6468 struct btrfs_file_extent_item
*fi
;
6471 if (slot
>= btrfs_header_nritems(leaf
)) {
6472 ret
= btrfs_next_leaf(root
, path
);
6480 btrfs_item_key_to_cpu(leaf
, &key
, slot
);
6481 if (key
.objectid
< sctx
->cur_ino
||
6482 key
.type
< BTRFS_EXTENT_DATA_KEY
)
6484 if (key
.objectid
> sctx
->cur_ino
||
6485 key
.type
> BTRFS_EXTENT_DATA_KEY
||
6489 fi
= btrfs_item_ptr(leaf
, slot
, struct btrfs_file_extent_item
);
6490 extent_end
= btrfs_file_extent_end(path
);
6491 if (extent_end
<= start
)
6493 if (btrfs_file_extent_disk_bytenr(leaf
, fi
) == 0) {
6494 search_start
= extent_end
;
6504 btrfs_free_path(path
);
6508 static int maybe_send_hole(struct send_ctx
*sctx
, struct btrfs_path
*path
,
6509 struct btrfs_key
*key
)
6513 if (sctx
->cur_ino
!= key
->objectid
|| !need_send_hole(sctx
))
6517 * Get last extent's end offset (exclusive) if we haven't determined it
6518 * yet (we're processing the first file extent item that is new), or if
6519 * we're at the first slot of a leaf and the last extent's end is less
6520 * than the current extent's offset, because we might have skipped
6521 * entire leaves that contained only file extent items for our current
6522 * inode. These leaves have a generation number smaller (older) than the
6523 * one in the current leaf and the leaf our last extent came from, and
6524 * are located between these 2 leaves.
6526 if ((sctx
->cur_inode_last_extent
== (u64
)-1) ||
6527 (path
->slots
[0] == 0 && sctx
->cur_inode_last_extent
< key
->offset
)) {
6528 ret
= get_last_extent(sctx
, key
->offset
- 1);
6533 if (sctx
->cur_inode_last_extent
< key
->offset
) {
6534 ret
= range_is_hole_in_parent(sctx
,
6535 sctx
->cur_inode_last_extent
,
6540 ret
= send_hole(sctx
, key
->offset
);
6544 sctx
->cur_inode_last_extent
= btrfs_file_extent_end(path
);
6548 static int process_extent(struct send_ctx
*sctx
,
6549 struct btrfs_path
*path
,
6550 struct btrfs_key
*key
)
6552 struct clone_root
*found_clone
= NULL
;
6555 if (S_ISLNK(sctx
->cur_inode_mode
))
6558 if (sctx
->parent_root
&& !sctx
->cur_inode_new
) {
6559 ret
= is_extent_unchanged(sctx
, path
, key
);
6567 struct btrfs_file_extent_item
*ei
;
6570 ei
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
6571 struct btrfs_file_extent_item
);
6572 type
= btrfs_file_extent_type(path
->nodes
[0], ei
);
6573 if (type
== BTRFS_FILE_EXTENT_PREALLOC
||
6574 type
== BTRFS_FILE_EXTENT_REG
) {
6576 * The send spec does not have a prealloc command yet,
6577 * so just leave a hole for prealloc'ed extents until
6578 * we have enough commands queued up to justify rev'ing
6581 if (type
== BTRFS_FILE_EXTENT_PREALLOC
) {
6586 /* Have a hole, just skip it. */
6587 if (btrfs_file_extent_disk_bytenr(path
->nodes
[0], ei
) == 0) {
6594 ret
= find_extent_clone(sctx
, path
, key
->objectid
, key
->offset
,
6595 sctx
->cur_inode_size
, &found_clone
);
6596 if (ret
!= -ENOENT
&& ret
< 0)
6599 ret
= send_write_or_clone(sctx
, path
, key
, found_clone
);
6603 ret
= maybe_send_hole(sctx
, path
, key
);
6608 static int process_all_extents(struct send_ctx
*sctx
)
6612 struct btrfs_root
*root
;
6613 struct btrfs_path
*path
;
6614 struct btrfs_key key
;
6615 struct btrfs_key found_key
;
6617 root
= sctx
->send_root
;
6618 path
= alloc_path_for_send();
6622 key
.objectid
= sctx
->cmp_key
->objectid
;
6623 key
.type
= BTRFS_EXTENT_DATA_KEY
;
6625 btrfs_for_each_slot(root
, &key
, &found_key
, path
, iter_ret
) {
6626 if (found_key
.objectid
!= key
.objectid
||
6627 found_key
.type
!= key
.type
) {
6632 ret
= process_extent(sctx
, path
, &found_key
);
6636 /* Catch error found during iteration */
6640 btrfs_free_path(path
);
6644 static int process_recorded_refs_if_needed(struct send_ctx
*sctx
, int at_end
,
6646 int *refs_processed
)
6650 if (sctx
->cur_ino
== 0)
6652 if (!at_end
&& sctx
->cur_ino
== sctx
->cmp_key
->objectid
&&
6653 sctx
->cmp_key
->type
<= BTRFS_INODE_EXTREF_KEY
)
6655 if (list_empty(&sctx
->new_refs
) && list_empty(&sctx
->deleted_refs
))
6658 ret
= process_recorded_refs(sctx
, pending_move
);
6662 *refs_processed
= 1;
6667 static int finish_inode_if_needed(struct send_ctx
*sctx
, int at_end
)
6670 struct btrfs_inode_info info
;
6681 bool need_fileattr
= false;
6682 int need_truncate
= 1;
6683 int pending_move
= 0;
6684 int refs_processed
= 0;
6686 if (sctx
->ignore_cur_inode
)
6689 ret
= process_recorded_refs_if_needed(sctx
, at_end
, &pending_move
,
6695 * We have processed the refs and thus need to advance send_progress.
6696 * Now, calls to get_cur_xxx will take the updated refs of the current
6697 * inode into account.
6699 * On the other hand, if our current inode is a directory and couldn't
6700 * be moved/renamed because its parent was renamed/moved too and it has
6701 * a higher inode number, we can only move/rename our current inode
6702 * after we moved/renamed its parent. Therefore in this case operate on
6703 * the old path (pre move/rename) of our current inode, and the
6704 * move/rename will be performed later.
6706 if (refs_processed
&& !pending_move
)
6707 sctx
->send_progress
= sctx
->cur_ino
+ 1;
6709 if (sctx
->cur_ino
== 0 || sctx
->cur_inode_deleted
)
6711 if (!at_end
&& sctx
->cmp_key
->objectid
== sctx
->cur_ino
)
6713 ret
= get_inode_info(sctx
->send_root
, sctx
->cur_ino
, &info
);
6716 left_mode
= info
.mode
;
6717 left_uid
= info
.uid
;
6718 left_gid
= info
.gid
;
6719 left_fileattr
= info
.fileattr
;
6721 if (!sctx
->parent_root
|| sctx
->cur_inode_new
) {
6723 if (!S_ISLNK(sctx
->cur_inode_mode
))
6725 if (sctx
->cur_inode_next_write_offset
== sctx
->cur_inode_size
)
6730 ret
= get_inode_info(sctx
->parent_root
, sctx
->cur_ino
, &info
);
6733 old_size
= info
.size
;
6734 right_mode
= info
.mode
;
6735 right_uid
= info
.uid
;
6736 right_gid
= info
.gid
;
6737 right_fileattr
= info
.fileattr
;
6739 if (left_uid
!= right_uid
|| left_gid
!= right_gid
)
6741 if (!S_ISLNK(sctx
->cur_inode_mode
) && left_mode
!= right_mode
)
6743 if (!S_ISLNK(sctx
->cur_inode_mode
) && left_fileattr
!= right_fileattr
)
6744 need_fileattr
= true;
6745 if ((old_size
== sctx
->cur_inode_size
) ||
6746 (sctx
->cur_inode_size
> old_size
&&
6747 sctx
->cur_inode_next_write_offset
== sctx
->cur_inode_size
))
6751 if (S_ISREG(sctx
->cur_inode_mode
)) {
6752 if (need_send_hole(sctx
)) {
6753 if (sctx
->cur_inode_last_extent
== (u64
)-1 ||
6754 sctx
->cur_inode_last_extent
<
6755 sctx
->cur_inode_size
) {
6756 ret
= get_last_extent(sctx
, (u64
)-1);
6760 if (sctx
->cur_inode_last_extent
< sctx
->cur_inode_size
) {
6761 ret
= range_is_hole_in_parent(sctx
,
6762 sctx
->cur_inode_last_extent
,
6763 sctx
->cur_inode_size
);
6766 } else if (ret
== 0) {
6767 ret
= send_hole(sctx
, sctx
->cur_inode_size
);
6771 /* Range is already a hole, skip. */
6776 if (need_truncate
) {
6777 ret
= send_truncate(sctx
, sctx
->cur_ino
,
6778 sctx
->cur_inode_gen
,
6779 sctx
->cur_inode_size
);
6786 ret
= send_chown(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
,
6787 left_uid
, left_gid
);
6792 ret
= send_chmod(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
,
6797 if (need_fileattr
) {
6798 ret
= send_fileattr(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
,
6804 if (proto_cmd_ok(sctx
, BTRFS_SEND_C_ENABLE_VERITY
)
6805 && sctx
->cur_inode_needs_verity
) {
6806 ret
= process_verity(sctx
);
6811 ret
= send_capabilities(sctx
);
6816 * If other directory inodes depended on our current directory
6817 * inode's move/rename, now do their move/rename operations.
6819 if (!is_waiting_for_move(sctx
, sctx
->cur_ino
)) {
6820 ret
= apply_children_dir_moves(sctx
);
6824 * Need to send that every time, no matter if it actually
6825 * changed between the two trees as we have done changes to
6826 * the inode before. If our inode is a directory and it's
6827 * waiting to be moved/renamed, we will send its utimes when
6828 * it's moved/renamed, therefore we don't need to do it here.
6830 sctx
->send_progress
= sctx
->cur_ino
+ 1;
6833 * If the current inode is a non-empty directory, delay issuing
6834 * the utimes command for it, as it's very likely we have inodes
6835 * with an higher number inside it. We want to issue the utimes
6836 * command only after adding all dentries to it.
6838 if (S_ISDIR(sctx
->cur_inode_mode
) && sctx
->cur_inode_size
> 0)
6839 ret
= cache_dir_utimes(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
);
6841 ret
= send_utimes(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
);
6849 ret
= trim_dir_utimes_cache(sctx
);
6854 static void close_current_inode(struct send_ctx
*sctx
)
6858 if (sctx
->cur_inode
== NULL
)
6861 i_size
= i_size_read(sctx
->cur_inode
);
6864 * If we are doing an incremental send, we may have extents between the
6865 * last processed extent and the i_size that have not been processed
6866 * because they haven't changed but we may have read some of their pages
6867 * through readahead, see the comments at send_extent_data().
6869 if (sctx
->clean_page_cache
&& sctx
->page_cache_clear_start
< i_size
)
6870 truncate_inode_pages_range(&sctx
->cur_inode
->i_data
,
6871 sctx
->page_cache_clear_start
,
6872 round_up(i_size
, PAGE_SIZE
) - 1);
6874 iput(sctx
->cur_inode
);
6875 sctx
->cur_inode
= NULL
;
6878 static int changed_inode(struct send_ctx
*sctx
,
6879 enum btrfs_compare_tree_result result
)
6882 struct btrfs_key
*key
= sctx
->cmp_key
;
6883 struct btrfs_inode_item
*left_ii
= NULL
;
6884 struct btrfs_inode_item
*right_ii
= NULL
;
6888 close_current_inode(sctx
);
6890 sctx
->cur_ino
= key
->objectid
;
6891 sctx
->cur_inode_new_gen
= false;
6892 sctx
->cur_inode_last_extent
= (u64
)-1;
6893 sctx
->cur_inode_next_write_offset
= 0;
6894 sctx
->ignore_cur_inode
= false;
6897 * Set send_progress to current inode. This will tell all get_cur_xxx
6898 * functions that the current inode's refs are not updated yet. Later,
6899 * when process_recorded_refs is finished, it is set to cur_ino + 1.
6901 sctx
->send_progress
= sctx
->cur_ino
;
6903 if (result
== BTRFS_COMPARE_TREE_NEW
||
6904 result
== BTRFS_COMPARE_TREE_CHANGED
) {
6905 left_ii
= btrfs_item_ptr(sctx
->left_path
->nodes
[0],
6906 sctx
->left_path
->slots
[0],
6907 struct btrfs_inode_item
);
6908 left_gen
= btrfs_inode_generation(sctx
->left_path
->nodes
[0],
6911 right_ii
= btrfs_item_ptr(sctx
->right_path
->nodes
[0],
6912 sctx
->right_path
->slots
[0],
6913 struct btrfs_inode_item
);
6914 right_gen
= btrfs_inode_generation(sctx
->right_path
->nodes
[0],
6917 if (result
== BTRFS_COMPARE_TREE_CHANGED
) {
6918 right_ii
= btrfs_item_ptr(sctx
->right_path
->nodes
[0],
6919 sctx
->right_path
->slots
[0],
6920 struct btrfs_inode_item
);
6922 right_gen
= btrfs_inode_generation(sctx
->right_path
->nodes
[0],
6926 * The cur_ino = root dir case is special here. We can't treat
6927 * the inode as deleted+reused because it would generate a
6928 * stream that tries to delete/mkdir the root dir.
6930 if (left_gen
!= right_gen
&&
6931 sctx
->cur_ino
!= BTRFS_FIRST_FREE_OBJECTID
)
6932 sctx
->cur_inode_new_gen
= true;
6936 * Normally we do not find inodes with a link count of zero (orphans)
6937 * because the most common case is to create a snapshot and use it
6938 * for a send operation. However other less common use cases involve
6939 * using a subvolume and send it after turning it to RO mode just
6940 * after deleting all hard links of a file while holding an open
6941 * file descriptor against it or turning a RO snapshot into RW mode,
6942 * keep an open file descriptor against a file, delete it and then
6943 * turn the snapshot back to RO mode before using it for a send
6944 * operation. The former is what the receiver operation does.
6945 * Therefore, if we want to send these snapshots soon after they're
6946 * received, we need to handle orphan inodes as well. Moreover, orphans
6947 * can appear not only in the send snapshot but also in the parent
6948 * snapshot. Here are several cases:
6950 * Case 1: BTRFS_COMPARE_TREE_NEW
6951 * | send snapshot | action
6952 * --------------------------------
6953 * nlink | 0 | ignore
6955 * Case 2: BTRFS_COMPARE_TREE_DELETED
6956 * | parent snapshot | action
6957 * ----------------------------------
6958 * nlink | 0 | as usual
6959 * Note: No unlinks will be sent because there're no paths for it.
6961 * Case 3: BTRFS_COMPARE_TREE_CHANGED
6962 * | | parent snapshot | send snapshot | action
6963 * -----------------------------------------------------------------------
6964 * subcase 1 | nlink | 0 | 0 | ignore
6965 * subcase 2 | nlink | >0 | 0 | new_gen(deletion)
6966 * subcase 3 | nlink | 0 | >0 | new_gen(creation)
6969 if (result
== BTRFS_COMPARE_TREE_NEW
) {
6970 if (btrfs_inode_nlink(sctx
->left_path
->nodes
[0], left_ii
) == 0) {
6971 sctx
->ignore_cur_inode
= true;
6974 sctx
->cur_inode_gen
= left_gen
;
6975 sctx
->cur_inode_new
= true;
6976 sctx
->cur_inode_deleted
= false;
6977 sctx
->cur_inode_size
= btrfs_inode_size(
6978 sctx
->left_path
->nodes
[0], left_ii
);
6979 sctx
->cur_inode_mode
= btrfs_inode_mode(
6980 sctx
->left_path
->nodes
[0], left_ii
);
6981 sctx
->cur_inode_rdev
= btrfs_inode_rdev(
6982 sctx
->left_path
->nodes
[0], left_ii
);
6983 if (sctx
->cur_ino
!= BTRFS_FIRST_FREE_OBJECTID
)
6984 ret
= send_create_inode_if_needed(sctx
);
6985 } else if (result
== BTRFS_COMPARE_TREE_DELETED
) {
6986 sctx
->cur_inode_gen
= right_gen
;
6987 sctx
->cur_inode_new
= false;
6988 sctx
->cur_inode_deleted
= true;
6989 sctx
->cur_inode_size
= btrfs_inode_size(
6990 sctx
->right_path
->nodes
[0], right_ii
);
6991 sctx
->cur_inode_mode
= btrfs_inode_mode(
6992 sctx
->right_path
->nodes
[0], right_ii
);
6993 } else if (result
== BTRFS_COMPARE_TREE_CHANGED
) {
6994 u32 new_nlinks
, old_nlinks
;
6996 new_nlinks
= btrfs_inode_nlink(sctx
->left_path
->nodes
[0], left_ii
);
6997 old_nlinks
= btrfs_inode_nlink(sctx
->right_path
->nodes
[0], right_ii
);
6998 if (new_nlinks
== 0 && old_nlinks
== 0) {
6999 sctx
->ignore_cur_inode
= true;
7001 } else if (new_nlinks
== 0 || old_nlinks
== 0) {
7002 sctx
->cur_inode_new_gen
= 1;
7005 * We need to do some special handling in case the inode was
7006 * reported as changed with a changed generation number. This
7007 * means that the original inode was deleted and new inode
7008 * reused the same inum. So we have to treat the old inode as
7009 * deleted and the new one as new.
7011 if (sctx
->cur_inode_new_gen
) {
7013 * First, process the inode as if it was deleted.
7015 if (old_nlinks
> 0) {
7016 sctx
->cur_inode_gen
= right_gen
;
7017 sctx
->cur_inode_new
= false;
7018 sctx
->cur_inode_deleted
= true;
7019 sctx
->cur_inode_size
= btrfs_inode_size(
7020 sctx
->right_path
->nodes
[0], right_ii
);
7021 sctx
->cur_inode_mode
= btrfs_inode_mode(
7022 sctx
->right_path
->nodes
[0], right_ii
);
7023 ret
= process_all_refs(sctx
,
7024 BTRFS_COMPARE_TREE_DELETED
);
7030 * Now process the inode as if it was new.
7032 if (new_nlinks
> 0) {
7033 sctx
->cur_inode_gen
= left_gen
;
7034 sctx
->cur_inode_new
= true;
7035 sctx
->cur_inode_deleted
= false;
7036 sctx
->cur_inode_size
= btrfs_inode_size(
7037 sctx
->left_path
->nodes
[0],
7039 sctx
->cur_inode_mode
= btrfs_inode_mode(
7040 sctx
->left_path
->nodes
[0],
7042 sctx
->cur_inode_rdev
= btrfs_inode_rdev(
7043 sctx
->left_path
->nodes
[0],
7045 ret
= send_create_inode_if_needed(sctx
);
7049 ret
= process_all_refs(sctx
, BTRFS_COMPARE_TREE_NEW
);
7053 * Advance send_progress now as we did not get
7054 * into process_recorded_refs_if_needed in the
7057 sctx
->send_progress
= sctx
->cur_ino
+ 1;
7060 * Now process all extents and xattrs of the
7061 * inode as if they were all new.
7063 ret
= process_all_extents(sctx
);
7066 ret
= process_all_new_xattrs(sctx
);
7071 sctx
->cur_inode_gen
= left_gen
;
7072 sctx
->cur_inode_new
= false;
7073 sctx
->cur_inode_new_gen
= false;
7074 sctx
->cur_inode_deleted
= false;
7075 sctx
->cur_inode_size
= btrfs_inode_size(
7076 sctx
->left_path
->nodes
[0], left_ii
);
7077 sctx
->cur_inode_mode
= btrfs_inode_mode(
7078 sctx
->left_path
->nodes
[0], left_ii
);
7087 * We have to process new refs before deleted refs, but compare_trees gives us
7088 * the new and deleted refs mixed. To fix this, we record the new/deleted refs
7089 * first and later process them in process_recorded_refs.
7090 * For the cur_inode_new_gen case, we skip recording completely because
7091 * changed_inode did already initiate processing of refs. The reason for this is
7092 * that in this case, compare_tree actually compares the refs of 2 different
7093 * inodes. To fix this, process_all_refs is used in changed_inode to handle all
7094 * refs of the right tree as deleted and all refs of the left tree as new.
7096 static int changed_ref(struct send_ctx
*sctx
,
7097 enum btrfs_compare_tree_result result
)
7101 if (sctx
->cur_ino
!= sctx
->cmp_key
->objectid
) {
7102 inconsistent_snapshot_error(sctx
, result
, "reference");
7106 if (!sctx
->cur_inode_new_gen
&&
7107 sctx
->cur_ino
!= BTRFS_FIRST_FREE_OBJECTID
) {
7108 if (result
== BTRFS_COMPARE_TREE_NEW
)
7109 ret
= record_new_ref(sctx
);
7110 else if (result
== BTRFS_COMPARE_TREE_DELETED
)
7111 ret
= record_deleted_ref(sctx
);
7112 else if (result
== BTRFS_COMPARE_TREE_CHANGED
)
7113 ret
= record_changed_ref(sctx
);
7120 * Process new/deleted/changed xattrs. We skip processing in the
7121 * cur_inode_new_gen case because changed_inode did already initiate processing
7122 * of xattrs. The reason is the same as in changed_ref
7124 static int changed_xattr(struct send_ctx
*sctx
,
7125 enum btrfs_compare_tree_result result
)
7129 if (sctx
->cur_ino
!= sctx
->cmp_key
->objectid
) {
7130 inconsistent_snapshot_error(sctx
, result
, "xattr");
7134 if (!sctx
->cur_inode_new_gen
&& !sctx
->cur_inode_deleted
) {
7135 if (result
== BTRFS_COMPARE_TREE_NEW
)
7136 ret
= process_new_xattr(sctx
);
7137 else if (result
== BTRFS_COMPARE_TREE_DELETED
)
7138 ret
= process_deleted_xattr(sctx
);
7139 else if (result
== BTRFS_COMPARE_TREE_CHANGED
)
7140 ret
= process_changed_xattr(sctx
);
7147 * Process new/deleted/changed extents. We skip processing in the
7148 * cur_inode_new_gen case because changed_inode did already initiate processing
7149 * of extents. The reason is the same as in changed_ref
7151 static int changed_extent(struct send_ctx
*sctx
,
7152 enum btrfs_compare_tree_result result
)
7157 * We have found an extent item that changed without the inode item
7158 * having changed. This can happen either after relocation (where the
7159 * disk_bytenr of an extent item is replaced at
7160 * relocation.c:replace_file_extents()) or after deduplication into a
7161 * file in both the parent and send snapshots (where an extent item can
7162 * get modified or replaced with a new one). Note that deduplication
7163 * updates the inode item, but it only changes the iversion (sequence
7164 * field in the inode item) of the inode, so if a file is deduplicated
7165 * the same amount of times in both the parent and send snapshots, its
7166 * iversion becomes the same in both snapshots, whence the inode item is
7167 * the same on both snapshots.
7169 if (sctx
->cur_ino
!= sctx
->cmp_key
->objectid
)
7172 if (!sctx
->cur_inode_new_gen
&& !sctx
->cur_inode_deleted
) {
7173 if (result
!= BTRFS_COMPARE_TREE_DELETED
)
7174 ret
= process_extent(sctx
, sctx
->left_path
,
7181 static int changed_verity(struct send_ctx
*sctx
, enum btrfs_compare_tree_result result
)
7183 if (!sctx
->cur_inode_new_gen
&& !sctx
->cur_inode_deleted
) {
7184 if (result
== BTRFS_COMPARE_TREE_NEW
)
7185 sctx
->cur_inode_needs_verity
= true;
7190 static int dir_changed(struct send_ctx
*sctx
, u64 dir
)
7192 u64 orig_gen
, new_gen
;
7195 ret
= get_inode_gen(sctx
->send_root
, dir
, &new_gen
);
7199 ret
= get_inode_gen(sctx
->parent_root
, dir
, &orig_gen
);
7203 return (orig_gen
!= new_gen
) ? 1 : 0;
7206 static int compare_refs(struct send_ctx
*sctx
, struct btrfs_path
*path
,
7207 struct btrfs_key
*key
)
7209 struct btrfs_inode_extref
*extref
;
7210 struct extent_buffer
*leaf
;
7211 u64 dirid
= 0, last_dirid
= 0;
7218 /* Easy case, just check this one dirid */
7219 if (key
->type
== BTRFS_INODE_REF_KEY
) {
7220 dirid
= key
->offset
;
7222 ret
= dir_changed(sctx
, dirid
);
7226 leaf
= path
->nodes
[0];
7227 item_size
= btrfs_item_size(leaf
, path
->slots
[0]);
7228 ptr
= btrfs_item_ptr_offset(leaf
, path
->slots
[0]);
7229 while (cur_offset
< item_size
) {
7230 extref
= (struct btrfs_inode_extref
*)(ptr
+
7232 dirid
= btrfs_inode_extref_parent(leaf
, extref
);
7233 ref_name_len
= btrfs_inode_extref_name_len(leaf
, extref
);
7234 cur_offset
+= ref_name_len
+ sizeof(*extref
);
7235 if (dirid
== last_dirid
)
7237 ret
= dir_changed(sctx
, dirid
);
7247 * Updates compare related fields in sctx and simply forwards to the actual
7248 * changed_xxx functions.
7250 static int changed_cb(struct btrfs_path
*left_path
,
7251 struct btrfs_path
*right_path
,
7252 struct btrfs_key
*key
,
7253 enum btrfs_compare_tree_result result
,
7254 struct send_ctx
*sctx
)
7259 * We can not hold the commit root semaphore here. This is because in
7260 * the case of sending and receiving to the same filesystem, using a
7261 * pipe, could result in a deadlock:
7263 * 1) The task running send blocks on the pipe because it's full;
7265 * 2) The task running receive, which is the only consumer of the pipe,
7266 * is waiting for a transaction commit (for example due to a space
7267 * reservation when doing a write or triggering a transaction commit
7268 * when creating a subvolume);
7270 * 3) The transaction is waiting to write lock the commit root semaphore,
7271 * but can not acquire it since it's being held at 1).
7273 * Down this call chain we write to the pipe through kernel_write().
7274 * The same type of problem can also happen when sending to a file that
7275 * is stored in the same filesystem - when reserving space for a write
7276 * into the file, we can trigger a transaction commit.
7278 * Our caller has supplied us with clones of leaves from the send and
7279 * parent roots, so we're safe here from a concurrent relocation and
7280 * further reallocation of metadata extents while we are here. Below we
7281 * also assert that the leaves are clones.
7283 lockdep_assert_not_held(&sctx
->send_root
->fs_info
->commit_root_sem
);
7286 * We always have a send root, so left_path is never NULL. We will not
7287 * have a leaf when we have reached the end of the send root but have
7288 * not yet reached the end of the parent root.
7290 if (left_path
->nodes
[0])
7291 ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED
,
7292 &left_path
->nodes
[0]->bflags
));
7294 * When doing a full send we don't have a parent root, so right_path is
7295 * NULL. When doing an incremental send, we may have reached the end of
7296 * the parent root already, so we don't have a leaf at right_path.
7298 if (right_path
&& right_path
->nodes
[0])
7299 ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED
,
7300 &right_path
->nodes
[0]->bflags
));
7302 if (result
== BTRFS_COMPARE_TREE_SAME
) {
7303 if (key
->type
== BTRFS_INODE_REF_KEY
||
7304 key
->type
== BTRFS_INODE_EXTREF_KEY
) {
7305 ret
= compare_refs(sctx
, left_path
, key
);
7310 } else if (key
->type
== BTRFS_EXTENT_DATA_KEY
) {
7311 return maybe_send_hole(sctx
, left_path
, key
);
7315 result
= BTRFS_COMPARE_TREE_CHANGED
;
7319 sctx
->left_path
= left_path
;
7320 sctx
->right_path
= right_path
;
7321 sctx
->cmp_key
= key
;
7323 ret
= finish_inode_if_needed(sctx
, 0);
7327 /* Ignore non-FS objects */
7328 if (key
->objectid
== BTRFS_FREE_INO_OBJECTID
||
7329 key
->objectid
== BTRFS_FREE_SPACE_OBJECTID
)
7332 if (key
->type
== BTRFS_INODE_ITEM_KEY
) {
7333 ret
= changed_inode(sctx
, result
);
7334 } else if (!sctx
->ignore_cur_inode
) {
7335 if (key
->type
== BTRFS_INODE_REF_KEY
||
7336 key
->type
== BTRFS_INODE_EXTREF_KEY
)
7337 ret
= changed_ref(sctx
, result
);
7338 else if (key
->type
== BTRFS_XATTR_ITEM_KEY
)
7339 ret
= changed_xattr(sctx
, result
);
7340 else if (key
->type
== BTRFS_EXTENT_DATA_KEY
)
7341 ret
= changed_extent(sctx
, result
);
7342 else if (key
->type
== BTRFS_VERITY_DESC_ITEM_KEY
&&
7344 ret
= changed_verity(sctx
, result
);
7351 static int search_key_again(const struct send_ctx
*sctx
,
7352 struct btrfs_root
*root
,
7353 struct btrfs_path
*path
,
7354 const struct btrfs_key
*key
)
7358 if (!path
->need_commit_sem
)
7359 lockdep_assert_held_read(&root
->fs_info
->commit_root_sem
);
7362 * Roots used for send operations are readonly and no one can add,
7363 * update or remove keys from them, so we should be able to find our
7364 * key again. The only exception is deduplication, which can operate on
7365 * readonly roots and add, update or remove keys to/from them - but at
7366 * the moment we don't allow it to run in parallel with send.
7368 ret
= btrfs_search_slot(NULL
, root
, key
, path
, 0, 0);
7371 btrfs_print_tree(path
->nodes
[path
->lowest_level
], false);
7372 btrfs_err(root
->fs_info
,
7373 "send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d",
7374 key
->objectid
, key
->type
, key
->offset
,
7375 (root
== sctx
->parent_root
? "parent" : "send"),
7376 btrfs_root_id(root
), path
->lowest_level
,
7377 path
->slots
[path
->lowest_level
]);
7384 static int full_send_tree(struct send_ctx
*sctx
)
7387 struct btrfs_root
*send_root
= sctx
->send_root
;
7388 struct btrfs_key key
;
7389 struct btrfs_fs_info
*fs_info
= send_root
->fs_info
;
7390 struct btrfs_path
*path
;
7392 path
= alloc_path_for_send();
7395 path
->reada
= READA_FORWARD_ALWAYS
;
7397 key
.objectid
= BTRFS_FIRST_FREE_OBJECTID
;
7398 key
.type
= BTRFS_INODE_ITEM_KEY
;
7401 down_read(&fs_info
->commit_root_sem
);
7402 sctx
->last_reloc_trans
= fs_info
->last_reloc_trans
;
7403 up_read(&fs_info
->commit_root_sem
);
7405 ret
= btrfs_search_slot_for_read(send_root
, &key
, path
, 1, 0);
7412 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0]);
7414 ret
= changed_cb(path
, NULL
, &key
,
7415 BTRFS_COMPARE_TREE_NEW
, sctx
);
7419 down_read(&fs_info
->commit_root_sem
);
7420 if (fs_info
->last_reloc_trans
> sctx
->last_reloc_trans
) {
7421 sctx
->last_reloc_trans
= fs_info
->last_reloc_trans
;
7422 up_read(&fs_info
->commit_root_sem
);
7424 * A transaction used for relocating a block group was
7425 * committed or is about to finish its commit. Release
7426 * our path (leaf) and restart the search, so that we
7427 * avoid operating on any file extent items that are
7428 * stale, with a disk_bytenr that reflects a pre
7429 * relocation value. This way we avoid as much as
7430 * possible to fallback to regular writes when checking
7431 * if we can clone file ranges.
7433 btrfs_release_path(path
);
7434 ret
= search_key_again(sctx
, send_root
, path
, &key
);
7438 up_read(&fs_info
->commit_root_sem
);
7441 ret
= btrfs_next_item(send_root
, path
);
7451 ret
= finish_inode_if_needed(sctx
, 1);
7454 btrfs_free_path(path
);
7458 static int replace_node_with_clone(struct btrfs_path
*path
, int level
)
7460 struct extent_buffer
*clone
;
7462 clone
= btrfs_clone_extent_buffer(path
->nodes
[level
]);
7466 free_extent_buffer(path
->nodes
[level
]);
7467 path
->nodes
[level
] = clone
;
7472 static int tree_move_down(struct btrfs_path
*path
, int *level
, u64 reada_min_gen
)
7474 struct extent_buffer
*eb
;
7475 struct extent_buffer
*parent
= path
->nodes
[*level
];
7476 int slot
= path
->slots
[*level
];
7477 const int nritems
= btrfs_header_nritems(parent
);
7481 lockdep_assert_held_read(&parent
->fs_info
->commit_root_sem
);
7482 ASSERT(*level
!= 0);
7484 eb
= btrfs_read_node_slot(parent
, slot
);
7489 * Trigger readahead for the next leaves we will process, so that it is
7490 * very likely that when we need them they are already in memory and we
7491 * will not block on disk IO. For nodes we only do readahead for one,
7492 * since the time window between processing nodes is typically larger.
7494 reada_max
= (*level
== 1 ? SZ_128K
: eb
->fs_info
->nodesize
);
7496 for (slot
++; slot
< nritems
&& reada_done
< reada_max
; slot
++) {
7497 if (btrfs_node_ptr_generation(parent
, slot
) > reada_min_gen
) {
7498 btrfs_readahead_node_child(parent
, slot
);
7499 reada_done
+= eb
->fs_info
->nodesize
;
7503 path
->nodes
[*level
- 1] = eb
;
7504 path
->slots
[*level
- 1] = 0;
7508 return replace_node_with_clone(path
, 0);
7513 static int tree_move_next_or_upnext(struct btrfs_path
*path
,
7514 int *level
, int root_level
)
7518 nritems
= btrfs_header_nritems(path
->nodes
[*level
]);
7520 path
->slots
[*level
]++;
7522 while (path
->slots
[*level
] >= nritems
) {
7523 if (*level
== root_level
) {
7524 path
->slots
[*level
] = nritems
- 1;
7529 path
->slots
[*level
] = 0;
7530 free_extent_buffer(path
->nodes
[*level
]);
7531 path
->nodes
[*level
] = NULL
;
7533 path
->slots
[*level
]++;
7535 nritems
= btrfs_header_nritems(path
->nodes
[*level
]);
7542 * Returns 1 if it had to move up and next. 0 is returned if it moved only next
7545 static int tree_advance(struct btrfs_path
*path
,
7546 int *level
, int root_level
,
7548 struct btrfs_key
*key
,
7553 if (*level
== 0 || !allow_down
) {
7554 ret
= tree_move_next_or_upnext(path
, level
, root_level
);
7556 ret
= tree_move_down(path
, level
, reada_min_gen
);
7560 * Even if we have reached the end of a tree, ret is -1, update the key
7561 * anyway, so that in case we need to restart due to a block group
7562 * relocation, we can assert that the last key of the root node still
7563 * exists in the tree.
7566 btrfs_item_key_to_cpu(path
->nodes
[*level
], key
,
7567 path
->slots
[*level
]);
7569 btrfs_node_key_to_cpu(path
->nodes
[*level
], key
,
7570 path
->slots
[*level
]);
7575 static int tree_compare_item(struct btrfs_path
*left_path
,
7576 struct btrfs_path
*right_path
,
7581 unsigned long off1
, off2
;
7583 len1
= btrfs_item_size(left_path
->nodes
[0], left_path
->slots
[0]);
7584 len2
= btrfs_item_size(right_path
->nodes
[0], right_path
->slots
[0]);
7588 off1
= btrfs_item_ptr_offset(left_path
->nodes
[0], left_path
->slots
[0]);
7589 off2
= btrfs_item_ptr_offset(right_path
->nodes
[0],
7590 right_path
->slots
[0]);
7592 read_extent_buffer(left_path
->nodes
[0], tmp_buf
, off1
, len1
);
7594 cmp
= memcmp_extent_buffer(right_path
->nodes
[0], tmp_buf
, off2
, len1
);
7601 * A transaction used for relocating a block group was committed or is about to
7602 * finish its commit. Release our paths and restart the search, so that we are
7603 * not using stale extent buffers:
7605 * 1) For levels > 0, we are only holding references of extent buffers, without
7606 * any locks on them, which does not prevent them from having been relocated
7607 * and reallocated after the last time we released the commit root semaphore.
7608 * The exception are the root nodes, for which we always have a clone, see
7609 * the comment at btrfs_compare_trees();
7611 * 2) For leaves, level 0, we are holding copies (clones) of extent buffers, so
7612 * we are safe from the concurrent relocation and reallocation. However they
7613 * can have file extent items with a pre relocation disk_bytenr value, so we
7614 * restart the start from the current commit roots and clone the new leaves so
7615 * that we get the post relocation disk_bytenr values. Not doing so, could
7616 * make us clone the wrong data in case there are new extents using the old
7617 * disk_bytenr that happen to be shared.
7619 static int restart_after_relocation(struct btrfs_path
*left_path
,
7620 struct btrfs_path
*right_path
,
7621 const struct btrfs_key
*left_key
,
7622 const struct btrfs_key
*right_key
,
7625 const struct send_ctx
*sctx
)
7630 lockdep_assert_held_read(&sctx
->send_root
->fs_info
->commit_root_sem
);
7632 btrfs_release_path(left_path
);
7633 btrfs_release_path(right_path
);
7636 * Since keys can not be added or removed to/from our roots because they
7637 * are readonly and we do not allow deduplication to run in parallel
7638 * (which can add, remove or change keys), the layout of the trees should
7641 left_path
->lowest_level
= left_level
;
7642 ret
= search_key_again(sctx
, sctx
->send_root
, left_path
, left_key
);
7646 right_path
->lowest_level
= right_level
;
7647 ret
= search_key_again(sctx
, sctx
->parent_root
, right_path
, right_key
);
7652 * If the lowest level nodes are leaves, clone them so that they can be
7653 * safely used by changed_cb() while not under the protection of the
7654 * commit root semaphore, even if relocation and reallocation happens in
7657 if (left_level
== 0) {
7658 ret
= replace_node_with_clone(left_path
, 0);
7663 if (right_level
== 0) {
7664 ret
= replace_node_with_clone(right_path
, 0);
7670 * Now clone the root nodes (unless they happen to be the leaves we have
7671 * already cloned). This is to protect against concurrent snapshotting of
7672 * the send and parent roots (see the comment at btrfs_compare_trees()).
7674 root_level
= btrfs_header_level(sctx
->send_root
->commit_root
);
7675 if (root_level
> 0) {
7676 ret
= replace_node_with_clone(left_path
, root_level
);
7681 root_level
= btrfs_header_level(sctx
->parent_root
->commit_root
);
7682 if (root_level
> 0) {
7683 ret
= replace_node_with_clone(right_path
, root_level
);
7692 * This function compares two trees and calls the provided callback for
7693 * every changed/new/deleted item it finds.
7694 * If shared tree blocks are encountered, whole subtrees are skipped, making
7695 * the compare pretty fast on snapshotted subvolumes.
7697 * This currently works on commit roots only. As commit roots are read only,
7698 * we don't do any locking. The commit roots are protected with transactions.
7699 * Transactions are ended and rejoined when a commit is tried in between.
7701 * This function checks for modifications done to the trees while comparing.
7702 * If it detects a change, it aborts immediately.
7704 static int btrfs_compare_trees(struct btrfs_root
*left_root
,
7705 struct btrfs_root
*right_root
, struct send_ctx
*sctx
)
7707 struct btrfs_fs_info
*fs_info
= left_root
->fs_info
;
7710 struct btrfs_path
*left_path
= NULL
;
7711 struct btrfs_path
*right_path
= NULL
;
7712 struct btrfs_key left_key
;
7713 struct btrfs_key right_key
;
7714 char *tmp_buf
= NULL
;
7715 int left_root_level
;
7716 int right_root_level
;
7719 int left_end_reached
= 0;
7720 int right_end_reached
= 0;
7721 int advance_left
= 0;
7722 int advance_right
= 0;
7729 left_path
= btrfs_alloc_path();
7734 right_path
= btrfs_alloc_path();
7740 tmp_buf
= kvmalloc(fs_info
->nodesize
, GFP_KERNEL
);
7746 left_path
->search_commit_root
= 1;
7747 left_path
->skip_locking
= 1;
7748 right_path
->search_commit_root
= 1;
7749 right_path
->skip_locking
= 1;
7752 * Strategy: Go to the first items of both trees. Then do
7754 * If both trees are at level 0
7755 * Compare keys of current items
7756 * If left < right treat left item as new, advance left tree
7758 * If left > right treat right item as deleted, advance right tree
7760 * If left == right do deep compare of items, treat as changed if
7761 * needed, advance both trees and repeat
7762 * If both trees are at the same level but not at level 0
7763 * Compare keys of current nodes/leafs
7764 * If left < right advance left tree and repeat
7765 * If left > right advance right tree and repeat
7766 * If left == right compare blockptrs of the next nodes/leafs
7767 * If they match advance both trees but stay at the same level
7769 * If they don't match advance both trees while allowing to go
7771 * If tree levels are different
7772 * Advance the tree that needs it and repeat
7774 * Advancing a tree means:
7775 * If we are at level 0, try to go to the next slot. If that's not
7776 * possible, go one level up and repeat. Stop when we found a level
7777 * where we could go to the next slot. We may at this point be on a
7780 * If we are not at level 0 and not on shared tree blocks, go one
7783 * If we are not at level 0 and on shared tree blocks, go one slot to
7784 * the right if possible or go up and right.
7787 down_read(&fs_info
->commit_root_sem
);
7788 left_level
= btrfs_header_level(left_root
->commit_root
);
7789 left_root_level
= left_level
;
7791 * We clone the root node of the send and parent roots to prevent races
7792 * with snapshot creation of these roots. Snapshot creation COWs the
7793 * root node of a tree, so after the transaction is committed the old
7794 * extent can be reallocated while this send operation is still ongoing.
7795 * So we clone them, under the commit root semaphore, to be race free.
7797 left_path
->nodes
[left_level
] =
7798 btrfs_clone_extent_buffer(left_root
->commit_root
);
7799 if (!left_path
->nodes
[left_level
]) {
7804 right_level
= btrfs_header_level(right_root
->commit_root
);
7805 right_root_level
= right_level
;
7806 right_path
->nodes
[right_level
] =
7807 btrfs_clone_extent_buffer(right_root
->commit_root
);
7808 if (!right_path
->nodes
[right_level
]) {
7813 * Our right root is the parent root, while the left root is the "send"
7814 * root. We know that all new nodes/leaves in the left root must have
7815 * a generation greater than the right root's generation, so we trigger
7816 * readahead for those nodes and leaves of the left root, as we know we
7817 * will need to read them at some point.
7819 reada_min_gen
= btrfs_header_generation(right_root
->commit_root
);
7821 if (left_level
== 0)
7822 btrfs_item_key_to_cpu(left_path
->nodes
[left_level
],
7823 &left_key
, left_path
->slots
[left_level
]);
7825 btrfs_node_key_to_cpu(left_path
->nodes
[left_level
],
7826 &left_key
, left_path
->slots
[left_level
]);
7827 if (right_level
== 0)
7828 btrfs_item_key_to_cpu(right_path
->nodes
[right_level
],
7829 &right_key
, right_path
->slots
[right_level
]);
7831 btrfs_node_key_to_cpu(right_path
->nodes
[right_level
],
7832 &right_key
, right_path
->slots
[right_level
]);
7834 sctx
->last_reloc_trans
= fs_info
->last_reloc_trans
;
7837 if (need_resched() ||
7838 rwsem_is_contended(&fs_info
->commit_root_sem
)) {
7839 up_read(&fs_info
->commit_root_sem
);
7841 down_read(&fs_info
->commit_root_sem
);
7844 if (fs_info
->last_reloc_trans
> sctx
->last_reloc_trans
) {
7845 ret
= restart_after_relocation(left_path
, right_path
,
7846 &left_key
, &right_key
,
7847 left_level
, right_level
,
7851 sctx
->last_reloc_trans
= fs_info
->last_reloc_trans
;
7854 if (advance_left
&& !left_end_reached
) {
7855 ret
= tree_advance(left_path
, &left_level
,
7857 advance_left
!= ADVANCE_ONLY_NEXT
,
7858 &left_key
, reada_min_gen
);
7860 left_end_reached
= ADVANCE
;
7865 if (advance_right
&& !right_end_reached
) {
7866 ret
= tree_advance(right_path
, &right_level
,
7868 advance_right
!= ADVANCE_ONLY_NEXT
,
7869 &right_key
, reada_min_gen
);
7871 right_end_reached
= ADVANCE
;
7877 if (left_end_reached
&& right_end_reached
) {
7880 } else if (left_end_reached
) {
7881 if (right_level
== 0) {
7882 up_read(&fs_info
->commit_root_sem
);
7883 ret
= changed_cb(left_path
, right_path
,
7885 BTRFS_COMPARE_TREE_DELETED
,
7889 down_read(&fs_info
->commit_root_sem
);
7891 advance_right
= ADVANCE
;
7893 } else if (right_end_reached
) {
7894 if (left_level
== 0) {
7895 up_read(&fs_info
->commit_root_sem
);
7896 ret
= changed_cb(left_path
, right_path
,
7898 BTRFS_COMPARE_TREE_NEW
,
7902 down_read(&fs_info
->commit_root_sem
);
7904 advance_left
= ADVANCE
;
7908 if (left_level
== 0 && right_level
== 0) {
7909 up_read(&fs_info
->commit_root_sem
);
7910 cmp
= btrfs_comp_cpu_keys(&left_key
, &right_key
);
7912 ret
= changed_cb(left_path
, right_path
,
7914 BTRFS_COMPARE_TREE_NEW
,
7916 advance_left
= ADVANCE
;
7917 } else if (cmp
> 0) {
7918 ret
= changed_cb(left_path
, right_path
,
7920 BTRFS_COMPARE_TREE_DELETED
,
7922 advance_right
= ADVANCE
;
7924 enum btrfs_compare_tree_result result
;
7926 WARN_ON(!extent_buffer_uptodate(left_path
->nodes
[0]));
7927 ret
= tree_compare_item(left_path
, right_path
,
7930 result
= BTRFS_COMPARE_TREE_CHANGED
;
7932 result
= BTRFS_COMPARE_TREE_SAME
;
7933 ret
= changed_cb(left_path
, right_path
,
7934 &left_key
, result
, sctx
);
7935 advance_left
= ADVANCE
;
7936 advance_right
= ADVANCE
;
7941 down_read(&fs_info
->commit_root_sem
);
7942 } else if (left_level
== right_level
) {
7943 cmp
= btrfs_comp_cpu_keys(&left_key
, &right_key
);
7945 advance_left
= ADVANCE
;
7946 } else if (cmp
> 0) {
7947 advance_right
= ADVANCE
;
7949 left_blockptr
= btrfs_node_blockptr(
7950 left_path
->nodes
[left_level
],
7951 left_path
->slots
[left_level
]);
7952 right_blockptr
= btrfs_node_blockptr(
7953 right_path
->nodes
[right_level
],
7954 right_path
->slots
[right_level
]);
7955 left_gen
= btrfs_node_ptr_generation(
7956 left_path
->nodes
[left_level
],
7957 left_path
->slots
[left_level
]);
7958 right_gen
= btrfs_node_ptr_generation(
7959 right_path
->nodes
[right_level
],
7960 right_path
->slots
[right_level
]);
7961 if (left_blockptr
== right_blockptr
&&
7962 left_gen
== right_gen
) {
7964 * As we're on a shared block, don't
7965 * allow to go deeper.
7967 advance_left
= ADVANCE_ONLY_NEXT
;
7968 advance_right
= ADVANCE_ONLY_NEXT
;
7970 advance_left
= ADVANCE
;
7971 advance_right
= ADVANCE
;
7974 } else if (left_level
< right_level
) {
7975 advance_right
= ADVANCE
;
7977 advance_left
= ADVANCE
;
7982 up_read(&fs_info
->commit_root_sem
);
7984 btrfs_free_path(left_path
);
7985 btrfs_free_path(right_path
);
7990 static int send_subvol(struct send_ctx
*sctx
)
7994 if (!(sctx
->flags
& BTRFS_SEND_FLAG_OMIT_STREAM_HEADER
)) {
7995 ret
= send_header(sctx
);
8000 ret
= send_subvol_begin(sctx
);
8004 if (sctx
->parent_root
) {
8005 ret
= btrfs_compare_trees(sctx
->send_root
, sctx
->parent_root
, sctx
);
8008 ret
= finish_inode_if_needed(sctx
, 1);
8012 ret
= full_send_tree(sctx
);
8018 free_recorded_refs(sctx
);
8023 * If orphan cleanup did remove any orphans from a root, it means the tree
8024 * was modified and therefore the commit root is not the same as the current
8025 * root anymore. This is a problem, because send uses the commit root and
8026 * therefore can see inode items that don't exist in the current root anymore,
8027 * and for example make calls to btrfs_iget, which will do tree lookups based
8028 * on the current root and not on the commit root. Those lookups will fail,
8029 * returning a -ESTALE error, and making send fail with that error. So make
8030 * sure a send does not see any orphans we have just removed, and that it will
8031 * see the same inodes regardless of whether a transaction commit happened
8032 * before it started (meaning that the commit root will be the same as the
8033 * current root) or not.
8035 static int ensure_commit_roots_uptodate(struct send_ctx
*sctx
)
8037 struct btrfs_root
*root
= sctx
->parent_root
;
8039 if (root
&& root
->node
!= root
->commit_root
)
8040 return btrfs_commit_current_transaction(root
);
8042 for (int i
= 0; i
< sctx
->clone_roots_cnt
; i
++) {
8043 root
= sctx
->clone_roots
[i
].root
;
8044 if (root
->node
!= root
->commit_root
)
8045 return btrfs_commit_current_transaction(root
);
8052 * Make sure any existing dellaloc is flushed for any root used by a send
8053 * operation so that we do not miss any data and we do not race with writeback
8054 * finishing and changing a tree while send is using the tree. This could
8055 * happen if a subvolume is in RW mode, has delalloc, is turned to RO mode and
8056 * a send operation then uses the subvolume.
8057 * After flushing delalloc ensure_commit_roots_uptodate() must be called.
8059 static int flush_delalloc_roots(struct send_ctx
*sctx
)
8061 struct btrfs_root
*root
= sctx
->parent_root
;
8066 ret
= btrfs_start_delalloc_snapshot(root
, false);
8069 btrfs_wait_ordered_extents(root
, U64_MAX
, NULL
);
8072 for (i
= 0; i
< sctx
->clone_roots_cnt
; i
++) {
8073 root
= sctx
->clone_roots
[i
].root
;
8074 ret
= btrfs_start_delalloc_snapshot(root
, false);
8077 btrfs_wait_ordered_extents(root
, U64_MAX
, NULL
);
8083 static void btrfs_root_dec_send_in_progress(struct btrfs_root
* root
)
8085 spin_lock(&root
->root_item_lock
);
8086 root
->send_in_progress
--;
8088 * Not much left to do, we don't know why it's unbalanced and
8089 * can't blindly reset it to 0.
8091 if (root
->send_in_progress
< 0)
8092 btrfs_err(root
->fs_info
,
8093 "send_in_progress unbalanced %d root %llu",
8094 root
->send_in_progress
, btrfs_root_id(root
));
8095 spin_unlock(&root
->root_item_lock
);
8098 static void dedupe_in_progress_warn(const struct btrfs_root
*root
)
8100 btrfs_warn_rl(root
->fs_info
,
8101 "cannot use root %llu for send while deduplications on it are in progress (%d in progress)",
8102 btrfs_root_id(root
), root
->dedupe_in_progress
);
8105 long btrfs_ioctl_send(struct btrfs_inode
*inode
, const struct btrfs_ioctl_send_args
*arg
)
8108 struct btrfs_root
*send_root
= inode
->root
;
8109 struct btrfs_fs_info
*fs_info
= send_root
->fs_info
;
8110 struct btrfs_root
*clone_root
;
8111 struct send_ctx
*sctx
= NULL
;
8113 u64
*clone_sources_tmp
= NULL
;
8114 int clone_sources_to_rollback
= 0;
8116 int sort_clone_roots
= 0;
8117 struct btrfs_lru_cache_entry
*entry
;
8118 struct btrfs_lru_cache_entry
*tmp
;
8120 if (!capable(CAP_SYS_ADMIN
))
8124 * The subvolume must remain read-only during send, protect against
8125 * making it RW. This also protects against deletion.
8127 spin_lock(&send_root
->root_item_lock
);
8129 * Unlikely but possible, if the subvolume is marked for deletion but
8130 * is slow to remove the directory entry, send can still be started.
8132 if (btrfs_root_dead(send_root
)) {
8133 spin_unlock(&send_root
->root_item_lock
);
8136 /* Userspace tools do the checks and warn the user if it's not RO. */
8137 if (!btrfs_root_readonly(send_root
)) {
8138 spin_unlock(&send_root
->root_item_lock
);
8141 if (send_root
->dedupe_in_progress
) {
8142 dedupe_in_progress_warn(send_root
);
8143 spin_unlock(&send_root
->root_item_lock
);
8146 send_root
->send_in_progress
++;
8147 spin_unlock(&send_root
->root_item_lock
);
8150 * Check that we don't overflow at later allocations, we request
8151 * clone_sources_count + 1 items, and compare to unsigned long inside
8152 * access_ok. Also set an upper limit for allocation size so this can't
8153 * easily exhaust memory. Max number of clone sources is about 200K.
8155 if (arg
->clone_sources_count
> SZ_8M
/ sizeof(struct clone_root
)) {
8160 if (arg
->flags
& ~BTRFS_SEND_FLAG_MASK
) {
8165 sctx
= kzalloc(sizeof(struct send_ctx
), GFP_KERNEL
);
8171 INIT_LIST_HEAD(&sctx
->new_refs
);
8172 INIT_LIST_HEAD(&sctx
->deleted_refs
);
8174 btrfs_lru_cache_init(&sctx
->name_cache
, SEND_MAX_NAME_CACHE_SIZE
);
8175 btrfs_lru_cache_init(&sctx
->backref_cache
, SEND_MAX_BACKREF_CACHE_SIZE
);
8176 btrfs_lru_cache_init(&sctx
->dir_created_cache
,
8177 SEND_MAX_DIR_CREATED_CACHE_SIZE
);
8179 * This cache is periodically trimmed to a fixed size elsewhere, see
8180 * cache_dir_utimes() and trim_dir_utimes_cache().
8182 btrfs_lru_cache_init(&sctx
->dir_utimes_cache
, 0);
8184 sctx
->pending_dir_moves
= RB_ROOT
;
8185 sctx
->waiting_dir_moves
= RB_ROOT
;
8186 sctx
->orphan_dirs
= RB_ROOT
;
8187 sctx
->rbtree_new_refs
= RB_ROOT
;
8188 sctx
->rbtree_deleted_refs
= RB_ROOT
;
8190 sctx
->flags
= arg
->flags
;
8192 if (arg
->flags
& BTRFS_SEND_FLAG_VERSION
) {
8193 if (arg
->version
> BTRFS_SEND_STREAM_VERSION
) {
8197 /* Zero means "use the highest version" */
8198 sctx
->proto
= arg
->version
?: BTRFS_SEND_STREAM_VERSION
;
8202 if ((arg
->flags
& BTRFS_SEND_FLAG_COMPRESSED
) && sctx
->proto
< 2) {
8207 sctx
->send_filp
= fget(arg
->send_fd
);
8208 if (!sctx
->send_filp
|| !(sctx
->send_filp
->f_mode
& FMODE_WRITE
)) {
8213 sctx
->send_root
= send_root
;
8214 sctx
->clone_roots_cnt
= arg
->clone_sources_count
;
8216 if (sctx
->proto
>= 2) {
8217 u32 send_buf_num_pages
;
8219 sctx
->send_max_size
= BTRFS_SEND_BUF_SIZE_V2
;
8220 sctx
->send_buf
= vmalloc(sctx
->send_max_size
);
8221 if (!sctx
->send_buf
) {
8225 send_buf_num_pages
= sctx
->send_max_size
>> PAGE_SHIFT
;
8226 sctx
->send_buf_pages
= kcalloc(send_buf_num_pages
,
8227 sizeof(*sctx
->send_buf_pages
),
8229 if (!sctx
->send_buf_pages
) {
8233 for (i
= 0; i
< send_buf_num_pages
; i
++) {
8234 sctx
->send_buf_pages
[i
] =
8235 vmalloc_to_page(sctx
->send_buf
+ (i
<< PAGE_SHIFT
));
8238 sctx
->send_max_size
= BTRFS_SEND_BUF_SIZE_V1
;
8239 sctx
->send_buf
= kvmalloc(sctx
->send_max_size
, GFP_KERNEL
);
8241 if (!sctx
->send_buf
) {
8246 sctx
->clone_roots
= kvcalloc(arg
->clone_sources_count
+ 1,
8247 sizeof(*sctx
->clone_roots
),
8249 if (!sctx
->clone_roots
) {
8254 alloc_size
= array_size(sizeof(*arg
->clone_sources
),
8255 arg
->clone_sources_count
);
8257 if (arg
->clone_sources_count
) {
8258 clone_sources_tmp
= kvmalloc(alloc_size
, GFP_KERNEL
);
8259 if (!clone_sources_tmp
) {
8264 ret
= copy_from_user(clone_sources_tmp
, arg
->clone_sources
,
8271 for (i
= 0; i
< arg
->clone_sources_count
; i
++) {
8272 clone_root
= btrfs_get_fs_root(fs_info
,
8273 clone_sources_tmp
[i
], true);
8274 if (IS_ERR(clone_root
)) {
8275 ret
= PTR_ERR(clone_root
);
8278 spin_lock(&clone_root
->root_item_lock
);
8279 if (!btrfs_root_readonly(clone_root
) ||
8280 btrfs_root_dead(clone_root
)) {
8281 spin_unlock(&clone_root
->root_item_lock
);
8282 btrfs_put_root(clone_root
);
8286 if (clone_root
->dedupe_in_progress
) {
8287 dedupe_in_progress_warn(clone_root
);
8288 spin_unlock(&clone_root
->root_item_lock
);
8289 btrfs_put_root(clone_root
);
8293 clone_root
->send_in_progress
++;
8294 spin_unlock(&clone_root
->root_item_lock
);
8296 sctx
->clone_roots
[i
].root
= clone_root
;
8297 clone_sources_to_rollback
= i
+ 1;
8299 kvfree(clone_sources_tmp
);
8300 clone_sources_tmp
= NULL
;
8303 if (arg
->parent_root
) {
8304 sctx
->parent_root
= btrfs_get_fs_root(fs_info
, arg
->parent_root
,
8306 if (IS_ERR(sctx
->parent_root
)) {
8307 ret
= PTR_ERR(sctx
->parent_root
);
8311 spin_lock(&sctx
->parent_root
->root_item_lock
);
8312 sctx
->parent_root
->send_in_progress
++;
8313 if (!btrfs_root_readonly(sctx
->parent_root
) ||
8314 btrfs_root_dead(sctx
->parent_root
)) {
8315 spin_unlock(&sctx
->parent_root
->root_item_lock
);
8319 if (sctx
->parent_root
->dedupe_in_progress
) {
8320 dedupe_in_progress_warn(sctx
->parent_root
);
8321 spin_unlock(&sctx
->parent_root
->root_item_lock
);
8325 spin_unlock(&sctx
->parent_root
->root_item_lock
);
8329 * Clones from send_root are allowed, but only if the clone source
8330 * is behind the current send position. This is checked while searching
8331 * for possible clone sources.
8333 sctx
->clone_roots
[sctx
->clone_roots_cnt
++].root
=
8334 btrfs_grab_root(sctx
->send_root
);
8336 /* We do a bsearch later */
8337 sort(sctx
->clone_roots
, sctx
->clone_roots_cnt
,
8338 sizeof(*sctx
->clone_roots
), __clone_root_cmp_sort
,
8340 sort_clone_roots
= 1;
8342 ret
= flush_delalloc_roots(sctx
);
8346 ret
= ensure_commit_roots_uptodate(sctx
);
8350 ret
= send_subvol(sctx
);
8354 btrfs_lru_cache_for_each_entry_safe(&sctx
->dir_utimes_cache
, entry
, tmp
) {
8355 ret
= send_utimes(sctx
, entry
->key
, entry
->gen
);
8358 btrfs_lru_cache_remove(&sctx
->dir_utimes_cache
, entry
);
8361 if (!(sctx
->flags
& BTRFS_SEND_FLAG_OMIT_END_CMD
)) {
8362 ret
= begin_cmd(sctx
, BTRFS_SEND_C_END
);
8365 ret
= send_cmd(sctx
);
8371 WARN_ON(sctx
&& !ret
&& !RB_EMPTY_ROOT(&sctx
->pending_dir_moves
));
8372 while (sctx
&& !RB_EMPTY_ROOT(&sctx
->pending_dir_moves
)) {
8374 struct pending_dir_move
*pm
;
8376 n
= rb_first(&sctx
->pending_dir_moves
);
8377 pm
= rb_entry(n
, struct pending_dir_move
, node
);
8378 while (!list_empty(&pm
->list
)) {
8379 struct pending_dir_move
*pm2
;
8381 pm2
= list_first_entry(&pm
->list
,
8382 struct pending_dir_move
, list
);
8383 free_pending_move(sctx
, pm2
);
8385 free_pending_move(sctx
, pm
);
8388 WARN_ON(sctx
&& !ret
&& !RB_EMPTY_ROOT(&sctx
->waiting_dir_moves
));
8389 while (sctx
&& !RB_EMPTY_ROOT(&sctx
->waiting_dir_moves
)) {
8391 struct waiting_dir_move
*dm
;
8393 n
= rb_first(&sctx
->waiting_dir_moves
);
8394 dm
= rb_entry(n
, struct waiting_dir_move
, node
);
8395 rb_erase(&dm
->node
, &sctx
->waiting_dir_moves
);
8399 WARN_ON(sctx
&& !ret
&& !RB_EMPTY_ROOT(&sctx
->orphan_dirs
));
8400 while (sctx
&& !RB_EMPTY_ROOT(&sctx
->orphan_dirs
)) {
8402 struct orphan_dir_info
*odi
;
8404 n
= rb_first(&sctx
->orphan_dirs
);
8405 odi
= rb_entry(n
, struct orphan_dir_info
, node
);
8406 free_orphan_dir_info(sctx
, odi
);
8409 if (sort_clone_roots
) {
8410 for (i
= 0; i
< sctx
->clone_roots_cnt
; i
++) {
8411 btrfs_root_dec_send_in_progress(
8412 sctx
->clone_roots
[i
].root
);
8413 btrfs_put_root(sctx
->clone_roots
[i
].root
);
8416 for (i
= 0; sctx
&& i
< clone_sources_to_rollback
; i
++) {
8417 btrfs_root_dec_send_in_progress(
8418 sctx
->clone_roots
[i
].root
);
8419 btrfs_put_root(sctx
->clone_roots
[i
].root
);
8422 btrfs_root_dec_send_in_progress(send_root
);
8424 if (sctx
&& !IS_ERR_OR_NULL(sctx
->parent_root
)) {
8425 btrfs_root_dec_send_in_progress(sctx
->parent_root
);
8426 btrfs_put_root(sctx
->parent_root
);
8429 kvfree(clone_sources_tmp
);
8432 if (sctx
->send_filp
)
8433 fput(sctx
->send_filp
);
8435 kvfree(sctx
->clone_roots
);
8436 kfree(sctx
->send_buf_pages
);
8437 kvfree(sctx
->send_buf
);
8438 kvfree(sctx
->verity_descriptor
);
8440 close_current_inode(sctx
);
8442 btrfs_lru_cache_clear(&sctx
->name_cache
);
8443 btrfs_lru_cache_clear(&sctx
->backref_cache
);
8444 btrfs_lru_cache_clear(&sctx
->dir_created_cache
);
8445 btrfs_lru_cache_clear(&sctx
->dir_utimes_cache
);