1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2012 Alexander Block. All rights reserved.
6 #include <linux/bsearch.h>
8 #include <linux/file.h>
9 #include <linux/sort.h>
10 #include <linux/mount.h>
11 #include <linux/xattr.h>
12 #include <linux/posix_acl_xattr.h>
13 #include <linux/radix-tree.h>
14 #include <linux/vmalloc.h>
15 #include <linux/string.h>
16 #include <linux/compat.h>
17 #include <linux/crc32c.h>
18 #include <linux/fsverity.h>
25 #include "btrfs_inode.h"
26 #include "transaction.h"
27 #include "compression.h"
28 #include "print-tree.h"
29 #include "accessors.h"
31 #include "file-item.h"
34 #include "lru_cache.h"
37 * Maximum number of references an extent can have in order for us to attempt to
38 * issue clone operations instead of write operations. This currently exists to
39 * avoid hitting limitations of the backreference walking code (taking a lot of
40 * time and using too much memory for extents with large number of references).
42 #define SEND_MAX_EXTENT_REFS 1024
45 * A fs_path is a helper to dynamically build path names with unknown size.
46 * It reallocates the internal buffer on demand.
47 * It allows fast adding of path elements on the right side (normal path) and
48 * fast adding to the left side (reversed path). A reversed path can also be
49 * unreversed if needed.
58 unsigned short buf_len
:15;
59 unsigned short reversed
:1;
63 * Average path length does not exceed 200 bytes, we'll have
64 * better packing in the slab and higher chance to satisfy
65 * an allocation later during send.
70 #define FS_PATH_INLINE_SIZE \
71 (sizeof(struct fs_path) - offsetof(struct fs_path, inline_buf))
74 /* reused for each extent */
76 struct btrfs_root
*root
;
83 #define SEND_MAX_NAME_CACHE_SIZE 256
86 * Limit the root_ids array of struct backref_cache_entry to 17 elements.
87 * This makes the size of a cache entry to be exactly 192 bytes on x86_64, which
88 * can be satisfied from the kmalloc-192 slab, without wasting any space.
89 * The most common case is to have a single root for cloning, which corresponds
90 * to the send root. Having the user specify more than 16 clone roots is not
91 * common, and in such rare cases we simply don't use caching if the number of
92 * cloning roots that lead down to a leaf is more than 17.
94 #define SEND_MAX_BACKREF_CACHE_ROOTS 17
97 * Max number of entries in the cache.
98 * With SEND_MAX_BACKREF_CACHE_ROOTS as 17, the size in bytes, excluding
99 * maple tree's internal nodes, is 24K.
101 #define SEND_MAX_BACKREF_CACHE_SIZE 128
104 * A backref cache entry maps a leaf to a list of IDs of roots from which the
105 * leaf is accessible and we can use for clone operations.
106 * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, each cache entry is 128 bytes (on
109 struct backref_cache_entry
{
110 struct btrfs_lru_cache_entry entry
;
111 u64 root_ids
[SEND_MAX_BACKREF_CACHE_ROOTS
];
112 /* Number of valid elements in the root_ids array. */
116 /* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */
117 static_assert(offsetof(struct backref_cache_entry
, entry
) == 0);
120 * Max number of entries in the cache that stores directories that were already
121 * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses
122 * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but
123 * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64).
125 #define SEND_MAX_DIR_CREATED_CACHE_SIZE 64
128 * Max number of entries in the cache that stores directories that were already
129 * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses
130 * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but
131 * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64).
133 #define SEND_MAX_DIR_UTIMES_CACHE_SIZE 64
136 struct file
*send_filp
;
142 * Whether BTRFS_SEND_A_DATA attribute was already added to current
143 * command (since protocol v2, data must be the last attribute).
146 struct page
**send_buf_pages
;
147 u64 flags
; /* 'flags' member of btrfs_ioctl_send_args is u64 */
148 /* Protocol version compatibility requested */
151 struct btrfs_root
*send_root
;
152 struct btrfs_root
*parent_root
;
153 struct clone_root
*clone_roots
;
156 /* current state of the compare_tree call */
157 struct btrfs_path
*left_path
;
158 struct btrfs_path
*right_path
;
159 struct btrfs_key
*cmp_key
;
162 * Keep track of the generation of the last transaction that was used
163 * for relocating a block group. This is periodically checked in order
164 * to detect if a relocation happened since the last check, so that we
165 * don't operate on stale extent buffers for nodes (level >= 1) or on
166 * stale disk_bytenr values of file extent items.
168 u64 last_reloc_trans
;
171 * infos of the currently processed inode. In case of deleted inodes,
172 * these are the values from the deleted inode.
179 u64 cur_inode_last_extent
;
180 u64 cur_inode_next_write_offset
;
182 bool cur_inode_new_gen
;
183 bool cur_inode_deleted
;
184 bool ignore_cur_inode
;
185 bool cur_inode_needs_verity
;
186 void *verity_descriptor
;
190 struct list_head new_refs
;
191 struct list_head deleted_refs
;
193 struct btrfs_lru_cache name_cache
;
196 * The inode we are currently processing. It's not NULL only when we
197 * need to issue write commands for data extents from this inode.
199 struct inode
*cur_inode
;
200 struct file_ra_state ra
;
201 u64 page_cache_clear_start
;
202 bool clean_page_cache
;
205 * We process inodes by their increasing order, so if before an
206 * incremental send we reverse the parent/child relationship of
207 * directories such that a directory with a lower inode number was
208 * the parent of a directory with a higher inode number, and the one
209 * becoming the new parent got renamed too, we can't rename/move the
210 * directory with lower inode number when we finish processing it - we
211 * must process the directory with higher inode number first, then
212 * rename/move it and then rename/move the directory with lower inode
213 * number. Example follows.
215 * Tree state when the first send was performed:
227 * Tree state when the second (incremental) send is performed:
236 * The sequence of steps that lead to the second state was:
238 * mv /a/b/c/d /a/b/c2/d2
239 * mv /a/b/c /a/b/c2/d2/cc
241 * "c" has lower inode number, but we can't move it (2nd mv operation)
242 * before we move "d", which has higher inode number.
244 * So we just memorize which move/rename operations must be performed
245 * later when their respective parent is processed and moved/renamed.
248 /* Indexed by parent directory inode number. */
249 struct rb_root pending_dir_moves
;
252 * Reverse index, indexed by the inode number of a directory that
253 * is waiting for the move/rename of its immediate parent before its
254 * own move/rename can be performed.
256 struct rb_root waiting_dir_moves
;
259 * A directory that is going to be rm'ed might have a child directory
260 * which is in the pending directory moves index above. In this case,
261 * the directory can only be removed after the move/rename of its child
262 * is performed. Example:
282 * Sequence of steps that lead to the send snapshot:
283 * rm -f /a/b/c/foo.txt
285 * mv /a/b/c/x /a/b/YY
288 * When the child is processed, its move/rename is delayed until its
289 * parent is processed (as explained above), but all other operations
290 * like update utimes, chown, chgrp, etc, are performed and the paths
291 * that it uses for those operations must use the orphanized name of
292 * its parent (the directory we're going to rm later), so we need to
293 * memorize that name.
295 * Indexed by the inode number of the directory to be deleted.
297 struct rb_root orphan_dirs
;
299 struct rb_root rbtree_new_refs
;
300 struct rb_root rbtree_deleted_refs
;
302 struct btrfs_lru_cache backref_cache
;
303 u64 backref_cache_last_reloc_trans
;
305 struct btrfs_lru_cache dir_created_cache
;
306 struct btrfs_lru_cache dir_utimes_cache
;
309 struct pending_dir_move
{
311 struct list_head list
;
315 struct list_head update_refs
;
318 struct waiting_dir_move
{
322 * There might be some directory that could not be removed because it
323 * was waiting for this directory inode to be moved first. Therefore
324 * after this directory is moved, we can try to rmdir the ino rmdir_ino.
331 struct orphan_dir_info
{
335 u64 last_dir_index_offset
;
336 u64 dir_high_seq_ino
;
339 struct name_cache_entry
{
341 * The key in the entry is an inode number, and the generation matches
342 * the inode's generation.
344 struct btrfs_lru_cache_entry entry
;
348 int need_later_update
;
350 char name
[] __counted_by(name_len
);
353 /* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */
354 static_assert(offsetof(struct name_cache_entry
, entry
) == 0);
357 #define ADVANCE_ONLY_NEXT -1
359 enum btrfs_compare_tree_result
{
360 BTRFS_COMPARE_TREE_NEW
,
361 BTRFS_COMPARE_TREE_DELETED
,
362 BTRFS_COMPARE_TREE_CHANGED
,
363 BTRFS_COMPARE_TREE_SAME
,
367 static void inconsistent_snapshot_error(struct send_ctx
*sctx
,
368 enum btrfs_compare_tree_result result
,
371 const char *result_string
;
374 case BTRFS_COMPARE_TREE_NEW
:
375 result_string
= "new";
377 case BTRFS_COMPARE_TREE_DELETED
:
378 result_string
= "deleted";
380 case BTRFS_COMPARE_TREE_CHANGED
:
381 result_string
= "updated";
383 case BTRFS_COMPARE_TREE_SAME
:
385 result_string
= "unchanged";
389 result_string
= "unexpected";
392 btrfs_err(sctx
->send_root
->fs_info
,
393 "Send: inconsistent snapshot, found %s %s for inode %llu without updated inode item, send root is %llu, parent root is %llu",
394 result_string
, what
, sctx
->cmp_key
->objectid
,
395 btrfs_root_id(sctx
->send_root
),
396 (sctx
->parent_root
? btrfs_root_id(sctx
->parent_root
) : 0));
400 static bool proto_cmd_ok(const struct send_ctx
*sctx
, int cmd
)
402 switch (sctx
->proto
) {
403 case 1: return cmd
<= BTRFS_SEND_C_MAX_V1
;
404 case 2: return cmd
<= BTRFS_SEND_C_MAX_V2
;
405 case 3: return cmd
<= BTRFS_SEND_C_MAX_V3
;
406 default: return false;
410 static int is_waiting_for_move(struct send_ctx
*sctx
, u64 ino
);
412 static struct waiting_dir_move
*
413 get_waiting_dir_move(struct send_ctx
*sctx
, u64 ino
);
415 static int is_waiting_for_rm(struct send_ctx
*sctx
, u64 dir_ino
, u64 gen
);
417 static int need_send_hole(struct send_ctx
*sctx
)
419 return (sctx
->parent_root
&& !sctx
->cur_inode_new
&&
420 !sctx
->cur_inode_new_gen
&& !sctx
->cur_inode_deleted
&&
421 S_ISREG(sctx
->cur_inode_mode
));
424 static void fs_path_reset(struct fs_path
*p
)
427 p
->start
= p
->buf
+ p
->buf_len
- 1;
437 static struct fs_path
*fs_path_alloc(void)
441 p
= kmalloc(sizeof(*p
), GFP_KERNEL
);
445 p
->buf
= p
->inline_buf
;
446 p
->buf_len
= FS_PATH_INLINE_SIZE
;
451 static struct fs_path
*fs_path_alloc_reversed(void)
463 static void fs_path_free(struct fs_path
*p
)
467 if (p
->buf
!= p
->inline_buf
)
472 static int fs_path_len(struct fs_path
*p
)
474 return p
->end
- p
->start
;
477 static int fs_path_ensure_buf(struct fs_path
*p
, int len
)
485 if (p
->buf_len
>= len
)
488 if (len
> PATH_MAX
) {
493 path_len
= p
->end
- p
->start
;
494 old_buf_len
= p
->buf_len
;
497 * Allocate to the next largest kmalloc bucket size, to let
498 * the fast path happen most of the time.
500 len
= kmalloc_size_roundup(len
);
502 * First time the inline_buf does not suffice
504 if (p
->buf
== p
->inline_buf
) {
505 tmp_buf
= kmalloc(len
, GFP_KERNEL
);
507 memcpy(tmp_buf
, p
->buf
, old_buf_len
);
509 tmp_buf
= krealloc(p
->buf
, len
, GFP_KERNEL
);
517 tmp_buf
= p
->buf
+ old_buf_len
- path_len
- 1;
518 p
->end
= p
->buf
+ p
->buf_len
- 1;
519 p
->start
= p
->end
- path_len
;
520 memmove(p
->start
, tmp_buf
, path_len
+ 1);
523 p
->end
= p
->start
+ path_len
;
528 static int fs_path_prepare_for_add(struct fs_path
*p
, int name_len
,
534 new_len
= p
->end
- p
->start
+ name_len
;
535 if (p
->start
!= p
->end
)
537 ret
= fs_path_ensure_buf(p
, new_len
);
542 if (p
->start
!= p
->end
)
544 p
->start
-= name_len
;
545 *prepared
= p
->start
;
547 if (p
->start
!= p
->end
)
558 static int fs_path_add(struct fs_path
*p
, const char *name
, int name_len
)
563 ret
= fs_path_prepare_for_add(p
, name_len
, &prepared
);
566 memcpy(prepared
, name
, name_len
);
572 static int fs_path_add_path(struct fs_path
*p
, struct fs_path
*p2
)
577 ret
= fs_path_prepare_for_add(p
, p2
->end
- p2
->start
, &prepared
);
580 memcpy(prepared
, p2
->start
, p2
->end
- p2
->start
);
586 static int fs_path_add_from_extent_buffer(struct fs_path
*p
,
587 struct extent_buffer
*eb
,
588 unsigned long off
, int len
)
593 ret
= fs_path_prepare_for_add(p
, len
, &prepared
);
597 read_extent_buffer(eb
, prepared
, off
, len
);
603 static int fs_path_copy(struct fs_path
*p
, struct fs_path
*from
)
605 p
->reversed
= from
->reversed
;
608 return fs_path_add_path(p
, from
);
611 static void fs_path_unreverse(struct fs_path
*p
)
620 len
= p
->end
- p
->start
;
622 p
->end
= p
->start
+ len
;
623 memmove(p
->start
, tmp
, len
+ 1);
627 static struct btrfs_path
*alloc_path_for_send(void)
629 struct btrfs_path
*path
;
631 path
= btrfs_alloc_path();
634 path
->search_commit_root
= 1;
635 path
->skip_locking
= 1;
636 path
->need_commit_sem
= 1;
640 static int write_buf(struct file
*filp
, const void *buf
, u32 len
, loff_t
*off
)
646 ret
= kernel_write(filp
, buf
+ pos
, len
- pos
, off
);
657 static int tlv_put(struct send_ctx
*sctx
, u16 attr
, const void *data
, int len
)
659 struct btrfs_tlv_header
*hdr
;
660 int total_len
= sizeof(*hdr
) + len
;
661 int left
= sctx
->send_max_size
- sctx
->send_size
;
663 if (WARN_ON_ONCE(sctx
->put_data
))
666 if (unlikely(left
< total_len
))
669 hdr
= (struct btrfs_tlv_header
*) (sctx
->send_buf
+ sctx
->send_size
);
670 put_unaligned_le16(attr
, &hdr
->tlv_type
);
671 put_unaligned_le16(len
, &hdr
->tlv_len
);
672 memcpy(hdr
+ 1, data
, len
);
673 sctx
->send_size
+= total_len
;
678 #define TLV_PUT_DEFINE_INT(bits) \
679 static int tlv_put_u##bits(struct send_ctx *sctx, \
680 u##bits attr, u##bits value) \
682 __le##bits __tmp = cpu_to_le##bits(value); \
683 return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \
686 TLV_PUT_DEFINE_INT(8)
687 TLV_PUT_DEFINE_INT(32)
688 TLV_PUT_DEFINE_INT(64)
690 static int tlv_put_string(struct send_ctx
*sctx
, u16 attr
,
691 const char *str
, int len
)
695 return tlv_put(sctx
, attr
, str
, len
);
698 static int tlv_put_uuid(struct send_ctx
*sctx
, u16 attr
,
701 return tlv_put(sctx
, attr
, uuid
, BTRFS_UUID_SIZE
);
704 static int tlv_put_btrfs_timespec(struct send_ctx
*sctx
, u16 attr
,
705 struct extent_buffer
*eb
,
706 struct btrfs_timespec
*ts
)
708 struct btrfs_timespec bts
;
709 read_extent_buffer(eb
, &bts
, (unsigned long)ts
, sizeof(bts
));
710 return tlv_put(sctx
, attr
, &bts
, sizeof(bts
));
714 #define TLV_PUT(sctx, attrtype, data, attrlen) \
716 ret = tlv_put(sctx, attrtype, data, attrlen); \
718 goto tlv_put_failure; \
721 #define TLV_PUT_INT(sctx, attrtype, bits, value) \
723 ret = tlv_put_u##bits(sctx, attrtype, value); \
725 goto tlv_put_failure; \
728 #define TLV_PUT_U8(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 8, data)
729 #define TLV_PUT_U16(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 16, data)
730 #define TLV_PUT_U32(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 32, data)
731 #define TLV_PUT_U64(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 64, data)
732 #define TLV_PUT_STRING(sctx, attrtype, str, len) \
734 ret = tlv_put_string(sctx, attrtype, str, len); \
736 goto tlv_put_failure; \
738 #define TLV_PUT_PATH(sctx, attrtype, p) \
740 ret = tlv_put_string(sctx, attrtype, p->start, \
741 p->end - p->start); \
743 goto tlv_put_failure; \
745 #define TLV_PUT_UUID(sctx, attrtype, uuid) \
747 ret = tlv_put_uuid(sctx, attrtype, uuid); \
749 goto tlv_put_failure; \
751 #define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \
753 ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \
755 goto tlv_put_failure; \
758 static int send_header(struct send_ctx
*sctx
)
760 struct btrfs_stream_header hdr
;
762 strcpy(hdr
.magic
, BTRFS_SEND_STREAM_MAGIC
);
763 hdr
.version
= cpu_to_le32(sctx
->proto
);
764 return write_buf(sctx
->send_filp
, &hdr
, sizeof(hdr
),
769 * For each command/item we want to send to userspace, we call this function.
771 static int begin_cmd(struct send_ctx
*sctx
, int cmd
)
773 struct btrfs_cmd_header
*hdr
;
775 if (WARN_ON(!sctx
->send_buf
))
778 if (unlikely(sctx
->send_size
!= 0)) {
779 btrfs_err(sctx
->send_root
->fs_info
,
780 "send: command header buffer not empty cmd %d offset %llu",
781 cmd
, sctx
->send_off
);
785 sctx
->send_size
+= sizeof(*hdr
);
786 hdr
= (struct btrfs_cmd_header
*)sctx
->send_buf
;
787 put_unaligned_le16(cmd
, &hdr
->cmd
);
792 static int send_cmd(struct send_ctx
*sctx
)
795 struct btrfs_cmd_header
*hdr
;
798 hdr
= (struct btrfs_cmd_header
*)sctx
->send_buf
;
799 put_unaligned_le32(sctx
->send_size
- sizeof(*hdr
), &hdr
->len
);
800 put_unaligned_le32(0, &hdr
->crc
);
802 crc
= crc32c(0, (unsigned char *)sctx
->send_buf
, sctx
->send_size
);
803 put_unaligned_le32(crc
, &hdr
->crc
);
805 ret
= write_buf(sctx
->send_filp
, sctx
->send_buf
, sctx
->send_size
,
809 sctx
->put_data
= false;
815 * Sends a move instruction to user space
817 static int send_rename(struct send_ctx
*sctx
,
818 struct fs_path
*from
, struct fs_path
*to
)
820 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
823 btrfs_debug(fs_info
, "send_rename %s -> %s", from
->start
, to
->start
);
825 ret
= begin_cmd(sctx
, BTRFS_SEND_C_RENAME
);
829 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, from
);
830 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH_TO
, to
);
832 ret
= send_cmd(sctx
);
840 * Sends a link instruction to user space
842 static int send_link(struct send_ctx
*sctx
,
843 struct fs_path
*path
, struct fs_path
*lnk
)
845 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
848 btrfs_debug(fs_info
, "send_link %s -> %s", path
->start
, lnk
->start
);
850 ret
= begin_cmd(sctx
, BTRFS_SEND_C_LINK
);
854 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, path
);
855 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH_LINK
, lnk
);
857 ret
= send_cmd(sctx
);
865 * Sends an unlink instruction to user space
867 static int send_unlink(struct send_ctx
*sctx
, struct fs_path
*path
)
869 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
872 btrfs_debug(fs_info
, "send_unlink %s", path
->start
);
874 ret
= begin_cmd(sctx
, BTRFS_SEND_C_UNLINK
);
878 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, path
);
880 ret
= send_cmd(sctx
);
888 * Sends a rmdir instruction to user space
890 static int send_rmdir(struct send_ctx
*sctx
, struct fs_path
*path
)
892 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
895 btrfs_debug(fs_info
, "send_rmdir %s", path
->start
);
897 ret
= begin_cmd(sctx
, BTRFS_SEND_C_RMDIR
);
901 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, path
);
903 ret
= send_cmd(sctx
);
910 struct btrfs_inode_info
{
922 * Helper function to retrieve some fields from an inode item.
924 static int get_inode_info(struct btrfs_root
*root
, u64 ino
,
925 struct btrfs_inode_info
*info
)
928 struct btrfs_path
*path
;
929 struct btrfs_inode_item
*ii
;
930 struct btrfs_key key
;
932 path
= alloc_path_for_send();
937 key
.type
= BTRFS_INODE_ITEM_KEY
;
939 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
949 ii
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
950 struct btrfs_inode_item
);
951 info
->size
= btrfs_inode_size(path
->nodes
[0], ii
);
952 info
->gen
= btrfs_inode_generation(path
->nodes
[0], ii
);
953 info
->mode
= btrfs_inode_mode(path
->nodes
[0], ii
);
954 info
->uid
= btrfs_inode_uid(path
->nodes
[0], ii
);
955 info
->gid
= btrfs_inode_gid(path
->nodes
[0], ii
);
956 info
->rdev
= btrfs_inode_rdev(path
->nodes
[0], ii
);
957 info
->nlink
= btrfs_inode_nlink(path
->nodes
[0], ii
);
959 * Transfer the unchanged u64 value of btrfs_inode_item::flags, that's
960 * otherwise logically split to 32/32 parts.
962 info
->fileattr
= btrfs_inode_flags(path
->nodes
[0], ii
);
965 btrfs_free_path(path
);
969 static int get_inode_gen(struct btrfs_root
*root
, u64 ino
, u64
*gen
)
972 struct btrfs_inode_info info
= { 0 };
976 ret
= get_inode_info(root
, ino
, &info
);
981 typedef int (*iterate_inode_ref_t
)(int num
, u64 dir
, int index
,
986 * Helper function to iterate the entries in ONE btrfs_inode_ref or
987 * btrfs_inode_extref.
988 * The iterate callback may return a non zero value to stop iteration. This can
989 * be a negative value for error codes or 1 to simply stop it.
991 * path must point to the INODE_REF or INODE_EXTREF when called.
993 static int iterate_inode_ref(struct btrfs_root
*root
, struct btrfs_path
*path
,
994 struct btrfs_key
*found_key
, int resolve
,
995 iterate_inode_ref_t iterate
, void *ctx
)
997 struct extent_buffer
*eb
= path
->nodes
[0];
998 struct btrfs_inode_ref
*iref
;
999 struct btrfs_inode_extref
*extref
;
1000 struct btrfs_path
*tmp_path
;
1004 int slot
= path
->slots
[0];
1011 unsigned long name_off
;
1012 unsigned long elem_size
;
1015 p
= fs_path_alloc_reversed();
1019 tmp_path
= alloc_path_for_send();
1026 if (found_key
->type
== BTRFS_INODE_REF_KEY
) {
1027 ptr
= (unsigned long)btrfs_item_ptr(eb
, slot
,
1028 struct btrfs_inode_ref
);
1029 total
= btrfs_item_size(eb
, slot
);
1030 elem_size
= sizeof(*iref
);
1032 ptr
= btrfs_item_ptr_offset(eb
, slot
);
1033 total
= btrfs_item_size(eb
, slot
);
1034 elem_size
= sizeof(*extref
);
1037 while (cur
< total
) {
1040 if (found_key
->type
== BTRFS_INODE_REF_KEY
) {
1041 iref
= (struct btrfs_inode_ref
*)(ptr
+ cur
);
1042 name_len
= btrfs_inode_ref_name_len(eb
, iref
);
1043 name_off
= (unsigned long)(iref
+ 1);
1044 index
= btrfs_inode_ref_index(eb
, iref
);
1045 dir
= found_key
->offset
;
1047 extref
= (struct btrfs_inode_extref
*)(ptr
+ cur
);
1048 name_len
= btrfs_inode_extref_name_len(eb
, extref
);
1049 name_off
= (unsigned long)&extref
->name
;
1050 index
= btrfs_inode_extref_index(eb
, extref
);
1051 dir
= btrfs_inode_extref_parent(eb
, extref
);
1055 start
= btrfs_ref_to_path(root
, tmp_path
, name_len
,
1057 p
->buf
, p
->buf_len
);
1058 if (IS_ERR(start
)) {
1059 ret
= PTR_ERR(start
);
1062 if (start
< p
->buf
) {
1063 /* overflow , try again with larger buffer */
1064 ret
= fs_path_ensure_buf(p
,
1065 p
->buf_len
+ p
->buf
- start
);
1068 start
= btrfs_ref_to_path(root
, tmp_path
,
1071 p
->buf
, p
->buf_len
);
1072 if (IS_ERR(start
)) {
1073 ret
= PTR_ERR(start
);
1076 if (unlikely(start
< p
->buf
)) {
1077 btrfs_err(root
->fs_info
,
1078 "send: path ref buffer underflow for key (%llu %u %llu)",
1079 found_key
->objectid
,
1088 ret
= fs_path_add_from_extent_buffer(p
, eb
, name_off
,
1094 cur
+= elem_size
+ name_len
;
1095 ret
= iterate(num
, dir
, index
, p
, ctx
);
1102 btrfs_free_path(tmp_path
);
1107 typedef int (*iterate_dir_item_t
)(int num
, struct btrfs_key
*di_key
,
1108 const char *name
, int name_len
,
1109 const char *data
, int data_len
,
1113 * Helper function to iterate the entries in ONE btrfs_dir_item.
1114 * The iterate callback may return a non zero value to stop iteration. This can
1115 * be a negative value for error codes or 1 to simply stop it.
1117 * path must point to the dir item when called.
1119 static int iterate_dir_item(struct btrfs_root
*root
, struct btrfs_path
*path
,
1120 iterate_dir_item_t iterate
, void *ctx
)
1123 struct extent_buffer
*eb
;
1124 struct btrfs_dir_item
*di
;
1125 struct btrfs_key di_key
;
1137 * Start with a small buffer (1 page). If later we end up needing more
1138 * space, which can happen for xattrs on a fs with a leaf size greater
1139 * than the page size, attempt to increase the buffer. Typically xattr
1143 buf
= kmalloc(buf_len
, GFP_KERNEL
);
1149 eb
= path
->nodes
[0];
1150 slot
= path
->slots
[0];
1151 di
= btrfs_item_ptr(eb
, slot
, struct btrfs_dir_item
);
1154 total
= btrfs_item_size(eb
, slot
);
1157 while (cur
< total
) {
1158 name_len
= btrfs_dir_name_len(eb
, di
);
1159 data_len
= btrfs_dir_data_len(eb
, di
);
1160 btrfs_dir_item_key_to_cpu(eb
, di
, &di_key
);
1162 if (btrfs_dir_ftype(eb
, di
) == BTRFS_FT_XATTR
) {
1163 if (name_len
> XATTR_NAME_MAX
) {
1164 ret
= -ENAMETOOLONG
;
1167 if (name_len
+ data_len
>
1168 BTRFS_MAX_XATTR_SIZE(root
->fs_info
)) {
1176 if (name_len
+ data_len
> PATH_MAX
) {
1177 ret
= -ENAMETOOLONG
;
1182 if (name_len
+ data_len
> buf_len
) {
1183 buf_len
= name_len
+ data_len
;
1184 if (is_vmalloc_addr(buf
)) {
1188 char *tmp
= krealloc(buf
, buf_len
,
1189 GFP_KERNEL
| __GFP_NOWARN
);
1196 buf
= kvmalloc(buf_len
, GFP_KERNEL
);
1204 read_extent_buffer(eb
, buf
, (unsigned long)(di
+ 1),
1205 name_len
+ data_len
);
1207 len
= sizeof(*di
) + name_len
+ data_len
;
1208 di
= (struct btrfs_dir_item
*)((char *)di
+ len
);
1211 ret
= iterate(num
, &di_key
, buf
, name_len
, buf
+ name_len
,
1228 static int __copy_first_ref(int num
, u64 dir
, int index
,
1229 struct fs_path
*p
, void *ctx
)
1232 struct fs_path
*pt
= ctx
;
1234 ret
= fs_path_copy(pt
, p
);
1238 /* we want the first only */
1243 * Retrieve the first path of an inode. If an inode has more then one
1244 * ref/hardlink, this is ignored.
1246 static int get_inode_path(struct btrfs_root
*root
,
1247 u64 ino
, struct fs_path
*path
)
1250 struct btrfs_key key
, found_key
;
1251 struct btrfs_path
*p
;
1253 p
= alloc_path_for_send();
1257 fs_path_reset(path
);
1260 key
.type
= BTRFS_INODE_REF_KEY
;
1263 ret
= btrfs_search_slot_for_read(root
, &key
, p
, 1, 0);
1270 btrfs_item_key_to_cpu(p
->nodes
[0], &found_key
, p
->slots
[0]);
1271 if (found_key
.objectid
!= ino
||
1272 (found_key
.type
!= BTRFS_INODE_REF_KEY
&&
1273 found_key
.type
!= BTRFS_INODE_EXTREF_KEY
)) {
1278 ret
= iterate_inode_ref(root
, p
, &found_key
, 1,
1279 __copy_first_ref
, path
);
1289 struct backref_ctx
{
1290 struct send_ctx
*sctx
;
1292 /* number of total found references */
1296 * used for clones found in send_root. clones found behind cur_objectid
1297 * and cur_offset are not considered as allowed clones.
1302 /* may be truncated in case it's the last extent in a file */
1305 /* The bytenr the file extent item we are processing refers to. */
1307 /* The owner (root id) of the data backref for the current extent. */
1309 /* The offset of the data backref for the current extent. */
1313 static int __clone_root_cmp_bsearch(const void *key
, const void *elt
)
1315 u64 root
= (u64
)(uintptr_t)key
;
1316 const struct clone_root
*cr
= elt
;
1318 if (root
< btrfs_root_id(cr
->root
))
1320 if (root
> btrfs_root_id(cr
->root
))
1325 static int __clone_root_cmp_sort(const void *e1
, const void *e2
)
1327 const struct clone_root
*cr1
= e1
;
1328 const struct clone_root
*cr2
= e2
;
1330 if (btrfs_root_id(cr1
->root
) < btrfs_root_id(cr2
->root
))
1332 if (btrfs_root_id(cr1
->root
) > btrfs_root_id(cr2
->root
))
1338 * Called for every backref that is found for the current extent.
1339 * Results are collected in sctx->clone_roots->ino/offset.
1341 static int iterate_backrefs(u64 ino
, u64 offset
, u64 num_bytes
, u64 root_id
,
1344 struct backref_ctx
*bctx
= ctx_
;
1345 struct clone_root
*clone_root
;
1347 /* First check if the root is in the list of accepted clone sources */
1348 clone_root
= bsearch((void *)(uintptr_t)root_id
, bctx
->sctx
->clone_roots
,
1349 bctx
->sctx
->clone_roots_cnt
,
1350 sizeof(struct clone_root
),
1351 __clone_root_cmp_bsearch
);
1355 /* This is our own reference, bail out as we can't clone from it. */
1356 if (clone_root
->root
== bctx
->sctx
->send_root
&&
1357 ino
== bctx
->cur_objectid
&&
1358 offset
== bctx
->cur_offset
)
1362 * Make sure we don't consider clones from send_root that are
1363 * behind the current inode/offset.
1365 if (clone_root
->root
== bctx
->sctx
->send_root
) {
1367 * If the source inode was not yet processed we can't issue a
1368 * clone operation, as the source extent does not exist yet at
1369 * the destination of the stream.
1371 if (ino
> bctx
->cur_objectid
)
1374 * We clone from the inode currently being sent as long as the
1375 * source extent is already processed, otherwise we could try
1376 * to clone from an extent that does not exist yet at the
1377 * destination of the stream.
1379 if (ino
== bctx
->cur_objectid
&&
1380 offset
+ bctx
->extent_len
>
1381 bctx
->sctx
->cur_inode_next_write_offset
)
1386 clone_root
->found_ref
= true;
1389 * If the given backref refers to a file extent item with a larger
1390 * number of bytes than what we found before, use the new one so that
1391 * we clone more optimally and end up doing less writes and getting
1392 * less exclusive, non-shared extents at the destination.
1394 if (num_bytes
> clone_root
->num_bytes
) {
1395 clone_root
->ino
= ino
;
1396 clone_root
->offset
= offset
;
1397 clone_root
->num_bytes
= num_bytes
;
1400 * Found a perfect candidate, so there's no need to continue
1403 if (num_bytes
>= bctx
->extent_len
)
1404 return BTRFS_ITERATE_EXTENT_INODES_STOP
;
1410 static bool lookup_backref_cache(u64 leaf_bytenr
, void *ctx
,
1411 const u64
**root_ids_ret
, int *root_count_ret
)
1413 struct backref_ctx
*bctx
= ctx
;
1414 struct send_ctx
*sctx
= bctx
->sctx
;
1415 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
1416 const u64 key
= leaf_bytenr
>> fs_info
->sectorsize_bits
;
1417 struct btrfs_lru_cache_entry
*raw_entry
;
1418 struct backref_cache_entry
*entry
;
1420 if (sctx
->backref_cache
.size
== 0)
1424 * If relocation happened since we first filled the cache, then we must
1425 * empty the cache and can not use it, because even though we operate on
1426 * read-only roots, their leaves and nodes may have been reallocated and
1427 * now be used for different nodes/leaves of the same tree or some other
1430 * We are called from iterate_extent_inodes() while either holding a
1431 * transaction handle or holding fs_info->commit_root_sem, so no need
1432 * to take any lock here.
1434 if (fs_info
->last_reloc_trans
> sctx
->backref_cache_last_reloc_trans
) {
1435 btrfs_lru_cache_clear(&sctx
->backref_cache
);
1439 raw_entry
= btrfs_lru_cache_lookup(&sctx
->backref_cache
, key
, 0);
1443 entry
= container_of(raw_entry
, struct backref_cache_entry
, entry
);
1444 *root_ids_ret
= entry
->root_ids
;
1445 *root_count_ret
= entry
->num_roots
;
1450 static void store_backref_cache(u64 leaf_bytenr
, const struct ulist
*root_ids
,
1453 struct backref_ctx
*bctx
= ctx
;
1454 struct send_ctx
*sctx
= bctx
->sctx
;
1455 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
1456 struct backref_cache_entry
*new_entry
;
1457 struct ulist_iterator uiter
;
1458 struct ulist_node
*node
;
1462 * We're called while holding a transaction handle or while holding
1463 * fs_info->commit_root_sem (at iterate_extent_inodes()), so must do a
1466 new_entry
= kmalloc(sizeof(struct backref_cache_entry
), GFP_NOFS
);
1467 /* No worries, cache is optional. */
1471 new_entry
->entry
.key
= leaf_bytenr
>> fs_info
->sectorsize_bits
;
1472 new_entry
->entry
.gen
= 0;
1473 new_entry
->num_roots
= 0;
1474 ULIST_ITER_INIT(&uiter
);
1475 while ((node
= ulist_next(root_ids
, &uiter
)) != NULL
) {
1476 const u64 root_id
= node
->val
;
1477 struct clone_root
*root
;
1479 root
= bsearch((void *)(uintptr_t)root_id
, sctx
->clone_roots
,
1480 sctx
->clone_roots_cnt
, sizeof(struct clone_root
),
1481 __clone_root_cmp_bsearch
);
1485 /* Too many roots, just exit, no worries as caching is optional. */
1486 if (new_entry
->num_roots
>= SEND_MAX_BACKREF_CACHE_ROOTS
) {
1491 new_entry
->root_ids
[new_entry
->num_roots
] = root_id
;
1492 new_entry
->num_roots
++;
1496 * We may have not added any roots to the new cache entry, which means
1497 * none of the roots is part of the list of roots from which we are
1498 * allowed to clone. Cache the new entry as it's still useful to avoid
1499 * backref walking to determine which roots have a path to the leaf.
1501 * Also use GFP_NOFS because we're called while holding a transaction
1502 * handle or while holding fs_info->commit_root_sem.
1504 ret
= btrfs_lru_cache_store(&sctx
->backref_cache
, &new_entry
->entry
,
1506 ASSERT(ret
== 0 || ret
== -ENOMEM
);
1508 /* Caching is optional, no worries. */
1514 * We are called from iterate_extent_inodes() while either holding a
1515 * transaction handle or holding fs_info->commit_root_sem, so no need
1516 * to take any lock here.
1518 if (sctx
->backref_cache
.size
== 1)
1519 sctx
->backref_cache_last_reloc_trans
= fs_info
->last_reloc_trans
;
1522 static int check_extent_item(u64 bytenr
, const struct btrfs_extent_item
*ei
,
1523 const struct extent_buffer
*leaf
, void *ctx
)
1525 const u64 refs
= btrfs_extent_refs(leaf
, ei
);
1526 const struct backref_ctx
*bctx
= ctx
;
1527 const struct send_ctx
*sctx
= bctx
->sctx
;
1529 if (bytenr
== bctx
->bytenr
) {
1530 const u64 flags
= btrfs_extent_flags(leaf
, ei
);
1532 if (WARN_ON(flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
))
1536 * If we have only one reference and only the send root as a
1537 * clone source - meaning no clone roots were given in the
1538 * struct btrfs_ioctl_send_args passed to the send ioctl - then
1539 * it's our reference and there's no point in doing backref
1540 * walking which is expensive, so exit early.
1542 if (refs
== 1 && sctx
->clone_roots_cnt
== 1)
1547 * Backreference walking (iterate_extent_inodes() below) is currently
1548 * too expensive when an extent has a large number of references, both
1549 * in time spent and used memory. So for now just fallback to write
1550 * operations instead of clone operations when an extent has more than
1551 * a certain amount of references.
1553 if (refs
> SEND_MAX_EXTENT_REFS
)
1559 static bool skip_self_data_ref(u64 root
, u64 ino
, u64 offset
, void *ctx
)
1561 const struct backref_ctx
*bctx
= ctx
;
1563 if (ino
== bctx
->cur_objectid
&&
1564 root
== bctx
->backref_owner
&&
1565 offset
== bctx
->backref_offset
)
1572 * Given an inode, offset and extent item, it finds a good clone for a clone
1573 * instruction. Returns -ENOENT when none could be found. The function makes
1574 * sure that the returned clone is usable at the point where sending is at the
1575 * moment. This means, that no clones are accepted which lie behind the current
1578 * path must point to the extent item when called.
1580 static int find_extent_clone(struct send_ctx
*sctx
,
1581 struct btrfs_path
*path
,
1582 u64 ino
, u64 data_offset
,
1584 struct clone_root
**found
)
1586 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
1592 struct btrfs_file_extent_item
*fi
;
1593 struct extent_buffer
*eb
= path
->nodes
[0];
1594 struct backref_ctx backref_ctx
= { 0 };
1595 struct btrfs_backref_walk_ctx backref_walk_ctx
= { 0 };
1596 struct clone_root
*cur_clone_root
;
1601 * With fallocate we can get prealloc extents beyond the inode's i_size,
1602 * so we don't do anything here because clone operations can not clone
1603 * to a range beyond i_size without increasing the i_size of the
1604 * destination inode.
1606 if (data_offset
>= ino_size
)
1609 fi
= btrfs_item_ptr(eb
, path
->slots
[0], struct btrfs_file_extent_item
);
1610 extent_type
= btrfs_file_extent_type(eb
, fi
);
1611 if (extent_type
== BTRFS_FILE_EXTENT_INLINE
)
1614 disk_byte
= btrfs_file_extent_disk_bytenr(eb
, fi
);
1618 compressed
= btrfs_file_extent_compression(eb
, fi
);
1619 num_bytes
= btrfs_file_extent_num_bytes(eb
, fi
);
1620 logical
= disk_byte
+ btrfs_file_extent_offset(eb
, fi
);
1623 * Setup the clone roots.
1625 for (i
= 0; i
< sctx
->clone_roots_cnt
; i
++) {
1626 cur_clone_root
= sctx
->clone_roots
+ i
;
1627 cur_clone_root
->ino
= (u64
)-1;
1628 cur_clone_root
->offset
= 0;
1629 cur_clone_root
->num_bytes
= 0;
1630 cur_clone_root
->found_ref
= false;
1633 backref_ctx
.sctx
= sctx
;
1634 backref_ctx
.cur_objectid
= ino
;
1635 backref_ctx
.cur_offset
= data_offset
;
1636 backref_ctx
.bytenr
= disk_byte
;
1638 * Use the header owner and not the send root's id, because in case of a
1639 * snapshot we can have shared subtrees.
1641 backref_ctx
.backref_owner
= btrfs_header_owner(eb
);
1642 backref_ctx
.backref_offset
= data_offset
- btrfs_file_extent_offset(eb
, fi
);
1645 * The last extent of a file may be too large due to page alignment.
1646 * We need to adjust extent_len in this case so that the checks in
1647 * iterate_backrefs() work.
1649 if (data_offset
+ num_bytes
>= ino_size
)
1650 backref_ctx
.extent_len
= ino_size
- data_offset
;
1652 backref_ctx
.extent_len
= num_bytes
;
1655 * Now collect all backrefs.
1657 backref_walk_ctx
.bytenr
= disk_byte
;
1658 if (compressed
== BTRFS_COMPRESS_NONE
)
1659 backref_walk_ctx
.extent_item_pos
= btrfs_file_extent_offset(eb
, fi
);
1660 backref_walk_ctx
.fs_info
= fs_info
;
1661 backref_walk_ctx
.cache_lookup
= lookup_backref_cache
;
1662 backref_walk_ctx
.cache_store
= store_backref_cache
;
1663 backref_walk_ctx
.indirect_ref_iterator
= iterate_backrefs
;
1664 backref_walk_ctx
.check_extent_item
= check_extent_item
;
1665 backref_walk_ctx
.user_ctx
= &backref_ctx
;
1668 * If have a single clone root, then it's the send root and we can tell
1669 * the backref walking code to skip our own backref and not resolve it,
1670 * since we can not use it for cloning - the source and destination
1671 * ranges can't overlap and in case the leaf is shared through a subtree
1672 * due to snapshots, we can't use those other roots since they are not
1673 * in the list of clone roots.
1675 if (sctx
->clone_roots_cnt
== 1)
1676 backref_walk_ctx
.skip_data_ref
= skip_self_data_ref
;
1678 ret
= iterate_extent_inodes(&backref_walk_ctx
, true, iterate_backrefs
,
1683 down_read(&fs_info
->commit_root_sem
);
1684 if (fs_info
->last_reloc_trans
> sctx
->last_reloc_trans
) {
1686 * A transaction commit for a transaction in which block group
1687 * relocation was done just happened.
1688 * The disk_bytenr of the file extent item we processed is
1689 * possibly stale, referring to the extent's location before
1690 * relocation. So act as if we haven't found any clone sources
1691 * and fallback to write commands, which will read the correct
1692 * data from the new extent location. Otherwise we will fail
1693 * below because we haven't found our own back reference or we
1694 * could be getting incorrect sources in case the old extent
1695 * was already reallocated after the relocation.
1697 up_read(&fs_info
->commit_root_sem
);
1700 up_read(&fs_info
->commit_root_sem
);
1702 btrfs_debug(fs_info
,
1703 "find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu",
1704 data_offset
, ino
, num_bytes
, logical
);
1706 if (!backref_ctx
.found
) {
1707 btrfs_debug(fs_info
, "no clones found");
1711 cur_clone_root
= NULL
;
1712 for (i
= 0; i
< sctx
->clone_roots_cnt
; i
++) {
1713 struct clone_root
*clone_root
= &sctx
->clone_roots
[i
];
1715 if (!clone_root
->found_ref
)
1719 * Choose the root from which we can clone more bytes, to
1720 * minimize write operations and therefore have more extent
1721 * sharing at the destination (the same as in the source).
1723 if (!cur_clone_root
||
1724 clone_root
->num_bytes
> cur_clone_root
->num_bytes
) {
1725 cur_clone_root
= clone_root
;
1728 * We found an optimal clone candidate (any inode from
1729 * any root is fine), so we're done.
1731 if (clone_root
->num_bytes
>= backref_ctx
.extent_len
)
1736 if (cur_clone_root
) {
1737 *found
= cur_clone_root
;
1746 static int read_symlink(struct btrfs_root
*root
,
1748 struct fs_path
*dest
)
1751 struct btrfs_path
*path
;
1752 struct btrfs_key key
;
1753 struct btrfs_file_extent_item
*ei
;
1759 path
= alloc_path_for_send();
1764 key
.type
= BTRFS_EXTENT_DATA_KEY
;
1766 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
1771 * An empty symlink inode. Can happen in rare error paths when
1772 * creating a symlink (transaction committed before the inode
1773 * eviction handler removed the symlink inode items and a crash
1774 * happened in between or the subvol was snapshoted in between).
1775 * Print an informative message to dmesg/syslog so that the user
1776 * can delete the symlink.
1778 btrfs_err(root
->fs_info
,
1779 "Found empty symlink inode %llu at root %llu",
1780 ino
, btrfs_root_id(root
));
1785 ei
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
1786 struct btrfs_file_extent_item
);
1787 type
= btrfs_file_extent_type(path
->nodes
[0], ei
);
1788 if (unlikely(type
!= BTRFS_FILE_EXTENT_INLINE
)) {
1790 btrfs_crit(root
->fs_info
,
1791 "send: found symlink extent that is not inline, ino %llu root %llu extent type %d",
1792 ino
, btrfs_root_id(root
), type
);
1795 compression
= btrfs_file_extent_compression(path
->nodes
[0], ei
);
1796 if (unlikely(compression
!= BTRFS_COMPRESS_NONE
)) {
1798 btrfs_crit(root
->fs_info
,
1799 "send: found symlink extent with compression, ino %llu root %llu compression type %d",
1800 ino
, btrfs_root_id(root
), compression
);
1804 off
= btrfs_file_extent_inline_start(ei
);
1805 len
= btrfs_file_extent_ram_bytes(path
->nodes
[0], ei
);
1807 ret
= fs_path_add_from_extent_buffer(dest
, path
->nodes
[0], off
, len
);
1810 btrfs_free_path(path
);
1815 * Helper function to generate a file name that is unique in the root of
1816 * send_root and parent_root. This is used to generate names for orphan inodes.
1818 static int gen_unique_name(struct send_ctx
*sctx
,
1820 struct fs_path
*dest
)
1823 struct btrfs_path
*path
;
1824 struct btrfs_dir_item
*di
;
1829 path
= alloc_path_for_send();
1834 struct fscrypt_str tmp_name
;
1836 len
= snprintf(tmp
, sizeof(tmp
), "o%llu-%llu-%llu",
1838 ASSERT(len
< sizeof(tmp
));
1839 tmp_name
.name
= tmp
;
1840 tmp_name
.len
= strlen(tmp
);
1842 di
= btrfs_lookup_dir_item(NULL
, sctx
->send_root
,
1843 path
, BTRFS_FIRST_FREE_OBJECTID
,
1845 btrfs_release_path(path
);
1851 /* not unique, try again */
1856 if (!sctx
->parent_root
) {
1862 di
= btrfs_lookup_dir_item(NULL
, sctx
->parent_root
,
1863 path
, BTRFS_FIRST_FREE_OBJECTID
,
1865 btrfs_release_path(path
);
1871 /* not unique, try again */
1879 ret
= fs_path_add(dest
, tmp
, strlen(tmp
));
1882 btrfs_free_path(path
);
1887 inode_state_no_change
,
1888 inode_state_will_create
,
1889 inode_state_did_create
,
1890 inode_state_will_delete
,
1891 inode_state_did_delete
,
1894 static int get_cur_inode_state(struct send_ctx
*sctx
, u64 ino
, u64 gen
,
1895 u64
*send_gen
, u64
*parent_gen
)
1902 struct btrfs_inode_info info
;
1904 ret
= get_inode_info(sctx
->send_root
, ino
, &info
);
1905 if (ret
< 0 && ret
!= -ENOENT
)
1907 left_ret
= (info
.nlink
== 0) ? -ENOENT
: ret
;
1908 left_gen
= info
.gen
;
1910 *send_gen
= ((left_ret
== -ENOENT
) ? 0 : info
.gen
);
1912 if (!sctx
->parent_root
) {
1913 right_ret
= -ENOENT
;
1915 ret
= get_inode_info(sctx
->parent_root
, ino
, &info
);
1916 if (ret
< 0 && ret
!= -ENOENT
)
1918 right_ret
= (info
.nlink
== 0) ? -ENOENT
: ret
;
1919 right_gen
= info
.gen
;
1921 *parent_gen
= ((right_ret
== -ENOENT
) ? 0 : info
.gen
);
1924 if (!left_ret
&& !right_ret
) {
1925 if (left_gen
== gen
&& right_gen
== gen
) {
1926 ret
= inode_state_no_change
;
1927 } else if (left_gen
== gen
) {
1928 if (ino
< sctx
->send_progress
)
1929 ret
= inode_state_did_create
;
1931 ret
= inode_state_will_create
;
1932 } else if (right_gen
== gen
) {
1933 if (ino
< sctx
->send_progress
)
1934 ret
= inode_state_did_delete
;
1936 ret
= inode_state_will_delete
;
1940 } else if (!left_ret
) {
1941 if (left_gen
== gen
) {
1942 if (ino
< sctx
->send_progress
)
1943 ret
= inode_state_did_create
;
1945 ret
= inode_state_will_create
;
1949 } else if (!right_ret
) {
1950 if (right_gen
== gen
) {
1951 if (ino
< sctx
->send_progress
)
1952 ret
= inode_state_did_delete
;
1954 ret
= inode_state_will_delete
;
1966 static int is_inode_existent(struct send_ctx
*sctx
, u64 ino
, u64 gen
,
1967 u64
*send_gen
, u64
*parent_gen
)
1971 if (ino
== BTRFS_FIRST_FREE_OBJECTID
)
1974 ret
= get_cur_inode_state(sctx
, ino
, gen
, send_gen
, parent_gen
);
1978 if (ret
== inode_state_no_change
||
1979 ret
== inode_state_did_create
||
1980 ret
== inode_state_will_delete
)
1990 * Helper function to lookup a dir item in a dir.
1992 static int lookup_dir_item_inode(struct btrfs_root
*root
,
1993 u64 dir
, const char *name
, int name_len
,
1997 struct btrfs_dir_item
*di
;
1998 struct btrfs_key key
;
1999 struct btrfs_path
*path
;
2000 struct fscrypt_str name_str
= FSTR_INIT((char *)name
, name_len
);
2002 path
= alloc_path_for_send();
2006 di
= btrfs_lookup_dir_item(NULL
, root
, path
, dir
, &name_str
, 0);
2007 if (IS_ERR_OR_NULL(di
)) {
2008 ret
= di
? PTR_ERR(di
) : -ENOENT
;
2011 btrfs_dir_item_key_to_cpu(path
->nodes
[0], di
, &key
);
2012 if (key
.type
== BTRFS_ROOT_ITEM_KEY
) {
2016 *found_inode
= key
.objectid
;
2019 btrfs_free_path(path
);
2024 * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
2025 * generation of the parent dir and the name of the dir entry.
2027 static int get_first_ref(struct btrfs_root
*root
, u64 ino
,
2028 u64
*dir
, u64
*dir_gen
, struct fs_path
*name
)
2031 struct btrfs_key key
;
2032 struct btrfs_key found_key
;
2033 struct btrfs_path
*path
;
2037 path
= alloc_path_for_send();
2042 key
.type
= BTRFS_INODE_REF_KEY
;
2045 ret
= btrfs_search_slot_for_read(root
, &key
, path
, 1, 0);
2049 btrfs_item_key_to_cpu(path
->nodes
[0], &found_key
,
2051 if (ret
|| found_key
.objectid
!= ino
||
2052 (found_key
.type
!= BTRFS_INODE_REF_KEY
&&
2053 found_key
.type
!= BTRFS_INODE_EXTREF_KEY
)) {
2058 if (found_key
.type
== BTRFS_INODE_REF_KEY
) {
2059 struct btrfs_inode_ref
*iref
;
2060 iref
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
2061 struct btrfs_inode_ref
);
2062 len
= btrfs_inode_ref_name_len(path
->nodes
[0], iref
);
2063 ret
= fs_path_add_from_extent_buffer(name
, path
->nodes
[0],
2064 (unsigned long)(iref
+ 1),
2066 parent_dir
= found_key
.offset
;
2068 struct btrfs_inode_extref
*extref
;
2069 extref
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
2070 struct btrfs_inode_extref
);
2071 len
= btrfs_inode_extref_name_len(path
->nodes
[0], extref
);
2072 ret
= fs_path_add_from_extent_buffer(name
, path
->nodes
[0],
2073 (unsigned long)&extref
->name
, len
);
2074 parent_dir
= btrfs_inode_extref_parent(path
->nodes
[0], extref
);
2078 btrfs_release_path(path
);
2081 ret
= get_inode_gen(root
, parent_dir
, dir_gen
);
2089 btrfs_free_path(path
);
2093 static int is_first_ref(struct btrfs_root
*root
,
2095 const char *name
, int name_len
)
2098 struct fs_path
*tmp_name
;
2101 tmp_name
= fs_path_alloc();
2105 ret
= get_first_ref(root
, ino
, &tmp_dir
, NULL
, tmp_name
);
2109 if (dir
!= tmp_dir
|| name_len
!= fs_path_len(tmp_name
)) {
2114 ret
= !memcmp(tmp_name
->start
, name
, name_len
);
2117 fs_path_free(tmp_name
);
2122 * Used by process_recorded_refs to determine if a new ref would overwrite an
2123 * already existing ref. In case it detects an overwrite, it returns the
2124 * inode/gen in who_ino/who_gen.
2125 * When an overwrite is detected, process_recorded_refs does proper orphanizing
2126 * to make sure later references to the overwritten inode are possible.
2127 * Orphanizing is however only required for the first ref of an inode.
2128 * process_recorded_refs does an additional is_first_ref check to see if
2129 * orphanizing is really required.
2131 static int will_overwrite_ref(struct send_ctx
*sctx
, u64 dir
, u64 dir_gen
,
2132 const char *name
, int name_len
,
2133 u64
*who_ino
, u64
*who_gen
, u64
*who_mode
)
2136 u64 parent_root_dir_gen
;
2137 u64 other_inode
= 0;
2138 struct btrfs_inode_info info
;
2140 if (!sctx
->parent_root
)
2143 ret
= is_inode_existent(sctx
, dir
, dir_gen
, NULL
, &parent_root_dir_gen
);
2148 * If we have a parent root we need to verify that the parent dir was
2149 * not deleted and then re-created, if it was then we have no overwrite
2150 * and we can just unlink this entry.
2152 * @parent_root_dir_gen was set to 0 if the inode does not exist in the
2155 if (sctx
->parent_root
&& dir
!= BTRFS_FIRST_FREE_OBJECTID
&&
2156 parent_root_dir_gen
!= dir_gen
)
2159 ret
= lookup_dir_item_inode(sctx
->parent_root
, dir
, name
, name_len
,
2167 * Check if the overwritten ref was already processed. If yes, the ref
2168 * was already unlinked/moved, so we can safely assume that we will not
2169 * overwrite anything at this point in time.
2171 if (other_inode
> sctx
->send_progress
||
2172 is_waiting_for_move(sctx
, other_inode
)) {
2173 ret
= get_inode_info(sctx
->parent_root
, other_inode
, &info
);
2177 *who_ino
= other_inode
;
2178 *who_gen
= info
.gen
;
2179 *who_mode
= info
.mode
;
2187 * Checks if the ref was overwritten by an already processed inode. This is
2188 * used by __get_cur_name_and_parent to find out if the ref was orphanized and
2189 * thus the orphan name needs be used.
2190 * process_recorded_refs also uses it to avoid unlinking of refs that were
2193 static int did_overwrite_ref(struct send_ctx
*sctx
,
2194 u64 dir
, u64 dir_gen
,
2195 u64 ino
, u64 ino_gen
,
2196 const char *name
, int name_len
)
2201 u64 send_root_dir_gen
;
2203 if (!sctx
->parent_root
)
2206 ret
= is_inode_existent(sctx
, dir
, dir_gen
, &send_root_dir_gen
, NULL
);
2211 * @send_root_dir_gen was set to 0 if the inode does not exist in the
2214 if (dir
!= BTRFS_FIRST_FREE_OBJECTID
&& send_root_dir_gen
!= dir_gen
)
2217 /* check if the ref was overwritten by another ref */
2218 ret
= lookup_dir_item_inode(sctx
->send_root
, dir
, name
, name_len
,
2220 if (ret
== -ENOENT
) {
2221 /* was never and will never be overwritten */
2223 } else if (ret
< 0) {
2227 if (ow_inode
== ino
) {
2228 ret
= get_inode_gen(sctx
->send_root
, ow_inode
, &ow_gen
);
2232 /* It's the same inode, so no overwrite happened. */
2233 if (ow_gen
== ino_gen
)
2238 * We know that it is or will be overwritten. Check this now.
2239 * The current inode being processed might have been the one that caused
2240 * inode 'ino' to be orphanized, therefore check if ow_inode matches
2241 * the current inode being processed.
2243 if (ow_inode
< sctx
->send_progress
)
2246 if (ino
!= sctx
->cur_ino
&& ow_inode
== sctx
->cur_ino
) {
2248 ret
= get_inode_gen(sctx
->send_root
, ow_inode
, &ow_gen
);
2252 if (ow_gen
== sctx
->cur_inode_gen
)
2260 * Same as did_overwrite_ref, but also checks if it is the first ref of an inode
2261 * that got overwritten. This is used by process_recorded_refs to determine
2262 * if it has to use the path as returned by get_cur_path or the orphan name.
2264 static int did_overwrite_first_ref(struct send_ctx
*sctx
, u64 ino
, u64 gen
)
2267 struct fs_path
*name
= NULL
;
2271 if (!sctx
->parent_root
)
2274 name
= fs_path_alloc();
2278 ret
= get_first_ref(sctx
->parent_root
, ino
, &dir
, &dir_gen
, name
);
2282 ret
= did_overwrite_ref(sctx
, dir
, dir_gen
, ino
, gen
,
2283 name
->start
, fs_path_len(name
));
2290 static inline struct name_cache_entry
*name_cache_search(struct send_ctx
*sctx
,
2293 struct btrfs_lru_cache_entry
*entry
;
2295 entry
= btrfs_lru_cache_lookup(&sctx
->name_cache
, ino
, gen
);
2299 return container_of(entry
, struct name_cache_entry
, entry
);
2303 * Used by get_cur_path for each ref up to the root.
2304 * Returns 0 if it succeeded.
2305 * Returns 1 if the inode is not existent or got overwritten. In that case, the
2306 * name is an orphan name. This instructs get_cur_path to stop iterating. If 1
2307 * is returned, parent_ino/parent_gen are not guaranteed to be valid.
2308 * Returns <0 in case of error.
2310 static int __get_cur_name_and_parent(struct send_ctx
*sctx
,
2314 struct fs_path
*dest
)
2318 struct name_cache_entry
*nce
;
2321 * First check if we already did a call to this function with the same
2322 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes
2323 * return the cached result.
2325 nce
= name_cache_search(sctx
, ino
, gen
);
2327 if (ino
< sctx
->send_progress
&& nce
->need_later_update
) {
2328 btrfs_lru_cache_remove(&sctx
->name_cache
, &nce
->entry
);
2331 *parent_ino
= nce
->parent_ino
;
2332 *parent_gen
= nce
->parent_gen
;
2333 ret
= fs_path_add(dest
, nce
->name
, nce
->name_len
);
2342 * If the inode is not existent yet, add the orphan name and return 1.
2343 * This should only happen for the parent dir that we determine in
2344 * record_new_ref_if_needed().
2346 ret
= is_inode_existent(sctx
, ino
, gen
, NULL
, NULL
);
2351 ret
= gen_unique_name(sctx
, ino
, gen
, dest
);
2359 * Depending on whether the inode was already processed or not, use
2360 * send_root or parent_root for ref lookup.
2362 if (ino
< sctx
->send_progress
)
2363 ret
= get_first_ref(sctx
->send_root
, ino
,
2364 parent_ino
, parent_gen
, dest
);
2366 ret
= get_first_ref(sctx
->parent_root
, ino
,
2367 parent_ino
, parent_gen
, dest
);
2372 * Check if the ref was overwritten by an inode's ref that was processed
2373 * earlier. If yes, treat as orphan and return 1.
2375 ret
= did_overwrite_ref(sctx
, *parent_ino
, *parent_gen
, ino
, gen
,
2376 dest
->start
, dest
->end
- dest
->start
);
2380 fs_path_reset(dest
);
2381 ret
= gen_unique_name(sctx
, ino
, gen
, dest
);
2389 * Store the result of the lookup in the name cache.
2391 nce
= kmalloc(sizeof(*nce
) + fs_path_len(dest
) + 1, GFP_KERNEL
);
2397 nce
->entry
.key
= ino
;
2398 nce
->entry
.gen
= gen
;
2399 nce
->parent_ino
= *parent_ino
;
2400 nce
->parent_gen
= *parent_gen
;
2401 nce
->name_len
= fs_path_len(dest
);
2403 strcpy(nce
->name
, dest
->start
);
2405 if (ino
< sctx
->send_progress
)
2406 nce
->need_later_update
= 0;
2408 nce
->need_later_update
= 1;
2410 nce_ret
= btrfs_lru_cache_store(&sctx
->name_cache
, &nce
->entry
, GFP_KERNEL
);
2421 * Magic happens here. This function returns the first ref to an inode as it
2422 * would look like while receiving the stream at this point in time.
2423 * We walk the path up to the root. For every inode in between, we check if it
2424 * was already processed/sent. If yes, we continue with the parent as found
2425 * in send_root. If not, we continue with the parent as found in parent_root.
2426 * If we encounter an inode that was deleted at this point in time, we use the
2427 * inodes "orphan" name instead of the real name and stop. Same with new inodes
2428 * that were not created yet and overwritten inodes/refs.
2430 * When do we have orphan inodes:
2431 * 1. When an inode is freshly created and thus no valid refs are available yet
2432 * 2. When a directory lost all it's refs (deleted) but still has dir items
2433 * inside which were not processed yet (pending for move/delete). If anyone
2434 * tried to get the path to the dir items, it would get a path inside that
2436 * 3. When an inode is moved around or gets new links, it may overwrite the ref
2437 * of an unprocessed inode. If in that case the first ref would be
2438 * overwritten, the overwritten inode gets "orphanized". Later when we
2439 * process this overwritten inode, it is restored at a new place by moving
2442 * sctx->send_progress tells this function at which point in time receiving
2445 static int get_cur_path(struct send_ctx
*sctx
, u64 ino
, u64 gen
,
2446 struct fs_path
*dest
)
2449 struct fs_path
*name
= NULL
;
2450 u64 parent_inode
= 0;
2454 name
= fs_path_alloc();
2461 fs_path_reset(dest
);
2463 while (!stop
&& ino
!= BTRFS_FIRST_FREE_OBJECTID
) {
2464 struct waiting_dir_move
*wdm
;
2466 fs_path_reset(name
);
2468 if (is_waiting_for_rm(sctx
, ino
, gen
)) {
2469 ret
= gen_unique_name(sctx
, ino
, gen
, name
);
2472 ret
= fs_path_add_path(dest
, name
);
2476 wdm
= get_waiting_dir_move(sctx
, ino
);
2477 if (wdm
&& wdm
->orphanized
) {
2478 ret
= gen_unique_name(sctx
, ino
, gen
, name
);
2481 ret
= get_first_ref(sctx
->parent_root
, ino
,
2482 &parent_inode
, &parent_gen
, name
);
2484 ret
= __get_cur_name_and_parent(sctx
, ino
, gen
,
2494 ret
= fs_path_add_path(dest
, name
);
2505 fs_path_unreverse(dest
);
2510 * Sends a BTRFS_SEND_C_SUBVOL command/item to userspace
2512 static int send_subvol_begin(struct send_ctx
*sctx
)
2515 struct btrfs_root
*send_root
= sctx
->send_root
;
2516 struct btrfs_root
*parent_root
= sctx
->parent_root
;
2517 struct btrfs_path
*path
;
2518 struct btrfs_key key
;
2519 struct btrfs_root_ref
*ref
;
2520 struct extent_buffer
*leaf
;
2524 path
= btrfs_alloc_path();
2528 name
= kmalloc(BTRFS_PATH_NAME_MAX
, GFP_KERNEL
);
2530 btrfs_free_path(path
);
2534 key
.objectid
= btrfs_root_id(send_root
);
2535 key
.type
= BTRFS_ROOT_BACKREF_KEY
;
2538 ret
= btrfs_search_slot_for_read(send_root
->fs_info
->tree_root
,
2547 leaf
= path
->nodes
[0];
2548 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
2549 if (key
.type
!= BTRFS_ROOT_BACKREF_KEY
||
2550 key
.objectid
!= btrfs_root_id(send_root
)) {
2554 ref
= btrfs_item_ptr(leaf
, path
->slots
[0], struct btrfs_root_ref
);
2555 namelen
= btrfs_root_ref_name_len(leaf
, ref
);
2556 read_extent_buffer(leaf
, name
, (unsigned long)(ref
+ 1), namelen
);
2557 btrfs_release_path(path
);
2560 ret
= begin_cmd(sctx
, BTRFS_SEND_C_SNAPSHOT
);
2564 ret
= begin_cmd(sctx
, BTRFS_SEND_C_SUBVOL
);
2569 TLV_PUT_STRING(sctx
, BTRFS_SEND_A_PATH
, name
, namelen
);
2571 if (!btrfs_is_empty_uuid(sctx
->send_root
->root_item
.received_uuid
))
2572 TLV_PUT_UUID(sctx
, BTRFS_SEND_A_UUID
,
2573 sctx
->send_root
->root_item
.received_uuid
);
2575 TLV_PUT_UUID(sctx
, BTRFS_SEND_A_UUID
,
2576 sctx
->send_root
->root_item
.uuid
);
2578 TLV_PUT_U64(sctx
, BTRFS_SEND_A_CTRANSID
,
2579 btrfs_root_ctransid(&sctx
->send_root
->root_item
));
2581 if (!btrfs_is_empty_uuid(parent_root
->root_item
.received_uuid
))
2582 TLV_PUT_UUID(sctx
, BTRFS_SEND_A_CLONE_UUID
,
2583 parent_root
->root_item
.received_uuid
);
2585 TLV_PUT_UUID(sctx
, BTRFS_SEND_A_CLONE_UUID
,
2586 parent_root
->root_item
.uuid
);
2587 TLV_PUT_U64(sctx
, BTRFS_SEND_A_CLONE_CTRANSID
,
2588 btrfs_root_ctransid(&sctx
->parent_root
->root_item
));
2591 ret
= send_cmd(sctx
);
2595 btrfs_free_path(path
);
2600 static int send_truncate(struct send_ctx
*sctx
, u64 ino
, u64 gen
, u64 size
)
2602 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
2606 btrfs_debug(fs_info
, "send_truncate %llu size=%llu", ino
, size
);
2608 p
= fs_path_alloc();
2612 ret
= begin_cmd(sctx
, BTRFS_SEND_C_TRUNCATE
);
2616 ret
= get_cur_path(sctx
, ino
, gen
, p
);
2619 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, p
);
2620 TLV_PUT_U64(sctx
, BTRFS_SEND_A_SIZE
, size
);
2622 ret
= send_cmd(sctx
);
2630 static int send_chmod(struct send_ctx
*sctx
, u64 ino
, u64 gen
, u64 mode
)
2632 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
2636 btrfs_debug(fs_info
, "send_chmod %llu mode=%llu", ino
, mode
);
2638 p
= fs_path_alloc();
2642 ret
= begin_cmd(sctx
, BTRFS_SEND_C_CHMOD
);
2646 ret
= get_cur_path(sctx
, ino
, gen
, p
);
2649 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, p
);
2650 TLV_PUT_U64(sctx
, BTRFS_SEND_A_MODE
, mode
& 07777);
2652 ret
= send_cmd(sctx
);
2660 static int send_fileattr(struct send_ctx
*sctx
, u64 ino
, u64 gen
, u64 fileattr
)
2662 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
2666 if (sctx
->proto
< 2)
2669 btrfs_debug(fs_info
, "send_fileattr %llu fileattr=%llu", ino
, fileattr
);
2671 p
= fs_path_alloc();
2675 ret
= begin_cmd(sctx
, BTRFS_SEND_C_FILEATTR
);
2679 ret
= get_cur_path(sctx
, ino
, gen
, p
);
2682 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, p
);
2683 TLV_PUT_U64(sctx
, BTRFS_SEND_A_FILEATTR
, fileattr
);
2685 ret
= send_cmd(sctx
);
2693 static int send_chown(struct send_ctx
*sctx
, u64 ino
, u64 gen
, u64 uid
, u64 gid
)
2695 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
2699 btrfs_debug(fs_info
, "send_chown %llu uid=%llu, gid=%llu",
2702 p
= fs_path_alloc();
2706 ret
= begin_cmd(sctx
, BTRFS_SEND_C_CHOWN
);
2710 ret
= get_cur_path(sctx
, ino
, gen
, p
);
2713 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, p
);
2714 TLV_PUT_U64(sctx
, BTRFS_SEND_A_UID
, uid
);
2715 TLV_PUT_U64(sctx
, BTRFS_SEND_A_GID
, gid
);
2717 ret
= send_cmd(sctx
);
2725 static int send_utimes(struct send_ctx
*sctx
, u64 ino
, u64 gen
)
2727 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
2729 struct fs_path
*p
= NULL
;
2730 struct btrfs_inode_item
*ii
;
2731 struct btrfs_path
*path
= NULL
;
2732 struct extent_buffer
*eb
;
2733 struct btrfs_key key
;
2736 btrfs_debug(fs_info
, "send_utimes %llu", ino
);
2738 p
= fs_path_alloc();
2742 path
= alloc_path_for_send();
2749 key
.type
= BTRFS_INODE_ITEM_KEY
;
2751 ret
= btrfs_search_slot(NULL
, sctx
->send_root
, &key
, path
, 0, 0);
2757 eb
= path
->nodes
[0];
2758 slot
= path
->slots
[0];
2759 ii
= btrfs_item_ptr(eb
, slot
, struct btrfs_inode_item
);
2761 ret
= begin_cmd(sctx
, BTRFS_SEND_C_UTIMES
);
2765 ret
= get_cur_path(sctx
, ino
, gen
, p
);
2768 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, p
);
2769 TLV_PUT_BTRFS_TIMESPEC(sctx
, BTRFS_SEND_A_ATIME
, eb
, &ii
->atime
);
2770 TLV_PUT_BTRFS_TIMESPEC(sctx
, BTRFS_SEND_A_MTIME
, eb
, &ii
->mtime
);
2771 TLV_PUT_BTRFS_TIMESPEC(sctx
, BTRFS_SEND_A_CTIME
, eb
, &ii
->ctime
);
2772 if (sctx
->proto
>= 2)
2773 TLV_PUT_BTRFS_TIMESPEC(sctx
, BTRFS_SEND_A_OTIME
, eb
, &ii
->otime
);
2775 ret
= send_cmd(sctx
);
2780 btrfs_free_path(path
);
2785 * If the cache is full, we can't remove entries from it and do a call to
2786 * send_utimes() for each respective inode, because we might be finishing
2787 * processing an inode that is a directory and it just got renamed, and existing
2788 * entries in the cache may refer to inodes that have the directory in their
2789 * full path - in which case we would generate outdated paths (pre-rename)
2790 * for the inodes that the cache entries point to. Instead of prunning the
2791 * cache when inserting, do it after we finish processing each inode at
2792 * finish_inode_if_needed().
2794 static int cache_dir_utimes(struct send_ctx
*sctx
, u64 dir
, u64 gen
)
2796 struct btrfs_lru_cache_entry
*entry
;
2799 entry
= btrfs_lru_cache_lookup(&sctx
->dir_utimes_cache
, dir
, gen
);
2803 /* Caching is optional, don't fail if we can't allocate memory. */
2804 entry
= kmalloc(sizeof(*entry
), GFP_KERNEL
);
2806 return send_utimes(sctx
, dir
, gen
);
2811 ret
= btrfs_lru_cache_store(&sctx
->dir_utimes_cache
, entry
, GFP_KERNEL
);
2812 ASSERT(ret
!= -EEXIST
);
2815 return send_utimes(sctx
, dir
, gen
);
2821 static int trim_dir_utimes_cache(struct send_ctx
*sctx
)
2823 while (sctx
->dir_utimes_cache
.size
> SEND_MAX_DIR_UTIMES_CACHE_SIZE
) {
2824 struct btrfs_lru_cache_entry
*lru
;
2827 lru
= btrfs_lru_cache_lru_entry(&sctx
->dir_utimes_cache
);
2828 ASSERT(lru
!= NULL
);
2830 ret
= send_utimes(sctx
, lru
->key
, lru
->gen
);
2834 btrfs_lru_cache_remove(&sctx
->dir_utimes_cache
, lru
);
2841 * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have
2842 * a valid path yet because we did not process the refs yet. So, the inode
2843 * is created as orphan.
2845 static int send_create_inode(struct send_ctx
*sctx
, u64 ino
)
2847 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
2851 struct btrfs_inode_info info
;
2856 btrfs_debug(fs_info
, "send_create_inode %llu", ino
);
2858 p
= fs_path_alloc();
2862 if (ino
!= sctx
->cur_ino
) {
2863 ret
= get_inode_info(sctx
->send_root
, ino
, &info
);
2870 gen
= sctx
->cur_inode_gen
;
2871 mode
= sctx
->cur_inode_mode
;
2872 rdev
= sctx
->cur_inode_rdev
;
2875 if (S_ISREG(mode
)) {
2876 cmd
= BTRFS_SEND_C_MKFILE
;
2877 } else if (S_ISDIR(mode
)) {
2878 cmd
= BTRFS_SEND_C_MKDIR
;
2879 } else if (S_ISLNK(mode
)) {
2880 cmd
= BTRFS_SEND_C_SYMLINK
;
2881 } else if (S_ISCHR(mode
) || S_ISBLK(mode
)) {
2882 cmd
= BTRFS_SEND_C_MKNOD
;
2883 } else if (S_ISFIFO(mode
)) {
2884 cmd
= BTRFS_SEND_C_MKFIFO
;
2885 } else if (S_ISSOCK(mode
)) {
2886 cmd
= BTRFS_SEND_C_MKSOCK
;
2888 btrfs_warn(sctx
->send_root
->fs_info
, "unexpected inode type %o",
2889 (int)(mode
& S_IFMT
));
2894 ret
= begin_cmd(sctx
, cmd
);
2898 ret
= gen_unique_name(sctx
, ino
, gen
, p
);
2902 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, p
);
2903 TLV_PUT_U64(sctx
, BTRFS_SEND_A_INO
, ino
);
2905 if (S_ISLNK(mode
)) {
2907 ret
= read_symlink(sctx
->send_root
, ino
, p
);
2910 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH_LINK
, p
);
2911 } else if (S_ISCHR(mode
) || S_ISBLK(mode
) ||
2912 S_ISFIFO(mode
) || S_ISSOCK(mode
)) {
2913 TLV_PUT_U64(sctx
, BTRFS_SEND_A_RDEV
, new_encode_dev(rdev
));
2914 TLV_PUT_U64(sctx
, BTRFS_SEND_A_MODE
, mode
);
2917 ret
= send_cmd(sctx
);
2928 static void cache_dir_created(struct send_ctx
*sctx
, u64 dir
)
2930 struct btrfs_lru_cache_entry
*entry
;
2933 /* Caching is optional, ignore any failures. */
2934 entry
= kmalloc(sizeof(*entry
), GFP_KERNEL
);
2940 ret
= btrfs_lru_cache_store(&sctx
->dir_created_cache
, entry
, GFP_KERNEL
);
2946 * We need some special handling for inodes that get processed before the parent
2947 * directory got created. See process_recorded_refs for details.
2948 * This function does the check if we already created the dir out of order.
2950 static int did_create_dir(struct send_ctx
*sctx
, u64 dir
)
2954 struct btrfs_path
*path
= NULL
;
2955 struct btrfs_key key
;
2956 struct btrfs_key found_key
;
2957 struct btrfs_key di_key
;
2958 struct btrfs_dir_item
*di
;
2960 if (btrfs_lru_cache_lookup(&sctx
->dir_created_cache
, dir
, 0))
2963 path
= alloc_path_for_send();
2968 key
.type
= BTRFS_DIR_INDEX_KEY
;
2971 btrfs_for_each_slot(sctx
->send_root
, &key
, &found_key
, path
, iter_ret
) {
2972 struct extent_buffer
*eb
= path
->nodes
[0];
2974 if (found_key
.objectid
!= key
.objectid
||
2975 found_key
.type
!= key
.type
) {
2980 di
= btrfs_item_ptr(eb
, path
->slots
[0], struct btrfs_dir_item
);
2981 btrfs_dir_item_key_to_cpu(eb
, di
, &di_key
);
2983 if (di_key
.type
!= BTRFS_ROOT_ITEM_KEY
&&
2984 di_key
.objectid
< sctx
->send_progress
) {
2986 cache_dir_created(sctx
, dir
);
2990 /* Catch error found during iteration */
2994 btrfs_free_path(path
);
2999 * Only creates the inode if it is:
3000 * 1. Not a directory
3001 * 2. Or a directory which was not created already due to out of order
3002 * directories. See did_create_dir and process_recorded_refs for details.
3004 static int send_create_inode_if_needed(struct send_ctx
*sctx
)
3008 if (S_ISDIR(sctx
->cur_inode_mode
)) {
3009 ret
= did_create_dir(sctx
, sctx
->cur_ino
);
3016 ret
= send_create_inode(sctx
, sctx
->cur_ino
);
3018 if (ret
== 0 && S_ISDIR(sctx
->cur_inode_mode
))
3019 cache_dir_created(sctx
, sctx
->cur_ino
);
3024 struct recorded_ref
{
3025 struct list_head list
;
3027 struct fs_path
*full_path
;
3031 struct rb_node node
;
3032 struct rb_root
*root
;
3035 static struct recorded_ref
*recorded_ref_alloc(void)
3037 struct recorded_ref
*ref
;
3039 ref
= kzalloc(sizeof(*ref
), GFP_KERNEL
);
3042 RB_CLEAR_NODE(&ref
->node
);
3043 INIT_LIST_HEAD(&ref
->list
);
3047 static void recorded_ref_free(struct recorded_ref
*ref
)
3051 if (!RB_EMPTY_NODE(&ref
->node
))
3052 rb_erase(&ref
->node
, ref
->root
);
3053 list_del(&ref
->list
);
3054 fs_path_free(ref
->full_path
);
3058 static void set_ref_path(struct recorded_ref
*ref
, struct fs_path
*path
)
3060 ref
->full_path
= path
;
3061 ref
->name
= (char *)kbasename(ref
->full_path
->start
);
3062 ref
->name_len
= ref
->full_path
->end
- ref
->name
;
3065 static int dup_ref(struct recorded_ref
*ref
, struct list_head
*list
)
3067 struct recorded_ref
*new;
3069 new = recorded_ref_alloc();
3073 new->dir
= ref
->dir
;
3074 new->dir_gen
= ref
->dir_gen
;
3075 list_add_tail(&new->list
, list
);
3079 static void __free_recorded_refs(struct list_head
*head
)
3081 struct recorded_ref
*cur
;
3083 while (!list_empty(head
)) {
3084 cur
= list_entry(head
->next
, struct recorded_ref
, list
);
3085 recorded_ref_free(cur
);
3089 static void free_recorded_refs(struct send_ctx
*sctx
)
3091 __free_recorded_refs(&sctx
->new_refs
);
3092 __free_recorded_refs(&sctx
->deleted_refs
);
3096 * Renames/moves a file/dir to its orphan name. Used when the first
3097 * ref of an unprocessed inode gets overwritten and for all non empty
3100 static int orphanize_inode(struct send_ctx
*sctx
, u64 ino
, u64 gen
,
3101 struct fs_path
*path
)
3104 struct fs_path
*orphan
;
3106 orphan
= fs_path_alloc();
3110 ret
= gen_unique_name(sctx
, ino
, gen
, orphan
);
3114 ret
= send_rename(sctx
, path
, orphan
);
3117 fs_path_free(orphan
);
3121 static struct orphan_dir_info
*add_orphan_dir_info(struct send_ctx
*sctx
,
3122 u64 dir_ino
, u64 dir_gen
)
3124 struct rb_node
**p
= &sctx
->orphan_dirs
.rb_node
;
3125 struct rb_node
*parent
= NULL
;
3126 struct orphan_dir_info
*entry
, *odi
;
3130 entry
= rb_entry(parent
, struct orphan_dir_info
, node
);
3131 if (dir_ino
< entry
->ino
)
3133 else if (dir_ino
> entry
->ino
)
3134 p
= &(*p
)->rb_right
;
3135 else if (dir_gen
< entry
->gen
)
3137 else if (dir_gen
> entry
->gen
)
3138 p
= &(*p
)->rb_right
;
3143 odi
= kmalloc(sizeof(*odi
), GFP_KERNEL
);
3145 return ERR_PTR(-ENOMEM
);
3148 odi
->last_dir_index_offset
= 0;
3149 odi
->dir_high_seq_ino
= 0;
3151 rb_link_node(&odi
->node
, parent
, p
);
3152 rb_insert_color(&odi
->node
, &sctx
->orphan_dirs
);
3156 static struct orphan_dir_info
*get_orphan_dir_info(struct send_ctx
*sctx
,
3157 u64 dir_ino
, u64 gen
)
3159 struct rb_node
*n
= sctx
->orphan_dirs
.rb_node
;
3160 struct orphan_dir_info
*entry
;
3163 entry
= rb_entry(n
, struct orphan_dir_info
, node
);
3164 if (dir_ino
< entry
->ino
)
3166 else if (dir_ino
> entry
->ino
)
3168 else if (gen
< entry
->gen
)
3170 else if (gen
> entry
->gen
)
3178 static int is_waiting_for_rm(struct send_ctx
*sctx
, u64 dir_ino
, u64 gen
)
3180 struct orphan_dir_info
*odi
= get_orphan_dir_info(sctx
, dir_ino
, gen
);
3185 static void free_orphan_dir_info(struct send_ctx
*sctx
,
3186 struct orphan_dir_info
*odi
)
3190 rb_erase(&odi
->node
, &sctx
->orphan_dirs
);
3195 * Returns 1 if a directory can be removed at this point in time.
3196 * We check this by iterating all dir items and checking if the inode behind
3197 * the dir item was already processed.
3199 static int can_rmdir(struct send_ctx
*sctx
, u64 dir
, u64 dir_gen
)
3203 struct btrfs_root
*root
= sctx
->parent_root
;
3204 struct btrfs_path
*path
;
3205 struct btrfs_key key
;
3206 struct btrfs_key found_key
;
3207 struct btrfs_key loc
;
3208 struct btrfs_dir_item
*di
;
3209 struct orphan_dir_info
*odi
= NULL
;
3210 u64 dir_high_seq_ino
= 0;
3211 u64 last_dir_index_offset
= 0;
3214 * Don't try to rmdir the top/root subvolume dir.
3216 if (dir
== BTRFS_FIRST_FREE_OBJECTID
)
3219 odi
= get_orphan_dir_info(sctx
, dir
, dir_gen
);
3220 if (odi
&& sctx
->cur_ino
< odi
->dir_high_seq_ino
)
3223 path
= alloc_path_for_send();
3229 * Find the inode number associated with the last dir index
3230 * entry. This is very likely the inode with the highest number
3231 * of all inodes that have an entry in the directory. We can
3232 * then use it to avoid future calls to can_rmdir(), when
3233 * processing inodes with a lower number, from having to search
3234 * the parent root b+tree for dir index keys.
3237 key
.type
= BTRFS_DIR_INDEX_KEY
;
3238 key
.offset
= (u64
)-1;
3240 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
3243 } else if (ret
> 0) {
3244 /* Can't happen, the root is never empty. */
3245 ASSERT(path
->slots
[0] > 0);
3246 if (WARN_ON(path
->slots
[0] == 0)) {
3253 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0]);
3254 if (key
.objectid
!= dir
|| key
.type
!= BTRFS_DIR_INDEX_KEY
) {
3255 /* No index keys, dir can be removed. */
3260 di
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
3261 struct btrfs_dir_item
);
3262 btrfs_dir_item_key_to_cpu(path
->nodes
[0], di
, &loc
);
3263 dir_high_seq_ino
= loc
.objectid
;
3264 if (sctx
->cur_ino
< dir_high_seq_ino
) {
3269 btrfs_release_path(path
);
3273 key
.type
= BTRFS_DIR_INDEX_KEY
;
3274 key
.offset
= (odi
? odi
->last_dir_index_offset
: 0);
3276 btrfs_for_each_slot(root
, &key
, &found_key
, path
, iter_ret
) {
3277 struct waiting_dir_move
*dm
;
3279 if (found_key
.objectid
!= key
.objectid
||
3280 found_key
.type
!= key
.type
)
3283 di
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
3284 struct btrfs_dir_item
);
3285 btrfs_dir_item_key_to_cpu(path
->nodes
[0], di
, &loc
);
3287 dir_high_seq_ino
= max(dir_high_seq_ino
, loc
.objectid
);
3288 last_dir_index_offset
= found_key
.offset
;
3290 dm
= get_waiting_dir_move(sctx
, loc
.objectid
);
3292 dm
->rmdir_ino
= dir
;
3293 dm
->rmdir_gen
= dir_gen
;
3298 if (loc
.objectid
> sctx
->cur_ino
) {
3307 free_orphan_dir_info(sctx
, odi
);
3312 btrfs_free_path(path
);
3318 odi
= add_orphan_dir_info(sctx
, dir
, dir_gen
);
3320 return PTR_ERR(odi
);
3325 odi
->last_dir_index_offset
= last_dir_index_offset
;
3326 odi
->dir_high_seq_ino
= max(odi
->dir_high_seq_ino
, dir_high_seq_ino
);
3331 static int is_waiting_for_move(struct send_ctx
*sctx
, u64 ino
)
3333 struct waiting_dir_move
*entry
= get_waiting_dir_move(sctx
, ino
);
3335 return entry
!= NULL
;
3338 static int add_waiting_dir_move(struct send_ctx
*sctx
, u64 ino
, bool orphanized
)
3340 struct rb_node
**p
= &sctx
->waiting_dir_moves
.rb_node
;
3341 struct rb_node
*parent
= NULL
;
3342 struct waiting_dir_move
*entry
, *dm
;
3344 dm
= kmalloc(sizeof(*dm
), GFP_KERNEL
);
3350 dm
->orphanized
= orphanized
;
3354 entry
= rb_entry(parent
, struct waiting_dir_move
, node
);
3355 if (ino
< entry
->ino
) {
3357 } else if (ino
> entry
->ino
) {
3358 p
= &(*p
)->rb_right
;
3365 rb_link_node(&dm
->node
, parent
, p
);
3366 rb_insert_color(&dm
->node
, &sctx
->waiting_dir_moves
);
3370 static struct waiting_dir_move
*
3371 get_waiting_dir_move(struct send_ctx
*sctx
, u64 ino
)
3373 struct rb_node
*n
= sctx
->waiting_dir_moves
.rb_node
;
3374 struct waiting_dir_move
*entry
;
3377 entry
= rb_entry(n
, struct waiting_dir_move
, node
);
3378 if (ino
< entry
->ino
)
3380 else if (ino
> entry
->ino
)
3388 static void free_waiting_dir_move(struct send_ctx
*sctx
,
3389 struct waiting_dir_move
*dm
)
3393 rb_erase(&dm
->node
, &sctx
->waiting_dir_moves
);
3397 static int add_pending_dir_move(struct send_ctx
*sctx
,
3401 struct list_head
*new_refs
,
3402 struct list_head
*deleted_refs
,
3403 const bool is_orphan
)
3405 struct rb_node
**p
= &sctx
->pending_dir_moves
.rb_node
;
3406 struct rb_node
*parent
= NULL
;
3407 struct pending_dir_move
*entry
= NULL
, *pm
;
3408 struct recorded_ref
*cur
;
3412 pm
= kmalloc(sizeof(*pm
), GFP_KERNEL
);
3415 pm
->parent_ino
= parent_ino
;
3418 INIT_LIST_HEAD(&pm
->list
);
3419 INIT_LIST_HEAD(&pm
->update_refs
);
3420 RB_CLEAR_NODE(&pm
->node
);
3424 entry
= rb_entry(parent
, struct pending_dir_move
, node
);
3425 if (parent_ino
< entry
->parent_ino
) {
3427 } else if (parent_ino
> entry
->parent_ino
) {
3428 p
= &(*p
)->rb_right
;
3435 list_for_each_entry(cur
, deleted_refs
, list
) {
3436 ret
= dup_ref(cur
, &pm
->update_refs
);
3440 list_for_each_entry(cur
, new_refs
, list
) {
3441 ret
= dup_ref(cur
, &pm
->update_refs
);
3446 ret
= add_waiting_dir_move(sctx
, pm
->ino
, is_orphan
);
3451 list_add_tail(&pm
->list
, &entry
->list
);
3453 rb_link_node(&pm
->node
, parent
, p
);
3454 rb_insert_color(&pm
->node
, &sctx
->pending_dir_moves
);
3459 __free_recorded_refs(&pm
->update_refs
);
3465 static struct pending_dir_move
*get_pending_dir_moves(struct send_ctx
*sctx
,
3468 struct rb_node
*n
= sctx
->pending_dir_moves
.rb_node
;
3469 struct pending_dir_move
*entry
;
3472 entry
= rb_entry(n
, struct pending_dir_move
, node
);
3473 if (parent_ino
< entry
->parent_ino
)
3475 else if (parent_ino
> entry
->parent_ino
)
3483 static int path_loop(struct send_ctx
*sctx
, struct fs_path
*name
,
3484 u64 ino
, u64 gen
, u64
*ancestor_ino
)
3487 u64 parent_inode
= 0;
3489 u64 start_ino
= ino
;
3492 while (ino
!= BTRFS_FIRST_FREE_OBJECTID
) {
3493 fs_path_reset(name
);
3495 if (is_waiting_for_rm(sctx
, ino
, gen
))
3497 if (is_waiting_for_move(sctx
, ino
)) {
3498 if (*ancestor_ino
== 0)
3499 *ancestor_ino
= ino
;
3500 ret
= get_first_ref(sctx
->parent_root
, ino
,
3501 &parent_inode
, &parent_gen
, name
);
3503 ret
= __get_cur_name_and_parent(sctx
, ino
, gen
,
3513 if (parent_inode
== start_ino
) {
3515 if (*ancestor_ino
== 0)
3516 *ancestor_ino
= ino
;
3525 static int apply_dir_move(struct send_ctx
*sctx
, struct pending_dir_move
*pm
)
3527 struct fs_path
*from_path
= NULL
;
3528 struct fs_path
*to_path
= NULL
;
3529 struct fs_path
*name
= NULL
;
3530 u64 orig_progress
= sctx
->send_progress
;
3531 struct recorded_ref
*cur
;
3532 u64 parent_ino
, parent_gen
;
3533 struct waiting_dir_move
*dm
= NULL
;
3540 name
= fs_path_alloc();
3541 from_path
= fs_path_alloc();
3542 if (!name
|| !from_path
) {
3547 dm
= get_waiting_dir_move(sctx
, pm
->ino
);
3549 rmdir_ino
= dm
->rmdir_ino
;
3550 rmdir_gen
= dm
->rmdir_gen
;
3551 is_orphan
= dm
->orphanized
;
3552 free_waiting_dir_move(sctx
, dm
);
3555 ret
= gen_unique_name(sctx
, pm
->ino
,
3556 pm
->gen
, from_path
);
3558 ret
= get_first_ref(sctx
->parent_root
, pm
->ino
,
3559 &parent_ino
, &parent_gen
, name
);
3562 ret
= get_cur_path(sctx
, parent_ino
, parent_gen
,
3566 ret
= fs_path_add_path(from_path
, name
);
3571 sctx
->send_progress
= sctx
->cur_ino
+ 1;
3572 ret
= path_loop(sctx
, name
, pm
->ino
, pm
->gen
, &ancestor
);
3576 LIST_HEAD(deleted_refs
);
3577 ASSERT(ancestor
> BTRFS_FIRST_FREE_OBJECTID
);
3578 ret
= add_pending_dir_move(sctx
, pm
->ino
, pm
->gen
, ancestor
,
3579 &pm
->update_refs
, &deleted_refs
,
3584 dm
= get_waiting_dir_move(sctx
, pm
->ino
);
3586 dm
->rmdir_ino
= rmdir_ino
;
3587 dm
->rmdir_gen
= rmdir_gen
;
3591 fs_path_reset(name
);
3594 ret
= get_cur_path(sctx
, pm
->ino
, pm
->gen
, to_path
);
3598 ret
= send_rename(sctx
, from_path
, to_path
);
3603 struct orphan_dir_info
*odi
;
3606 odi
= get_orphan_dir_info(sctx
, rmdir_ino
, rmdir_gen
);
3608 /* already deleted */
3613 ret
= can_rmdir(sctx
, rmdir_ino
, gen
);
3619 name
= fs_path_alloc();
3624 ret
= get_cur_path(sctx
, rmdir_ino
, gen
, name
);
3627 ret
= send_rmdir(sctx
, name
);
3633 ret
= cache_dir_utimes(sctx
, pm
->ino
, pm
->gen
);
3638 * After rename/move, need to update the utimes of both new parent(s)
3639 * and old parent(s).
3641 list_for_each_entry(cur
, &pm
->update_refs
, list
) {
3643 * The parent inode might have been deleted in the send snapshot
3645 ret
= get_inode_info(sctx
->send_root
, cur
->dir
, NULL
);
3646 if (ret
== -ENOENT
) {
3653 ret
= cache_dir_utimes(sctx
, cur
->dir
, cur
->dir_gen
);
3660 fs_path_free(from_path
);
3661 fs_path_free(to_path
);
3662 sctx
->send_progress
= orig_progress
;
3667 static void free_pending_move(struct send_ctx
*sctx
, struct pending_dir_move
*m
)
3669 if (!list_empty(&m
->list
))
3671 if (!RB_EMPTY_NODE(&m
->node
))
3672 rb_erase(&m
->node
, &sctx
->pending_dir_moves
);
3673 __free_recorded_refs(&m
->update_refs
);
3677 static void tail_append_pending_moves(struct send_ctx
*sctx
,
3678 struct pending_dir_move
*moves
,
3679 struct list_head
*stack
)
3681 if (list_empty(&moves
->list
)) {
3682 list_add_tail(&moves
->list
, stack
);
3685 list_splice_init(&moves
->list
, &list
);
3686 list_add_tail(&moves
->list
, stack
);
3687 list_splice_tail(&list
, stack
);
3689 if (!RB_EMPTY_NODE(&moves
->node
)) {
3690 rb_erase(&moves
->node
, &sctx
->pending_dir_moves
);
3691 RB_CLEAR_NODE(&moves
->node
);
3695 static int apply_children_dir_moves(struct send_ctx
*sctx
)
3697 struct pending_dir_move
*pm
;
3699 u64 parent_ino
= sctx
->cur_ino
;
3702 pm
= get_pending_dir_moves(sctx
, parent_ino
);
3706 tail_append_pending_moves(sctx
, pm
, &stack
);
3708 while (!list_empty(&stack
)) {
3709 pm
= list_first_entry(&stack
, struct pending_dir_move
, list
);
3710 parent_ino
= pm
->ino
;
3711 ret
= apply_dir_move(sctx
, pm
);
3712 free_pending_move(sctx
, pm
);
3715 pm
= get_pending_dir_moves(sctx
, parent_ino
);
3717 tail_append_pending_moves(sctx
, pm
, &stack
);
3722 while (!list_empty(&stack
)) {
3723 pm
= list_first_entry(&stack
, struct pending_dir_move
, list
);
3724 free_pending_move(sctx
, pm
);
3730 * We might need to delay a directory rename even when no ancestor directory
3731 * (in the send root) with a higher inode number than ours (sctx->cur_ino) was
3732 * renamed. This happens when we rename a directory to the old name (the name
3733 * in the parent root) of some other unrelated directory that got its rename
3734 * delayed due to some ancestor with higher number that got renamed.
3740 * |---- a/ (ino 257)
3741 * | |---- file (ino 260)
3743 * |---- b/ (ino 258)
3744 * |---- c/ (ino 259)
3748 * |---- a/ (ino 258)
3749 * |---- x/ (ino 259)
3750 * |---- y/ (ino 257)
3751 * |----- file (ino 260)
3753 * Here we can not rename 258 from 'b' to 'a' without the rename of inode 257
3754 * from 'a' to 'x/y' happening first, which in turn depends on the rename of
3755 * inode 259 from 'c' to 'x'. So the order of rename commands the send stream
3758 * 1 - rename 259 from 'c' to 'x'
3759 * 2 - rename 257 from 'a' to 'x/y'
3760 * 3 - rename 258 from 'b' to 'a'
3762 * Returns 1 if the rename of sctx->cur_ino needs to be delayed, 0 if it can
3763 * be done right away and < 0 on error.
3765 static int wait_for_dest_dir_move(struct send_ctx
*sctx
,
3766 struct recorded_ref
*parent_ref
,
3767 const bool is_orphan
)
3769 struct btrfs_fs_info
*fs_info
= sctx
->parent_root
->fs_info
;
3770 struct btrfs_path
*path
;
3771 struct btrfs_key key
;
3772 struct btrfs_key di_key
;
3773 struct btrfs_dir_item
*di
;
3777 struct waiting_dir_move
*wdm
;
3779 if (RB_EMPTY_ROOT(&sctx
->waiting_dir_moves
))
3782 path
= alloc_path_for_send();
3786 key
.objectid
= parent_ref
->dir
;
3787 key
.type
= BTRFS_DIR_ITEM_KEY
;
3788 key
.offset
= btrfs_name_hash(parent_ref
->name
, parent_ref
->name_len
);
3790 ret
= btrfs_search_slot(NULL
, sctx
->parent_root
, &key
, path
, 0, 0);
3793 } else if (ret
> 0) {
3798 di
= btrfs_match_dir_item_name(fs_info
, path
, parent_ref
->name
,
3799 parent_ref
->name_len
);
3805 * di_key.objectid has the number of the inode that has a dentry in the
3806 * parent directory with the same name that sctx->cur_ino is being
3807 * renamed to. We need to check if that inode is in the send root as
3808 * well and if it is currently marked as an inode with a pending rename,
3809 * if it is, we need to delay the rename of sctx->cur_ino as well, so
3810 * that it happens after that other inode is renamed.
3812 btrfs_dir_item_key_to_cpu(path
->nodes
[0], di
, &di_key
);
3813 if (di_key
.type
!= BTRFS_INODE_ITEM_KEY
) {
3818 ret
= get_inode_gen(sctx
->parent_root
, di_key
.objectid
, &left_gen
);
3821 ret
= get_inode_gen(sctx
->send_root
, di_key
.objectid
, &right_gen
);
3828 /* Different inode, no need to delay the rename of sctx->cur_ino */
3829 if (right_gen
!= left_gen
) {
3834 wdm
= get_waiting_dir_move(sctx
, di_key
.objectid
);
3835 if (wdm
&& !wdm
->orphanized
) {
3836 ret
= add_pending_dir_move(sctx
,
3838 sctx
->cur_inode_gen
,
3841 &sctx
->deleted_refs
,
3847 btrfs_free_path(path
);
3852 * Check if inode ino2, or any of its ancestors, is inode ino1.
3853 * Return 1 if true, 0 if false and < 0 on error.
3855 static int check_ino_in_path(struct btrfs_root
*root
,
3860 struct fs_path
*fs_path
)
3865 return ino1_gen
== ino2_gen
;
3867 while (ino
> BTRFS_FIRST_FREE_OBJECTID
) {
3872 fs_path_reset(fs_path
);
3873 ret
= get_first_ref(root
, ino
, &parent
, &parent_gen
, fs_path
);
3877 return parent_gen
== ino1_gen
;
3884 * Check if inode ino1 is an ancestor of inode ino2 in the given root for any
3885 * possible path (in case ino2 is not a directory and has multiple hard links).
3886 * Return 1 if true, 0 if false and < 0 on error.
3888 static int is_ancestor(struct btrfs_root
*root
,
3892 struct fs_path
*fs_path
)
3894 bool free_fs_path
= false;
3897 struct btrfs_path
*path
= NULL
;
3898 struct btrfs_key key
;
3901 fs_path
= fs_path_alloc();
3904 free_fs_path
= true;
3907 path
= alloc_path_for_send();
3913 key
.objectid
= ino2
;
3914 key
.type
= BTRFS_INODE_REF_KEY
;
3917 btrfs_for_each_slot(root
, &key
, &key
, path
, iter_ret
) {
3918 struct extent_buffer
*leaf
= path
->nodes
[0];
3919 int slot
= path
->slots
[0];
3923 if (key
.objectid
!= ino2
)
3925 if (key
.type
!= BTRFS_INODE_REF_KEY
&&
3926 key
.type
!= BTRFS_INODE_EXTREF_KEY
)
3929 item_size
= btrfs_item_size(leaf
, slot
);
3930 while (cur_offset
< item_size
) {
3934 if (key
.type
== BTRFS_INODE_EXTREF_KEY
) {
3936 struct btrfs_inode_extref
*extref
;
3938 ptr
= btrfs_item_ptr_offset(leaf
, slot
);
3939 extref
= (struct btrfs_inode_extref
*)
3941 parent
= btrfs_inode_extref_parent(leaf
,
3943 cur_offset
+= sizeof(*extref
);
3944 cur_offset
+= btrfs_inode_extref_name_len(leaf
,
3947 parent
= key
.offset
;
3948 cur_offset
= item_size
;
3951 ret
= get_inode_gen(root
, parent
, &parent_gen
);
3954 ret
= check_ino_in_path(root
, ino1
, ino1_gen
,
3955 parent
, parent_gen
, fs_path
);
3965 btrfs_free_path(path
);
3967 fs_path_free(fs_path
);
3971 static int wait_for_parent_move(struct send_ctx
*sctx
,
3972 struct recorded_ref
*parent_ref
,
3973 const bool is_orphan
)
3976 u64 ino
= parent_ref
->dir
;
3977 u64 ino_gen
= parent_ref
->dir_gen
;
3978 u64 parent_ino_before
, parent_ino_after
;
3979 struct fs_path
*path_before
= NULL
;
3980 struct fs_path
*path_after
= NULL
;
3983 path_after
= fs_path_alloc();
3984 path_before
= fs_path_alloc();
3985 if (!path_after
|| !path_before
) {
3991 * Our current directory inode may not yet be renamed/moved because some
3992 * ancestor (immediate or not) has to be renamed/moved first. So find if
3993 * such ancestor exists and make sure our own rename/move happens after
3994 * that ancestor is processed to avoid path build infinite loops (done
3995 * at get_cur_path()).
3997 while (ino
> BTRFS_FIRST_FREE_OBJECTID
) {
3998 u64 parent_ino_after_gen
;
4000 if (is_waiting_for_move(sctx
, ino
)) {
4002 * If the current inode is an ancestor of ino in the
4003 * parent root, we need to delay the rename of the
4004 * current inode, otherwise don't delayed the rename
4005 * because we can end up with a circular dependency
4006 * of renames, resulting in some directories never
4007 * getting the respective rename operations issued in
4008 * the send stream or getting into infinite path build
4011 ret
= is_ancestor(sctx
->parent_root
,
4012 sctx
->cur_ino
, sctx
->cur_inode_gen
,
4018 fs_path_reset(path_before
);
4019 fs_path_reset(path_after
);
4021 ret
= get_first_ref(sctx
->send_root
, ino
, &parent_ino_after
,
4022 &parent_ino_after_gen
, path_after
);
4025 ret
= get_first_ref(sctx
->parent_root
, ino
, &parent_ino_before
,
4027 if (ret
< 0 && ret
!= -ENOENT
) {
4029 } else if (ret
== -ENOENT
) {
4034 len1
= fs_path_len(path_before
);
4035 len2
= fs_path_len(path_after
);
4036 if (ino
> sctx
->cur_ino
&&
4037 (parent_ino_before
!= parent_ino_after
|| len1
!= len2
||
4038 memcmp(path_before
->start
, path_after
->start
, len1
))) {
4041 ret
= get_inode_gen(sctx
->parent_root
, ino
, &parent_ino_gen
);
4044 if (ino_gen
== parent_ino_gen
) {
4049 ino
= parent_ino_after
;
4050 ino_gen
= parent_ino_after_gen
;
4054 fs_path_free(path_before
);
4055 fs_path_free(path_after
);
4058 ret
= add_pending_dir_move(sctx
,
4060 sctx
->cur_inode_gen
,
4063 &sctx
->deleted_refs
,
4072 static int update_ref_path(struct send_ctx
*sctx
, struct recorded_ref
*ref
)
4075 struct fs_path
*new_path
;
4078 * Our reference's name member points to its full_path member string, so
4079 * we use here a new path.
4081 new_path
= fs_path_alloc();
4085 ret
= get_cur_path(sctx
, ref
->dir
, ref
->dir_gen
, new_path
);
4087 fs_path_free(new_path
);
4090 ret
= fs_path_add(new_path
, ref
->name
, ref
->name_len
);
4092 fs_path_free(new_path
);
4096 fs_path_free(ref
->full_path
);
4097 set_ref_path(ref
, new_path
);
4103 * When processing the new references for an inode we may orphanize an existing
4104 * directory inode because its old name conflicts with one of the new references
4105 * of the current inode. Later, when processing another new reference of our
4106 * inode, we might need to orphanize another inode, but the path we have in the
4107 * reference reflects the pre-orphanization name of the directory we previously
4108 * orphanized. For example:
4110 * parent snapshot looks like:
4113 * |----- f1 (ino 257)
4114 * |----- f2 (ino 258)
4115 * |----- d1/ (ino 259)
4116 * |----- d2/ (ino 260)
4118 * send snapshot looks like:
4121 * |----- d1 (ino 258)
4122 * |----- f2/ (ino 259)
4123 * |----- f2_link/ (ino 260)
4124 * | |----- f1 (ino 257)
4126 * |----- d2 (ino 258)
4128 * When processing inode 257 we compute the name for inode 259 as "d1", and we
4129 * cache it in the name cache. Later when we start processing inode 258, when
4130 * collecting all its new references we set a full path of "d1/d2" for its new
4131 * reference with name "d2". When we start processing the new references we
4132 * start by processing the new reference with name "d1", and this results in
4133 * orphanizing inode 259, since its old reference causes a conflict. Then we
4134 * move on the next new reference, with name "d2", and we find out we must
4135 * orphanize inode 260, as its old reference conflicts with ours - but for the
4136 * orphanization we use a source path corresponding to the path we stored in the
4137 * new reference, which is "d1/d2" and not "o259-6-0/d2" - this makes the
4138 * receiver fail since the path component "d1/" no longer exists, it was renamed
4139 * to "o259-6-0/" when processing the previous new reference. So in this case we
4140 * must recompute the path in the new reference and use it for the new
4141 * orphanization operation.
4143 static int refresh_ref_path(struct send_ctx
*sctx
, struct recorded_ref
*ref
)
4148 name
= kmemdup(ref
->name
, ref
->name_len
, GFP_KERNEL
);
4152 fs_path_reset(ref
->full_path
);
4153 ret
= get_cur_path(sctx
, ref
->dir
, ref
->dir_gen
, ref
->full_path
);
4157 ret
= fs_path_add(ref
->full_path
, name
, ref
->name_len
);
4161 /* Update the reference's base name pointer. */
4162 set_ref_path(ref
, ref
->full_path
);
4169 * This does all the move/link/unlink/rmdir magic.
4171 static int process_recorded_refs(struct send_ctx
*sctx
, int *pending_move
)
4173 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
4175 struct recorded_ref
*cur
;
4176 struct recorded_ref
*cur2
;
4177 LIST_HEAD(check_dirs
);
4178 struct fs_path
*valid_path
= NULL
;
4182 int did_overwrite
= 0;
4184 u64 last_dir_ino_rm
= 0;
4185 bool can_rename
= true;
4186 bool orphanized_dir
= false;
4187 bool orphanized_ancestor
= false;
4189 btrfs_debug(fs_info
, "process_recorded_refs %llu", sctx
->cur_ino
);
4192 * This should never happen as the root dir always has the same ref
4193 * which is always '..'
4195 if (unlikely(sctx
->cur_ino
<= BTRFS_FIRST_FREE_OBJECTID
)) {
4197 "send: unexpected inode %llu in process_recorded_refs()",
4203 valid_path
= fs_path_alloc();
4210 * First, check if the first ref of the current inode was overwritten
4211 * before. If yes, we know that the current inode was already orphanized
4212 * and thus use the orphan name. If not, we can use get_cur_path to
4213 * get the path of the first ref as it would like while receiving at
4214 * this point in time.
4215 * New inodes are always orphan at the beginning, so force to use the
4216 * orphan name in this case.
4217 * The first ref is stored in valid_path and will be updated if it
4218 * gets moved around.
4220 if (!sctx
->cur_inode_new
) {
4221 ret
= did_overwrite_first_ref(sctx
, sctx
->cur_ino
,
4222 sctx
->cur_inode_gen
);
4228 if (sctx
->cur_inode_new
|| did_overwrite
) {
4229 ret
= gen_unique_name(sctx
, sctx
->cur_ino
,
4230 sctx
->cur_inode_gen
, valid_path
);
4235 ret
= get_cur_path(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
,
4242 * Before doing any rename and link operations, do a first pass on the
4243 * new references to orphanize any unprocessed inodes that may have a
4244 * reference that conflicts with one of the new references of the current
4245 * inode. This needs to happen first because a new reference may conflict
4246 * with the old reference of a parent directory, so we must make sure
4247 * that the path used for link and rename commands don't use an
4248 * orphanized name when an ancestor was not yet orphanized.
4255 * |----- testdir/ (ino 259)
4256 * | |----- a (ino 257)
4258 * |----- b (ino 258)
4263 * |----- testdir_2/ (ino 259)
4264 * | |----- a (ino 260)
4266 * |----- testdir (ino 257)
4267 * |----- b (ino 257)
4268 * |----- b2 (ino 258)
4270 * Processing the new reference for inode 257 with name "b" may happen
4271 * before processing the new reference with name "testdir". If so, we
4272 * must make sure that by the time we send a link command to create the
4273 * hard link "b", inode 259 was already orphanized, since the generated
4274 * path in "valid_path" already contains the orphanized name for 259.
4275 * We are processing inode 257, so only later when processing 259 we do
4276 * the rename operation to change its temporary (orphanized) name to
4279 list_for_each_entry(cur
, &sctx
->new_refs
, list
) {
4280 ret
= get_cur_inode_state(sctx
, cur
->dir
, cur
->dir_gen
, NULL
, NULL
);
4283 if (ret
== inode_state_will_create
)
4287 * Check if this new ref would overwrite the first ref of another
4288 * unprocessed inode. If yes, orphanize the overwritten inode.
4289 * If we find an overwritten ref that is not the first ref,
4292 ret
= will_overwrite_ref(sctx
, cur
->dir
, cur
->dir_gen
,
4293 cur
->name
, cur
->name_len
,
4294 &ow_inode
, &ow_gen
, &ow_mode
);
4298 ret
= is_first_ref(sctx
->parent_root
,
4299 ow_inode
, cur
->dir
, cur
->name
,
4304 struct name_cache_entry
*nce
;
4305 struct waiting_dir_move
*wdm
;
4307 if (orphanized_dir
) {
4308 ret
= refresh_ref_path(sctx
, cur
);
4313 ret
= orphanize_inode(sctx
, ow_inode
, ow_gen
,
4317 if (S_ISDIR(ow_mode
))
4318 orphanized_dir
= true;
4321 * If ow_inode has its rename operation delayed
4322 * make sure that its orphanized name is used in
4323 * the source path when performing its rename
4326 wdm
= get_waiting_dir_move(sctx
, ow_inode
);
4328 wdm
->orphanized
= true;
4331 * Make sure we clear our orphanized inode's
4332 * name from the name cache. This is because the
4333 * inode ow_inode might be an ancestor of some
4334 * other inode that will be orphanized as well
4335 * later and has an inode number greater than
4336 * sctx->send_progress. We need to prevent
4337 * future name lookups from using the old name
4338 * and get instead the orphan name.
4340 nce
= name_cache_search(sctx
, ow_inode
, ow_gen
);
4342 btrfs_lru_cache_remove(&sctx
->name_cache
,
4346 * ow_inode might currently be an ancestor of
4347 * cur_ino, therefore compute valid_path (the
4348 * current path of cur_ino) again because it
4349 * might contain the pre-orphanization name of
4350 * ow_inode, which is no longer valid.
4352 ret
= is_ancestor(sctx
->parent_root
,
4354 sctx
->cur_ino
, NULL
);
4356 orphanized_ancestor
= true;
4357 fs_path_reset(valid_path
);
4358 ret
= get_cur_path(sctx
, sctx
->cur_ino
,
4359 sctx
->cur_inode_gen
,
4366 * If we previously orphanized a directory that
4367 * collided with a new reference that we already
4368 * processed, recompute the current path because
4369 * that directory may be part of the path.
4371 if (orphanized_dir
) {
4372 ret
= refresh_ref_path(sctx
, cur
);
4376 ret
= send_unlink(sctx
, cur
->full_path
);
4384 list_for_each_entry(cur
, &sctx
->new_refs
, list
) {
4386 * We may have refs where the parent directory does not exist
4387 * yet. This happens if the parent directories inum is higher
4388 * than the current inum. To handle this case, we create the
4389 * parent directory out of order. But we need to check if this
4390 * did already happen before due to other refs in the same dir.
4392 ret
= get_cur_inode_state(sctx
, cur
->dir
, cur
->dir_gen
, NULL
, NULL
);
4395 if (ret
== inode_state_will_create
) {
4398 * First check if any of the current inodes refs did
4399 * already create the dir.
4401 list_for_each_entry(cur2
, &sctx
->new_refs
, list
) {
4404 if (cur2
->dir
== cur
->dir
) {
4411 * If that did not happen, check if a previous inode
4412 * did already create the dir.
4415 ret
= did_create_dir(sctx
, cur
->dir
);
4419 ret
= send_create_inode(sctx
, cur
->dir
);
4422 cache_dir_created(sctx
, cur
->dir
);
4426 if (S_ISDIR(sctx
->cur_inode_mode
) && sctx
->parent_root
) {
4427 ret
= wait_for_dest_dir_move(sctx
, cur
, is_orphan
);
4436 if (S_ISDIR(sctx
->cur_inode_mode
) && sctx
->parent_root
&&
4438 ret
= wait_for_parent_move(sctx
, cur
, is_orphan
);
4448 * link/move the ref to the new place. If we have an orphan
4449 * inode, move it and update valid_path. If not, link or move
4450 * it depending on the inode mode.
4452 if (is_orphan
&& can_rename
) {
4453 ret
= send_rename(sctx
, valid_path
, cur
->full_path
);
4457 ret
= fs_path_copy(valid_path
, cur
->full_path
);
4460 } else if (can_rename
) {
4461 if (S_ISDIR(sctx
->cur_inode_mode
)) {
4463 * Dirs can't be linked, so move it. For moved
4464 * dirs, we always have one new and one deleted
4465 * ref. The deleted ref is ignored later.
4467 ret
= send_rename(sctx
, valid_path
,
4470 ret
= fs_path_copy(valid_path
,
4476 * We might have previously orphanized an inode
4477 * which is an ancestor of our current inode,
4478 * so our reference's full path, which was
4479 * computed before any such orphanizations, must
4482 if (orphanized_dir
) {
4483 ret
= update_ref_path(sctx
, cur
);
4487 ret
= send_link(sctx
, cur
->full_path
,
4493 ret
= dup_ref(cur
, &check_dirs
);
4498 if (S_ISDIR(sctx
->cur_inode_mode
) && sctx
->cur_inode_deleted
) {
4500 * Check if we can already rmdir the directory. If not,
4501 * orphanize it. For every dir item inside that gets deleted
4502 * later, we do this check again and rmdir it then if possible.
4503 * See the use of check_dirs for more details.
4505 ret
= can_rmdir(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
);
4509 ret
= send_rmdir(sctx
, valid_path
);
4512 } else if (!is_orphan
) {
4513 ret
= orphanize_inode(sctx
, sctx
->cur_ino
,
4514 sctx
->cur_inode_gen
, valid_path
);
4520 list_for_each_entry(cur
, &sctx
->deleted_refs
, list
) {
4521 ret
= dup_ref(cur
, &check_dirs
);
4525 } else if (S_ISDIR(sctx
->cur_inode_mode
) &&
4526 !list_empty(&sctx
->deleted_refs
)) {
4528 * We have a moved dir. Add the old parent to check_dirs
4530 cur
= list_entry(sctx
->deleted_refs
.next
, struct recorded_ref
,
4532 ret
= dup_ref(cur
, &check_dirs
);
4535 } else if (!S_ISDIR(sctx
->cur_inode_mode
)) {
4537 * We have a non dir inode. Go through all deleted refs and
4538 * unlink them if they were not already overwritten by other
4541 list_for_each_entry(cur
, &sctx
->deleted_refs
, list
) {
4542 ret
= did_overwrite_ref(sctx
, cur
->dir
, cur
->dir_gen
,
4543 sctx
->cur_ino
, sctx
->cur_inode_gen
,
4544 cur
->name
, cur
->name_len
);
4549 * If we orphanized any ancestor before, we need
4550 * to recompute the full path for deleted names,
4551 * since any such path was computed before we
4552 * processed any references and orphanized any
4555 if (orphanized_ancestor
) {
4556 ret
= update_ref_path(sctx
, cur
);
4560 ret
= send_unlink(sctx
, cur
->full_path
);
4564 ret
= dup_ref(cur
, &check_dirs
);
4569 * If the inode is still orphan, unlink the orphan. This may
4570 * happen when a previous inode did overwrite the first ref
4571 * of this inode and no new refs were added for the current
4572 * inode. Unlinking does not mean that the inode is deleted in
4573 * all cases. There may still be links to this inode in other
4577 ret
= send_unlink(sctx
, valid_path
);
4584 * We did collect all parent dirs where cur_inode was once located. We
4585 * now go through all these dirs and check if they are pending for
4586 * deletion and if it's finally possible to perform the rmdir now.
4587 * We also update the inode stats of the parent dirs here.
4589 list_for_each_entry(cur
, &check_dirs
, list
) {
4591 * In case we had refs into dirs that were not processed yet,
4592 * we don't need to do the utime and rmdir logic for these dirs.
4593 * The dir will be processed later.
4595 if (cur
->dir
> sctx
->cur_ino
)
4598 ret
= get_cur_inode_state(sctx
, cur
->dir
, cur
->dir_gen
, NULL
, NULL
);
4602 if (ret
== inode_state_did_create
||
4603 ret
== inode_state_no_change
) {
4604 ret
= cache_dir_utimes(sctx
, cur
->dir
, cur
->dir_gen
);
4607 } else if (ret
== inode_state_did_delete
&&
4608 cur
->dir
!= last_dir_ino_rm
) {
4609 ret
= can_rmdir(sctx
, cur
->dir
, cur
->dir_gen
);
4613 ret
= get_cur_path(sctx
, cur
->dir
,
4614 cur
->dir_gen
, valid_path
);
4617 ret
= send_rmdir(sctx
, valid_path
);
4620 last_dir_ino_rm
= cur
->dir
;
4628 __free_recorded_refs(&check_dirs
);
4629 free_recorded_refs(sctx
);
4630 fs_path_free(valid_path
);
4634 static int rbtree_ref_comp(const void *k
, const struct rb_node
*node
)
4636 const struct recorded_ref
*data
= k
;
4637 const struct recorded_ref
*ref
= rb_entry(node
, struct recorded_ref
, node
);
4640 if (data
->dir
> ref
->dir
)
4642 if (data
->dir
< ref
->dir
)
4644 if (data
->dir_gen
> ref
->dir_gen
)
4646 if (data
->dir_gen
< ref
->dir_gen
)
4648 if (data
->name_len
> ref
->name_len
)
4650 if (data
->name_len
< ref
->name_len
)
4652 result
= strcmp(data
->name
, ref
->name
);
4660 static bool rbtree_ref_less(struct rb_node
*node
, const struct rb_node
*parent
)
4662 const struct recorded_ref
*entry
= rb_entry(node
, struct recorded_ref
, node
);
4664 return rbtree_ref_comp(entry
, parent
) < 0;
4667 static int record_ref_in_tree(struct rb_root
*root
, struct list_head
*refs
,
4668 struct fs_path
*name
, u64 dir
, u64 dir_gen
,
4669 struct send_ctx
*sctx
)
4672 struct fs_path
*path
= NULL
;
4673 struct recorded_ref
*ref
= NULL
;
4675 path
= fs_path_alloc();
4681 ref
= recorded_ref_alloc();
4687 ret
= get_cur_path(sctx
, dir
, dir_gen
, path
);
4690 ret
= fs_path_add_path(path
, name
);
4695 ref
->dir_gen
= dir_gen
;
4696 set_ref_path(ref
, path
);
4697 list_add_tail(&ref
->list
, refs
);
4698 rb_add(&ref
->node
, root
, rbtree_ref_less
);
4702 if (path
&& (!ref
|| !ref
->full_path
))
4704 recorded_ref_free(ref
);
4709 static int record_new_ref_if_needed(int num
, u64 dir
, int index
,
4710 struct fs_path
*name
, void *ctx
)
4713 struct send_ctx
*sctx
= ctx
;
4714 struct rb_node
*node
= NULL
;
4715 struct recorded_ref data
;
4716 struct recorded_ref
*ref
;
4719 ret
= get_inode_gen(sctx
->send_root
, dir
, &dir_gen
);
4724 data
.dir_gen
= dir_gen
;
4725 set_ref_path(&data
, name
);
4726 node
= rb_find(&data
, &sctx
->rbtree_deleted_refs
, rbtree_ref_comp
);
4728 ref
= rb_entry(node
, struct recorded_ref
, node
);
4729 recorded_ref_free(ref
);
4731 ret
= record_ref_in_tree(&sctx
->rbtree_new_refs
,
4732 &sctx
->new_refs
, name
, dir
, dir_gen
,
4739 static int record_deleted_ref_if_needed(int num
, u64 dir
, int index
,
4740 struct fs_path
*name
, void *ctx
)
4743 struct send_ctx
*sctx
= ctx
;
4744 struct rb_node
*node
= NULL
;
4745 struct recorded_ref data
;
4746 struct recorded_ref
*ref
;
4749 ret
= get_inode_gen(sctx
->parent_root
, dir
, &dir_gen
);
4754 data
.dir_gen
= dir_gen
;
4755 set_ref_path(&data
, name
);
4756 node
= rb_find(&data
, &sctx
->rbtree_new_refs
, rbtree_ref_comp
);
4758 ref
= rb_entry(node
, struct recorded_ref
, node
);
4759 recorded_ref_free(ref
);
4761 ret
= record_ref_in_tree(&sctx
->rbtree_deleted_refs
,
4762 &sctx
->deleted_refs
, name
, dir
,
4769 static int record_new_ref(struct send_ctx
*sctx
)
4773 ret
= iterate_inode_ref(sctx
->send_root
, sctx
->left_path
,
4774 sctx
->cmp_key
, 0, record_new_ref_if_needed
, sctx
);
4783 static int record_deleted_ref(struct send_ctx
*sctx
)
4787 ret
= iterate_inode_ref(sctx
->parent_root
, sctx
->right_path
,
4788 sctx
->cmp_key
, 0, record_deleted_ref_if_needed
,
4798 static int record_changed_ref(struct send_ctx
*sctx
)
4802 ret
= iterate_inode_ref(sctx
->send_root
, sctx
->left_path
,
4803 sctx
->cmp_key
, 0, record_new_ref_if_needed
, sctx
);
4806 ret
= iterate_inode_ref(sctx
->parent_root
, sctx
->right_path
,
4807 sctx
->cmp_key
, 0, record_deleted_ref_if_needed
, sctx
);
4817 * Record and process all refs at once. Needed when an inode changes the
4818 * generation number, which means that it was deleted and recreated.
4820 static int process_all_refs(struct send_ctx
*sctx
,
4821 enum btrfs_compare_tree_result cmd
)
4825 struct btrfs_root
*root
;
4826 struct btrfs_path
*path
;
4827 struct btrfs_key key
;
4828 struct btrfs_key found_key
;
4829 iterate_inode_ref_t cb
;
4830 int pending_move
= 0;
4832 path
= alloc_path_for_send();
4836 if (cmd
== BTRFS_COMPARE_TREE_NEW
) {
4837 root
= sctx
->send_root
;
4838 cb
= record_new_ref_if_needed
;
4839 } else if (cmd
== BTRFS_COMPARE_TREE_DELETED
) {
4840 root
= sctx
->parent_root
;
4841 cb
= record_deleted_ref_if_needed
;
4843 btrfs_err(sctx
->send_root
->fs_info
,
4844 "Wrong command %d in process_all_refs", cmd
);
4849 key
.objectid
= sctx
->cmp_key
->objectid
;
4850 key
.type
= BTRFS_INODE_REF_KEY
;
4852 btrfs_for_each_slot(root
, &key
, &found_key
, path
, iter_ret
) {
4853 if (found_key
.objectid
!= key
.objectid
||
4854 (found_key
.type
!= BTRFS_INODE_REF_KEY
&&
4855 found_key
.type
!= BTRFS_INODE_EXTREF_KEY
))
4858 ret
= iterate_inode_ref(root
, path
, &found_key
, 0, cb
, sctx
);
4862 /* Catch error found during iteration */
4867 btrfs_release_path(path
);
4870 * We don't actually care about pending_move as we are simply
4871 * re-creating this inode and will be rename'ing it into place once we
4872 * rename the parent directory.
4874 ret
= process_recorded_refs(sctx
, &pending_move
);
4876 btrfs_free_path(path
);
4880 static int send_set_xattr(struct send_ctx
*sctx
,
4881 struct fs_path
*path
,
4882 const char *name
, int name_len
,
4883 const char *data
, int data_len
)
4887 ret
= begin_cmd(sctx
, BTRFS_SEND_C_SET_XATTR
);
4891 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, path
);
4892 TLV_PUT_STRING(sctx
, BTRFS_SEND_A_XATTR_NAME
, name
, name_len
);
4893 TLV_PUT(sctx
, BTRFS_SEND_A_XATTR_DATA
, data
, data_len
);
4895 ret
= send_cmd(sctx
);
4902 static int send_remove_xattr(struct send_ctx
*sctx
,
4903 struct fs_path
*path
,
4904 const char *name
, int name_len
)
4908 ret
= begin_cmd(sctx
, BTRFS_SEND_C_REMOVE_XATTR
);
4912 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, path
);
4913 TLV_PUT_STRING(sctx
, BTRFS_SEND_A_XATTR_NAME
, name
, name_len
);
4915 ret
= send_cmd(sctx
);
4922 static int __process_new_xattr(int num
, struct btrfs_key
*di_key
,
4923 const char *name
, int name_len
, const char *data
,
4924 int data_len
, void *ctx
)
4927 struct send_ctx
*sctx
= ctx
;
4929 struct posix_acl_xattr_header dummy_acl
;
4931 /* Capabilities are emitted by finish_inode_if_needed */
4932 if (!strncmp(name
, XATTR_NAME_CAPS
, name_len
))
4935 p
= fs_path_alloc();
4940 * This hack is needed because empty acls are stored as zero byte
4941 * data in xattrs. Problem with that is, that receiving these zero byte
4942 * acls will fail later. To fix this, we send a dummy acl list that
4943 * only contains the version number and no entries.
4945 if (!strncmp(name
, XATTR_NAME_POSIX_ACL_ACCESS
, name_len
) ||
4946 !strncmp(name
, XATTR_NAME_POSIX_ACL_DEFAULT
, name_len
)) {
4947 if (data_len
== 0) {
4948 dummy_acl
.a_version
=
4949 cpu_to_le32(POSIX_ACL_XATTR_VERSION
);
4950 data
= (char *)&dummy_acl
;
4951 data_len
= sizeof(dummy_acl
);
4955 ret
= get_cur_path(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
, p
);
4959 ret
= send_set_xattr(sctx
, p
, name
, name_len
, data
, data_len
);
4966 static int __process_deleted_xattr(int num
, struct btrfs_key
*di_key
,
4967 const char *name
, int name_len
,
4968 const char *data
, int data_len
, void *ctx
)
4971 struct send_ctx
*sctx
= ctx
;
4974 p
= fs_path_alloc();
4978 ret
= get_cur_path(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
, p
);
4982 ret
= send_remove_xattr(sctx
, p
, name
, name_len
);
4989 static int process_new_xattr(struct send_ctx
*sctx
)
4993 ret
= iterate_dir_item(sctx
->send_root
, sctx
->left_path
,
4994 __process_new_xattr
, sctx
);
4999 static int process_deleted_xattr(struct send_ctx
*sctx
)
5001 return iterate_dir_item(sctx
->parent_root
, sctx
->right_path
,
5002 __process_deleted_xattr
, sctx
);
5005 struct find_xattr_ctx
{
5013 static int __find_xattr(int num
, struct btrfs_key
*di_key
, const char *name
,
5014 int name_len
, const char *data
, int data_len
, void *vctx
)
5016 struct find_xattr_ctx
*ctx
= vctx
;
5018 if (name_len
== ctx
->name_len
&&
5019 strncmp(name
, ctx
->name
, name_len
) == 0) {
5020 ctx
->found_idx
= num
;
5021 ctx
->found_data_len
= data_len
;
5022 ctx
->found_data
= kmemdup(data
, data_len
, GFP_KERNEL
);
5023 if (!ctx
->found_data
)
5030 static int find_xattr(struct btrfs_root
*root
,
5031 struct btrfs_path
*path
,
5032 struct btrfs_key
*key
,
5033 const char *name
, int name_len
,
5034 char **data
, int *data_len
)
5037 struct find_xattr_ctx ctx
;
5040 ctx
.name_len
= name_len
;
5042 ctx
.found_data
= NULL
;
5043 ctx
.found_data_len
= 0;
5045 ret
= iterate_dir_item(root
, path
, __find_xattr
, &ctx
);
5049 if (ctx
.found_idx
== -1)
5052 *data
= ctx
.found_data
;
5053 *data_len
= ctx
.found_data_len
;
5055 kfree(ctx
.found_data
);
5057 return ctx
.found_idx
;
5061 static int __process_changed_new_xattr(int num
, struct btrfs_key
*di_key
,
5062 const char *name
, int name_len
,
5063 const char *data
, int data_len
,
5067 struct send_ctx
*sctx
= ctx
;
5068 char *found_data
= NULL
;
5069 int found_data_len
= 0;
5071 ret
= find_xattr(sctx
->parent_root
, sctx
->right_path
,
5072 sctx
->cmp_key
, name
, name_len
, &found_data
,
5074 if (ret
== -ENOENT
) {
5075 ret
= __process_new_xattr(num
, di_key
, name
, name_len
, data
,
5077 } else if (ret
>= 0) {
5078 if (data_len
!= found_data_len
||
5079 memcmp(data
, found_data
, data_len
)) {
5080 ret
= __process_new_xattr(num
, di_key
, name
, name_len
,
5081 data
, data_len
, ctx
);
5091 static int __process_changed_deleted_xattr(int num
, struct btrfs_key
*di_key
,
5092 const char *name
, int name_len
,
5093 const char *data
, int data_len
,
5097 struct send_ctx
*sctx
= ctx
;
5099 ret
= find_xattr(sctx
->send_root
, sctx
->left_path
, sctx
->cmp_key
,
5100 name
, name_len
, NULL
, NULL
);
5102 ret
= __process_deleted_xattr(num
, di_key
, name
, name_len
, data
,
5110 static int process_changed_xattr(struct send_ctx
*sctx
)
5114 ret
= iterate_dir_item(sctx
->send_root
, sctx
->left_path
,
5115 __process_changed_new_xattr
, sctx
);
5118 ret
= iterate_dir_item(sctx
->parent_root
, sctx
->right_path
,
5119 __process_changed_deleted_xattr
, sctx
);
5125 static int process_all_new_xattrs(struct send_ctx
*sctx
)
5129 struct btrfs_root
*root
;
5130 struct btrfs_path
*path
;
5131 struct btrfs_key key
;
5132 struct btrfs_key found_key
;
5134 path
= alloc_path_for_send();
5138 root
= sctx
->send_root
;
5140 key
.objectid
= sctx
->cmp_key
->objectid
;
5141 key
.type
= BTRFS_XATTR_ITEM_KEY
;
5143 btrfs_for_each_slot(root
, &key
, &found_key
, path
, iter_ret
) {
5144 if (found_key
.objectid
!= key
.objectid
||
5145 found_key
.type
!= key
.type
) {
5150 ret
= iterate_dir_item(root
, path
, __process_new_xattr
, sctx
);
5154 /* Catch error found during iteration */
5158 btrfs_free_path(path
);
5162 static int send_verity(struct send_ctx
*sctx
, struct fs_path
*path
,
5163 struct fsverity_descriptor
*desc
)
5167 ret
= begin_cmd(sctx
, BTRFS_SEND_C_ENABLE_VERITY
);
5171 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, path
);
5172 TLV_PUT_U8(sctx
, BTRFS_SEND_A_VERITY_ALGORITHM
,
5173 le8_to_cpu(desc
->hash_algorithm
));
5174 TLV_PUT_U32(sctx
, BTRFS_SEND_A_VERITY_BLOCK_SIZE
,
5175 1U << le8_to_cpu(desc
->log_blocksize
));
5176 TLV_PUT(sctx
, BTRFS_SEND_A_VERITY_SALT_DATA
, desc
->salt
,
5177 le8_to_cpu(desc
->salt_size
));
5178 TLV_PUT(sctx
, BTRFS_SEND_A_VERITY_SIG_DATA
, desc
->signature
,
5179 le32_to_cpu(desc
->sig_size
));
5181 ret
= send_cmd(sctx
);
5188 static int process_verity(struct send_ctx
*sctx
)
5191 struct inode
*inode
;
5194 inode
= btrfs_iget(sctx
->cur_ino
, sctx
->send_root
);
5196 return PTR_ERR(inode
);
5198 ret
= btrfs_get_verity_descriptor(inode
, NULL
, 0);
5202 if (ret
> FS_VERITY_MAX_DESCRIPTOR_SIZE
) {
5206 if (!sctx
->verity_descriptor
) {
5207 sctx
->verity_descriptor
= kvmalloc(FS_VERITY_MAX_DESCRIPTOR_SIZE
,
5209 if (!sctx
->verity_descriptor
) {
5215 ret
= btrfs_get_verity_descriptor(inode
, sctx
->verity_descriptor
, ret
);
5219 p
= fs_path_alloc();
5224 ret
= get_cur_path(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
, p
);
5228 ret
= send_verity(sctx
, p
, sctx
->verity_descriptor
);
5239 static inline u64
max_send_read_size(const struct send_ctx
*sctx
)
5241 return sctx
->send_max_size
- SZ_16K
;
5244 static int put_data_header(struct send_ctx
*sctx
, u32 len
)
5246 if (WARN_ON_ONCE(sctx
->put_data
))
5248 sctx
->put_data
= true;
5249 if (sctx
->proto
>= 2) {
5251 * Since v2, the data attribute header doesn't include a length,
5252 * it is implicitly to the end of the command.
5254 if (sctx
->send_max_size
- sctx
->send_size
< sizeof(__le16
) + len
)
5256 put_unaligned_le16(BTRFS_SEND_A_DATA
, sctx
->send_buf
+ sctx
->send_size
);
5257 sctx
->send_size
+= sizeof(__le16
);
5259 struct btrfs_tlv_header
*hdr
;
5261 if (sctx
->send_max_size
- sctx
->send_size
< sizeof(*hdr
) + len
)
5263 hdr
= (struct btrfs_tlv_header
*)(sctx
->send_buf
+ sctx
->send_size
);
5264 put_unaligned_le16(BTRFS_SEND_A_DATA
, &hdr
->tlv_type
);
5265 put_unaligned_le16(len
, &hdr
->tlv_len
);
5266 sctx
->send_size
+= sizeof(*hdr
);
5271 static int put_file_data(struct send_ctx
*sctx
, u64 offset
, u32 len
)
5273 struct btrfs_root
*root
= sctx
->send_root
;
5274 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
5275 struct folio
*folio
;
5276 pgoff_t index
= offset
>> PAGE_SHIFT
;
5278 unsigned pg_offset
= offset_in_page(offset
);
5279 struct address_space
*mapping
= sctx
->cur_inode
->i_mapping
;
5282 ret
= put_data_header(sctx
, len
);
5286 last_index
= (offset
+ len
- 1) >> PAGE_SHIFT
;
5288 while (index
<= last_index
) {
5289 unsigned cur_len
= min_t(unsigned, len
,
5290 PAGE_SIZE
- pg_offset
);
5292 folio
= filemap_lock_folio(mapping
, index
);
5293 if (IS_ERR(folio
)) {
5294 page_cache_sync_readahead(mapping
,
5295 &sctx
->ra
, NULL
, index
,
5296 last_index
+ 1 - index
);
5298 folio
= filemap_grab_folio(mapping
, index
);
5299 if (IS_ERR(folio
)) {
5300 ret
= PTR_ERR(folio
);
5305 WARN_ON(folio_order(folio
));
5307 if (folio_test_readahead(folio
))
5308 page_cache_async_readahead(mapping
, &sctx
->ra
, NULL
, folio
,
5309 last_index
+ 1 - index
);
5311 if (!folio_test_uptodate(folio
)) {
5312 btrfs_read_folio(NULL
, folio
);
5314 if (!folio_test_uptodate(folio
)) {
5315 folio_unlock(folio
);
5317 "send: IO error at offset %llu for inode %llu root %llu",
5318 folio_pos(folio
), sctx
->cur_ino
,
5319 btrfs_root_id(sctx
->send_root
));
5326 memcpy_from_folio(sctx
->send_buf
+ sctx
->send_size
, folio
,
5327 pg_offset
, cur_len
);
5328 folio_unlock(folio
);
5333 sctx
->send_size
+= cur_len
;
5340 * Read some bytes from the current inode/file and send a write command to
5343 static int send_write(struct send_ctx
*sctx
, u64 offset
, u32 len
)
5345 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
5349 p
= fs_path_alloc();
5353 btrfs_debug(fs_info
, "send_write offset=%llu, len=%d", offset
, len
);
5355 ret
= begin_cmd(sctx
, BTRFS_SEND_C_WRITE
);
5359 ret
= get_cur_path(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
, p
);
5363 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, p
);
5364 TLV_PUT_U64(sctx
, BTRFS_SEND_A_FILE_OFFSET
, offset
);
5365 ret
= put_file_data(sctx
, offset
, len
);
5369 ret
= send_cmd(sctx
);
5378 * Send a clone command to user space.
5380 static int send_clone(struct send_ctx
*sctx
,
5381 u64 offset
, u32 len
,
5382 struct clone_root
*clone_root
)
5388 btrfs_debug(sctx
->send_root
->fs_info
,
5389 "send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu",
5390 offset
, len
, btrfs_root_id(clone_root
->root
),
5391 clone_root
->ino
, clone_root
->offset
);
5393 p
= fs_path_alloc();
5397 ret
= begin_cmd(sctx
, BTRFS_SEND_C_CLONE
);
5401 ret
= get_cur_path(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
, p
);
5405 TLV_PUT_U64(sctx
, BTRFS_SEND_A_FILE_OFFSET
, offset
);
5406 TLV_PUT_U64(sctx
, BTRFS_SEND_A_CLONE_LEN
, len
);
5407 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, p
);
5409 if (clone_root
->root
== sctx
->send_root
) {
5410 ret
= get_inode_gen(sctx
->send_root
, clone_root
->ino
, &gen
);
5413 ret
= get_cur_path(sctx
, clone_root
->ino
, gen
, p
);
5415 ret
= get_inode_path(clone_root
->root
, clone_root
->ino
, p
);
5421 * If the parent we're using has a received_uuid set then use that as
5422 * our clone source as that is what we will look for when doing a
5425 * This covers the case that we create a snapshot off of a received
5426 * subvolume and then use that as the parent and try to receive on a
5429 if (!btrfs_is_empty_uuid(clone_root
->root
->root_item
.received_uuid
))
5430 TLV_PUT_UUID(sctx
, BTRFS_SEND_A_CLONE_UUID
,
5431 clone_root
->root
->root_item
.received_uuid
);
5433 TLV_PUT_UUID(sctx
, BTRFS_SEND_A_CLONE_UUID
,
5434 clone_root
->root
->root_item
.uuid
);
5435 TLV_PUT_U64(sctx
, BTRFS_SEND_A_CLONE_CTRANSID
,
5436 btrfs_root_ctransid(&clone_root
->root
->root_item
));
5437 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_CLONE_PATH
, p
);
5438 TLV_PUT_U64(sctx
, BTRFS_SEND_A_CLONE_OFFSET
,
5439 clone_root
->offset
);
5441 ret
= send_cmd(sctx
);
5450 * Send an update extent command to user space.
5452 static int send_update_extent(struct send_ctx
*sctx
,
5453 u64 offset
, u32 len
)
5458 p
= fs_path_alloc();
5462 ret
= begin_cmd(sctx
, BTRFS_SEND_C_UPDATE_EXTENT
);
5466 ret
= get_cur_path(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
, p
);
5470 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, p
);
5471 TLV_PUT_U64(sctx
, BTRFS_SEND_A_FILE_OFFSET
, offset
);
5472 TLV_PUT_U64(sctx
, BTRFS_SEND_A_SIZE
, len
);
5474 ret
= send_cmd(sctx
);
5482 static int send_hole(struct send_ctx
*sctx
, u64 end
)
5484 struct fs_path
*p
= NULL
;
5485 u64 read_size
= max_send_read_size(sctx
);
5486 u64 offset
= sctx
->cur_inode_last_extent
;
5490 * A hole that starts at EOF or beyond it. Since we do not yet support
5491 * fallocate (for extent preallocation and hole punching), sending a
5492 * write of zeroes starting at EOF or beyond would later require issuing
5493 * a truncate operation which would undo the write and achieve nothing.
5495 if (offset
>= sctx
->cur_inode_size
)
5499 * Don't go beyond the inode's i_size due to prealloc extents that start
5502 end
= min_t(u64
, end
, sctx
->cur_inode_size
);
5504 if (sctx
->flags
& BTRFS_SEND_FLAG_NO_FILE_DATA
)
5505 return send_update_extent(sctx
, offset
, end
- offset
);
5507 p
= fs_path_alloc();
5510 ret
= get_cur_path(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
, p
);
5512 goto tlv_put_failure
;
5513 while (offset
< end
) {
5514 u64 len
= min(end
- offset
, read_size
);
5516 ret
= begin_cmd(sctx
, BTRFS_SEND_C_WRITE
);
5519 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, p
);
5520 TLV_PUT_U64(sctx
, BTRFS_SEND_A_FILE_OFFSET
, offset
);
5521 ret
= put_data_header(sctx
, len
);
5524 memset(sctx
->send_buf
+ sctx
->send_size
, 0, len
);
5525 sctx
->send_size
+= len
;
5526 ret
= send_cmd(sctx
);
5531 sctx
->cur_inode_next_write_offset
= offset
;
5537 static int send_encoded_inline_extent(struct send_ctx
*sctx
,
5538 struct btrfs_path
*path
, u64 offset
,
5541 struct btrfs_root
*root
= sctx
->send_root
;
5542 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
5543 struct inode
*inode
;
5544 struct fs_path
*fspath
;
5545 struct extent_buffer
*leaf
= path
->nodes
[0];
5546 struct btrfs_key key
;
5547 struct btrfs_file_extent_item
*ei
;
5552 inode
= btrfs_iget(sctx
->cur_ino
, root
);
5554 return PTR_ERR(inode
);
5556 fspath
= fs_path_alloc();
5562 ret
= begin_cmd(sctx
, BTRFS_SEND_C_ENCODED_WRITE
);
5566 ret
= get_cur_path(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
, fspath
);
5570 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
5571 ei
= btrfs_item_ptr(leaf
, path
->slots
[0], struct btrfs_file_extent_item
);
5572 ram_bytes
= btrfs_file_extent_ram_bytes(leaf
, ei
);
5573 inline_size
= btrfs_file_extent_inline_item_len(leaf
, path
->slots
[0]);
5575 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, fspath
);
5576 TLV_PUT_U64(sctx
, BTRFS_SEND_A_FILE_OFFSET
, offset
);
5577 TLV_PUT_U64(sctx
, BTRFS_SEND_A_UNENCODED_FILE_LEN
,
5578 min(key
.offset
+ ram_bytes
- offset
, len
));
5579 TLV_PUT_U64(sctx
, BTRFS_SEND_A_UNENCODED_LEN
, ram_bytes
);
5580 TLV_PUT_U64(sctx
, BTRFS_SEND_A_UNENCODED_OFFSET
, offset
- key
.offset
);
5581 ret
= btrfs_encoded_io_compression_from_extent(fs_info
,
5582 btrfs_file_extent_compression(leaf
, ei
));
5585 TLV_PUT_U32(sctx
, BTRFS_SEND_A_COMPRESSION
, ret
);
5587 ret
= put_data_header(sctx
, inline_size
);
5590 read_extent_buffer(leaf
, sctx
->send_buf
+ sctx
->send_size
,
5591 btrfs_file_extent_inline_start(ei
), inline_size
);
5592 sctx
->send_size
+= inline_size
;
5594 ret
= send_cmd(sctx
);
5598 fs_path_free(fspath
);
5603 static int send_encoded_extent(struct send_ctx
*sctx
, struct btrfs_path
*path
,
5604 u64 offset
, u64 len
)
5606 struct btrfs_root
*root
= sctx
->send_root
;
5607 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
5608 struct inode
*inode
;
5609 struct fs_path
*fspath
;
5610 struct extent_buffer
*leaf
= path
->nodes
[0];
5611 struct btrfs_key key
;
5612 struct btrfs_file_extent_item
*ei
;
5613 u64 disk_bytenr
, disk_num_bytes
;
5615 struct btrfs_cmd_header
*hdr
;
5619 inode
= btrfs_iget(sctx
->cur_ino
, root
);
5621 return PTR_ERR(inode
);
5623 fspath
= fs_path_alloc();
5629 ret
= begin_cmd(sctx
, BTRFS_SEND_C_ENCODED_WRITE
);
5633 ret
= get_cur_path(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
, fspath
);
5637 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
5638 ei
= btrfs_item_ptr(leaf
, path
->slots
[0], struct btrfs_file_extent_item
);
5639 disk_bytenr
= btrfs_file_extent_disk_bytenr(leaf
, ei
);
5640 disk_num_bytes
= btrfs_file_extent_disk_num_bytes(leaf
, ei
);
5642 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, fspath
);
5643 TLV_PUT_U64(sctx
, BTRFS_SEND_A_FILE_OFFSET
, offset
);
5644 TLV_PUT_U64(sctx
, BTRFS_SEND_A_UNENCODED_FILE_LEN
,
5645 min(key
.offset
+ btrfs_file_extent_num_bytes(leaf
, ei
) - offset
,
5647 TLV_PUT_U64(sctx
, BTRFS_SEND_A_UNENCODED_LEN
,
5648 btrfs_file_extent_ram_bytes(leaf
, ei
));
5649 TLV_PUT_U64(sctx
, BTRFS_SEND_A_UNENCODED_OFFSET
,
5650 offset
- key
.offset
+ btrfs_file_extent_offset(leaf
, ei
));
5651 ret
= btrfs_encoded_io_compression_from_extent(fs_info
,
5652 btrfs_file_extent_compression(leaf
, ei
));
5655 TLV_PUT_U32(sctx
, BTRFS_SEND_A_COMPRESSION
, ret
);
5656 TLV_PUT_U32(sctx
, BTRFS_SEND_A_ENCRYPTION
, 0);
5658 ret
= put_data_header(sctx
, disk_num_bytes
);
5663 * We want to do I/O directly into the send buffer, so get the next page
5664 * boundary in the send buffer. This means that there may be a gap
5665 * between the beginning of the command and the file data.
5667 data_offset
= PAGE_ALIGN(sctx
->send_size
);
5668 if (data_offset
> sctx
->send_max_size
||
5669 sctx
->send_max_size
- data_offset
< disk_num_bytes
) {
5675 * Note that send_buf is a mapping of send_buf_pages, so this is really
5676 * reading into send_buf.
5678 ret
= btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode
), offset
,
5679 disk_bytenr
, disk_num_bytes
,
5680 sctx
->send_buf_pages
+
5681 (data_offset
>> PAGE_SHIFT
));
5685 hdr
= (struct btrfs_cmd_header
*)sctx
->send_buf
;
5686 hdr
->len
= cpu_to_le32(sctx
->send_size
+ disk_num_bytes
- sizeof(*hdr
));
5688 crc
= crc32c(0, sctx
->send_buf
, sctx
->send_size
);
5689 crc
= crc32c(crc
, sctx
->send_buf
+ data_offset
, disk_num_bytes
);
5690 hdr
->crc
= cpu_to_le32(crc
);
5692 ret
= write_buf(sctx
->send_filp
, sctx
->send_buf
, sctx
->send_size
,
5695 ret
= write_buf(sctx
->send_filp
, sctx
->send_buf
+ data_offset
,
5696 disk_num_bytes
, &sctx
->send_off
);
5698 sctx
->send_size
= 0;
5699 sctx
->put_data
= false;
5703 fs_path_free(fspath
);
5708 static int send_extent_data(struct send_ctx
*sctx
, struct btrfs_path
*path
,
5709 const u64 offset
, const u64 len
)
5711 const u64 end
= offset
+ len
;
5712 struct extent_buffer
*leaf
= path
->nodes
[0];
5713 struct btrfs_file_extent_item
*ei
;
5714 u64 read_size
= max_send_read_size(sctx
);
5717 if (sctx
->flags
& BTRFS_SEND_FLAG_NO_FILE_DATA
)
5718 return send_update_extent(sctx
, offset
, len
);
5720 ei
= btrfs_item_ptr(leaf
, path
->slots
[0],
5721 struct btrfs_file_extent_item
);
5722 if ((sctx
->flags
& BTRFS_SEND_FLAG_COMPRESSED
) &&
5723 btrfs_file_extent_compression(leaf
, ei
) != BTRFS_COMPRESS_NONE
) {
5724 bool is_inline
= (btrfs_file_extent_type(leaf
, ei
) ==
5725 BTRFS_FILE_EXTENT_INLINE
);
5728 * Send the compressed extent unless the compressed data is
5729 * larger than the decompressed data. This can happen if we're
5730 * not sending the entire extent, either because it has been
5731 * partially overwritten/truncated or because this is a part of
5732 * the extent that we couldn't clone in clone_range().
5735 btrfs_file_extent_inline_item_len(leaf
,
5736 path
->slots
[0]) <= len
) {
5737 return send_encoded_inline_extent(sctx
, path
, offset
,
5739 } else if (!is_inline
&&
5740 btrfs_file_extent_disk_num_bytes(leaf
, ei
) <= len
) {
5741 return send_encoded_extent(sctx
, path
, offset
, len
);
5745 if (sctx
->cur_inode
== NULL
) {
5746 struct btrfs_root
*root
= sctx
->send_root
;
5748 sctx
->cur_inode
= btrfs_iget(sctx
->cur_ino
, root
);
5749 if (IS_ERR(sctx
->cur_inode
)) {
5750 int err
= PTR_ERR(sctx
->cur_inode
);
5752 sctx
->cur_inode
= NULL
;
5755 memset(&sctx
->ra
, 0, sizeof(struct file_ra_state
));
5756 file_ra_state_init(&sctx
->ra
, sctx
->cur_inode
->i_mapping
);
5759 * It's very likely there are no pages from this inode in the page
5760 * cache, so after reading extents and sending their data, we clean
5761 * the page cache to avoid trashing the page cache (adding pressure
5762 * to the page cache and forcing eviction of other data more useful
5763 * for applications).
5765 * We decide if we should clean the page cache simply by checking
5766 * if the inode's mapping nrpages is 0 when we first open it, and
5767 * not by using something like filemap_range_has_page() before
5768 * reading an extent because when we ask the readahead code to
5769 * read a given file range, it may (and almost always does) read
5770 * pages from beyond that range (see the documentation for
5771 * page_cache_sync_readahead()), so it would not be reliable,
5772 * because after reading the first extent future calls to
5773 * filemap_range_has_page() would return true because the readahead
5774 * on the previous extent resulted in reading pages of the current
5777 sctx
->clean_page_cache
= (sctx
->cur_inode
->i_mapping
->nrpages
== 0);
5778 sctx
->page_cache_clear_start
= round_down(offset
, PAGE_SIZE
);
5781 while (sent
< len
) {
5782 u64 size
= min(len
- sent
, read_size
);
5785 ret
= send_write(sctx
, offset
+ sent
, size
);
5791 if (sctx
->clean_page_cache
&& PAGE_ALIGNED(end
)) {
5793 * Always operate only on ranges that are a multiple of the page
5794 * size. This is not only to prevent zeroing parts of a page in
5795 * the case of subpage sector size, but also to guarantee we evict
5796 * pages, as passing a range that is smaller than page size does
5797 * not evict the respective page (only zeroes part of its content).
5799 * Always start from the end offset of the last range cleared.
5800 * This is because the readahead code may (and very often does)
5801 * reads pages beyond the range we request for readahead. So if
5802 * we have an extent layout like this:
5804 * [ extent A ] [ extent B ] [ extent C ]
5806 * When we ask page_cache_sync_readahead() to read extent A, it
5807 * may also trigger reads for pages of extent B. If we are doing
5808 * an incremental send and extent B has not changed between the
5809 * parent and send snapshots, some or all of its pages may end
5810 * up being read and placed in the page cache. So when truncating
5811 * the page cache we always start from the end offset of the
5812 * previously processed extent up to the end of the current
5815 truncate_inode_pages_range(&sctx
->cur_inode
->i_data
,
5816 sctx
->page_cache_clear_start
,
5818 sctx
->page_cache_clear_start
= end
;
5825 * Search for a capability xattr related to sctx->cur_ino. If the capability is
5826 * found, call send_set_xattr function to emit it.
5828 * Return 0 if there isn't a capability, or when the capability was emitted
5829 * successfully, or < 0 if an error occurred.
5831 static int send_capabilities(struct send_ctx
*sctx
)
5833 struct fs_path
*fspath
= NULL
;
5834 struct btrfs_path
*path
;
5835 struct btrfs_dir_item
*di
;
5836 struct extent_buffer
*leaf
;
5837 unsigned long data_ptr
;
5842 path
= alloc_path_for_send();
5846 di
= btrfs_lookup_xattr(NULL
, sctx
->send_root
, path
, sctx
->cur_ino
,
5847 XATTR_NAME_CAPS
, strlen(XATTR_NAME_CAPS
), 0);
5849 /* There is no xattr for this inode */
5851 } else if (IS_ERR(di
)) {
5856 leaf
= path
->nodes
[0];
5857 buf_len
= btrfs_dir_data_len(leaf
, di
);
5859 fspath
= fs_path_alloc();
5860 buf
= kmalloc(buf_len
, GFP_KERNEL
);
5861 if (!fspath
|| !buf
) {
5866 ret
= get_cur_path(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
, fspath
);
5870 data_ptr
= (unsigned long)(di
+ 1) + btrfs_dir_name_len(leaf
, di
);
5871 read_extent_buffer(leaf
, buf
, data_ptr
, buf_len
);
5873 ret
= send_set_xattr(sctx
, fspath
, XATTR_NAME_CAPS
,
5874 strlen(XATTR_NAME_CAPS
), buf
, buf_len
);
5877 fs_path_free(fspath
);
5878 btrfs_free_path(path
);
5882 static int clone_range(struct send_ctx
*sctx
, struct btrfs_path
*dst_path
,
5883 struct clone_root
*clone_root
, const u64 disk_byte
,
5884 u64 data_offset
, u64 offset
, u64 len
)
5886 struct btrfs_path
*path
;
5887 struct btrfs_key key
;
5889 struct btrfs_inode_info info
;
5890 u64 clone_src_i_size
= 0;
5893 * Prevent cloning from a zero offset with a length matching the sector
5894 * size because in some scenarios this will make the receiver fail.
5896 * For example, if in the source filesystem the extent at offset 0
5897 * has a length of sectorsize and it was written using direct IO, then
5898 * it can never be an inline extent (even if compression is enabled).
5899 * Then this extent can be cloned in the original filesystem to a non
5900 * zero file offset, but it may not be possible to clone in the
5901 * destination filesystem because it can be inlined due to compression
5902 * on the destination filesystem (as the receiver's write operations are
5903 * always done using buffered IO). The same happens when the original
5904 * filesystem does not have compression enabled but the destination
5907 if (clone_root
->offset
== 0 &&
5908 len
== sctx
->send_root
->fs_info
->sectorsize
)
5909 return send_extent_data(sctx
, dst_path
, offset
, len
);
5911 path
= alloc_path_for_send();
5916 * There are inodes that have extents that lie behind its i_size. Don't
5917 * accept clones from these extents.
5919 ret
= get_inode_info(clone_root
->root
, clone_root
->ino
, &info
);
5920 btrfs_release_path(path
);
5923 clone_src_i_size
= info
.size
;
5926 * We can't send a clone operation for the entire range if we find
5927 * extent items in the respective range in the source file that
5928 * refer to different extents or if we find holes.
5929 * So check for that and do a mix of clone and regular write/copy
5930 * operations if needed.
5934 * mkfs.btrfs -f /dev/sda
5935 * mount /dev/sda /mnt
5936 * xfs_io -f -c "pwrite -S 0xaa 0K 100K" /mnt/foo
5937 * cp --reflink=always /mnt/foo /mnt/bar
5938 * xfs_io -c "pwrite -S 0xbb 50K 50K" /mnt/foo
5939 * btrfs subvolume snapshot -r /mnt /mnt/snap
5941 * If when we send the snapshot and we are processing file bar (which
5942 * has a higher inode number than foo) we blindly send a clone operation
5943 * for the [0, 100K[ range from foo to bar, the receiver ends up getting
5944 * a file bar that matches the content of file foo - iow, doesn't match
5945 * the content from bar in the original filesystem.
5947 key
.objectid
= clone_root
->ino
;
5948 key
.type
= BTRFS_EXTENT_DATA_KEY
;
5949 key
.offset
= clone_root
->offset
;
5950 ret
= btrfs_search_slot(NULL
, clone_root
->root
, &key
, path
, 0, 0);
5953 if (ret
> 0 && path
->slots
[0] > 0) {
5954 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0] - 1);
5955 if (key
.objectid
== clone_root
->ino
&&
5956 key
.type
== BTRFS_EXTENT_DATA_KEY
)
5961 struct extent_buffer
*leaf
= path
->nodes
[0];
5962 int slot
= path
->slots
[0];
5963 struct btrfs_file_extent_item
*ei
;
5967 u64 clone_data_offset
;
5968 bool crossed_src_i_size
= false;
5970 if (slot
>= btrfs_header_nritems(leaf
)) {
5971 ret
= btrfs_next_leaf(clone_root
->root
, path
);
5979 btrfs_item_key_to_cpu(leaf
, &key
, slot
);
5982 * We might have an implicit trailing hole (NO_HOLES feature
5983 * enabled). We deal with it after leaving this loop.
5985 if (key
.objectid
!= clone_root
->ino
||
5986 key
.type
!= BTRFS_EXTENT_DATA_KEY
)
5989 ei
= btrfs_item_ptr(leaf
, slot
, struct btrfs_file_extent_item
);
5990 type
= btrfs_file_extent_type(leaf
, ei
);
5991 if (type
== BTRFS_FILE_EXTENT_INLINE
) {
5992 ext_len
= btrfs_file_extent_ram_bytes(leaf
, ei
);
5993 ext_len
= PAGE_ALIGN(ext_len
);
5995 ext_len
= btrfs_file_extent_num_bytes(leaf
, ei
);
5998 if (key
.offset
+ ext_len
<= clone_root
->offset
)
6001 if (key
.offset
> clone_root
->offset
) {
6002 /* Implicit hole, NO_HOLES feature enabled. */
6003 u64 hole_len
= key
.offset
- clone_root
->offset
;
6007 ret
= send_extent_data(sctx
, dst_path
, offset
,
6016 clone_root
->offset
+= hole_len
;
6017 data_offset
+= hole_len
;
6020 if (key
.offset
>= clone_root
->offset
+ len
)
6023 if (key
.offset
>= clone_src_i_size
)
6026 if (key
.offset
+ ext_len
> clone_src_i_size
) {
6027 ext_len
= clone_src_i_size
- key
.offset
;
6028 crossed_src_i_size
= true;
6031 clone_data_offset
= btrfs_file_extent_offset(leaf
, ei
);
6032 if (btrfs_file_extent_disk_bytenr(leaf
, ei
) == disk_byte
) {
6033 clone_root
->offset
= key
.offset
;
6034 if (clone_data_offset
< data_offset
&&
6035 clone_data_offset
+ ext_len
> data_offset
) {
6038 extent_offset
= data_offset
- clone_data_offset
;
6039 ext_len
-= extent_offset
;
6040 clone_data_offset
+= extent_offset
;
6041 clone_root
->offset
+= extent_offset
;
6045 clone_len
= min_t(u64
, ext_len
, len
);
6047 if (btrfs_file_extent_disk_bytenr(leaf
, ei
) == disk_byte
&&
6048 clone_data_offset
== data_offset
) {
6049 const u64 src_end
= clone_root
->offset
+ clone_len
;
6050 const u64 sectorsize
= SZ_64K
;
6053 * We can't clone the last block, when its size is not
6054 * sector size aligned, into the middle of a file. If we
6055 * do so, the receiver will get a failure (-EINVAL) when
6056 * trying to clone or will silently corrupt the data in
6057 * the destination file if it's on a kernel without the
6058 * fix introduced by commit ac765f83f1397646
6059 * ("Btrfs: fix data corruption due to cloning of eof
6062 * So issue a clone of the aligned down range plus a
6063 * regular write for the eof block, if we hit that case.
6065 * Also, we use the maximum possible sector size, 64K,
6066 * because we don't know what's the sector size of the
6067 * filesystem that receives the stream, so we have to
6068 * assume the largest possible sector size.
6070 if (src_end
== clone_src_i_size
&&
6071 !IS_ALIGNED(src_end
, sectorsize
) &&
6072 offset
+ clone_len
< sctx
->cur_inode_size
) {
6075 slen
= ALIGN_DOWN(src_end
- clone_root
->offset
,
6078 ret
= send_clone(sctx
, offset
, slen
,
6083 ret
= send_extent_data(sctx
, dst_path
,
6087 ret
= send_clone(sctx
, offset
, clone_len
,
6090 } else if (crossed_src_i_size
&& clone_len
< len
) {
6092 * If we are at i_size of the clone source inode and we
6093 * can not clone from it, terminate the loop. This is
6094 * to avoid sending two write operations, one with a
6095 * length matching clone_len and the final one after
6096 * this loop with a length of len - clone_len.
6098 * When using encoded writes (BTRFS_SEND_FLAG_COMPRESSED
6099 * was passed to the send ioctl), this helps avoid
6100 * sending an encoded write for an offset that is not
6101 * sector size aligned, in case the i_size of the source
6102 * inode is not sector size aligned. That will make the
6103 * receiver fallback to decompression of the data and
6104 * writing it using regular buffered IO, therefore while
6105 * not incorrect, it's not optimal due decompression and
6106 * possible re-compression at the receiver.
6110 ret
= send_extent_data(sctx
, dst_path
, offset
,
6120 offset
+= clone_len
;
6121 clone_root
->offset
+= clone_len
;
6124 * If we are cloning from the file we are currently processing,
6125 * and using the send root as the clone root, we must stop once
6126 * the current clone offset reaches the current eof of the file
6127 * at the receiver, otherwise we would issue an invalid clone
6128 * operation (source range going beyond eof) and cause the
6129 * receiver to fail. So if we reach the current eof, bail out
6130 * and fallback to a regular write.
6132 if (clone_root
->root
== sctx
->send_root
&&
6133 clone_root
->ino
== sctx
->cur_ino
&&
6134 clone_root
->offset
>= sctx
->cur_inode_next_write_offset
)
6137 data_offset
+= clone_len
;
6143 ret
= send_extent_data(sctx
, dst_path
, offset
, len
);
6147 btrfs_free_path(path
);
6151 static int send_write_or_clone(struct send_ctx
*sctx
,
6152 struct btrfs_path
*path
,
6153 struct btrfs_key
*key
,
6154 struct clone_root
*clone_root
)
6157 u64 offset
= key
->offset
;
6159 u64 bs
= sctx
->send_root
->fs_info
->sectorsize
;
6160 struct btrfs_file_extent_item
*ei
;
6164 struct btrfs_inode_info info
= { 0 };
6166 end
= min_t(u64
, btrfs_file_extent_end(path
), sctx
->cur_inode_size
);
6170 num_bytes
= end
- offset
;
6175 if (IS_ALIGNED(end
, bs
))
6179 * If the extent end is not aligned, we can clone if the extent ends at
6180 * the i_size of the inode and the clone range ends at the i_size of the
6181 * source inode, otherwise the clone operation fails with -EINVAL.
6183 if (end
!= sctx
->cur_inode_size
)
6186 ret
= get_inode_info(clone_root
->root
, clone_root
->ino
, &info
);
6190 if (clone_root
->offset
+ num_bytes
== info
.size
)
6194 ret
= send_extent_data(sctx
, path
, offset
, num_bytes
);
6195 sctx
->cur_inode_next_write_offset
= end
;
6199 ei
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
6200 struct btrfs_file_extent_item
);
6201 disk_byte
= btrfs_file_extent_disk_bytenr(path
->nodes
[0], ei
);
6202 data_offset
= btrfs_file_extent_offset(path
->nodes
[0], ei
);
6203 ret
= clone_range(sctx
, path
, clone_root
, disk_byte
, data_offset
, offset
,
6205 sctx
->cur_inode_next_write_offset
= end
;
6209 static int is_extent_unchanged(struct send_ctx
*sctx
,
6210 struct btrfs_path
*left_path
,
6211 struct btrfs_key
*ekey
)
6214 struct btrfs_key key
;
6215 struct btrfs_path
*path
= NULL
;
6216 struct extent_buffer
*eb
;
6218 struct btrfs_key found_key
;
6219 struct btrfs_file_extent_item
*ei
;
6224 u64 left_offset_fixed
;
6232 path
= alloc_path_for_send();
6236 eb
= left_path
->nodes
[0];
6237 slot
= left_path
->slots
[0];
6238 ei
= btrfs_item_ptr(eb
, slot
, struct btrfs_file_extent_item
);
6239 left_type
= btrfs_file_extent_type(eb
, ei
);
6241 if (left_type
!= BTRFS_FILE_EXTENT_REG
) {
6245 left_disknr
= btrfs_file_extent_disk_bytenr(eb
, ei
);
6246 left_len
= btrfs_file_extent_num_bytes(eb
, ei
);
6247 left_offset
= btrfs_file_extent_offset(eb
, ei
);
6248 left_gen
= btrfs_file_extent_generation(eb
, ei
);
6251 * Following comments will refer to these graphics. L is the left
6252 * extents which we are checking at the moment. 1-8 are the right
6253 * extents that we iterate.
6256 * |-1-|-2a-|-3-|-4-|-5-|-6-|
6259 * |--1--|-2b-|...(same as above)
6261 * Alternative situation. Happens on files where extents got split.
6263 * |-----------7-----------|-6-|
6265 * Alternative situation. Happens on files which got larger.
6268 * Nothing follows after 8.
6271 key
.objectid
= ekey
->objectid
;
6272 key
.type
= BTRFS_EXTENT_DATA_KEY
;
6273 key
.offset
= ekey
->offset
;
6274 ret
= btrfs_search_slot_for_read(sctx
->parent_root
, &key
, path
, 0, 0);
6283 * Handle special case where the right side has no extents at all.
6285 eb
= path
->nodes
[0];
6286 slot
= path
->slots
[0];
6287 btrfs_item_key_to_cpu(eb
, &found_key
, slot
);
6288 if (found_key
.objectid
!= key
.objectid
||
6289 found_key
.type
!= key
.type
) {
6290 /* If we're a hole then just pretend nothing changed */
6291 ret
= (left_disknr
) ? 0 : 1;
6296 * We're now on 2a, 2b or 7.
6299 while (key
.offset
< ekey
->offset
+ left_len
) {
6300 ei
= btrfs_item_ptr(eb
, slot
, struct btrfs_file_extent_item
);
6301 right_type
= btrfs_file_extent_type(eb
, ei
);
6302 if (right_type
!= BTRFS_FILE_EXTENT_REG
&&
6303 right_type
!= BTRFS_FILE_EXTENT_INLINE
) {
6308 if (right_type
== BTRFS_FILE_EXTENT_INLINE
) {
6309 right_len
= btrfs_file_extent_ram_bytes(eb
, ei
);
6310 right_len
= PAGE_ALIGN(right_len
);
6312 right_len
= btrfs_file_extent_num_bytes(eb
, ei
);
6316 * Are we at extent 8? If yes, we know the extent is changed.
6317 * This may only happen on the first iteration.
6319 if (found_key
.offset
+ right_len
<= ekey
->offset
) {
6320 /* If we're a hole just pretend nothing changed */
6321 ret
= (left_disknr
) ? 0 : 1;
6326 * We just wanted to see if when we have an inline extent, what
6327 * follows it is a regular extent (wanted to check the above
6328 * condition for inline extents too). This should normally not
6329 * happen but it's possible for example when we have an inline
6330 * compressed extent representing data with a size matching
6331 * the page size (currently the same as sector size).
6333 if (right_type
== BTRFS_FILE_EXTENT_INLINE
) {
6338 right_disknr
= btrfs_file_extent_disk_bytenr(eb
, ei
);
6339 right_offset
= btrfs_file_extent_offset(eb
, ei
);
6340 right_gen
= btrfs_file_extent_generation(eb
, ei
);
6342 left_offset_fixed
= left_offset
;
6343 if (key
.offset
< ekey
->offset
) {
6344 /* Fix the right offset for 2a and 7. */
6345 right_offset
+= ekey
->offset
- key
.offset
;
6347 /* Fix the left offset for all behind 2a and 2b */
6348 left_offset_fixed
+= key
.offset
- ekey
->offset
;
6352 * Check if we have the same extent.
6354 if (left_disknr
!= right_disknr
||
6355 left_offset_fixed
!= right_offset
||
6356 left_gen
!= right_gen
) {
6362 * Go to the next extent.
6364 ret
= btrfs_next_item(sctx
->parent_root
, path
);
6368 eb
= path
->nodes
[0];
6369 slot
= path
->slots
[0];
6370 btrfs_item_key_to_cpu(eb
, &found_key
, slot
);
6372 if (ret
|| found_key
.objectid
!= key
.objectid
||
6373 found_key
.type
!= key
.type
) {
6374 key
.offset
+= right_len
;
6377 if (found_key
.offset
!= key
.offset
+ right_len
) {
6385 * We're now behind the left extent (treat as unchanged) or at the end
6386 * of the right side (treat as changed).
6388 if (key
.offset
>= ekey
->offset
+ left_len
)
6395 btrfs_free_path(path
);
6399 static int get_last_extent(struct send_ctx
*sctx
, u64 offset
)
6401 struct btrfs_path
*path
;
6402 struct btrfs_root
*root
= sctx
->send_root
;
6403 struct btrfs_key key
;
6406 path
= alloc_path_for_send();
6410 sctx
->cur_inode_last_extent
= 0;
6412 key
.objectid
= sctx
->cur_ino
;
6413 key
.type
= BTRFS_EXTENT_DATA_KEY
;
6414 key
.offset
= offset
;
6415 ret
= btrfs_search_slot_for_read(root
, &key
, path
, 0, 1);
6419 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0]);
6420 if (key
.objectid
!= sctx
->cur_ino
|| key
.type
!= BTRFS_EXTENT_DATA_KEY
)
6423 sctx
->cur_inode_last_extent
= btrfs_file_extent_end(path
);
6425 btrfs_free_path(path
);
6429 static int range_is_hole_in_parent(struct send_ctx
*sctx
,
6433 struct btrfs_path
*path
;
6434 struct btrfs_key key
;
6435 struct btrfs_root
*root
= sctx
->parent_root
;
6436 u64 search_start
= start
;
6439 path
= alloc_path_for_send();
6443 key
.objectid
= sctx
->cur_ino
;
6444 key
.type
= BTRFS_EXTENT_DATA_KEY
;
6445 key
.offset
= search_start
;
6446 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
6449 if (ret
> 0 && path
->slots
[0] > 0)
6452 while (search_start
< end
) {
6453 struct extent_buffer
*leaf
= path
->nodes
[0];
6454 int slot
= path
->slots
[0];
6455 struct btrfs_file_extent_item
*fi
;
6458 if (slot
>= btrfs_header_nritems(leaf
)) {
6459 ret
= btrfs_next_leaf(root
, path
);
6467 btrfs_item_key_to_cpu(leaf
, &key
, slot
);
6468 if (key
.objectid
< sctx
->cur_ino
||
6469 key
.type
< BTRFS_EXTENT_DATA_KEY
)
6471 if (key
.objectid
> sctx
->cur_ino
||
6472 key
.type
> BTRFS_EXTENT_DATA_KEY
||
6476 fi
= btrfs_item_ptr(leaf
, slot
, struct btrfs_file_extent_item
);
6477 extent_end
= btrfs_file_extent_end(path
);
6478 if (extent_end
<= start
)
6480 if (btrfs_file_extent_disk_bytenr(leaf
, fi
) == 0) {
6481 search_start
= extent_end
;
6491 btrfs_free_path(path
);
6495 static int maybe_send_hole(struct send_ctx
*sctx
, struct btrfs_path
*path
,
6496 struct btrfs_key
*key
)
6500 if (sctx
->cur_ino
!= key
->objectid
|| !need_send_hole(sctx
))
6504 * Get last extent's end offset (exclusive) if we haven't determined it
6505 * yet (we're processing the first file extent item that is new), or if
6506 * we're at the first slot of a leaf and the last extent's end is less
6507 * than the current extent's offset, because we might have skipped
6508 * entire leaves that contained only file extent items for our current
6509 * inode. These leaves have a generation number smaller (older) than the
6510 * one in the current leaf and the leaf our last extent came from, and
6511 * are located between these 2 leaves.
6513 if ((sctx
->cur_inode_last_extent
== (u64
)-1) ||
6514 (path
->slots
[0] == 0 && sctx
->cur_inode_last_extent
< key
->offset
)) {
6515 ret
= get_last_extent(sctx
, key
->offset
- 1);
6520 if (sctx
->cur_inode_last_extent
< key
->offset
) {
6521 ret
= range_is_hole_in_parent(sctx
,
6522 sctx
->cur_inode_last_extent
,
6527 ret
= send_hole(sctx
, key
->offset
);
6531 sctx
->cur_inode_last_extent
= btrfs_file_extent_end(path
);
6535 static int process_extent(struct send_ctx
*sctx
,
6536 struct btrfs_path
*path
,
6537 struct btrfs_key
*key
)
6539 struct clone_root
*found_clone
= NULL
;
6542 if (S_ISLNK(sctx
->cur_inode_mode
))
6545 if (sctx
->parent_root
&& !sctx
->cur_inode_new
) {
6546 ret
= is_extent_unchanged(sctx
, path
, key
);
6554 struct btrfs_file_extent_item
*ei
;
6557 ei
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
6558 struct btrfs_file_extent_item
);
6559 type
= btrfs_file_extent_type(path
->nodes
[0], ei
);
6560 if (type
== BTRFS_FILE_EXTENT_PREALLOC
||
6561 type
== BTRFS_FILE_EXTENT_REG
) {
6563 * The send spec does not have a prealloc command yet,
6564 * so just leave a hole for prealloc'ed extents until
6565 * we have enough commands queued up to justify rev'ing
6568 if (type
== BTRFS_FILE_EXTENT_PREALLOC
) {
6573 /* Have a hole, just skip it. */
6574 if (btrfs_file_extent_disk_bytenr(path
->nodes
[0], ei
) == 0) {
6581 ret
= find_extent_clone(sctx
, path
, key
->objectid
, key
->offset
,
6582 sctx
->cur_inode_size
, &found_clone
);
6583 if (ret
!= -ENOENT
&& ret
< 0)
6586 ret
= send_write_or_clone(sctx
, path
, key
, found_clone
);
6590 ret
= maybe_send_hole(sctx
, path
, key
);
6595 static int process_all_extents(struct send_ctx
*sctx
)
6599 struct btrfs_root
*root
;
6600 struct btrfs_path
*path
;
6601 struct btrfs_key key
;
6602 struct btrfs_key found_key
;
6604 root
= sctx
->send_root
;
6605 path
= alloc_path_for_send();
6609 key
.objectid
= sctx
->cmp_key
->objectid
;
6610 key
.type
= BTRFS_EXTENT_DATA_KEY
;
6612 btrfs_for_each_slot(root
, &key
, &found_key
, path
, iter_ret
) {
6613 if (found_key
.objectid
!= key
.objectid
||
6614 found_key
.type
!= key
.type
) {
6619 ret
= process_extent(sctx
, path
, &found_key
);
6623 /* Catch error found during iteration */
6627 btrfs_free_path(path
);
6631 static int process_recorded_refs_if_needed(struct send_ctx
*sctx
, int at_end
,
6633 int *refs_processed
)
6637 if (sctx
->cur_ino
== 0)
6639 if (!at_end
&& sctx
->cur_ino
== sctx
->cmp_key
->objectid
&&
6640 sctx
->cmp_key
->type
<= BTRFS_INODE_EXTREF_KEY
)
6642 if (list_empty(&sctx
->new_refs
) && list_empty(&sctx
->deleted_refs
))
6645 ret
= process_recorded_refs(sctx
, pending_move
);
6649 *refs_processed
= 1;
6654 static int finish_inode_if_needed(struct send_ctx
*sctx
, int at_end
)
6657 struct btrfs_inode_info info
;
6668 bool need_fileattr
= false;
6669 int need_truncate
= 1;
6670 int pending_move
= 0;
6671 int refs_processed
= 0;
6673 if (sctx
->ignore_cur_inode
)
6676 ret
= process_recorded_refs_if_needed(sctx
, at_end
, &pending_move
,
6682 * We have processed the refs and thus need to advance send_progress.
6683 * Now, calls to get_cur_xxx will take the updated refs of the current
6684 * inode into account.
6686 * On the other hand, if our current inode is a directory and couldn't
6687 * be moved/renamed because its parent was renamed/moved too and it has
6688 * a higher inode number, we can only move/rename our current inode
6689 * after we moved/renamed its parent. Therefore in this case operate on
6690 * the old path (pre move/rename) of our current inode, and the
6691 * move/rename will be performed later.
6693 if (refs_processed
&& !pending_move
)
6694 sctx
->send_progress
= sctx
->cur_ino
+ 1;
6696 if (sctx
->cur_ino
== 0 || sctx
->cur_inode_deleted
)
6698 if (!at_end
&& sctx
->cmp_key
->objectid
== sctx
->cur_ino
)
6700 ret
= get_inode_info(sctx
->send_root
, sctx
->cur_ino
, &info
);
6703 left_mode
= info
.mode
;
6704 left_uid
= info
.uid
;
6705 left_gid
= info
.gid
;
6706 left_fileattr
= info
.fileattr
;
6708 if (!sctx
->parent_root
|| sctx
->cur_inode_new
) {
6710 if (!S_ISLNK(sctx
->cur_inode_mode
))
6712 if (sctx
->cur_inode_next_write_offset
== sctx
->cur_inode_size
)
6717 ret
= get_inode_info(sctx
->parent_root
, sctx
->cur_ino
, &info
);
6720 old_size
= info
.size
;
6721 right_mode
= info
.mode
;
6722 right_uid
= info
.uid
;
6723 right_gid
= info
.gid
;
6724 right_fileattr
= info
.fileattr
;
6726 if (left_uid
!= right_uid
|| left_gid
!= right_gid
)
6728 if (!S_ISLNK(sctx
->cur_inode_mode
) && left_mode
!= right_mode
)
6730 if (!S_ISLNK(sctx
->cur_inode_mode
) && left_fileattr
!= right_fileattr
)
6731 need_fileattr
= true;
6732 if ((old_size
== sctx
->cur_inode_size
) ||
6733 (sctx
->cur_inode_size
> old_size
&&
6734 sctx
->cur_inode_next_write_offset
== sctx
->cur_inode_size
))
6738 if (S_ISREG(sctx
->cur_inode_mode
)) {
6739 if (need_send_hole(sctx
)) {
6740 if (sctx
->cur_inode_last_extent
== (u64
)-1 ||
6741 sctx
->cur_inode_last_extent
<
6742 sctx
->cur_inode_size
) {
6743 ret
= get_last_extent(sctx
, (u64
)-1);
6747 if (sctx
->cur_inode_last_extent
< sctx
->cur_inode_size
) {
6748 ret
= range_is_hole_in_parent(sctx
,
6749 sctx
->cur_inode_last_extent
,
6750 sctx
->cur_inode_size
);
6753 } else if (ret
== 0) {
6754 ret
= send_hole(sctx
, sctx
->cur_inode_size
);
6758 /* Range is already a hole, skip. */
6763 if (need_truncate
) {
6764 ret
= send_truncate(sctx
, sctx
->cur_ino
,
6765 sctx
->cur_inode_gen
,
6766 sctx
->cur_inode_size
);
6773 ret
= send_chown(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
,
6774 left_uid
, left_gid
);
6779 ret
= send_chmod(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
,
6784 if (need_fileattr
) {
6785 ret
= send_fileattr(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
,
6791 if (proto_cmd_ok(sctx
, BTRFS_SEND_C_ENABLE_VERITY
)
6792 && sctx
->cur_inode_needs_verity
) {
6793 ret
= process_verity(sctx
);
6798 ret
= send_capabilities(sctx
);
6803 * If other directory inodes depended on our current directory
6804 * inode's move/rename, now do their move/rename operations.
6806 if (!is_waiting_for_move(sctx
, sctx
->cur_ino
)) {
6807 ret
= apply_children_dir_moves(sctx
);
6811 * Need to send that every time, no matter if it actually
6812 * changed between the two trees as we have done changes to
6813 * the inode before. If our inode is a directory and it's
6814 * waiting to be moved/renamed, we will send its utimes when
6815 * it's moved/renamed, therefore we don't need to do it here.
6817 sctx
->send_progress
= sctx
->cur_ino
+ 1;
6820 * If the current inode is a non-empty directory, delay issuing
6821 * the utimes command for it, as it's very likely we have inodes
6822 * with an higher number inside it. We want to issue the utimes
6823 * command only after adding all dentries to it.
6825 if (S_ISDIR(sctx
->cur_inode_mode
) && sctx
->cur_inode_size
> 0)
6826 ret
= cache_dir_utimes(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
);
6828 ret
= send_utimes(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
);
6836 ret
= trim_dir_utimes_cache(sctx
);
6841 static void close_current_inode(struct send_ctx
*sctx
)
6845 if (sctx
->cur_inode
== NULL
)
6848 i_size
= i_size_read(sctx
->cur_inode
);
6851 * If we are doing an incremental send, we may have extents between the
6852 * last processed extent and the i_size that have not been processed
6853 * because they haven't changed but we may have read some of their pages
6854 * through readahead, see the comments at send_extent_data().
6856 if (sctx
->clean_page_cache
&& sctx
->page_cache_clear_start
< i_size
)
6857 truncate_inode_pages_range(&sctx
->cur_inode
->i_data
,
6858 sctx
->page_cache_clear_start
,
6859 round_up(i_size
, PAGE_SIZE
) - 1);
6861 iput(sctx
->cur_inode
);
6862 sctx
->cur_inode
= NULL
;
6865 static int changed_inode(struct send_ctx
*sctx
,
6866 enum btrfs_compare_tree_result result
)
6869 struct btrfs_key
*key
= sctx
->cmp_key
;
6870 struct btrfs_inode_item
*left_ii
= NULL
;
6871 struct btrfs_inode_item
*right_ii
= NULL
;
6875 close_current_inode(sctx
);
6877 sctx
->cur_ino
= key
->objectid
;
6878 sctx
->cur_inode_new_gen
= false;
6879 sctx
->cur_inode_last_extent
= (u64
)-1;
6880 sctx
->cur_inode_next_write_offset
= 0;
6881 sctx
->ignore_cur_inode
= false;
6884 * Set send_progress to current inode. This will tell all get_cur_xxx
6885 * functions that the current inode's refs are not updated yet. Later,
6886 * when process_recorded_refs is finished, it is set to cur_ino + 1.
6888 sctx
->send_progress
= sctx
->cur_ino
;
6890 if (result
== BTRFS_COMPARE_TREE_NEW
||
6891 result
== BTRFS_COMPARE_TREE_CHANGED
) {
6892 left_ii
= btrfs_item_ptr(sctx
->left_path
->nodes
[0],
6893 sctx
->left_path
->slots
[0],
6894 struct btrfs_inode_item
);
6895 left_gen
= btrfs_inode_generation(sctx
->left_path
->nodes
[0],
6898 right_ii
= btrfs_item_ptr(sctx
->right_path
->nodes
[0],
6899 sctx
->right_path
->slots
[0],
6900 struct btrfs_inode_item
);
6901 right_gen
= btrfs_inode_generation(sctx
->right_path
->nodes
[0],
6904 if (result
== BTRFS_COMPARE_TREE_CHANGED
) {
6905 right_ii
= btrfs_item_ptr(sctx
->right_path
->nodes
[0],
6906 sctx
->right_path
->slots
[0],
6907 struct btrfs_inode_item
);
6909 right_gen
= btrfs_inode_generation(sctx
->right_path
->nodes
[0],
6913 * The cur_ino = root dir case is special here. We can't treat
6914 * the inode as deleted+reused because it would generate a
6915 * stream that tries to delete/mkdir the root dir.
6917 if (left_gen
!= right_gen
&&
6918 sctx
->cur_ino
!= BTRFS_FIRST_FREE_OBJECTID
)
6919 sctx
->cur_inode_new_gen
= true;
6923 * Normally we do not find inodes with a link count of zero (orphans)
6924 * because the most common case is to create a snapshot and use it
6925 * for a send operation. However other less common use cases involve
6926 * using a subvolume and send it after turning it to RO mode just
6927 * after deleting all hard links of a file while holding an open
6928 * file descriptor against it or turning a RO snapshot into RW mode,
6929 * keep an open file descriptor against a file, delete it and then
6930 * turn the snapshot back to RO mode before using it for a send
6931 * operation. The former is what the receiver operation does.
6932 * Therefore, if we want to send these snapshots soon after they're
6933 * received, we need to handle orphan inodes as well. Moreover, orphans
6934 * can appear not only in the send snapshot but also in the parent
6935 * snapshot. Here are several cases:
6937 * Case 1: BTRFS_COMPARE_TREE_NEW
6938 * | send snapshot | action
6939 * --------------------------------
6940 * nlink | 0 | ignore
6942 * Case 2: BTRFS_COMPARE_TREE_DELETED
6943 * | parent snapshot | action
6944 * ----------------------------------
6945 * nlink | 0 | as usual
6946 * Note: No unlinks will be sent because there're no paths for it.
6948 * Case 3: BTRFS_COMPARE_TREE_CHANGED
6949 * | | parent snapshot | send snapshot | action
6950 * -----------------------------------------------------------------------
6951 * subcase 1 | nlink | 0 | 0 | ignore
6952 * subcase 2 | nlink | >0 | 0 | new_gen(deletion)
6953 * subcase 3 | nlink | 0 | >0 | new_gen(creation)
6956 if (result
== BTRFS_COMPARE_TREE_NEW
) {
6957 if (btrfs_inode_nlink(sctx
->left_path
->nodes
[0], left_ii
) == 0) {
6958 sctx
->ignore_cur_inode
= true;
6961 sctx
->cur_inode_gen
= left_gen
;
6962 sctx
->cur_inode_new
= true;
6963 sctx
->cur_inode_deleted
= false;
6964 sctx
->cur_inode_size
= btrfs_inode_size(
6965 sctx
->left_path
->nodes
[0], left_ii
);
6966 sctx
->cur_inode_mode
= btrfs_inode_mode(
6967 sctx
->left_path
->nodes
[0], left_ii
);
6968 sctx
->cur_inode_rdev
= btrfs_inode_rdev(
6969 sctx
->left_path
->nodes
[0], left_ii
);
6970 if (sctx
->cur_ino
!= BTRFS_FIRST_FREE_OBJECTID
)
6971 ret
= send_create_inode_if_needed(sctx
);
6972 } else if (result
== BTRFS_COMPARE_TREE_DELETED
) {
6973 sctx
->cur_inode_gen
= right_gen
;
6974 sctx
->cur_inode_new
= false;
6975 sctx
->cur_inode_deleted
= true;
6976 sctx
->cur_inode_size
= btrfs_inode_size(
6977 sctx
->right_path
->nodes
[0], right_ii
);
6978 sctx
->cur_inode_mode
= btrfs_inode_mode(
6979 sctx
->right_path
->nodes
[0], right_ii
);
6980 } else if (result
== BTRFS_COMPARE_TREE_CHANGED
) {
6981 u32 new_nlinks
, old_nlinks
;
6983 new_nlinks
= btrfs_inode_nlink(sctx
->left_path
->nodes
[0], left_ii
);
6984 old_nlinks
= btrfs_inode_nlink(sctx
->right_path
->nodes
[0], right_ii
);
6985 if (new_nlinks
== 0 && old_nlinks
== 0) {
6986 sctx
->ignore_cur_inode
= true;
6988 } else if (new_nlinks
== 0 || old_nlinks
== 0) {
6989 sctx
->cur_inode_new_gen
= 1;
6992 * We need to do some special handling in case the inode was
6993 * reported as changed with a changed generation number. This
6994 * means that the original inode was deleted and new inode
6995 * reused the same inum. So we have to treat the old inode as
6996 * deleted and the new one as new.
6998 if (sctx
->cur_inode_new_gen
) {
7000 * First, process the inode as if it was deleted.
7002 if (old_nlinks
> 0) {
7003 sctx
->cur_inode_gen
= right_gen
;
7004 sctx
->cur_inode_new
= false;
7005 sctx
->cur_inode_deleted
= true;
7006 sctx
->cur_inode_size
= btrfs_inode_size(
7007 sctx
->right_path
->nodes
[0], right_ii
);
7008 sctx
->cur_inode_mode
= btrfs_inode_mode(
7009 sctx
->right_path
->nodes
[0], right_ii
);
7010 ret
= process_all_refs(sctx
,
7011 BTRFS_COMPARE_TREE_DELETED
);
7017 * Now process the inode as if it was new.
7019 if (new_nlinks
> 0) {
7020 sctx
->cur_inode_gen
= left_gen
;
7021 sctx
->cur_inode_new
= true;
7022 sctx
->cur_inode_deleted
= false;
7023 sctx
->cur_inode_size
= btrfs_inode_size(
7024 sctx
->left_path
->nodes
[0],
7026 sctx
->cur_inode_mode
= btrfs_inode_mode(
7027 sctx
->left_path
->nodes
[0],
7029 sctx
->cur_inode_rdev
= btrfs_inode_rdev(
7030 sctx
->left_path
->nodes
[0],
7032 ret
= send_create_inode_if_needed(sctx
);
7036 ret
= process_all_refs(sctx
, BTRFS_COMPARE_TREE_NEW
);
7040 * Advance send_progress now as we did not get
7041 * into process_recorded_refs_if_needed in the
7044 sctx
->send_progress
= sctx
->cur_ino
+ 1;
7047 * Now process all extents and xattrs of the
7048 * inode as if they were all new.
7050 ret
= process_all_extents(sctx
);
7053 ret
= process_all_new_xattrs(sctx
);
7058 sctx
->cur_inode_gen
= left_gen
;
7059 sctx
->cur_inode_new
= false;
7060 sctx
->cur_inode_new_gen
= false;
7061 sctx
->cur_inode_deleted
= false;
7062 sctx
->cur_inode_size
= btrfs_inode_size(
7063 sctx
->left_path
->nodes
[0], left_ii
);
7064 sctx
->cur_inode_mode
= btrfs_inode_mode(
7065 sctx
->left_path
->nodes
[0], left_ii
);
7074 * We have to process new refs before deleted refs, but compare_trees gives us
7075 * the new and deleted refs mixed. To fix this, we record the new/deleted refs
7076 * first and later process them in process_recorded_refs.
7077 * For the cur_inode_new_gen case, we skip recording completely because
7078 * changed_inode did already initiate processing of refs. The reason for this is
7079 * that in this case, compare_tree actually compares the refs of 2 different
7080 * inodes. To fix this, process_all_refs is used in changed_inode to handle all
7081 * refs of the right tree as deleted and all refs of the left tree as new.
7083 static int changed_ref(struct send_ctx
*sctx
,
7084 enum btrfs_compare_tree_result result
)
7088 if (sctx
->cur_ino
!= sctx
->cmp_key
->objectid
) {
7089 inconsistent_snapshot_error(sctx
, result
, "reference");
7093 if (!sctx
->cur_inode_new_gen
&&
7094 sctx
->cur_ino
!= BTRFS_FIRST_FREE_OBJECTID
) {
7095 if (result
== BTRFS_COMPARE_TREE_NEW
)
7096 ret
= record_new_ref(sctx
);
7097 else if (result
== BTRFS_COMPARE_TREE_DELETED
)
7098 ret
= record_deleted_ref(sctx
);
7099 else if (result
== BTRFS_COMPARE_TREE_CHANGED
)
7100 ret
= record_changed_ref(sctx
);
7107 * Process new/deleted/changed xattrs. We skip processing in the
7108 * cur_inode_new_gen case because changed_inode did already initiate processing
7109 * of xattrs. The reason is the same as in changed_ref
7111 static int changed_xattr(struct send_ctx
*sctx
,
7112 enum btrfs_compare_tree_result result
)
7116 if (sctx
->cur_ino
!= sctx
->cmp_key
->objectid
) {
7117 inconsistent_snapshot_error(sctx
, result
, "xattr");
7121 if (!sctx
->cur_inode_new_gen
&& !sctx
->cur_inode_deleted
) {
7122 if (result
== BTRFS_COMPARE_TREE_NEW
)
7123 ret
= process_new_xattr(sctx
);
7124 else if (result
== BTRFS_COMPARE_TREE_DELETED
)
7125 ret
= process_deleted_xattr(sctx
);
7126 else if (result
== BTRFS_COMPARE_TREE_CHANGED
)
7127 ret
= process_changed_xattr(sctx
);
7134 * Process new/deleted/changed extents. We skip processing in the
7135 * cur_inode_new_gen case because changed_inode did already initiate processing
7136 * of extents. The reason is the same as in changed_ref
7138 static int changed_extent(struct send_ctx
*sctx
,
7139 enum btrfs_compare_tree_result result
)
7144 * We have found an extent item that changed without the inode item
7145 * having changed. This can happen either after relocation (where the
7146 * disk_bytenr of an extent item is replaced at
7147 * relocation.c:replace_file_extents()) or after deduplication into a
7148 * file in both the parent and send snapshots (where an extent item can
7149 * get modified or replaced with a new one). Note that deduplication
7150 * updates the inode item, but it only changes the iversion (sequence
7151 * field in the inode item) of the inode, so if a file is deduplicated
7152 * the same amount of times in both the parent and send snapshots, its
7153 * iversion becomes the same in both snapshots, whence the inode item is
7154 * the same on both snapshots.
7156 if (sctx
->cur_ino
!= sctx
->cmp_key
->objectid
)
7159 if (!sctx
->cur_inode_new_gen
&& !sctx
->cur_inode_deleted
) {
7160 if (result
!= BTRFS_COMPARE_TREE_DELETED
)
7161 ret
= process_extent(sctx
, sctx
->left_path
,
7168 static int changed_verity(struct send_ctx
*sctx
, enum btrfs_compare_tree_result result
)
7172 if (!sctx
->cur_inode_new_gen
&& !sctx
->cur_inode_deleted
) {
7173 if (result
== BTRFS_COMPARE_TREE_NEW
)
7174 sctx
->cur_inode_needs_verity
= true;
7179 static int dir_changed(struct send_ctx
*sctx
, u64 dir
)
7181 u64 orig_gen
, new_gen
;
7184 ret
= get_inode_gen(sctx
->send_root
, dir
, &new_gen
);
7188 ret
= get_inode_gen(sctx
->parent_root
, dir
, &orig_gen
);
7192 return (orig_gen
!= new_gen
) ? 1 : 0;
7195 static int compare_refs(struct send_ctx
*sctx
, struct btrfs_path
*path
,
7196 struct btrfs_key
*key
)
7198 struct btrfs_inode_extref
*extref
;
7199 struct extent_buffer
*leaf
;
7200 u64 dirid
= 0, last_dirid
= 0;
7207 /* Easy case, just check this one dirid */
7208 if (key
->type
== BTRFS_INODE_REF_KEY
) {
7209 dirid
= key
->offset
;
7211 ret
= dir_changed(sctx
, dirid
);
7215 leaf
= path
->nodes
[0];
7216 item_size
= btrfs_item_size(leaf
, path
->slots
[0]);
7217 ptr
= btrfs_item_ptr_offset(leaf
, path
->slots
[0]);
7218 while (cur_offset
< item_size
) {
7219 extref
= (struct btrfs_inode_extref
*)(ptr
+
7221 dirid
= btrfs_inode_extref_parent(leaf
, extref
);
7222 ref_name_len
= btrfs_inode_extref_name_len(leaf
, extref
);
7223 cur_offset
+= ref_name_len
+ sizeof(*extref
);
7224 if (dirid
== last_dirid
)
7226 ret
= dir_changed(sctx
, dirid
);
7236 * Updates compare related fields in sctx and simply forwards to the actual
7237 * changed_xxx functions.
7239 static int changed_cb(struct btrfs_path
*left_path
,
7240 struct btrfs_path
*right_path
,
7241 struct btrfs_key
*key
,
7242 enum btrfs_compare_tree_result result
,
7243 struct send_ctx
*sctx
)
7248 * We can not hold the commit root semaphore here. This is because in
7249 * the case of sending and receiving to the same filesystem, using a
7250 * pipe, could result in a deadlock:
7252 * 1) The task running send blocks on the pipe because it's full;
7254 * 2) The task running receive, which is the only consumer of the pipe,
7255 * is waiting for a transaction commit (for example due to a space
7256 * reservation when doing a write or triggering a transaction commit
7257 * when creating a subvolume);
7259 * 3) The transaction is waiting to write lock the commit root semaphore,
7260 * but can not acquire it since it's being held at 1).
7262 * Down this call chain we write to the pipe through kernel_write().
7263 * The same type of problem can also happen when sending to a file that
7264 * is stored in the same filesystem - when reserving space for a write
7265 * into the file, we can trigger a transaction commit.
7267 * Our caller has supplied us with clones of leaves from the send and
7268 * parent roots, so we're safe here from a concurrent relocation and
7269 * further reallocation of metadata extents while we are here. Below we
7270 * also assert that the leaves are clones.
7272 lockdep_assert_not_held(&sctx
->send_root
->fs_info
->commit_root_sem
);
7275 * We always have a send root, so left_path is never NULL. We will not
7276 * have a leaf when we have reached the end of the send root but have
7277 * not yet reached the end of the parent root.
7279 if (left_path
->nodes
[0])
7280 ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED
,
7281 &left_path
->nodes
[0]->bflags
));
7283 * When doing a full send we don't have a parent root, so right_path is
7284 * NULL. When doing an incremental send, we may have reached the end of
7285 * the parent root already, so we don't have a leaf at right_path.
7287 if (right_path
&& right_path
->nodes
[0])
7288 ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED
,
7289 &right_path
->nodes
[0]->bflags
));
7291 if (result
== BTRFS_COMPARE_TREE_SAME
) {
7292 if (key
->type
== BTRFS_INODE_REF_KEY
||
7293 key
->type
== BTRFS_INODE_EXTREF_KEY
) {
7294 ret
= compare_refs(sctx
, left_path
, key
);
7299 } else if (key
->type
== BTRFS_EXTENT_DATA_KEY
) {
7300 return maybe_send_hole(sctx
, left_path
, key
);
7304 result
= BTRFS_COMPARE_TREE_CHANGED
;
7308 sctx
->left_path
= left_path
;
7309 sctx
->right_path
= right_path
;
7310 sctx
->cmp_key
= key
;
7312 ret
= finish_inode_if_needed(sctx
, 0);
7316 /* Ignore non-FS objects */
7317 if (key
->objectid
== BTRFS_FREE_INO_OBJECTID
||
7318 key
->objectid
== BTRFS_FREE_SPACE_OBJECTID
)
7321 if (key
->type
== BTRFS_INODE_ITEM_KEY
) {
7322 ret
= changed_inode(sctx
, result
);
7323 } else if (!sctx
->ignore_cur_inode
) {
7324 if (key
->type
== BTRFS_INODE_REF_KEY
||
7325 key
->type
== BTRFS_INODE_EXTREF_KEY
)
7326 ret
= changed_ref(sctx
, result
);
7327 else if (key
->type
== BTRFS_XATTR_ITEM_KEY
)
7328 ret
= changed_xattr(sctx
, result
);
7329 else if (key
->type
== BTRFS_EXTENT_DATA_KEY
)
7330 ret
= changed_extent(sctx
, result
);
7331 else if (key
->type
== BTRFS_VERITY_DESC_ITEM_KEY
&&
7333 ret
= changed_verity(sctx
, result
);
7340 static int search_key_again(const struct send_ctx
*sctx
,
7341 struct btrfs_root
*root
,
7342 struct btrfs_path
*path
,
7343 const struct btrfs_key
*key
)
7347 if (!path
->need_commit_sem
)
7348 lockdep_assert_held_read(&root
->fs_info
->commit_root_sem
);
7351 * Roots used for send operations are readonly and no one can add,
7352 * update or remove keys from them, so we should be able to find our
7353 * key again. The only exception is deduplication, which can operate on
7354 * readonly roots and add, update or remove keys to/from them - but at
7355 * the moment we don't allow it to run in parallel with send.
7357 ret
= btrfs_search_slot(NULL
, root
, key
, path
, 0, 0);
7360 btrfs_print_tree(path
->nodes
[path
->lowest_level
], false);
7361 btrfs_err(root
->fs_info
,
7362 "send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d",
7363 key
->objectid
, key
->type
, key
->offset
,
7364 (root
== sctx
->parent_root
? "parent" : "send"),
7365 btrfs_root_id(root
), path
->lowest_level
,
7366 path
->slots
[path
->lowest_level
]);
7373 static int full_send_tree(struct send_ctx
*sctx
)
7376 struct btrfs_root
*send_root
= sctx
->send_root
;
7377 struct btrfs_key key
;
7378 struct btrfs_fs_info
*fs_info
= send_root
->fs_info
;
7379 struct btrfs_path
*path
;
7381 path
= alloc_path_for_send();
7384 path
->reada
= READA_FORWARD_ALWAYS
;
7386 key
.objectid
= BTRFS_FIRST_FREE_OBJECTID
;
7387 key
.type
= BTRFS_INODE_ITEM_KEY
;
7390 down_read(&fs_info
->commit_root_sem
);
7391 sctx
->last_reloc_trans
= fs_info
->last_reloc_trans
;
7392 up_read(&fs_info
->commit_root_sem
);
7394 ret
= btrfs_search_slot_for_read(send_root
, &key
, path
, 1, 0);
7401 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0]);
7403 ret
= changed_cb(path
, NULL
, &key
,
7404 BTRFS_COMPARE_TREE_NEW
, sctx
);
7408 down_read(&fs_info
->commit_root_sem
);
7409 if (fs_info
->last_reloc_trans
> sctx
->last_reloc_trans
) {
7410 sctx
->last_reloc_trans
= fs_info
->last_reloc_trans
;
7411 up_read(&fs_info
->commit_root_sem
);
7413 * A transaction used for relocating a block group was
7414 * committed or is about to finish its commit. Release
7415 * our path (leaf) and restart the search, so that we
7416 * avoid operating on any file extent items that are
7417 * stale, with a disk_bytenr that reflects a pre
7418 * relocation value. This way we avoid as much as
7419 * possible to fallback to regular writes when checking
7420 * if we can clone file ranges.
7422 btrfs_release_path(path
);
7423 ret
= search_key_again(sctx
, send_root
, path
, &key
);
7427 up_read(&fs_info
->commit_root_sem
);
7430 ret
= btrfs_next_item(send_root
, path
);
7440 ret
= finish_inode_if_needed(sctx
, 1);
7443 btrfs_free_path(path
);
7447 static int replace_node_with_clone(struct btrfs_path
*path
, int level
)
7449 struct extent_buffer
*clone
;
7451 clone
= btrfs_clone_extent_buffer(path
->nodes
[level
]);
7455 free_extent_buffer(path
->nodes
[level
]);
7456 path
->nodes
[level
] = clone
;
7461 static int tree_move_down(struct btrfs_path
*path
, int *level
, u64 reada_min_gen
)
7463 struct extent_buffer
*eb
;
7464 struct extent_buffer
*parent
= path
->nodes
[*level
];
7465 int slot
= path
->slots
[*level
];
7466 const int nritems
= btrfs_header_nritems(parent
);
7470 lockdep_assert_held_read(&parent
->fs_info
->commit_root_sem
);
7471 ASSERT(*level
!= 0);
7473 eb
= btrfs_read_node_slot(parent
, slot
);
7478 * Trigger readahead for the next leaves we will process, so that it is
7479 * very likely that when we need them they are already in memory and we
7480 * will not block on disk IO. For nodes we only do readahead for one,
7481 * since the time window between processing nodes is typically larger.
7483 reada_max
= (*level
== 1 ? SZ_128K
: eb
->fs_info
->nodesize
);
7485 for (slot
++; slot
< nritems
&& reada_done
< reada_max
; slot
++) {
7486 if (btrfs_node_ptr_generation(parent
, slot
) > reada_min_gen
) {
7487 btrfs_readahead_node_child(parent
, slot
);
7488 reada_done
+= eb
->fs_info
->nodesize
;
7492 path
->nodes
[*level
- 1] = eb
;
7493 path
->slots
[*level
- 1] = 0;
7497 return replace_node_with_clone(path
, 0);
7502 static int tree_move_next_or_upnext(struct btrfs_path
*path
,
7503 int *level
, int root_level
)
7507 nritems
= btrfs_header_nritems(path
->nodes
[*level
]);
7509 path
->slots
[*level
]++;
7511 while (path
->slots
[*level
] >= nritems
) {
7512 if (*level
== root_level
) {
7513 path
->slots
[*level
] = nritems
- 1;
7518 path
->slots
[*level
] = 0;
7519 free_extent_buffer(path
->nodes
[*level
]);
7520 path
->nodes
[*level
] = NULL
;
7522 path
->slots
[*level
]++;
7524 nritems
= btrfs_header_nritems(path
->nodes
[*level
]);
7531 * Returns 1 if it had to move up and next. 0 is returned if it moved only next
7534 static int tree_advance(struct btrfs_path
*path
,
7535 int *level
, int root_level
,
7537 struct btrfs_key
*key
,
7542 if (*level
== 0 || !allow_down
) {
7543 ret
= tree_move_next_or_upnext(path
, level
, root_level
);
7545 ret
= tree_move_down(path
, level
, reada_min_gen
);
7549 * Even if we have reached the end of a tree, ret is -1, update the key
7550 * anyway, so that in case we need to restart due to a block group
7551 * relocation, we can assert that the last key of the root node still
7552 * exists in the tree.
7555 btrfs_item_key_to_cpu(path
->nodes
[*level
], key
,
7556 path
->slots
[*level
]);
7558 btrfs_node_key_to_cpu(path
->nodes
[*level
], key
,
7559 path
->slots
[*level
]);
7564 static int tree_compare_item(struct btrfs_path
*left_path
,
7565 struct btrfs_path
*right_path
,
7570 unsigned long off1
, off2
;
7572 len1
= btrfs_item_size(left_path
->nodes
[0], left_path
->slots
[0]);
7573 len2
= btrfs_item_size(right_path
->nodes
[0], right_path
->slots
[0]);
7577 off1
= btrfs_item_ptr_offset(left_path
->nodes
[0], left_path
->slots
[0]);
7578 off2
= btrfs_item_ptr_offset(right_path
->nodes
[0],
7579 right_path
->slots
[0]);
7581 read_extent_buffer(left_path
->nodes
[0], tmp_buf
, off1
, len1
);
7583 cmp
= memcmp_extent_buffer(right_path
->nodes
[0], tmp_buf
, off2
, len1
);
7590 * A transaction used for relocating a block group was committed or is about to
7591 * finish its commit. Release our paths and restart the search, so that we are
7592 * not using stale extent buffers:
7594 * 1) For levels > 0, we are only holding references of extent buffers, without
7595 * any locks on them, which does not prevent them from having been relocated
7596 * and reallocated after the last time we released the commit root semaphore.
7597 * The exception are the root nodes, for which we always have a clone, see
7598 * the comment at btrfs_compare_trees();
7600 * 2) For leaves, level 0, we are holding copies (clones) of extent buffers, so
7601 * we are safe from the concurrent relocation and reallocation. However they
7602 * can have file extent items with a pre relocation disk_bytenr value, so we
7603 * restart the start from the current commit roots and clone the new leaves so
7604 * that we get the post relocation disk_bytenr values. Not doing so, could
7605 * make us clone the wrong data in case there are new extents using the old
7606 * disk_bytenr that happen to be shared.
7608 static int restart_after_relocation(struct btrfs_path
*left_path
,
7609 struct btrfs_path
*right_path
,
7610 const struct btrfs_key
*left_key
,
7611 const struct btrfs_key
*right_key
,
7614 const struct send_ctx
*sctx
)
7619 lockdep_assert_held_read(&sctx
->send_root
->fs_info
->commit_root_sem
);
7621 btrfs_release_path(left_path
);
7622 btrfs_release_path(right_path
);
7625 * Since keys can not be added or removed to/from our roots because they
7626 * are readonly and we do not allow deduplication to run in parallel
7627 * (which can add, remove or change keys), the layout of the trees should
7630 left_path
->lowest_level
= left_level
;
7631 ret
= search_key_again(sctx
, sctx
->send_root
, left_path
, left_key
);
7635 right_path
->lowest_level
= right_level
;
7636 ret
= search_key_again(sctx
, sctx
->parent_root
, right_path
, right_key
);
7641 * If the lowest level nodes are leaves, clone them so that they can be
7642 * safely used by changed_cb() while not under the protection of the
7643 * commit root semaphore, even if relocation and reallocation happens in
7646 if (left_level
== 0) {
7647 ret
= replace_node_with_clone(left_path
, 0);
7652 if (right_level
== 0) {
7653 ret
= replace_node_with_clone(right_path
, 0);
7659 * Now clone the root nodes (unless they happen to be the leaves we have
7660 * already cloned). This is to protect against concurrent snapshotting of
7661 * the send and parent roots (see the comment at btrfs_compare_trees()).
7663 root_level
= btrfs_header_level(sctx
->send_root
->commit_root
);
7664 if (root_level
> 0) {
7665 ret
= replace_node_with_clone(left_path
, root_level
);
7670 root_level
= btrfs_header_level(sctx
->parent_root
->commit_root
);
7671 if (root_level
> 0) {
7672 ret
= replace_node_with_clone(right_path
, root_level
);
7681 * This function compares two trees and calls the provided callback for
7682 * every changed/new/deleted item it finds.
7683 * If shared tree blocks are encountered, whole subtrees are skipped, making
7684 * the compare pretty fast on snapshotted subvolumes.
7686 * This currently works on commit roots only. As commit roots are read only,
7687 * we don't do any locking. The commit roots are protected with transactions.
7688 * Transactions are ended and rejoined when a commit is tried in between.
7690 * This function checks for modifications done to the trees while comparing.
7691 * If it detects a change, it aborts immediately.
7693 static int btrfs_compare_trees(struct btrfs_root
*left_root
,
7694 struct btrfs_root
*right_root
, struct send_ctx
*sctx
)
7696 struct btrfs_fs_info
*fs_info
= left_root
->fs_info
;
7699 struct btrfs_path
*left_path
= NULL
;
7700 struct btrfs_path
*right_path
= NULL
;
7701 struct btrfs_key left_key
;
7702 struct btrfs_key right_key
;
7703 char *tmp_buf
= NULL
;
7704 int left_root_level
;
7705 int right_root_level
;
7708 int left_end_reached
= 0;
7709 int right_end_reached
= 0;
7710 int advance_left
= 0;
7711 int advance_right
= 0;
7718 left_path
= btrfs_alloc_path();
7723 right_path
= btrfs_alloc_path();
7729 tmp_buf
= kvmalloc(fs_info
->nodesize
, GFP_KERNEL
);
7735 left_path
->search_commit_root
= 1;
7736 left_path
->skip_locking
= 1;
7737 right_path
->search_commit_root
= 1;
7738 right_path
->skip_locking
= 1;
7741 * Strategy: Go to the first items of both trees. Then do
7743 * If both trees are at level 0
7744 * Compare keys of current items
7745 * If left < right treat left item as new, advance left tree
7747 * If left > right treat right item as deleted, advance right tree
7749 * If left == right do deep compare of items, treat as changed if
7750 * needed, advance both trees and repeat
7751 * If both trees are at the same level but not at level 0
7752 * Compare keys of current nodes/leafs
7753 * If left < right advance left tree and repeat
7754 * If left > right advance right tree and repeat
7755 * If left == right compare blockptrs of the next nodes/leafs
7756 * If they match advance both trees but stay at the same level
7758 * If they don't match advance both trees while allowing to go
7760 * If tree levels are different
7761 * Advance the tree that needs it and repeat
7763 * Advancing a tree means:
7764 * If we are at level 0, try to go to the next slot. If that's not
7765 * possible, go one level up and repeat. Stop when we found a level
7766 * where we could go to the next slot. We may at this point be on a
7769 * If we are not at level 0 and not on shared tree blocks, go one
7772 * If we are not at level 0 and on shared tree blocks, go one slot to
7773 * the right if possible or go up and right.
7776 down_read(&fs_info
->commit_root_sem
);
7777 left_level
= btrfs_header_level(left_root
->commit_root
);
7778 left_root_level
= left_level
;
7780 * We clone the root node of the send and parent roots to prevent races
7781 * with snapshot creation of these roots. Snapshot creation COWs the
7782 * root node of a tree, so after the transaction is committed the old
7783 * extent can be reallocated while this send operation is still ongoing.
7784 * So we clone them, under the commit root semaphore, to be race free.
7786 left_path
->nodes
[left_level
] =
7787 btrfs_clone_extent_buffer(left_root
->commit_root
);
7788 if (!left_path
->nodes
[left_level
]) {
7793 right_level
= btrfs_header_level(right_root
->commit_root
);
7794 right_root_level
= right_level
;
7795 right_path
->nodes
[right_level
] =
7796 btrfs_clone_extent_buffer(right_root
->commit_root
);
7797 if (!right_path
->nodes
[right_level
]) {
7802 * Our right root is the parent root, while the left root is the "send"
7803 * root. We know that all new nodes/leaves in the left root must have
7804 * a generation greater than the right root's generation, so we trigger
7805 * readahead for those nodes and leaves of the left root, as we know we
7806 * will need to read them at some point.
7808 reada_min_gen
= btrfs_header_generation(right_root
->commit_root
);
7810 if (left_level
== 0)
7811 btrfs_item_key_to_cpu(left_path
->nodes
[left_level
],
7812 &left_key
, left_path
->slots
[left_level
]);
7814 btrfs_node_key_to_cpu(left_path
->nodes
[left_level
],
7815 &left_key
, left_path
->slots
[left_level
]);
7816 if (right_level
== 0)
7817 btrfs_item_key_to_cpu(right_path
->nodes
[right_level
],
7818 &right_key
, right_path
->slots
[right_level
]);
7820 btrfs_node_key_to_cpu(right_path
->nodes
[right_level
],
7821 &right_key
, right_path
->slots
[right_level
]);
7823 sctx
->last_reloc_trans
= fs_info
->last_reloc_trans
;
7826 if (need_resched() ||
7827 rwsem_is_contended(&fs_info
->commit_root_sem
)) {
7828 up_read(&fs_info
->commit_root_sem
);
7830 down_read(&fs_info
->commit_root_sem
);
7833 if (fs_info
->last_reloc_trans
> sctx
->last_reloc_trans
) {
7834 ret
= restart_after_relocation(left_path
, right_path
,
7835 &left_key
, &right_key
,
7836 left_level
, right_level
,
7840 sctx
->last_reloc_trans
= fs_info
->last_reloc_trans
;
7843 if (advance_left
&& !left_end_reached
) {
7844 ret
= tree_advance(left_path
, &left_level
,
7846 advance_left
!= ADVANCE_ONLY_NEXT
,
7847 &left_key
, reada_min_gen
);
7849 left_end_reached
= ADVANCE
;
7854 if (advance_right
&& !right_end_reached
) {
7855 ret
= tree_advance(right_path
, &right_level
,
7857 advance_right
!= ADVANCE_ONLY_NEXT
,
7858 &right_key
, reada_min_gen
);
7860 right_end_reached
= ADVANCE
;
7866 if (left_end_reached
&& right_end_reached
) {
7869 } else if (left_end_reached
) {
7870 if (right_level
== 0) {
7871 up_read(&fs_info
->commit_root_sem
);
7872 ret
= changed_cb(left_path
, right_path
,
7874 BTRFS_COMPARE_TREE_DELETED
,
7878 down_read(&fs_info
->commit_root_sem
);
7880 advance_right
= ADVANCE
;
7882 } else if (right_end_reached
) {
7883 if (left_level
== 0) {
7884 up_read(&fs_info
->commit_root_sem
);
7885 ret
= changed_cb(left_path
, right_path
,
7887 BTRFS_COMPARE_TREE_NEW
,
7891 down_read(&fs_info
->commit_root_sem
);
7893 advance_left
= ADVANCE
;
7897 if (left_level
== 0 && right_level
== 0) {
7898 up_read(&fs_info
->commit_root_sem
);
7899 cmp
= btrfs_comp_cpu_keys(&left_key
, &right_key
);
7901 ret
= changed_cb(left_path
, right_path
,
7903 BTRFS_COMPARE_TREE_NEW
,
7905 advance_left
= ADVANCE
;
7906 } else if (cmp
> 0) {
7907 ret
= changed_cb(left_path
, right_path
,
7909 BTRFS_COMPARE_TREE_DELETED
,
7911 advance_right
= ADVANCE
;
7913 enum btrfs_compare_tree_result result
;
7915 WARN_ON(!extent_buffer_uptodate(left_path
->nodes
[0]));
7916 ret
= tree_compare_item(left_path
, right_path
,
7919 result
= BTRFS_COMPARE_TREE_CHANGED
;
7921 result
= BTRFS_COMPARE_TREE_SAME
;
7922 ret
= changed_cb(left_path
, right_path
,
7923 &left_key
, result
, sctx
);
7924 advance_left
= ADVANCE
;
7925 advance_right
= ADVANCE
;
7930 down_read(&fs_info
->commit_root_sem
);
7931 } else if (left_level
== right_level
) {
7932 cmp
= btrfs_comp_cpu_keys(&left_key
, &right_key
);
7934 advance_left
= ADVANCE
;
7935 } else if (cmp
> 0) {
7936 advance_right
= ADVANCE
;
7938 left_blockptr
= btrfs_node_blockptr(
7939 left_path
->nodes
[left_level
],
7940 left_path
->slots
[left_level
]);
7941 right_blockptr
= btrfs_node_blockptr(
7942 right_path
->nodes
[right_level
],
7943 right_path
->slots
[right_level
]);
7944 left_gen
= btrfs_node_ptr_generation(
7945 left_path
->nodes
[left_level
],
7946 left_path
->slots
[left_level
]);
7947 right_gen
= btrfs_node_ptr_generation(
7948 right_path
->nodes
[right_level
],
7949 right_path
->slots
[right_level
]);
7950 if (left_blockptr
== right_blockptr
&&
7951 left_gen
== right_gen
) {
7953 * As we're on a shared block, don't
7954 * allow to go deeper.
7956 advance_left
= ADVANCE_ONLY_NEXT
;
7957 advance_right
= ADVANCE_ONLY_NEXT
;
7959 advance_left
= ADVANCE
;
7960 advance_right
= ADVANCE
;
7963 } else if (left_level
< right_level
) {
7964 advance_right
= ADVANCE
;
7966 advance_left
= ADVANCE
;
7971 up_read(&fs_info
->commit_root_sem
);
7973 btrfs_free_path(left_path
);
7974 btrfs_free_path(right_path
);
7979 static int send_subvol(struct send_ctx
*sctx
)
7983 if (!(sctx
->flags
& BTRFS_SEND_FLAG_OMIT_STREAM_HEADER
)) {
7984 ret
= send_header(sctx
);
7989 ret
= send_subvol_begin(sctx
);
7993 if (sctx
->parent_root
) {
7994 ret
= btrfs_compare_trees(sctx
->send_root
, sctx
->parent_root
, sctx
);
7997 ret
= finish_inode_if_needed(sctx
, 1);
8001 ret
= full_send_tree(sctx
);
8007 free_recorded_refs(sctx
);
8012 * If orphan cleanup did remove any orphans from a root, it means the tree
8013 * was modified and therefore the commit root is not the same as the current
8014 * root anymore. This is a problem, because send uses the commit root and
8015 * therefore can see inode items that don't exist in the current root anymore,
8016 * and for example make calls to btrfs_iget, which will do tree lookups based
8017 * on the current root and not on the commit root. Those lookups will fail,
8018 * returning a -ESTALE error, and making send fail with that error. So make
8019 * sure a send does not see any orphans we have just removed, and that it will
8020 * see the same inodes regardless of whether a transaction commit happened
8021 * before it started (meaning that the commit root will be the same as the
8022 * current root) or not.
8024 static int ensure_commit_roots_uptodate(struct send_ctx
*sctx
)
8026 struct btrfs_root
*root
= sctx
->parent_root
;
8028 if (root
&& root
->node
!= root
->commit_root
)
8029 return btrfs_commit_current_transaction(root
);
8031 for (int i
= 0; i
< sctx
->clone_roots_cnt
; i
++) {
8032 root
= sctx
->clone_roots
[i
].root
;
8033 if (root
->node
!= root
->commit_root
)
8034 return btrfs_commit_current_transaction(root
);
8041 * Make sure any existing dellaloc is flushed for any root used by a send
8042 * operation so that we do not miss any data and we do not race with writeback
8043 * finishing and changing a tree while send is using the tree. This could
8044 * happen if a subvolume is in RW mode, has delalloc, is turned to RO mode and
8045 * a send operation then uses the subvolume.
8046 * After flushing delalloc ensure_commit_roots_uptodate() must be called.
8048 static int flush_delalloc_roots(struct send_ctx
*sctx
)
8050 struct btrfs_root
*root
= sctx
->parent_root
;
8055 ret
= btrfs_start_delalloc_snapshot(root
, false);
8058 btrfs_wait_ordered_extents(root
, U64_MAX
, NULL
);
8061 for (i
= 0; i
< sctx
->clone_roots_cnt
; i
++) {
8062 root
= sctx
->clone_roots
[i
].root
;
8063 ret
= btrfs_start_delalloc_snapshot(root
, false);
8066 btrfs_wait_ordered_extents(root
, U64_MAX
, NULL
);
8072 static void btrfs_root_dec_send_in_progress(struct btrfs_root
* root
)
8074 spin_lock(&root
->root_item_lock
);
8075 root
->send_in_progress
--;
8077 * Not much left to do, we don't know why it's unbalanced and
8078 * can't blindly reset it to 0.
8080 if (root
->send_in_progress
< 0)
8081 btrfs_err(root
->fs_info
,
8082 "send_in_progress unbalanced %d root %llu",
8083 root
->send_in_progress
, btrfs_root_id(root
));
8084 spin_unlock(&root
->root_item_lock
);
8087 static void dedupe_in_progress_warn(const struct btrfs_root
*root
)
8089 btrfs_warn_rl(root
->fs_info
,
8090 "cannot use root %llu for send while deduplications on it are in progress (%d in progress)",
8091 btrfs_root_id(root
), root
->dedupe_in_progress
);
8094 long btrfs_ioctl_send(struct btrfs_inode
*inode
, const struct btrfs_ioctl_send_args
*arg
)
8097 struct btrfs_root
*send_root
= inode
->root
;
8098 struct btrfs_fs_info
*fs_info
= send_root
->fs_info
;
8099 struct btrfs_root
*clone_root
;
8100 struct send_ctx
*sctx
= NULL
;
8102 u64
*clone_sources_tmp
= NULL
;
8103 int clone_sources_to_rollback
= 0;
8105 int sort_clone_roots
= 0;
8106 struct btrfs_lru_cache_entry
*entry
;
8107 struct btrfs_lru_cache_entry
*tmp
;
8109 if (!capable(CAP_SYS_ADMIN
))
8113 * The subvolume must remain read-only during send, protect against
8114 * making it RW. This also protects against deletion.
8116 spin_lock(&send_root
->root_item_lock
);
8117 if (btrfs_root_readonly(send_root
) && send_root
->dedupe_in_progress
) {
8118 dedupe_in_progress_warn(send_root
);
8119 spin_unlock(&send_root
->root_item_lock
);
8122 send_root
->send_in_progress
++;
8123 spin_unlock(&send_root
->root_item_lock
);
8126 * Userspace tools do the checks and warn the user if it's
8129 if (!btrfs_root_readonly(send_root
)) {
8135 * Check that we don't overflow at later allocations, we request
8136 * clone_sources_count + 1 items, and compare to unsigned long inside
8137 * access_ok. Also set an upper limit for allocation size so this can't
8138 * easily exhaust memory. Max number of clone sources is about 200K.
8140 if (arg
->clone_sources_count
> SZ_8M
/ sizeof(struct clone_root
)) {
8145 if (arg
->flags
& ~BTRFS_SEND_FLAG_MASK
) {
8150 sctx
= kzalloc(sizeof(struct send_ctx
), GFP_KERNEL
);
8156 INIT_LIST_HEAD(&sctx
->new_refs
);
8157 INIT_LIST_HEAD(&sctx
->deleted_refs
);
8159 btrfs_lru_cache_init(&sctx
->name_cache
, SEND_MAX_NAME_CACHE_SIZE
);
8160 btrfs_lru_cache_init(&sctx
->backref_cache
, SEND_MAX_BACKREF_CACHE_SIZE
);
8161 btrfs_lru_cache_init(&sctx
->dir_created_cache
,
8162 SEND_MAX_DIR_CREATED_CACHE_SIZE
);
8164 * This cache is periodically trimmed to a fixed size elsewhere, see
8165 * cache_dir_utimes() and trim_dir_utimes_cache().
8167 btrfs_lru_cache_init(&sctx
->dir_utimes_cache
, 0);
8169 sctx
->pending_dir_moves
= RB_ROOT
;
8170 sctx
->waiting_dir_moves
= RB_ROOT
;
8171 sctx
->orphan_dirs
= RB_ROOT
;
8172 sctx
->rbtree_new_refs
= RB_ROOT
;
8173 sctx
->rbtree_deleted_refs
= RB_ROOT
;
8175 sctx
->flags
= arg
->flags
;
8177 if (arg
->flags
& BTRFS_SEND_FLAG_VERSION
) {
8178 if (arg
->version
> BTRFS_SEND_STREAM_VERSION
) {
8182 /* Zero means "use the highest version" */
8183 sctx
->proto
= arg
->version
?: BTRFS_SEND_STREAM_VERSION
;
8187 if ((arg
->flags
& BTRFS_SEND_FLAG_COMPRESSED
) && sctx
->proto
< 2) {
8192 sctx
->send_filp
= fget(arg
->send_fd
);
8193 if (!sctx
->send_filp
|| !(sctx
->send_filp
->f_mode
& FMODE_WRITE
)) {
8198 sctx
->send_root
= send_root
;
8200 * Unlikely but possible, if the subvolume is marked for deletion but
8201 * is slow to remove the directory entry, send can still be started
8203 if (btrfs_root_dead(sctx
->send_root
)) {
8208 sctx
->clone_roots_cnt
= arg
->clone_sources_count
;
8210 if (sctx
->proto
>= 2) {
8211 u32 send_buf_num_pages
;
8213 sctx
->send_max_size
= BTRFS_SEND_BUF_SIZE_V2
;
8214 sctx
->send_buf
= vmalloc(sctx
->send_max_size
);
8215 if (!sctx
->send_buf
) {
8219 send_buf_num_pages
= sctx
->send_max_size
>> PAGE_SHIFT
;
8220 sctx
->send_buf_pages
= kcalloc(send_buf_num_pages
,
8221 sizeof(*sctx
->send_buf_pages
),
8223 if (!sctx
->send_buf_pages
) {
8227 for (i
= 0; i
< send_buf_num_pages
; i
++) {
8228 sctx
->send_buf_pages
[i
] =
8229 vmalloc_to_page(sctx
->send_buf
+ (i
<< PAGE_SHIFT
));
8232 sctx
->send_max_size
= BTRFS_SEND_BUF_SIZE_V1
;
8233 sctx
->send_buf
= kvmalloc(sctx
->send_max_size
, GFP_KERNEL
);
8235 if (!sctx
->send_buf
) {
8240 sctx
->clone_roots
= kvcalloc(arg
->clone_sources_count
+ 1,
8241 sizeof(*sctx
->clone_roots
),
8243 if (!sctx
->clone_roots
) {
8248 alloc_size
= array_size(sizeof(*arg
->clone_sources
),
8249 arg
->clone_sources_count
);
8251 if (arg
->clone_sources_count
) {
8252 clone_sources_tmp
= kvmalloc(alloc_size
, GFP_KERNEL
);
8253 if (!clone_sources_tmp
) {
8258 ret
= copy_from_user(clone_sources_tmp
, arg
->clone_sources
,
8265 for (i
= 0; i
< arg
->clone_sources_count
; i
++) {
8266 clone_root
= btrfs_get_fs_root(fs_info
,
8267 clone_sources_tmp
[i
], true);
8268 if (IS_ERR(clone_root
)) {
8269 ret
= PTR_ERR(clone_root
);
8272 spin_lock(&clone_root
->root_item_lock
);
8273 if (!btrfs_root_readonly(clone_root
) ||
8274 btrfs_root_dead(clone_root
)) {
8275 spin_unlock(&clone_root
->root_item_lock
);
8276 btrfs_put_root(clone_root
);
8280 if (clone_root
->dedupe_in_progress
) {
8281 dedupe_in_progress_warn(clone_root
);
8282 spin_unlock(&clone_root
->root_item_lock
);
8283 btrfs_put_root(clone_root
);
8287 clone_root
->send_in_progress
++;
8288 spin_unlock(&clone_root
->root_item_lock
);
8290 sctx
->clone_roots
[i
].root
= clone_root
;
8291 clone_sources_to_rollback
= i
+ 1;
8293 kvfree(clone_sources_tmp
);
8294 clone_sources_tmp
= NULL
;
8297 if (arg
->parent_root
) {
8298 sctx
->parent_root
= btrfs_get_fs_root(fs_info
, arg
->parent_root
,
8300 if (IS_ERR(sctx
->parent_root
)) {
8301 ret
= PTR_ERR(sctx
->parent_root
);
8305 spin_lock(&sctx
->parent_root
->root_item_lock
);
8306 sctx
->parent_root
->send_in_progress
++;
8307 if (!btrfs_root_readonly(sctx
->parent_root
) ||
8308 btrfs_root_dead(sctx
->parent_root
)) {
8309 spin_unlock(&sctx
->parent_root
->root_item_lock
);
8313 if (sctx
->parent_root
->dedupe_in_progress
) {
8314 dedupe_in_progress_warn(sctx
->parent_root
);
8315 spin_unlock(&sctx
->parent_root
->root_item_lock
);
8319 spin_unlock(&sctx
->parent_root
->root_item_lock
);
8323 * Clones from send_root are allowed, but only if the clone source
8324 * is behind the current send position. This is checked while searching
8325 * for possible clone sources.
8327 sctx
->clone_roots
[sctx
->clone_roots_cnt
++].root
=
8328 btrfs_grab_root(sctx
->send_root
);
8330 /* We do a bsearch later */
8331 sort(sctx
->clone_roots
, sctx
->clone_roots_cnt
,
8332 sizeof(*sctx
->clone_roots
), __clone_root_cmp_sort
,
8334 sort_clone_roots
= 1;
8336 ret
= flush_delalloc_roots(sctx
);
8340 ret
= ensure_commit_roots_uptodate(sctx
);
8344 ret
= send_subvol(sctx
);
8348 btrfs_lru_cache_for_each_entry_safe(&sctx
->dir_utimes_cache
, entry
, tmp
) {
8349 ret
= send_utimes(sctx
, entry
->key
, entry
->gen
);
8352 btrfs_lru_cache_remove(&sctx
->dir_utimes_cache
, entry
);
8355 if (!(sctx
->flags
& BTRFS_SEND_FLAG_OMIT_END_CMD
)) {
8356 ret
= begin_cmd(sctx
, BTRFS_SEND_C_END
);
8359 ret
= send_cmd(sctx
);
8365 WARN_ON(sctx
&& !ret
&& !RB_EMPTY_ROOT(&sctx
->pending_dir_moves
));
8366 while (sctx
&& !RB_EMPTY_ROOT(&sctx
->pending_dir_moves
)) {
8368 struct pending_dir_move
*pm
;
8370 n
= rb_first(&sctx
->pending_dir_moves
);
8371 pm
= rb_entry(n
, struct pending_dir_move
, node
);
8372 while (!list_empty(&pm
->list
)) {
8373 struct pending_dir_move
*pm2
;
8375 pm2
= list_first_entry(&pm
->list
,
8376 struct pending_dir_move
, list
);
8377 free_pending_move(sctx
, pm2
);
8379 free_pending_move(sctx
, pm
);
8382 WARN_ON(sctx
&& !ret
&& !RB_EMPTY_ROOT(&sctx
->waiting_dir_moves
));
8383 while (sctx
&& !RB_EMPTY_ROOT(&sctx
->waiting_dir_moves
)) {
8385 struct waiting_dir_move
*dm
;
8387 n
= rb_first(&sctx
->waiting_dir_moves
);
8388 dm
= rb_entry(n
, struct waiting_dir_move
, node
);
8389 rb_erase(&dm
->node
, &sctx
->waiting_dir_moves
);
8393 WARN_ON(sctx
&& !ret
&& !RB_EMPTY_ROOT(&sctx
->orphan_dirs
));
8394 while (sctx
&& !RB_EMPTY_ROOT(&sctx
->orphan_dirs
)) {
8396 struct orphan_dir_info
*odi
;
8398 n
= rb_first(&sctx
->orphan_dirs
);
8399 odi
= rb_entry(n
, struct orphan_dir_info
, node
);
8400 free_orphan_dir_info(sctx
, odi
);
8403 if (sort_clone_roots
) {
8404 for (i
= 0; i
< sctx
->clone_roots_cnt
; i
++) {
8405 btrfs_root_dec_send_in_progress(
8406 sctx
->clone_roots
[i
].root
);
8407 btrfs_put_root(sctx
->clone_roots
[i
].root
);
8410 for (i
= 0; sctx
&& i
< clone_sources_to_rollback
; i
++) {
8411 btrfs_root_dec_send_in_progress(
8412 sctx
->clone_roots
[i
].root
);
8413 btrfs_put_root(sctx
->clone_roots
[i
].root
);
8416 btrfs_root_dec_send_in_progress(send_root
);
8418 if (sctx
&& !IS_ERR_OR_NULL(sctx
->parent_root
)) {
8419 btrfs_root_dec_send_in_progress(sctx
->parent_root
);
8420 btrfs_put_root(sctx
->parent_root
);
8423 kvfree(clone_sources_tmp
);
8426 if (sctx
->send_filp
)
8427 fput(sctx
->send_filp
);
8429 kvfree(sctx
->clone_roots
);
8430 kfree(sctx
->send_buf_pages
);
8431 kvfree(sctx
->send_buf
);
8432 kvfree(sctx
->verity_descriptor
);
8434 close_current_inode(sctx
);
8436 btrfs_lru_cache_clear(&sctx
->name_cache
);
8437 btrfs_lru_cache_clear(&sctx
->backref_cache
);
8438 btrfs_lru_cache_clear(&sctx
->dir_created_cache
);
8439 btrfs_lru_cache_clear(&sctx
->dir_utimes_cache
);