1 // SPDX-License-Identifier: GPL-2.0
3 #include <linux/init.h>
5 #include <linux/slab.h>
6 #include <linux/rwsem.h>
7 #include <linux/xattr.h>
8 #include <linux/security.h>
9 #include <linux/posix_acl_xattr.h>
10 #include <linux/iversion.h>
11 #include <linux/fsverity.h>
12 #include <linux/sched/mm.h>
15 #include "btrfs_inode.h"
16 #include "transaction.h"
19 #include "accessors.h"
25 * Implementation of the interface defined in struct fsverity_operations.
27 * The main question is how and where to store the verity descriptor and the
28 * Merkle tree. We store both in dedicated btree items in the filesystem tree,
29 * together with the rest of the inode metadata. This means we'll need to do
30 * extra work to encrypt them once encryption is supported in btrfs, but btrfs
31 * has a lot of careful code around i_size and it seems better to make a new key
32 * type than try and adjust all of our expectations for i_size.
34 * Note that this differs from the implementation in ext4 and f2fs, where
35 * this data is stored as if it were in the file, but past EOF. However, btrfs
36 * does not have a widespread mechanism for caching opaque metadata pages, so we
37 * do pretend that the Merkle tree pages themselves are past EOF for the
38 * purposes of caching them (as opposed to creating a virtual inode).
40 * fs verity items are stored under two different key types on disk.
41 * The descriptor items:
42 * [ inode objectid, BTRFS_VERITY_DESC_ITEM_KEY, offset ]
44 * At offset 0, we store a btrfs_verity_descriptor_item which tracks the
45 * size of the descriptor item and some extra data for encryption.
46 * Starting at offset 1, these hold the generic fs verity descriptor.
47 * The latter are opaque to btrfs, we just read and write them as a blob for
48 * the higher level verity code. The most common descriptor size is 256 bytes.
50 * The merkle tree items:
51 * [ inode objectid, BTRFS_VERITY_MERKLE_ITEM_KEY, offset ]
53 * These also start at offset 0, and correspond to the merkle tree bytes.
54 * So when fsverity asks for page 0 of the merkle tree, we pull up one page
55 * starting at offset 0 for this key type. These are also opaque to btrfs,
56 * we're blindly storing whatever fsverity sends down.
58 * Another important consideration is the fact that the Merkle tree data scales
59 * linearly with the size of the file (with 4K pages/blocks and SHA-256, it's
60 * ~1/127th the size) so for large files, writing the tree can be a lengthy
61 * operation. For that reason, we guard the whole enable verity operation
62 * (between begin_enable_verity and end_enable_verity) with an orphan item.
63 * Again, because the data can be pretty large, it's quite possible that we
64 * could run out of space writing it, so we try our best to handle errors by
65 * stopping and rolling back rather than aborting the victim transaction.
68 #define MERKLE_START_ALIGN 65536
71 * Compute the logical file offset where we cache the Merkle tree.
73 * @inode: inode of the verity file
75 * For the purposes of caching the Merkle tree pages, as required by
76 * fs-verity, it is convenient to do size computations in terms of a file
77 * offset, rather than in terms of page indices.
79 * Use 64K to be sure it's past the last page in the file, even with 64K pages.
80 * That rounding operation itself can overflow loff_t, so we do it in u64 and
83 * Returns the file offset on success, negative error code on failure.
85 static loff_t
merkle_file_pos(const struct inode
*inode
)
87 u64 sz
= inode
->i_size
;
88 u64 rounded
= round_up(sz
, MERKLE_START_ALIGN
);
90 if (rounded
> inode
->i_sb
->s_maxbytes
)
97 * Drop all the items for this inode with this key_type.
99 * @inode: inode to drop items for
100 * @key_type: type of items to drop (BTRFS_VERITY_DESC_ITEM or
101 * BTRFS_VERITY_MERKLE_ITEM)
103 * Before doing a verity enable we cleanup any existing verity items.
104 * This is also used to clean up if a verity enable failed half way through.
106 * Returns number of dropped items on success, negative error code on failure.
108 static int drop_verity_items(struct btrfs_inode
*inode
, u8 key_type
)
110 struct btrfs_trans_handle
*trans
;
111 struct btrfs_root
*root
= inode
->root
;
112 struct btrfs_path
*path
;
113 struct btrfs_key key
;
117 path
= btrfs_alloc_path();
122 /* 1 for the item being dropped */
123 trans
= btrfs_start_transaction(root
, 1);
125 ret
= PTR_ERR(trans
);
130 * Walk backwards through all the items until we find one that
131 * isn't from our key type or objectid
133 key
.objectid
= btrfs_ino(inode
);
135 key
.offset
= (u64
)-1;
137 ret
= btrfs_search_slot(trans
, root
, &key
, path
, -1, 1);
140 /* No more keys of this type, we're done */
141 if (path
->slots
[0] == 0)
144 } else if (ret
< 0) {
145 btrfs_end_transaction(trans
);
149 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0]);
151 /* No more keys of this type, we're done */
152 if (key
.objectid
!= btrfs_ino(inode
) || key
.type
!= key_type
)
156 * This shouldn't be a performance sensitive function because
157 * it's not used as part of truncate. If it ever becomes
158 * perf sensitive, change this to walk forward and bulk delete
161 ret
= btrfs_del_items(trans
, root
, path
, path
->slots
[0], 1);
163 btrfs_end_transaction(trans
);
167 btrfs_release_path(path
);
168 btrfs_end_transaction(trans
);
171 btrfs_end_transaction(trans
);
173 btrfs_free_path(path
);
178 * Drop all verity items
180 * @inode: inode to drop verity items for
182 * In most contexts where we are dropping verity items, we want to do it for all
183 * the types of verity items, not a particular one.
185 * Returns: 0 on success, negative error code on failure.
187 int btrfs_drop_verity_items(struct btrfs_inode
*inode
)
191 ret
= drop_verity_items(inode
, BTRFS_VERITY_DESC_ITEM_KEY
);
194 ret
= drop_verity_items(inode
, BTRFS_VERITY_MERKLE_ITEM_KEY
);
202 * Insert and write inode items with a given key type and offset.
204 * @inode: inode to insert for
205 * @key_type: key type to insert
206 * @offset: item offset to insert at
207 * @src: source data to write
208 * @len: length of source data to write
210 * Write len bytes from src into items of up to 2K length.
211 * The inserted items will have key (ino, key_type, offset + off) where off is
212 * consecutively increasing from 0 up to the last item ending at offset + len.
214 * Returns 0 on success and a negative error code on failure.
216 static int write_key_bytes(struct btrfs_inode
*inode
, u8 key_type
, u64 offset
,
217 const char *src
, u64 len
)
219 struct btrfs_trans_handle
*trans
;
220 struct btrfs_path
*path
;
221 struct btrfs_root
*root
= inode
->root
;
222 struct extent_buffer
*leaf
;
223 struct btrfs_key key
;
224 unsigned long copy_bytes
;
225 unsigned long src_offset
= 0;
229 path
= btrfs_alloc_path();
234 /* 1 for the new item being inserted */
235 trans
= btrfs_start_transaction(root
, 1);
237 ret
= PTR_ERR(trans
);
241 key
.objectid
= btrfs_ino(inode
);
246 * Insert 2K at a time mostly to be friendly for smaller leaf
249 copy_bytes
= min_t(u64
, len
, 2048);
251 ret
= btrfs_insert_empty_item(trans
, root
, path
, &key
, copy_bytes
);
253 btrfs_end_transaction(trans
);
257 leaf
= path
->nodes
[0];
259 data
= btrfs_item_ptr(leaf
, path
->slots
[0], void);
260 write_extent_buffer(leaf
, src
+ src_offset
,
261 (unsigned long)data
, copy_bytes
);
262 offset
+= copy_bytes
;
263 src_offset
+= copy_bytes
;
266 btrfs_release_path(path
);
267 btrfs_end_transaction(trans
);
270 btrfs_free_path(path
);
275 * Read inode items of the given key type and offset from the btree.
277 * @inode: inode to read items of
278 * @key_type: key type to read
279 * @offset: item offset to read from
280 * @dest: Buffer to read into. This parameter has slightly tricky
281 * semantics. If it is NULL, the function will not do any copying
282 * and will just return the size of all the items up to len bytes.
283 * If dest_page is passed, then the function will kmap_local the
284 * page and ignore dest, but it must still be non-NULL to avoid the
285 * counting-only behavior.
286 * @len: length in bytes to read
287 * @dest_folio: copy into this folio instead of the dest buffer
289 * Helper function to read items from the btree. This returns the number of
290 * bytes read or < 0 for errors. We can return short reads if the items don't
291 * exist on disk or aren't big enough to fill the desired length. Supports
292 * reading into a provided buffer (dest) or into the page cache
294 * Returns number of bytes read or a negative error code on failure.
296 static int read_key_bytes(struct btrfs_inode
*inode
, u8 key_type
, u64 offset
,
297 char *dest
, u64 len
, struct folio
*dest_folio
)
299 struct btrfs_path
*path
;
300 struct btrfs_root
*root
= inode
->root
;
301 struct extent_buffer
*leaf
;
302 struct btrfs_key key
;
307 unsigned long copy_bytes
;
308 unsigned long dest_offset
= 0;
313 path
= btrfs_alloc_path();
318 path
->reada
= READA_FORWARD
;
320 key
.objectid
= btrfs_ino(inode
);
324 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
327 } else if (ret
> 0) {
329 if (path
->slots
[0] == 0)
335 leaf
= path
->nodes
[0];
336 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
338 if (key
.objectid
!= btrfs_ino(inode
) || key
.type
!= key_type
)
341 item_end
= btrfs_item_size(leaf
, path
->slots
[0]) + key
.offset
;
345 * Once we've copied something, we want all of the items
348 if (key
.offset
!= offset
)
352 * Our initial offset might be in the middle of an
353 * item. Make sure it all makes sense.
355 if (key
.offset
> offset
)
357 if (item_end
<= offset
)
361 /* desc = NULL to just sum all the item lengths */
365 copy_end
= min(offset
+ len
, item_end
);
367 /* Number of bytes in this item we want to copy */
368 copy_bytes
= copy_end
- offset
;
370 /* Offset from the start of item for copying */
371 copy_offset
= offset
- key
.offset
;
375 kaddr
= kmap_local_folio(dest_folio
, 0);
377 data
= btrfs_item_ptr(leaf
, path
->slots
[0], void);
378 read_extent_buffer(leaf
, kaddr
+ dest_offset
,
379 (unsigned long)data
+ copy_offset
,
386 offset
+= copy_bytes
;
387 dest_offset
+= copy_bytes
;
389 copied
+= copy_bytes
;
392 if (path
->slots
[0] >= btrfs_header_nritems(path
->nodes
[0])) {
394 * We've reached the last slot in this leaf and we need
395 * to go to the next leaf.
397 ret
= btrfs_next_leaf(root
, path
);
400 } else if (ret
> 0) {
407 btrfs_free_path(path
);
414 * Delete an fsverity orphan
416 * @trans: transaction to do the delete in
417 * @inode: inode to orphan
419 * Capture verity orphan specific logic that is repeated in the couple places
420 * we delete verity orphans. Specifically, handling ENOENT and ignoring inodes
423 * Returns zero on success or a negative error code on failure.
425 static int del_orphan(struct btrfs_trans_handle
*trans
, struct btrfs_inode
*inode
)
427 struct btrfs_root
*root
= inode
->root
;
431 * If the inode has no links, it is either already unlinked, or was
432 * created with O_TMPFILE. In either case, it should have an orphan from
433 * that other operation. Rather than reference count the orphans, we
434 * simply ignore them here, because we only invoke the verity path in
435 * the orphan logic when i_nlink is 1.
437 if (!inode
->vfs_inode
.i_nlink
)
440 ret
= btrfs_del_orphan_item(trans
, root
, btrfs_ino(inode
));
447 * Rollback in-progress verity if we encounter an error.
449 * @inode: inode verity had an error for
451 * We try to handle recoverable errors while enabling verity by rolling it back
452 * and just failing the operation, rather than having an fs level error no
453 * matter what. However, any error in rollback is unrecoverable.
455 * Returns 0 on success, negative error code on failure.
457 static int rollback_verity(struct btrfs_inode
*inode
)
459 struct btrfs_trans_handle
*trans
= NULL
;
460 struct btrfs_root
*root
= inode
->root
;
463 btrfs_assert_inode_locked(inode
);
464 truncate_inode_pages(inode
->vfs_inode
.i_mapping
, inode
->vfs_inode
.i_size
);
465 clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS
, &inode
->runtime_flags
);
466 ret
= btrfs_drop_verity_items(inode
);
468 btrfs_handle_fs_error(root
->fs_info
, ret
,
469 "failed to drop verity items in rollback %llu",
470 (u64
)inode
->vfs_inode
.i_ino
);
475 * 1 for updating the inode flag
476 * 1 for deleting the orphan
478 trans
= btrfs_start_transaction(root
, 2);
480 ret
= PTR_ERR(trans
);
482 btrfs_handle_fs_error(root
->fs_info
, ret
,
483 "failed to start transaction in verity rollback %llu",
484 (u64
)inode
->vfs_inode
.i_ino
);
487 inode
->ro_flags
&= ~BTRFS_INODE_RO_VERITY
;
488 btrfs_sync_inode_flags_to_i_flags(&inode
->vfs_inode
);
489 ret
= btrfs_update_inode(trans
, inode
);
491 btrfs_abort_transaction(trans
, ret
);
494 ret
= del_orphan(trans
, inode
);
496 btrfs_abort_transaction(trans
, ret
);
501 btrfs_end_transaction(trans
);
506 * Finalize making the file a valid verity file
508 * @inode: inode to be marked as verity
509 * @desc: contents of the verity descriptor to write (not NULL)
510 * @desc_size: size of the verity descriptor
512 * Do the actual work of finalizing verity after successfully writing the Merkle
515 * - write out the descriptor items
516 * - mark the inode with the verity flag
517 * - delete the orphan item
518 * - mark the ro compat bit
519 * - clear the in progress bit
521 * Returns 0 on success, negative error code on failure.
523 static int finish_verity(struct btrfs_inode
*inode
, const void *desc
,
526 struct btrfs_trans_handle
*trans
= NULL
;
527 struct btrfs_root
*root
= inode
->root
;
528 struct btrfs_verity_descriptor_item item
;
531 /* Write out the descriptor item */
532 memset(&item
, 0, sizeof(item
));
533 btrfs_set_stack_verity_descriptor_size(&item
, desc_size
);
534 ret
= write_key_bytes(inode
, BTRFS_VERITY_DESC_ITEM_KEY
, 0,
535 (const char *)&item
, sizeof(item
));
539 /* Write out the descriptor itself */
540 ret
= write_key_bytes(inode
, BTRFS_VERITY_DESC_ITEM_KEY
, 1,
546 * 1 for updating the inode flag
547 * 1 for deleting the orphan
549 trans
= btrfs_start_transaction(root
, 2);
551 ret
= PTR_ERR(trans
);
554 inode
->ro_flags
|= BTRFS_INODE_RO_VERITY
;
555 btrfs_sync_inode_flags_to_i_flags(&inode
->vfs_inode
);
556 ret
= btrfs_update_inode(trans
, inode
);
559 ret
= del_orphan(trans
, inode
);
562 clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS
, &inode
->runtime_flags
);
563 btrfs_set_fs_compat_ro(root
->fs_info
, VERITY
);
565 btrfs_end_transaction(trans
);
572 * fsverity op that begins enabling verity.
574 * @filp: file to enable verity on
576 * Begin enabling fsverity for the file. We drop any existing verity items, add
577 * an orphan and set the in progress bit.
579 * Returns 0 on success, negative error code on failure.
581 static int btrfs_begin_enable_verity(struct file
*filp
)
583 struct btrfs_inode
*inode
= BTRFS_I(file_inode(filp
));
584 struct btrfs_root
*root
= inode
->root
;
585 struct btrfs_trans_handle
*trans
;
588 btrfs_assert_inode_locked(inode
);
590 if (test_bit(BTRFS_INODE_VERITY_IN_PROGRESS
, &inode
->runtime_flags
))
594 * This should almost never do anything, but theoretically, it's
595 * possible that we failed to enable verity on a file, then were
596 * interrupted or failed while rolling back, failed to cleanup the
597 * orphan, and finally attempt to enable verity again.
599 ret
= btrfs_drop_verity_items(inode
);
603 /* 1 for the orphan item */
604 trans
= btrfs_start_transaction(root
, 1);
606 return PTR_ERR(trans
);
608 ret
= btrfs_orphan_add(trans
, inode
);
610 set_bit(BTRFS_INODE_VERITY_IN_PROGRESS
, &inode
->runtime_flags
);
611 btrfs_end_transaction(trans
);
617 * fsverity op that ends enabling verity.
619 * @filp: file we are finishing enabling verity on
620 * @desc: verity descriptor to write out (NULL in error conditions)
621 * @desc_size: size of the verity descriptor (variable with signatures)
622 * @merkle_tree_size: size of the merkle tree in bytes
624 * If desc is null, then VFS is signaling an error occurred during verity
625 * enable, and we should try to rollback. Otherwise, attempt to finish verity.
627 * Returns 0 on success, negative error code on error.
629 static int btrfs_end_enable_verity(struct file
*filp
, const void *desc
,
630 size_t desc_size
, u64 merkle_tree_size
)
632 struct btrfs_inode
*inode
= BTRFS_I(file_inode(filp
));
636 btrfs_assert_inode_locked(inode
);
641 ret
= finish_verity(inode
, desc
, desc_size
);
647 rollback_ret
= rollback_verity(inode
);
649 btrfs_err(inode
->root
->fs_info
,
650 "failed to rollback verity items: %d", rollback_ret
);
655 * fsverity op that gets the struct fsverity_descriptor.
657 * @inode: inode to get the descriptor of
658 * @buf: output buffer for the descriptor contents
659 * @buf_size: size of the output buffer. 0 to query the size
661 * fsverity does a two pass setup for reading the descriptor, in the first pass
662 * it calls with buf_size = 0 to query the size of the descriptor, and then in
663 * the second pass it actually reads the descriptor off disk.
665 * Returns the size on success or a negative error code on failure.
667 int btrfs_get_verity_descriptor(struct inode
*inode
, void *buf
, size_t buf_size
)
671 struct btrfs_verity_descriptor_item item
;
673 memset(&item
, 0, sizeof(item
));
674 ret
= read_key_bytes(BTRFS_I(inode
), BTRFS_VERITY_DESC_ITEM_KEY
, 0,
675 (char *)&item
, sizeof(item
), NULL
);
679 if (item
.reserved
[0] != 0 || item
.reserved
[1] != 0)
682 true_size
= btrfs_stack_verity_descriptor_size(&item
);
683 if (true_size
> INT_MAX
)
688 if (buf_size
< true_size
)
691 ret
= read_key_bytes(BTRFS_I(inode
), BTRFS_VERITY_DESC_ITEM_KEY
, 1,
692 buf
, buf_size
, NULL
);
695 if (ret
!= true_size
)
702 * fsverity op that reads and caches a merkle tree page.
704 * @inode: inode to read a merkle tree page for
705 * @index: page index relative to the start of the merkle tree
706 * @num_ra_pages: number of pages to readahead. Optional, we ignore it
708 * The Merkle tree is stored in the filesystem btree, but its pages are cached
709 * with a logical position past EOF in the inode's mapping.
711 * Returns the page we read, or an ERR_PTR on error.
713 static struct page
*btrfs_read_merkle_tree_page(struct inode
*inode
,
715 unsigned long num_ra_pages
)
718 u64 off
= (u64
)index
<< PAGE_SHIFT
;
719 loff_t merkle_pos
= merkle_file_pos(inode
);
723 return ERR_PTR(merkle_pos
);
724 if (merkle_pos
> inode
->i_sb
->s_maxbytes
- off
- PAGE_SIZE
)
725 return ERR_PTR(-EFBIG
);
726 index
+= merkle_pos
>> PAGE_SHIFT
;
728 folio
= __filemap_get_folio(inode
->i_mapping
, index
, FGP_ACCESSED
, 0);
729 if (!IS_ERR(folio
)) {
730 if (folio_test_uptodate(folio
))
734 /* If it's not uptodate after we have the lock, we got a read error. */
735 if (!folio_test_uptodate(folio
)) {
738 return ERR_PTR(-EIO
);
744 folio
= filemap_alloc_folio(mapping_gfp_constraint(inode
->i_mapping
, ~__GFP_FS
),
747 return ERR_PTR(-ENOMEM
);
749 ret
= filemap_add_folio(inode
->i_mapping
, folio
, index
, GFP_NOFS
);
752 /* Did someone else insert a folio here? */
759 * Merkle item keys are indexed from byte 0 in the merkle tree.
760 * They have the form:
762 * [ inode objectid, BTRFS_MERKLE_ITEM_KEY, offset in bytes ]
764 ret
= read_key_bytes(BTRFS_I(inode
), BTRFS_VERITY_MERKLE_ITEM_KEY
, off
,
765 folio_address(folio
), PAGE_SIZE
, folio
);
771 folio_zero_segment(folio
, ret
, PAGE_SIZE
);
773 folio_mark_uptodate(folio
);
777 return folio_file_page(folio
, index
);
781 * fsverity op that writes a Merkle tree block into the btree.
783 * @inode: inode to write a Merkle tree block for
784 * @buf: Merkle tree block to write
785 * @pos: the position of the block in the Merkle tree (in bytes)
786 * @size: the Merkle tree block size (in bytes)
788 * Returns 0 on success or negative error code on failure
790 static int btrfs_write_merkle_tree_block(struct inode
*inode
, const void *buf
,
791 u64 pos
, unsigned int size
)
793 loff_t merkle_pos
= merkle_file_pos(inode
);
797 if (merkle_pos
> inode
->i_sb
->s_maxbytes
- pos
- size
)
800 return write_key_bytes(BTRFS_I(inode
), BTRFS_VERITY_MERKLE_ITEM_KEY
,
804 const struct fsverity_operations btrfs_verityops
= {
805 .begin_enable_verity
= btrfs_begin_enable_verity
,
806 .end_enable_verity
= btrfs_end_enable_verity
,
807 .get_verity_descriptor
= btrfs_get_verity_descriptor
,
808 .read_merkle_tree_page
= btrfs_read_merkle_tree_page
,
809 .write_merkle_tree_block
= btrfs_write_merkle_tree_block
,