2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public
4 * License v2 as published by the Free Software Foundation.
6 * This program is distributed in the hope that it will be useful,
7 * but WITHOUT ANY WARRANTY; without even the implied warranty of
8 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
9 * General Public License for more details.
11 * You should have received a copy of the GNU General Public
12 * License along with this program; if not, write to the
13 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
14 * Boston, MA 021110-1307, USA.
21 #include "transaction.h"
24 #include "check/mode-common.h"
27 * Check if the inode referenced by the given data reference uses the extent
28 * at disk_bytenr as a non-prealloc extent.
30 * Returns 1 if true, 0 if false and < 0 on error.
32 static int check_prealloc_data_ref(struct btrfs_fs_info
*fs_info
,
34 struct btrfs_extent_data_ref
*dref
,
35 struct extent_buffer
*eb
)
37 u64 rootid
= btrfs_extent_data_ref_root(eb
, dref
);
38 u64 objectid
= btrfs_extent_data_ref_objectid(eb
, dref
);
39 u64 offset
= btrfs_extent_data_ref_offset(eb
, dref
);
40 struct btrfs_root
*root
;
42 struct btrfs_path path
;
45 btrfs_init_path(&path
);
46 key
.objectid
= rootid
;
47 key
.type
= BTRFS_ROOT_ITEM_KEY
;
49 root
= btrfs_read_fs_root(fs_info
, &key
);
55 key
.objectid
= objectid
;
56 key
.type
= BTRFS_EXTENT_DATA_KEY
;
58 ret
= btrfs_search_slot(NULL
, root
, &key
, &path
, 0, 0);
61 "Missing file extent item for inode %llu, root %llu, offset %llu",
62 objectid
, rootid
, offset
);
69 struct btrfs_file_extent_item
*fi
;
72 if (path
.slots
[0] >= btrfs_header_nritems(path
.nodes
[0])) {
73 ret
= btrfs_next_leaf(root
, &path
);
80 btrfs_item_key_to_cpu(path
.nodes
[0], &key
, path
.slots
[0]);
81 if (key
.objectid
!= objectid
||
82 key
.type
!= BTRFS_EXTENT_DATA_KEY
)
85 fi
= btrfs_item_ptr(path
.nodes
[0], path
.slots
[0],
86 struct btrfs_file_extent_item
);
87 extent_type
= btrfs_file_extent_type(path
.nodes
[0], fi
);
88 if (extent_type
!= BTRFS_FILE_EXTENT_REG
&&
89 extent_type
!= BTRFS_FILE_EXTENT_PREALLOC
)
92 if (btrfs_file_extent_disk_bytenr(path
.nodes
[0], fi
) !=
96 if (extent_type
== BTRFS_FILE_EXTENT_REG
) {
105 btrfs_release_path(&path
);
110 * Check if a shared data reference points to a node that has a file extent item
111 * pointing to the extent at @disk_bytenr that is not of type prealloc.
113 * Returns 1 if true, 0 if false and < 0 on error.
115 static int check_prealloc_shared_data_ref(struct btrfs_fs_info
*fs_info
,
116 u64 parent
, u64 disk_bytenr
)
118 struct extent_buffer
*eb
;
123 eb
= read_tree_block(fs_info
, parent
, 0);
124 if (!extent_buffer_uptodate(eb
)) {
129 nr
= btrfs_header_nritems(eb
);
130 for (i
= 0; i
< nr
; i
++) {
131 struct btrfs_key key
;
132 struct btrfs_file_extent_item
*fi
;
135 btrfs_item_key_to_cpu(eb
, &key
, i
);
136 if (key
.type
!= BTRFS_EXTENT_DATA_KEY
)
139 fi
= btrfs_item_ptr(eb
, i
, struct btrfs_file_extent_item
);
140 extent_type
= btrfs_file_extent_type(eb
, fi
);
141 if (extent_type
!= BTRFS_FILE_EXTENT_REG
&&
142 extent_type
!= BTRFS_FILE_EXTENT_PREALLOC
)
145 if (btrfs_file_extent_disk_bytenr(eb
, fi
) == disk_bytenr
&&
146 extent_type
== BTRFS_FILE_EXTENT_REG
) {
152 free_extent_buffer(eb
);
157 * Check if a prealloc extent is shared by multiple inodes and if any inode has
158 * already written to that extent. This is to avoid emitting invalid warnings
159 * about odd csum items (a inode has an extent entirely marked as prealloc
160 * but another inode shares it and has already written to it).
162 * Note: right now it does not check if the number of checksum items in the
163 * csum tree matches the number of bytes written into the ex-prealloc extent.
164 * It's complex to deal with that because the prealloc extent might have been
165 * partially written through multiple inodes and we would have to keep track of
166 * ranges, merging them and notice ranges that fully or partially overlap, to
167 * avoid false reports of csum items missing for areas of the prealloc extent
168 * that were not written to - for example if we have a 1M prealloc extent, we
169 * can have only the first half of it written, but 2 different inodes refer to
170 * the its first half (through reflinks/cloning), so keeping a counter of bytes
171 * covered by checksum items is not enough, as the correct value would be 512K
172 * and not 1M (whence the need to track ranges).
174 * Returns 0 if the prealloc extent was not written yet by any inode, 1 if
175 * at least one other inode has written to it, and < 0 on error.
177 int check_prealloc_extent_written(struct btrfs_fs_info
*fs_info
,
178 u64 disk_bytenr
, u64 num_bytes
)
180 struct btrfs_path path
;
181 struct btrfs_key key
;
183 struct btrfs_extent_item
*ei
;
188 key
.objectid
= disk_bytenr
;
189 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
190 key
.offset
= num_bytes
;
192 btrfs_init_path(&path
);
193 ret
= btrfs_search_slot(NULL
, fs_info
->extent_root
, &key
, &path
, 0, 0);
196 "Missing extent item in extent tree for disk_bytenr %llu, num_bytes %llu\n",
197 disk_bytenr
, num_bytes
);
203 /* First check all inline refs. */
204 ei
= btrfs_item_ptr(path
.nodes
[0], path
.slots
[0],
205 struct btrfs_extent_item
);
206 item_size
= btrfs_item_size_nr(path
.nodes
[0], path
.slots
[0]);
207 ptr
= (unsigned long)(ei
+ 1);
208 end
= (unsigned long)ei
+ item_size
;
210 struct btrfs_extent_inline_ref
*iref
;
213 iref
= (struct btrfs_extent_inline_ref
*)ptr
;
214 type
= btrfs_extent_inline_ref_type(path
.nodes
[0], iref
);
215 ASSERT(type
== BTRFS_EXTENT_DATA_REF_KEY
||
216 type
== BTRFS_SHARED_DATA_REF_KEY
);
218 if (type
== BTRFS_EXTENT_DATA_REF_KEY
) {
219 struct btrfs_extent_data_ref
*dref
;
221 dref
= (struct btrfs_extent_data_ref
*)(&iref
->offset
);
222 ret
= check_prealloc_data_ref(fs_info
, disk_bytenr
,
223 dref
, path
.nodes
[0]);
226 } else if (type
== BTRFS_SHARED_DATA_REF_KEY
) {
229 parent
= btrfs_extent_inline_ref_offset(path
.nodes
[0],
231 ret
= check_prealloc_shared_data_ref(fs_info
,
238 ptr
+= btrfs_extent_inline_ref_size(type
);
241 /* Now check if there are any non-inlined refs. */
244 if (path
.slots
[0] >= btrfs_header_nritems(path
.nodes
[0])) {
245 ret
= btrfs_next_leaf(fs_info
->extent_root
, &path
);
254 btrfs_item_key_to_cpu(path
.nodes
[0], &key
, path
.slots
[0]);
255 if (key
.objectid
!= disk_bytenr
)
258 if (key
.type
== BTRFS_EXTENT_DATA_REF_KEY
) {
259 struct btrfs_extent_data_ref
*dref
;
261 dref
= btrfs_item_ptr(path
.nodes
[0], path
.slots
[0],
262 struct btrfs_extent_data_ref
);
263 ret
= check_prealloc_data_ref(fs_info
, disk_bytenr
,
264 dref
, path
.nodes
[0]);
267 } else if (key
.type
== BTRFS_SHARED_DATA_REF_KEY
) {
268 ret
= check_prealloc_shared_data_ref(fs_info
,
278 btrfs_release_path(&path
);
283 * Search in csum tree to find how many bytes of range [@start, @start + @len)
284 * has the corresponding csum item.
286 * @start: range start
288 * @found: return value of found csum bytes
291 int count_csum_range(struct btrfs_fs_info
*fs_info
, u64 start
,
294 struct btrfs_key key
;
295 struct btrfs_path path
;
296 struct extent_buffer
*leaf
;
301 u16 csum_size
= btrfs_super_csum_size(fs_info
->super_copy
);
303 btrfs_init_path(&path
);
305 key
.objectid
= BTRFS_EXTENT_CSUM_OBJECTID
;
307 key
.type
= BTRFS_EXTENT_CSUM_KEY
;
309 ret
= btrfs_search_slot(NULL
, fs_info
->csum_root
,
313 if (ret
> 0 && path
.slots
[0] > 0) {
314 leaf
= path
.nodes
[0];
315 btrfs_item_key_to_cpu(leaf
, &key
, path
.slots
[0] - 1);
316 if (key
.objectid
== BTRFS_EXTENT_CSUM_OBJECTID
&&
317 key
.type
== BTRFS_EXTENT_CSUM_KEY
)
322 leaf
= path
.nodes
[0];
323 if (path
.slots
[0] >= btrfs_header_nritems(leaf
)) {
324 ret
= btrfs_next_leaf(fs_info
->csum_root
, &path
);
329 leaf
= path
.nodes
[0];
332 btrfs_item_key_to_cpu(leaf
, &key
, path
.slots
[0]);
333 if (key
.objectid
!= BTRFS_EXTENT_CSUM_OBJECTID
||
334 key
.type
!= BTRFS_EXTENT_CSUM_KEY
)
337 btrfs_item_key_to_cpu(leaf
, &key
, path
.slots
[0]);
338 if (key
.offset
>= start
+ len
)
341 if (key
.offset
> start
)
344 size
= btrfs_item_size_nr(leaf
, path
.slots
[0]);
345 csum_end
= key
.offset
+ (size
/ csum_size
) *
347 if (csum_end
> start
) {
348 size
= min(csum_end
- start
, len
);
357 btrfs_release_path(&path
);
364 * Wrapper to insert one inode item into given @root
365 * Timestamp will be set to current time.
367 * @root: the root to insert inode item into
370 * @nbytes: nbytes (real used size, without hole)
371 * @nlink: number of links
372 * @mode: file mode, including S_IF* bits
374 int insert_inode_item(struct btrfs_trans_handle
*trans
,
375 struct btrfs_root
*root
, u64 ino
, u64 size
,
376 u64 nbytes
, u64 nlink
, u32 mode
)
378 struct btrfs_inode_item ii
;
379 time_t now
= time(NULL
);
382 btrfs_set_stack_inode_size(&ii
, size
);
383 btrfs_set_stack_inode_nbytes(&ii
, nbytes
);
384 btrfs_set_stack_inode_nlink(&ii
, nlink
);
385 btrfs_set_stack_inode_mode(&ii
, mode
);
386 btrfs_set_stack_inode_generation(&ii
, trans
->transid
);
387 btrfs_set_stack_timespec_nsec(&ii
.atime
, 0);
388 btrfs_set_stack_timespec_sec(&ii
.ctime
, now
);
389 btrfs_set_stack_timespec_nsec(&ii
.ctime
, 0);
390 btrfs_set_stack_timespec_sec(&ii
.mtime
, now
);
391 btrfs_set_stack_timespec_nsec(&ii
.mtime
, 0);
392 btrfs_set_stack_timespec_sec(&ii
.otime
, 0);
393 btrfs_set_stack_timespec_nsec(&ii
.otime
, 0);
395 ret
= btrfs_insert_inode(trans
, root
, ino
, &ii
);
398 warning("root %llu inode %llu recreating inode item, this may "
399 "be incomplete, please check permissions and content after "
400 "the fsck completes.\n", (unsigned long long)root
->objectid
,
401 (unsigned long long)ino
);
406 static int get_highest_inode(struct btrfs_trans_handle
*trans
,
407 struct btrfs_root
*root
, struct btrfs_path
*path
,
410 struct btrfs_key key
, found_key
;
413 btrfs_init_path(path
);
414 key
.objectid
= BTRFS_LAST_FREE_OBJECTID
;
416 key
.type
= BTRFS_INODE_ITEM_KEY
;
417 ret
= btrfs_search_slot(trans
, root
, &key
, path
, -1, 1);
419 btrfs_item_key_to_cpu(path
->nodes
[0], &found_key
,
421 *highest_ino
= found_key
.objectid
;
424 if (*highest_ino
>= BTRFS_LAST_FREE_OBJECTID
)
426 btrfs_release_path(path
);
431 * Link inode to dir 'lost+found'. Increase @ref_count.
433 * Returns 0 means success.
434 * Returns <0 means failure.
436 int link_inode_to_lostfound(struct btrfs_trans_handle
*trans
,
437 struct btrfs_root
*root
,
438 struct btrfs_path
*path
,
439 u64 ino
, char *namebuf
, u32 name_len
,
440 u8 filetype
, u64
*ref_count
)
442 char *dir_name
= "lost+found";
447 btrfs_release_path(path
);
448 ret
= get_highest_inode(trans
, root
, path
, &lost_found_ino
);
453 ret
= btrfs_mkdir(trans
, root
, dir_name
, strlen(dir_name
),
454 BTRFS_FIRST_FREE_OBJECTID
, &lost_found_ino
,
457 error("failed to create '%s' dir: %s", dir_name
, strerror(-ret
));
460 ret
= btrfs_add_link(trans
, root
, ino
, lost_found_ino
,
461 namebuf
, name_len
, filetype
, NULL
, 1, 0);
463 * Add ".INO" suffix several times to handle case where
464 * "FILENAME.INO" is already taken by another file.
466 while (ret
== -EEXIST
) {
468 * Conflicting file name, add ".INO" as suffix * +1 for '.'
470 if (name_len
+ count_digits(ino
) + 1 > BTRFS_NAME_LEN
) {
474 snprintf(namebuf
+ name_len
, BTRFS_NAME_LEN
- name_len
,
476 name_len
+= count_digits(ino
) + 1;
477 ret
= btrfs_add_link(trans
, root
, ino
, lost_found_ino
, namebuf
,
478 name_len
, filetype
, NULL
, 1, 0);
481 error("failed to link the inode %llu to %s dir: %s",
482 ino
, dir_name
, strerror(-ret
));
487 printf("Moving file '%.*s' to '%s' dir since it has no valid backref\n",
488 name_len
, namebuf
, dir_name
);
490 btrfs_release_path(path
);
492 error("failed to move file '%.*s' to '%s' dir", name_len
,
498 * Extra (optional) check for dev_item size to report possbile problem on a new
501 void check_dev_size_alignment(u64 devid
, u64 total_bytes
, u32 sectorsize
)
503 if (!IS_ALIGNED(total_bytes
, sectorsize
)) {
505 "unaligned total_bytes detected for devid %llu, have %llu should be aligned to %u",
506 devid
, total_bytes
, sectorsize
);
508 "this is OK for older kernel, but may cause kernel warning for newer kernels");
509 warning("this can be fixed by 'btrfs rescue fix-device-size'");
513 void reada_walk_down(struct btrfs_root
*root
, struct extent_buffer
*node
,
516 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
523 level
= btrfs_header_level(node
);
527 nritems
= btrfs_header_nritems(node
);
528 for (i
= slot
; i
< nritems
; i
++) {
529 bytenr
= btrfs_node_blockptr(node
, i
);
530 ptr_gen
= btrfs_node_ptr_generation(node
, i
);
531 readahead_tree_block(fs_info
, bytenr
, ptr_gen
);
536 * Check the child node/leaf by the following condition:
537 * 1. the first item key of the node/leaf should be the same with the one
539 * 2. block in parent node should match the child node/leaf.
540 * 3. generation of parent node and child's header should be consistent.
542 * Or the child node/leaf pointed by the key in parent is not valid.
544 * We hope to check leaf owner too, but since subvol may share leaves,
545 * which makes leaf owner check not so strong, key check should be
546 * sufficient enough for that case.
548 int check_child_node(struct extent_buffer
*parent
, int slot
,
549 struct extent_buffer
*child
)
551 struct btrfs_key parent_key
;
552 struct btrfs_key child_key
;
555 btrfs_node_key_to_cpu(parent
, &parent_key
, slot
);
556 if (btrfs_header_level(child
) == 0)
557 btrfs_item_key_to_cpu(child
, &child_key
, 0);
559 btrfs_node_key_to_cpu(child
, &child_key
, 0);
561 if (memcmp(&parent_key
, &child_key
, sizeof(parent_key
))) {
564 "Wrong key of child node/leaf, wanted: (%llu, %u, %llu), have: (%llu, %u, %llu)\n",
565 parent_key
.objectid
, parent_key
.type
, parent_key
.offset
,
566 child_key
.objectid
, child_key
.type
, child_key
.offset
);
568 if (btrfs_header_bytenr(child
) != btrfs_node_blockptr(parent
, slot
)) {
570 fprintf(stderr
, "Wrong block of child node/leaf, wanted: %llu, have: %llu\n",
571 btrfs_node_blockptr(parent
, slot
),
572 btrfs_header_bytenr(child
));
574 if (btrfs_node_ptr_generation(parent
, slot
) !=
575 btrfs_header_generation(child
)) {
577 fprintf(stderr
, "Wrong generation of child node/leaf, wanted: %llu, have: %llu\n",
578 btrfs_header_generation(child
),
579 btrfs_node_ptr_generation(parent
, slot
));
584 void reset_cached_block_groups(struct btrfs_fs_info
*fs_info
)
586 struct btrfs_block_group_cache
*cache
;
591 ret
= find_first_extent_bit(&fs_info
->free_space_cache
, 0,
592 &start
, &end
, EXTENT_DIRTY
);
595 clear_extent_dirty(&fs_info
->free_space_cache
, start
, end
);
600 cache
= btrfs_lookup_first_block_group(fs_info
, start
);
605 start
= cache
->key
.objectid
+ cache
->key
.offset
;
609 static int traverse_tree_blocks(struct btrfs_fs_info
*fs_info
,
610 struct extent_buffer
*eb
, int tree_root
,
613 struct extent_buffer
*tmp
;
614 struct btrfs_root_item
*ri
;
615 struct btrfs_key key
;
616 struct extent_io_tree
*tree
;
618 int level
= btrfs_header_level(eb
);
622 u64 end
= eb
->start
+ eb
->len
;
625 tree
= &fs_info
->pinned_extents
;
627 tree
= fs_info
->excluded_extents
;
629 * If we have pinned/excluded this block before, don't do it again.
630 * This can not only avoid forever loop with broken filesystem
631 * but also give us some speedups.
633 if (test_range_bit(tree
, eb
->start
, end
- 1, EXTENT_DIRTY
, 0))
637 btrfs_pin_extent(fs_info
, eb
->start
, eb
->len
);
639 set_extent_dirty(tree
, eb
->start
, end
- 1);
641 nritems
= btrfs_header_nritems(eb
);
642 for (i
= 0; i
< nritems
; i
++) {
645 btrfs_item_key_to_cpu(eb
, &key
, i
);
646 if (key
.type
!= BTRFS_ROOT_ITEM_KEY
)
648 /* Skip the extent root and reloc roots */
649 if (key
.objectid
== BTRFS_TREE_RELOC_OBJECTID
||
650 key
.objectid
== BTRFS_DATA_RELOC_TREE_OBJECTID
)
653 key
.objectid
== BTRFS_EXTENT_TREE_OBJECTID
;
654 /* If pin, skip the extent root */
655 if (pin
&& is_extent_root
)
657 ri
= btrfs_item_ptr(eb
, i
, struct btrfs_root_item
);
658 bytenr
= btrfs_disk_root_bytenr(eb
, ri
);
661 * If at any point we start needing the real root we
662 * will have to build a stump root for the root we are
663 * in, but for now this doesn't actually use the root so
664 * just pass in extent_root.
666 tmp
= read_tree_block(fs_info
, bytenr
, 0);
667 if (!extent_buffer_uptodate(tmp
)) {
668 fprintf(stderr
, "Error reading root block\n");
671 ret
= traverse_tree_blocks(fs_info
, tmp
, 0, pin
);
672 free_extent_buffer(tmp
);
676 bytenr
= btrfs_node_blockptr(eb
, i
);
678 /* If we aren't the tree root don't read the block */
679 if (level
== 1 && !tree_root
) {
680 btrfs_pin_extent(fs_info
, bytenr
,
685 tmp
= read_tree_block(fs_info
, bytenr
, 0);
686 if (!extent_buffer_uptodate(tmp
)) {
687 fprintf(stderr
, "Error reading tree block\n");
690 ret
= traverse_tree_blocks(fs_info
, tmp
, tree_root
,
692 free_extent_buffer(tmp
);
701 static int pin_down_tree_blocks(struct btrfs_fs_info
*fs_info
,
702 struct extent_buffer
*eb
, int tree_root
)
704 return traverse_tree_blocks(fs_info
, eb
, tree_root
, 1);
707 int pin_metadata_blocks(struct btrfs_fs_info
*fs_info
)
711 ret
= pin_down_tree_blocks(fs_info
, fs_info
->chunk_root
->node
, 0);
715 return pin_down_tree_blocks(fs_info
, fs_info
->tree_root
->node
, 1);
718 static int exclude_tree_blocks(struct btrfs_fs_info
*fs_info
,
719 struct extent_buffer
*eb
, int tree_root
)
721 return traverse_tree_blocks(fs_info
, eb
, tree_root
, 0);
724 int exclude_metadata_blocks(struct btrfs_fs_info
*fs_info
)
727 struct extent_io_tree
*excluded_extents
;
729 excluded_extents
= malloc(sizeof(*excluded_extents
));
730 if (!excluded_extents
)
732 extent_io_tree_init(excluded_extents
);
733 fs_info
->excluded_extents
= excluded_extents
;
735 ret
= exclude_tree_blocks(fs_info
, fs_info
->chunk_root
->node
, 0);
738 return exclude_tree_blocks(fs_info
, fs_info
->tree_root
->node
, 1);
741 void cleanup_excluded_extents(struct btrfs_fs_info
*fs_info
)
743 if (fs_info
->excluded_extents
) {
744 extent_io_tree_cleanup(fs_info
->excluded_extents
);
745 free(fs_info
->excluded_extents
);
747 fs_info
->excluded_extents
= NULL
;