2 * Copyright (C) 2008 Oracle. All rights reserved.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include <linux/list_sort.h>
23 #include "transaction.h"
26 #include "print-tree.h"
32 /* magic values for the inode_only field in btrfs_log_inode:
34 * LOG_INODE_ALL means to log everything
35 * LOG_INODE_EXISTS means to log just enough to recreate the inode
38 #define LOG_INODE_ALL 0
39 #define LOG_INODE_EXISTS 1
42 * directory trouble cases
44 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
45 * log, we must force a full commit before doing an fsync of the directory
46 * where the unlink was done.
47 * ---> record transid of last unlink/rename per directory
51 * rename foo/some_dir foo2/some_dir
53 * fsync foo/some_dir/some_file
55 * The fsync above will unlink the original some_dir without recording
56 * it in its new location (foo2). After a crash, some_dir will be gone
57 * unless the fsync of some_file forces a full commit
59 * 2) we must log any new names for any file or dir that is in the fsync
60 * log. ---> check inode while renaming/linking.
62 * 2a) we must log any new names for any file or dir during rename
63 * when the directory they are being removed from was logged.
64 * ---> check inode and old parent dir during rename
66 * 2a is actually the more important variant. With the extra logging
67 * a crash might unlink the old name without recreating the new one
69 * 3) after a crash, we must go through any directories with a link count
70 * of zero and redo the rm -rf
77 * The directory f1 was fully removed from the FS, but fsync was never
78 * called on f1, only its parent dir. After a crash the rm -rf must
79 * be replayed. This must be able to recurse down the entire
80 * directory tree. The inode link count fixup code takes care of the
85 * stages for the tree walking. The first
86 * stage (0) is to only pin down the blocks we find
87 * the second stage (1) is to make sure that all the inodes
88 * we find in the log are created in the subvolume.
90 * The last stage is to deal with directories and links and extents
91 * and all the other fun semantics
93 #define LOG_WALK_PIN_ONLY 0
94 #define LOG_WALK_REPLAY_INODES 1
95 #define LOG_WALK_REPLAY_ALL 2
97 static int btrfs_log_inode(struct btrfs_trans_handle
*trans
,
98 struct btrfs_root
*root
, struct inode
*inode
,
100 static int link_to_fixup_dir(struct btrfs_trans_handle
*trans
,
101 struct btrfs_root
*root
,
102 struct btrfs_path
*path
, u64 objectid
);
103 static noinline
int replay_dir_deletes(struct btrfs_trans_handle
*trans
,
104 struct btrfs_root
*root
,
105 struct btrfs_root
*log
,
106 struct btrfs_path
*path
,
107 u64 dirid
, int del_all
);
110 * tree logging is a special write ahead log used to make sure that
111 * fsyncs and O_SYNCs can happen without doing full tree commits.
113 * Full tree commits are expensive because they require commonly
114 * modified blocks to be recowed, creating many dirty pages in the
115 * extent tree an 4x-6x higher write load than ext3.
117 * Instead of doing a tree commit on every fsync, we use the
118 * key ranges and transaction ids to find items for a given file or directory
119 * that have changed in this transaction. Those items are copied into
120 * a special tree (one per subvolume root), that tree is written to disk
121 * and then the fsync is considered complete.
123 * After a crash, items are copied out of the log-tree back into the
124 * subvolume tree. Any file data extents found are recorded in the extent
125 * allocation tree, and the log-tree freed.
127 * The log tree is read three times, once to pin down all the extents it is
128 * using in ram and once, once to create all the inodes logged in the tree
129 * and once to do all the other items.
133 * start a sub transaction and setup the log tree
134 * this increments the log tree writer count to make the people
135 * syncing the tree wait for us to finish
137 static int start_log_trans(struct btrfs_trans_handle
*trans
,
138 struct btrfs_root
*root
)
143 mutex_lock(&root
->log_mutex
);
144 if (root
->log_root
) {
145 if (!root
->log_start_pid
) {
146 root
->log_start_pid
= current
->pid
;
147 root
->log_multiple_pids
= false;
148 } else if (root
->log_start_pid
!= current
->pid
) {
149 root
->log_multiple_pids
= true;
152 atomic_inc(&root
->log_batch
);
153 atomic_inc(&root
->log_writers
);
154 mutex_unlock(&root
->log_mutex
);
157 root
->log_multiple_pids
= false;
158 root
->log_start_pid
= current
->pid
;
159 mutex_lock(&root
->fs_info
->tree_log_mutex
);
160 if (!root
->fs_info
->log_root_tree
) {
161 ret
= btrfs_init_log_root_tree(trans
, root
->fs_info
);
165 if (err
== 0 && !root
->log_root
) {
166 ret
= btrfs_add_log_tree(trans
, root
);
170 mutex_unlock(&root
->fs_info
->tree_log_mutex
);
171 atomic_inc(&root
->log_batch
);
172 atomic_inc(&root
->log_writers
);
173 mutex_unlock(&root
->log_mutex
);
178 * returns 0 if there was a log transaction running and we were able
179 * to join, or returns -ENOENT if there were not transactions
182 static int join_running_log_trans(struct btrfs_root
*root
)
190 mutex_lock(&root
->log_mutex
);
191 if (root
->log_root
) {
193 atomic_inc(&root
->log_writers
);
195 mutex_unlock(&root
->log_mutex
);
200 * This either makes the current running log transaction wait
201 * until you call btrfs_end_log_trans() or it makes any future
202 * log transactions wait until you call btrfs_end_log_trans()
204 int btrfs_pin_log_trans(struct btrfs_root
*root
)
208 mutex_lock(&root
->log_mutex
);
209 atomic_inc(&root
->log_writers
);
210 mutex_unlock(&root
->log_mutex
);
215 * indicate we're done making changes to the log tree
216 * and wake up anyone waiting to do a sync
218 void btrfs_end_log_trans(struct btrfs_root
*root
)
220 if (atomic_dec_and_test(&root
->log_writers
)) {
222 if (waitqueue_active(&root
->log_writer_wait
))
223 wake_up(&root
->log_writer_wait
);
229 * the walk control struct is used to pass state down the chain when
230 * processing the log tree. The stage field tells us which part
231 * of the log tree processing we are currently doing. The others
232 * are state fields used for that specific part
234 struct walk_control
{
235 /* should we free the extent on disk when done? This is used
236 * at transaction commit time while freeing a log tree
240 /* should we write out the extent buffer? This is used
241 * while flushing the log tree to disk during a sync
245 /* should we wait for the extent buffer io to finish? Also used
246 * while flushing the log tree to disk for a sync
250 /* pin only walk, we record which extents on disk belong to the
255 /* what stage of the replay code we're currently in */
258 /* the root we are currently replaying */
259 struct btrfs_root
*replay_dest
;
261 /* the trans handle for the current replay */
262 struct btrfs_trans_handle
*trans
;
264 /* the function that gets used to process blocks we find in the
265 * tree. Note the extent_buffer might not be up to date when it is
266 * passed in, and it must be checked or read if you need the data
269 int (*process_func
)(struct btrfs_root
*log
, struct extent_buffer
*eb
,
270 struct walk_control
*wc
, u64 gen
);
274 * process_func used to pin down extents, write them or wait on them
276 static int process_one_buffer(struct btrfs_root
*log
,
277 struct extent_buffer
*eb
,
278 struct walk_control
*wc
, u64 gen
)
281 btrfs_pin_extent_for_log_replay(wc
->trans
,
282 log
->fs_info
->extent_root
,
285 if (btrfs_buffer_uptodate(eb
, gen
, 0)) {
287 btrfs_write_tree_block(eb
);
289 btrfs_wait_tree_block_writeback(eb
);
295 * Item overwrite used by replay and tree logging. eb, slot and key all refer
296 * to the src data we are copying out.
298 * root is the tree we are copying into, and path is a scratch
299 * path for use in this function (it should be released on entry and
300 * will be released on exit).
302 * If the key is already in the destination tree the existing item is
303 * overwritten. If the existing item isn't big enough, it is extended.
304 * If it is too large, it is truncated.
306 * If the key isn't in the destination yet, a new item is inserted.
308 static noinline
int overwrite_item(struct btrfs_trans_handle
*trans
,
309 struct btrfs_root
*root
,
310 struct btrfs_path
*path
,
311 struct extent_buffer
*eb
, int slot
,
312 struct btrfs_key
*key
)
316 u64 saved_i_size
= 0;
317 int save_old_i_size
= 0;
318 unsigned long src_ptr
;
319 unsigned long dst_ptr
;
320 int overwrite_root
= 0;
322 if (root
->root_key
.objectid
!= BTRFS_TREE_LOG_OBJECTID
)
325 item_size
= btrfs_item_size_nr(eb
, slot
);
326 src_ptr
= btrfs_item_ptr_offset(eb
, slot
);
328 /* look for the key in the destination tree */
329 ret
= btrfs_search_slot(NULL
, root
, key
, path
, 0, 0);
333 u32 dst_size
= btrfs_item_size_nr(path
->nodes
[0],
335 if (dst_size
!= item_size
)
338 if (item_size
== 0) {
339 btrfs_release_path(path
);
342 dst_copy
= kmalloc(item_size
, GFP_NOFS
);
343 src_copy
= kmalloc(item_size
, GFP_NOFS
);
344 if (!dst_copy
|| !src_copy
) {
345 btrfs_release_path(path
);
351 read_extent_buffer(eb
, src_copy
, src_ptr
, item_size
);
353 dst_ptr
= btrfs_item_ptr_offset(path
->nodes
[0], path
->slots
[0]);
354 read_extent_buffer(path
->nodes
[0], dst_copy
, dst_ptr
,
356 ret
= memcmp(dst_copy
, src_copy
, item_size
);
361 * they have the same contents, just return, this saves
362 * us from cowing blocks in the destination tree and doing
363 * extra writes that may not have been done by a previous
367 btrfs_release_path(path
);
373 btrfs_release_path(path
);
374 /* try to insert the key into the destination tree */
375 ret
= btrfs_insert_empty_item(trans
, root
, path
,
378 /* make sure any existing item is the correct size */
379 if (ret
== -EEXIST
) {
381 found_size
= btrfs_item_size_nr(path
->nodes
[0],
383 if (found_size
> item_size
)
384 btrfs_truncate_item(trans
, root
, path
, item_size
, 1);
385 else if (found_size
< item_size
)
386 btrfs_extend_item(trans
, root
, path
,
387 item_size
- found_size
);
391 dst_ptr
= btrfs_item_ptr_offset(path
->nodes
[0],
394 /* don't overwrite an existing inode if the generation number
395 * was logged as zero. This is done when the tree logging code
396 * is just logging an inode to make sure it exists after recovery.
398 * Also, don't overwrite i_size on directories during replay.
399 * log replay inserts and removes directory items based on the
400 * state of the tree found in the subvolume, and i_size is modified
403 if (key
->type
== BTRFS_INODE_ITEM_KEY
&& ret
== -EEXIST
) {
404 struct btrfs_inode_item
*src_item
;
405 struct btrfs_inode_item
*dst_item
;
407 src_item
= (struct btrfs_inode_item
*)src_ptr
;
408 dst_item
= (struct btrfs_inode_item
*)dst_ptr
;
410 if (btrfs_inode_generation(eb
, src_item
) == 0)
413 if (overwrite_root
&&
414 S_ISDIR(btrfs_inode_mode(eb
, src_item
)) &&
415 S_ISDIR(btrfs_inode_mode(path
->nodes
[0], dst_item
))) {
417 saved_i_size
= btrfs_inode_size(path
->nodes
[0],
422 copy_extent_buffer(path
->nodes
[0], eb
, dst_ptr
,
425 if (save_old_i_size
) {
426 struct btrfs_inode_item
*dst_item
;
427 dst_item
= (struct btrfs_inode_item
*)dst_ptr
;
428 btrfs_set_inode_size(path
->nodes
[0], dst_item
, saved_i_size
);
431 /* make sure the generation is filled in */
432 if (key
->type
== BTRFS_INODE_ITEM_KEY
) {
433 struct btrfs_inode_item
*dst_item
;
434 dst_item
= (struct btrfs_inode_item
*)dst_ptr
;
435 if (btrfs_inode_generation(path
->nodes
[0], dst_item
) == 0) {
436 btrfs_set_inode_generation(path
->nodes
[0], dst_item
,
441 btrfs_mark_buffer_dirty(path
->nodes
[0]);
442 btrfs_release_path(path
);
447 * simple helper to read an inode off the disk from a given root
448 * This can only be called for subvolume roots and not for the log
450 static noinline
struct inode
*read_one_inode(struct btrfs_root
*root
,
453 struct btrfs_key key
;
456 key
.objectid
= objectid
;
457 key
.type
= BTRFS_INODE_ITEM_KEY
;
459 inode
= btrfs_iget(root
->fs_info
->sb
, &key
, root
, NULL
);
462 } else if (is_bad_inode(inode
)) {
469 /* replays a single extent in 'eb' at 'slot' with 'key' into the
470 * subvolume 'root'. path is released on entry and should be released
473 * extents in the log tree have not been allocated out of the extent
474 * tree yet. So, this completes the allocation, taking a reference
475 * as required if the extent already exists or creating a new extent
476 * if it isn't in the extent allocation tree yet.
478 * The extent is inserted into the file, dropping any existing extents
479 * from the file that overlap the new one.
481 static noinline
int replay_one_extent(struct btrfs_trans_handle
*trans
,
482 struct btrfs_root
*root
,
483 struct btrfs_path
*path
,
484 struct extent_buffer
*eb
, int slot
,
485 struct btrfs_key
*key
)
488 u64 mask
= root
->sectorsize
- 1;
490 u64 start
= key
->offset
;
492 struct btrfs_file_extent_item
*item
;
493 struct inode
*inode
= NULL
;
497 item
= btrfs_item_ptr(eb
, slot
, struct btrfs_file_extent_item
);
498 found_type
= btrfs_file_extent_type(eb
, item
);
500 if (found_type
== BTRFS_FILE_EXTENT_REG
||
501 found_type
== BTRFS_FILE_EXTENT_PREALLOC
)
502 extent_end
= start
+ btrfs_file_extent_num_bytes(eb
, item
);
503 else if (found_type
== BTRFS_FILE_EXTENT_INLINE
) {
504 size
= btrfs_file_extent_inline_len(eb
, item
);
505 extent_end
= (start
+ size
+ mask
) & ~mask
;
511 inode
= read_one_inode(root
, key
->objectid
);
518 * first check to see if we already have this extent in the
519 * file. This must be done before the btrfs_drop_extents run
520 * so we don't try to drop this extent.
522 ret
= btrfs_lookup_file_extent(trans
, root
, path
, btrfs_ino(inode
),
526 (found_type
== BTRFS_FILE_EXTENT_REG
||
527 found_type
== BTRFS_FILE_EXTENT_PREALLOC
)) {
528 struct btrfs_file_extent_item cmp1
;
529 struct btrfs_file_extent_item cmp2
;
530 struct btrfs_file_extent_item
*existing
;
531 struct extent_buffer
*leaf
;
533 leaf
= path
->nodes
[0];
534 existing
= btrfs_item_ptr(leaf
, path
->slots
[0],
535 struct btrfs_file_extent_item
);
537 read_extent_buffer(eb
, &cmp1
, (unsigned long)item
,
539 read_extent_buffer(leaf
, &cmp2
, (unsigned long)existing
,
543 * we already have a pointer to this exact extent,
544 * we don't have to do anything
546 if (memcmp(&cmp1
, &cmp2
, sizeof(cmp1
)) == 0) {
547 btrfs_release_path(path
);
551 btrfs_release_path(path
);
553 saved_nbytes
= inode_get_bytes(inode
);
554 /* drop any overlapping extents */
555 ret
= btrfs_drop_extents(trans
, root
, inode
, start
, extent_end
, 1);
558 if (found_type
== BTRFS_FILE_EXTENT_REG
||
559 found_type
== BTRFS_FILE_EXTENT_PREALLOC
) {
561 unsigned long dest_offset
;
562 struct btrfs_key ins
;
564 ret
= btrfs_insert_empty_item(trans
, root
, path
, key
,
567 dest_offset
= btrfs_item_ptr_offset(path
->nodes
[0],
569 copy_extent_buffer(path
->nodes
[0], eb
, dest_offset
,
570 (unsigned long)item
, sizeof(*item
));
572 ins
.objectid
= btrfs_file_extent_disk_bytenr(eb
, item
);
573 ins
.offset
= btrfs_file_extent_disk_num_bytes(eb
, item
);
574 ins
.type
= BTRFS_EXTENT_ITEM_KEY
;
575 offset
= key
->offset
- btrfs_file_extent_offset(eb
, item
);
577 if (ins
.objectid
> 0) {
580 LIST_HEAD(ordered_sums
);
582 * is this extent already allocated in the extent
583 * allocation tree? If so, just add a reference
585 ret
= btrfs_lookup_extent(root
, ins
.objectid
,
588 ret
= btrfs_inc_extent_ref(trans
, root
,
589 ins
.objectid
, ins
.offset
,
590 0, root
->root_key
.objectid
,
591 key
->objectid
, offset
, 0);
595 * insert the extent pointer in the extent
598 ret
= btrfs_alloc_logged_file_extent(trans
,
599 root
, root
->root_key
.objectid
,
600 key
->objectid
, offset
, &ins
);
603 btrfs_release_path(path
);
605 if (btrfs_file_extent_compression(eb
, item
)) {
606 csum_start
= ins
.objectid
;
607 csum_end
= csum_start
+ ins
.offset
;
609 csum_start
= ins
.objectid
+
610 btrfs_file_extent_offset(eb
, item
);
611 csum_end
= csum_start
+
612 btrfs_file_extent_num_bytes(eb
, item
);
615 ret
= btrfs_lookup_csums_range(root
->log_root
,
616 csum_start
, csum_end
- 1,
619 while (!list_empty(&ordered_sums
)) {
620 struct btrfs_ordered_sum
*sums
;
621 sums
= list_entry(ordered_sums
.next
,
622 struct btrfs_ordered_sum
,
624 ret
= btrfs_csum_file_blocks(trans
,
625 root
->fs_info
->csum_root
,
628 list_del(&sums
->list
);
632 btrfs_release_path(path
);
634 } else if (found_type
== BTRFS_FILE_EXTENT_INLINE
) {
635 /* inline extents are easy, we just overwrite them */
636 ret
= overwrite_item(trans
, root
, path
, eb
, slot
, key
);
640 inode_set_bytes(inode
, saved_nbytes
);
641 ret
= btrfs_update_inode(trans
, root
, inode
);
649 * when cleaning up conflicts between the directory names in the
650 * subvolume, directory names in the log and directory names in the
651 * inode back references, we may have to unlink inodes from directories.
653 * This is a helper function to do the unlink of a specific directory
656 static noinline
int drop_one_dir_item(struct btrfs_trans_handle
*trans
,
657 struct btrfs_root
*root
,
658 struct btrfs_path
*path
,
660 struct btrfs_dir_item
*di
)
665 struct extent_buffer
*leaf
;
666 struct btrfs_key location
;
669 leaf
= path
->nodes
[0];
671 btrfs_dir_item_key_to_cpu(leaf
, di
, &location
);
672 name_len
= btrfs_dir_name_len(leaf
, di
);
673 name
= kmalloc(name_len
, GFP_NOFS
);
677 read_extent_buffer(leaf
, name
, (unsigned long)(di
+ 1), name_len
);
678 btrfs_release_path(path
);
680 inode
= read_one_inode(root
, location
.objectid
);
686 ret
= link_to_fixup_dir(trans
, root
, path
, location
.objectid
);
689 ret
= btrfs_unlink_inode(trans
, root
, dir
, inode
, name
, name_len
);
695 btrfs_run_delayed_items(trans
, root
);
700 * helper function to see if a given name and sequence number found
701 * in an inode back reference are already in a directory and correctly
702 * point to this inode
704 static noinline
int inode_in_dir(struct btrfs_root
*root
,
705 struct btrfs_path
*path
,
706 u64 dirid
, u64 objectid
, u64 index
,
707 const char *name
, int name_len
)
709 struct btrfs_dir_item
*di
;
710 struct btrfs_key location
;
713 di
= btrfs_lookup_dir_index_item(NULL
, root
, path
, dirid
,
714 index
, name
, name_len
, 0);
715 if (di
&& !IS_ERR(di
)) {
716 btrfs_dir_item_key_to_cpu(path
->nodes
[0], di
, &location
);
717 if (location
.objectid
!= objectid
)
721 btrfs_release_path(path
);
723 di
= btrfs_lookup_dir_item(NULL
, root
, path
, dirid
, name
, name_len
, 0);
724 if (di
&& !IS_ERR(di
)) {
725 btrfs_dir_item_key_to_cpu(path
->nodes
[0], di
, &location
);
726 if (location
.objectid
!= objectid
)
732 btrfs_release_path(path
);
737 * helper function to check a log tree for a named back reference in
738 * an inode. This is used to decide if a back reference that is
739 * found in the subvolume conflicts with what we find in the log.
741 * inode backreferences may have multiple refs in a single item,
742 * during replay we process one reference at a time, and we don't
743 * want to delete valid links to a file from the subvolume if that
744 * link is also in the log.
746 static noinline
int backref_in_log(struct btrfs_root
*log
,
747 struct btrfs_key
*key
,
749 char *name
, int namelen
)
751 struct btrfs_path
*path
;
752 struct btrfs_inode_ref
*ref
;
754 unsigned long ptr_end
;
755 unsigned long name_ptr
;
761 path
= btrfs_alloc_path();
765 ret
= btrfs_search_slot(NULL
, log
, key
, path
, 0, 0);
769 ptr
= btrfs_item_ptr_offset(path
->nodes
[0], path
->slots
[0]);
771 if (key
->type
== BTRFS_INODE_EXTREF_KEY
) {
772 if (btrfs_find_name_in_ext_backref(path
, ref_objectid
,
773 name
, namelen
, NULL
))
779 item_size
= btrfs_item_size_nr(path
->nodes
[0], path
->slots
[0]);
780 ptr_end
= ptr
+ item_size
;
781 while (ptr
< ptr_end
) {
782 ref
= (struct btrfs_inode_ref
*)ptr
;
783 found_name_len
= btrfs_inode_ref_name_len(path
->nodes
[0], ref
);
784 if (found_name_len
== namelen
) {
785 name_ptr
= (unsigned long)(ref
+ 1);
786 ret
= memcmp_extent_buffer(path
->nodes
[0], name
,
793 ptr
= (unsigned long)(ref
+ 1) + found_name_len
;
796 btrfs_free_path(path
);
800 static inline int __add_inode_ref(struct btrfs_trans_handle
*trans
,
801 struct btrfs_root
*root
,
802 struct btrfs_path
*path
,
803 struct btrfs_root
*log_root
,
804 struct inode
*dir
, struct inode
*inode
,
805 struct extent_buffer
*eb
,
806 u64 inode_objectid
, u64 parent_objectid
,
807 u64 ref_index
, char *name
, int namelen
,
813 struct extent_buffer
*leaf
;
814 struct btrfs_dir_item
*di
;
815 struct btrfs_key search_key
;
816 struct btrfs_inode_extref
*extref
;
819 /* Search old style refs */
820 search_key
.objectid
= inode_objectid
;
821 search_key
.type
= BTRFS_INODE_REF_KEY
;
822 search_key
.offset
= parent_objectid
;
823 ret
= btrfs_search_slot(NULL
, root
, &search_key
, path
, 0, 0);
825 struct btrfs_inode_ref
*victim_ref
;
827 unsigned long ptr_end
;
829 leaf
= path
->nodes
[0];
831 /* are we trying to overwrite a back ref for the root directory
832 * if so, just jump out, we're done
834 if (search_key
.objectid
== search_key
.offset
)
837 /* check all the names in this back reference to see
838 * if they are in the log. if so, we allow them to stay
839 * otherwise they must be unlinked as a conflict
841 ptr
= btrfs_item_ptr_offset(leaf
, path
->slots
[0]);
842 ptr_end
= ptr
+ btrfs_item_size_nr(leaf
, path
->slots
[0]);
843 while (ptr
< ptr_end
) {
844 victim_ref
= (struct btrfs_inode_ref
*)ptr
;
845 victim_name_len
= btrfs_inode_ref_name_len(leaf
,
847 victim_name
= kmalloc(victim_name_len
, GFP_NOFS
);
848 BUG_ON(!victim_name
);
850 read_extent_buffer(leaf
, victim_name
,
851 (unsigned long)(victim_ref
+ 1),
854 if (!backref_in_log(log_root
, &search_key
,
858 btrfs_inc_nlink(inode
);
859 btrfs_release_path(path
);
861 ret
= btrfs_unlink_inode(trans
, root
, dir
,
865 btrfs_run_delayed_items(trans
, root
);
872 ptr
= (unsigned long)(victim_ref
+ 1) + victim_name_len
;
877 * NOTE: we have searched root tree and checked the
878 * coresponding ref, it does not need to check again.
882 btrfs_release_path(path
);
884 /* Same search but for extended refs */
885 extref
= btrfs_lookup_inode_extref(NULL
, root
, path
, name
, namelen
,
886 inode_objectid
, parent_objectid
, 0,
888 if (!IS_ERR_OR_NULL(extref
)) {
892 struct inode
*victim_parent
;
894 leaf
= path
->nodes
[0];
896 item_size
= btrfs_item_size_nr(leaf
, path
->slots
[0]);
897 base
= btrfs_item_ptr_offset(leaf
, path
->slots
[0]);
899 while (cur_offset
< item_size
) {
900 extref
= (struct btrfs_inode_extref
*)base
+ cur_offset
;
902 victim_name_len
= btrfs_inode_extref_name_len(leaf
, extref
);
904 if (btrfs_inode_extref_parent(leaf
, extref
) != parent_objectid
)
907 victim_name
= kmalloc(victim_name_len
, GFP_NOFS
);
908 read_extent_buffer(leaf
, victim_name
, (unsigned long)&extref
->name
,
911 search_key
.objectid
= inode_objectid
;
912 search_key
.type
= BTRFS_INODE_EXTREF_KEY
;
913 search_key
.offset
= btrfs_extref_hash(parent_objectid
,
917 if (!backref_in_log(log_root
, &search_key
,
918 parent_objectid
, victim_name
,
921 victim_parent
= read_one_inode(root
,
924 btrfs_inc_nlink(inode
);
925 btrfs_release_path(path
);
927 ret
= btrfs_unlink_inode(trans
, root
,
932 btrfs_run_delayed_items(trans
, root
);
943 cur_offset
+= victim_name_len
+ sizeof(*extref
);
947 btrfs_release_path(path
);
949 /* look for a conflicting sequence number */
950 di
= btrfs_lookup_dir_index_item(trans
, root
, path
, btrfs_ino(dir
),
951 ref_index
, name
, namelen
, 0);
952 if (di
&& !IS_ERR(di
)) {
953 ret
= drop_one_dir_item(trans
, root
, path
, dir
, di
);
956 btrfs_release_path(path
);
958 /* look for a conflicing name */
959 di
= btrfs_lookup_dir_item(trans
, root
, path
, btrfs_ino(dir
),
961 if (di
&& !IS_ERR(di
)) {
962 ret
= drop_one_dir_item(trans
, root
, path
, dir
, di
);
965 btrfs_release_path(path
);
970 static int extref_get_fields(struct extent_buffer
*eb
, unsigned long ref_ptr
,
971 u32
*namelen
, char **name
, u64
*index
,
972 u64
*parent_objectid
)
974 struct btrfs_inode_extref
*extref
;
976 extref
= (struct btrfs_inode_extref
*)ref_ptr
;
978 *namelen
= btrfs_inode_extref_name_len(eb
, extref
);
979 *name
= kmalloc(*namelen
, GFP_NOFS
);
983 read_extent_buffer(eb
, *name
, (unsigned long)&extref
->name
,
986 *index
= btrfs_inode_extref_index(eb
, extref
);
988 *parent_objectid
= btrfs_inode_extref_parent(eb
, extref
);
993 static int ref_get_fields(struct extent_buffer
*eb
, unsigned long ref_ptr
,
994 u32
*namelen
, char **name
, u64
*index
)
996 struct btrfs_inode_ref
*ref
;
998 ref
= (struct btrfs_inode_ref
*)ref_ptr
;
1000 *namelen
= btrfs_inode_ref_name_len(eb
, ref
);
1001 *name
= kmalloc(*namelen
, GFP_NOFS
);
1005 read_extent_buffer(eb
, *name
, (unsigned long)(ref
+ 1), *namelen
);
1007 *index
= btrfs_inode_ref_index(eb
, ref
);
1013 * replay one inode back reference item found in the log tree.
1014 * eb, slot and key refer to the buffer and key found in the log tree.
1015 * root is the destination we are replaying into, and path is for temp
1016 * use by this function. (it should be released on return).
1018 static noinline
int add_inode_ref(struct btrfs_trans_handle
*trans
,
1019 struct btrfs_root
*root
,
1020 struct btrfs_root
*log
,
1021 struct btrfs_path
*path
,
1022 struct extent_buffer
*eb
, int slot
,
1023 struct btrfs_key
*key
)
1026 struct inode
*inode
;
1027 unsigned long ref_ptr
;
1028 unsigned long ref_end
;
1032 int search_done
= 0;
1033 int log_ref_ver
= 0;
1034 u64 parent_objectid
;
1037 int ref_struct_size
;
1039 ref_ptr
= btrfs_item_ptr_offset(eb
, slot
);
1040 ref_end
= ref_ptr
+ btrfs_item_size_nr(eb
, slot
);
1042 if (key
->type
== BTRFS_INODE_EXTREF_KEY
) {
1043 struct btrfs_inode_extref
*r
;
1045 ref_struct_size
= sizeof(struct btrfs_inode_extref
);
1047 r
= (struct btrfs_inode_extref
*)ref_ptr
;
1048 parent_objectid
= btrfs_inode_extref_parent(eb
, r
);
1050 ref_struct_size
= sizeof(struct btrfs_inode_ref
);
1051 parent_objectid
= key
->offset
;
1053 inode_objectid
= key
->objectid
;
1056 * it is possible that we didn't log all the parent directories
1057 * for a given inode. If we don't find the dir, just don't
1058 * copy the back ref in. The link count fixup code will take
1061 dir
= read_one_inode(root
, parent_objectid
);
1065 inode
= read_one_inode(root
, inode_objectid
);
1071 while (ref_ptr
< ref_end
) {
1073 ret
= extref_get_fields(eb
, ref_ptr
, &namelen
, &name
,
1074 &ref_index
, &parent_objectid
);
1076 * parent object can change from one array
1080 dir
= read_one_inode(root
, parent_objectid
);
1084 ret
= ref_get_fields(eb
, ref_ptr
, &namelen
, &name
,
1090 /* if we already have a perfect match, we're done */
1091 if (!inode_in_dir(root
, path
, btrfs_ino(dir
), btrfs_ino(inode
),
1092 ref_index
, name
, namelen
)) {
1094 * look for a conflicting back reference in the
1095 * metadata. if we find one we have to unlink that name
1096 * of the file before we add our new link. Later on, we
1097 * overwrite any existing back reference, and we don't
1098 * want to create dangling pointers in the directory.
1102 ret
= __add_inode_ref(trans
, root
, path
, log
,
1106 ref_index
, name
, namelen
,
1113 /* insert our name */
1114 ret
= btrfs_add_link(trans
, dir
, inode
, name
, namelen
,
1118 btrfs_update_inode(trans
, root
, inode
);
1121 ref_ptr
= (unsigned long)(ref_ptr
+ ref_struct_size
) + namelen
;
1129 /* finally write the back reference in the inode */
1130 ret
= overwrite_item(trans
, root
, path
, eb
, slot
, key
);
1134 btrfs_release_path(path
);
1140 static int insert_orphan_item(struct btrfs_trans_handle
*trans
,
1141 struct btrfs_root
*root
, u64 offset
)
1144 ret
= btrfs_find_orphan_item(root
, offset
);
1146 ret
= btrfs_insert_orphan_item(trans
, root
, offset
);
1150 static int count_inode_extrefs(struct btrfs_root
*root
,
1151 struct inode
*inode
, struct btrfs_path
*path
)
1155 unsigned int nlink
= 0;
1158 u64 inode_objectid
= btrfs_ino(inode
);
1161 struct btrfs_inode_extref
*extref
;
1162 struct extent_buffer
*leaf
;
1165 ret
= btrfs_find_one_extref(root
, inode_objectid
, offset
, path
,
1170 leaf
= path
->nodes
[0];
1171 item_size
= btrfs_item_size_nr(leaf
, path
->slots
[0]);
1172 ptr
= btrfs_item_ptr_offset(leaf
, path
->slots
[0]);
1174 while (cur_offset
< item_size
) {
1175 extref
= (struct btrfs_inode_extref
*) (ptr
+ cur_offset
);
1176 name_len
= btrfs_inode_extref_name_len(leaf
, extref
);
1180 cur_offset
+= name_len
+ sizeof(*extref
);
1184 btrfs_release_path(path
);
1186 btrfs_release_path(path
);
1193 static int count_inode_refs(struct btrfs_root
*root
,
1194 struct inode
*inode
, struct btrfs_path
*path
)
1197 struct btrfs_key key
;
1198 unsigned int nlink
= 0;
1200 unsigned long ptr_end
;
1202 u64 ino
= btrfs_ino(inode
);
1205 key
.type
= BTRFS_INODE_REF_KEY
;
1206 key
.offset
= (u64
)-1;
1209 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
1213 if (path
->slots
[0] == 0)
1217 btrfs_item_key_to_cpu(path
->nodes
[0], &key
,
1219 if (key
.objectid
!= ino
||
1220 key
.type
!= BTRFS_INODE_REF_KEY
)
1222 ptr
= btrfs_item_ptr_offset(path
->nodes
[0], path
->slots
[0]);
1223 ptr_end
= ptr
+ btrfs_item_size_nr(path
->nodes
[0],
1225 while (ptr
< ptr_end
) {
1226 struct btrfs_inode_ref
*ref
;
1228 ref
= (struct btrfs_inode_ref
*)ptr
;
1229 name_len
= btrfs_inode_ref_name_len(path
->nodes
[0],
1231 ptr
= (unsigned long)(ref
+ 1) + name_len
;
1235 if (key
.offset
== 0)
1238 btrfs_release_path(path
);
1240 btrfs_release_path(path
);
1246 * There are a few corners where the link count of the file can't
1247 * be properly maintained during replay. So, instead of adding
1248 * lots of complexity to the log code, we just scan the backrefs
1249 * for any file that has been through replay.
1251 * The scan will update the link count on the inode to reflect the
1252 * number of back refs found. If it goes down to zero, the iput
1253 * will free the inode.
1255 static noinline
int fixup_inode_link_count(struct btrfs_trans_handle
*trans
,
1256 struct btrfs_root
*root
,
1257 struct inode
*inode
)
1259 struct btrfs_path
*path
;
1262 u64 ino
= btrfs_ino(inode
);
1264 path
= btrfs_alloc_path();
1268 ret
= count_inode_refs(root
, inode
, path
);
1274 ret
= count_inode_extrefs(root
, inode
, path
);
1285 if (nlink
!= inode
->i_nlink
) {
1286 set_nlink(inode
, nlink
);
1287 btrfs_update_inode(trans
, root
, inode
);
1289 BTRFS_I(inode
)->index_cnt
= (u64
)-1;
1291 if (inode
->i_nlink
== 0) {
1292 if (S_ISDIR(inode
->i_mode
)) {
1293 ret
= replay_dir_deletes(trans
, root
, NULL
, path
,
1297 ret
= insert_orphan_item(trans
, root
, ino
);
1302 btrfs_free_path(path
);
1306 static noinline
int fixup_inode_link_counts(struct btrfs_trans_handle
*trans
,
1307 struct btrfs_root
*root
,
1308 struct btrfs_path
*path
)
1311 struct btrfs_key key
;
1312 struct inode
*inode
;
1314 key
.objectid
= BTRFS_TREE_LOG_FIXUP_OBJECTID
;
1315 key
.type
= BTRFS_ORPHAN_ITEM_KEY
;
1316 key
.offset
= (u64
)-1;
1318 ret
= btrfs_search_slot(trans
, root
, &key
, path
, -1, 1);
1323 if (path
->slots
[0] == 0)
1328 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0]);
1329 if (key
.objectid
!= BTRFS_TREE_LOG_FIXUP_OBJECTID
||
1330 key
.type
!= BTRFS_ORPHAN_ITEM_KEY
)
1333 ret
= btrfs_del_item(trans
, root
, path
);
1337 btrfs_release_path(path
);
1338 inode
= read_one_inode(root
, key
.offset
);
1342 ret
= fixup_inode_link_count(trans
, root
, inode
);
1348 * fixup on a directory may create new entries,
1349 * make sure we always look for the highset possible
1352 key
.offset
= (u64
)-1;
1356 btrfs_release_path(path
);
1362 * record a given inode in the fixup dir so we can check its link
1363 * count when replay is done. The link count is incremented here
1364 * so the inode won't go away until we check it
1366 static noinline
int link_to_fixup_dir(struct btrfs_trans_handle
*trans
,
1367 struct btrfs_root
*root
,
1368 struct btrfs_path
*path
,
1371 struct btrfs_key key
;
1373 struct inode
*inode
;
1375 inode
= read_one_inode(root
, objectid
);
1379 key
.objectid
= BTRFS_TREE_LOG_FIXUP_OBJECTID
;
1380 btrfs_set_key_type(&key
, BTRFS_ORPHAN_ITEM_KEY
);
1381 key
.offset
= objectid
;
1383 ret
= btrfs_insert_empty_item(trans
, root
, path
, &key
, 0);
1385 btrfs_release_path(path
);
1387 btrfs_inc_nlink(inode
);
1388 ret
= btrfs_update_inode(trans
, root
, inode
);
1389 } else if (ret
== -EEXIST
) {
1400 * when replaying the log for a directory, we only insert names
1401 * for inodes that actually exist. This means an fsync on a directory
1402 * does not implicitly fsync all the new files in it
1404 static noinline
int insert_one_name(struct btrfs_trans_handle
*trans
,
1405 struct btrfs_root
*root
,
1406 struct btrfs_path
*path
,
1407 u64 dirid
, u64 index
,
1408 char *name
, int name_len
, u8 type
,
1409 struct btrfs_key
*location
)
1411 struct inode
*inode
;
1415 inode
= read_one_inode(root
, location
->objectid
);
1419 dir
= read_one_inode(root
, dirid
);
1424 ret
= btrfs_add_link(trans
, dir
, inode
, name
, name_len
, 1, index
);
1426 /* FIXME, put inode into FIXUP list */
1434 * take a single entry in a log directory item and replay it into
1437 * if a conflicting item exists in the subdirectory already,
1438 * the inode it points to is unlinked and put into the link count
1441 * If a name from the log points to a file or directory that does
1442 * not exist in the FS, it is skipped. fsyncs on directories
1443 * do not force down inodes inside that directory, just changes to the
1444 * names or unlinks in a directory.
1446 static noinline
int replay_one_name(struct btrfs_trans_handle
*trans
,
1447 struct btrfs_root
*root
,
1448 struct btrfs_path
*path
,
1449 struct extent_buffer
*eb
,
1450 struct btrfs_dir_item
*di
,
1451 struct btrfs_key
*key
)
1455 struct btrfs_dir_item
*dst_di
;
1456 struct btrfs_key found_key
;
1457 struct btrfs_key log_key
;
1463 dir
= read_one_inode(root
, key
->objectid
);
1467 name_len
= btrfs_dir_name_len(eb
, di
);
1468 name
= kmalloc(name_len
, GFP_NOFS
);
1472 log_type
= btrfs_dir_type(eb
, di
);
1473 read_extent_buffer(eb
, name
, (unsigned long)(di
+ 1),
1476 btrfs_dir_item_key_to_cpu(eb
, di
, &log_key
);
1477 exists
= btrfs_lookup_inode(trans
, root
, path
, &log_key
, 0);
1482 btrfs_release_path(path
);
1484 if (key
->type
== BTRFS_DIR_ITEM_KEY
) {
1485 dst_di
= btrfs_lookup_dir_item(trans
, root
, path
, key
->objectid
,
1487 } else if (key
->type
== BTRFS_DIR_INDEX_KEY
) {
1488 dst_di
= btrfs_lookup_dir_index_item(trans
, root
, path
,
1495 if (IS_ERR_OR_NULL(dst_di
)) {
1496 /* we need a sequence number to insert, so we only
1497 * do inserts for the BTRFS_DIR_INDEX_KEY types
1499 if (key
->type
!= BTRFS_DIR_INDEX_KEY
)
1504 btrfs_dir_item_key_to_cpu(path
->nodes
[0], dst_di
, &found_key
);
1505 /* the existing item matches the logged item */
1506 if (found_key
.objectid
== log_key
.objectid
&&
1507 found_key
.type
== log_key
.type
&&
1508 found_key
.offset
== log_key
.offset
&&
1509 btrfs_dir_type(path
->nodes
[0], dst_di
) == log_type
) {
1514 * don't drop the conflicting directory entry if the inode
1515 * for the new entry doesn't exist
1520 ret
= drop_one_dir_item(trans
, root
, path
, dir
, dst_di
);
1523 if (key
->type
== BTRFS_DIR_INDEX_KEY
)
1526 btrfs_release_path(path
);
1532 btrfs_release_path(path
);
1533 ret
= insert_one_name(trans
, root
, path
, key
->objectid
, key
->offset
,
1534 name
, name_len
, log_type
, &log_key
);
1536 BUG_ON(ret
&& ret
!= -ENOENT
);
1541 * find all the names in a directory item and reconcile them into
1542 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than
1543 * one name in a directory item, but the same code gets used for
1544 * both directory index types
1546 static noinline
int replay_one_dir_item(struct btrfs_trans_handle
*trans
,
1547 struct btrfs_root
*root
,
1548 struct btrfs_path
*path
,
1549 struct extent_buffer
*eb
, int slot
,
1550 struct btrfs_key
*key
)
1553 u32 item_size
= btrfs_item_size_nr(eb
, slot
);
1554 struct btrfs_dir_item
*di
;
1557 unsigned long ptr_end
;
1559 ptr
= btrfs_item_ptr_offset(eb
, slot
);
1560 ptr_end
= ptr
+ item_size
;
1561 while (ptr
< ptr_end
) {
1562 di
= (struct btrfs_dir_item
*)ptr
;
1563 if (verify_dir_item(root
, eb
, di
))
1565 name_len
= btrfs_dir_name_len(eb
, di
);
1566 ret
= replay_one_name(trans
, root
, path
, eb
, di
, key
);
1568 ptr
= (unsigned long)(di
+ 1);
1575 * directory replay has two parts. There are the standard directory
1576 * items in the log copied from the subvolume, and range items
1577 * created in the log while the subvolume was logged.
1579 * The range items tell us which parts of the key space the log
1580 * is authoritative for. During replay, if a key in the subvolume
1581 * directory is in a logged range item, but not actually in the log
1582 * that means it was deleted from the directory before the fsync
1583 * and should be removed.
1585 static noinline
int find_dir_range(struct btrfs_root
*root
,
1586 struct btrfs_path
*path
,
1587 u64 dirid
, int key_type
,
1588 u64
*start_ret
, u64
*end_ret
)
1590 struct btrfs_key key
;
1592 struct btrfs_dir_log_item
*item
;
1596 if (*start_ret
== (u64
)-1)
1599 key
.objectid
= dirid
;
1600 key
.type
= key_type
;
1601 key
.offset
= *start_ret
;
1603 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
1607 if (path
->slots
[0] == 0)
1612 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0]);
1614 if (key
.type
!= key_type
|| key
.objectid
!= dirid
) {
1618 item
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
1619 struct btrfs_dir_log_item
);
1620 found_end
= btrfs_dir_log_end(path
->nodes
[0], item
);
1622 if (*start_ret
>= key
.offset
&& *start_ret
<= found_end
) {
1624 *start_ret
= key
.offset
;
1625 *end_ret
= found_end
;
1630 /* check the next slot in the tree to see if it is a valid item */
1631 nritems
= btrfs_header_nritems(path
->nodes
[0]);
1632 if (path
->slots
[0] >= nritems
) {
1633 ret
= btrfs_next_leaf(root
, path
);
1640 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0]);
1642 if (key
.type
!= key_type
|| key
.objectid
!= dirid
) {
1646 item
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
1647 struct btrfs_dir_log_item
);
1648 found_end
= btrfs_dir_log_end(path
->nodes
[0], item
);
1649 *start_ret
= key
.offset
;
1650 *end_ret
= found_end
;
1653 btrfs_release_path(path
);
1658 * this looks for a given directory item in the log. If the directory
1659 * item is not in the log, the item is removed and the inode it points
1662 static noinline
int check_item_in_log(struct btrfs_trans_handle
*trans
,
1663 struct btrfs_root
*root
,
1664 struct btrfs_root
*log
,
1665 struct btrfs_path
*path
,
1666 struct btrfs_path
*log_path
,
1668 struct btrfs_key
*dir_key
)
1671 struct extent_buffer
*eb
;
1674 struct btrfs_dir_item
*di
;
1675 struct btrfs_dir_item
*log_di
;
1678 unsigned long ptr_end
;
1680 struct inode
*inode
;
1681 struct btrfs_key location
;
1684 eb
= path
->nodes
[0];
1685 slot
= path
->slots
[0];
1686 item_size
= btrfs_item_size_nr(eb
, slot
);
1687 ptr
= btrfs_item_ptr_offset(eb
, slot
);
1688 ptr_end
= ptr
+ item_size
;
1689 while (ptr
< ptr_end
) {
1690 di
= (struct btrfs_dir_item
*)ptr
;
1691 if (verify_dir_item(root
, eb
, di
)) {
1696 name_len
= btrfs_dir_name_len(eb
, di
);
1697 name
= kmalloc(name_len
, GFP_NOFS
);
1702 read_extent_buffer(eb
, name
, (unsigned long)(di
+ 1),
1705 if (log
&& dir_key
->type
== BTRFS_DIR_ITEM_KEY
) {
1706 log_di
= btrfs_lookup_dir_item(trans
, log
, log_path
,
1709 } else if (log
&& dir_key
->type
== BTRFS_DIR_INDEX_KEY
) {
1710 log_di
= btrfs_lookup_dir_index_item(trans
, log
,
1716 if (IS_ERR_OR_NULL(log_di
)) {
1717 btrfs_dir_item_key_to_cpu(eb
, di
, &location
);
1718 btrfs_release_path(path
);
1719 btrfs_release_path(log_path
);
1720 inode
= read_one_inode(root
, location
.objectid
);
1726 ret
= link_to_fixup_dir(trans
, root
,
1727 path
, location
.objectid
);
1729 btrfs_inc_nlink(inode
);
1730 ret
= btrfs_unlink_inode(trans
, root
, dir
, inode
,
1734 btrfs_run_delayed_items(trans
, root
);
1739 /* there might still be more names under this key
1740 * check and repeat if required
1742 ret
= btrfs_search_slot(NULL
, root
, dir_key
, path
,
1749 btrfs_release_path(log_path
);
1752 ptr
= (unsigned long)(di
+ 1);
1757 btrfs_release_path(path
);
1758 btrfs_release_path(log_path
);
1763 * deletion replay happens before we copy any new directory items
1764 * out of the log or out of backreferences from inodes. It
1765 * scans the log to find ranges of keys that log is authoritative for,
1766 * and then scans the directory to find items in those ranges that are
1767 * not present in the log.
1769 * Anything we don't find in the log is unlinked and removed from the
1772 static noinline
int replay_dir_deletes(struct btrfs_trans_handle
*trans
,
1773 struct btrfs_root
*root
,
1774 struct btrfs_root
*log
,
1775 struct btrfs_path
*path
,
1776 u64 dirid
, int del_all
)
1780 int key_type
= BTRFS_DIR_LOG_ITEM_KEY
;
1782 struct btrfs_key dir_key
;
1783 struct btrfs_key found_key
;
1784 struct btrfs_path
*log_path
;
1787 dir_key
.objectid
= dirid
;
1788 dir_key
.type
= BTRFS_DIR_ITEM_KEY
;
1789 log_path
= btrfs_alloc_path();
1793 dir
= read_one_inode(root
, dirid
);
1794 /* it isn't an error if the inode isn't there, that can happen
1795 * because we replay the deletes before we copy in the inode item
1799 btrfs_free_path(log_path
);
1807 range_end
= (u64
)-1;
1809 ret
= find_dir_range(log
, path
, dirid
, key_type
,
1810 &range_start
, &range_end
);
1815 dir_key
.offset
= range_start
;
1818 ret
= btrfs_search_slot(NULL
, root
, &dir_key
, path
,
1823 nritems
= btrfs_header_nritems(path
->nodes
[0]);
1824 if (path
->slots
[0] >= nritems
) {
1825 ret
= btrfs_next_leaf(root
, path
);
1829 btrfs_item_key_to_cpu(path
->nodes
[0], &found_key
,
1831 if (found_key
.objectid
!= dirid
||
1832 found_key
.type
!= dir_key
.type
)
1835 if (found_key
.offset
> range_end
)
1838 ret
= check_item_in_log(trans
, root
, log
, path
,
1842 if (found_key
.offset
== (u64
)-1)
1844 dir_key
.offset
= found_key
.offset
+ 1;
1846 btrfs_release_path(path
);
1847 if (range_end
== (u64
)-1)
1849 range_start
= range_end
+ 1;
1854 if (key_type
== BTRFS_DIR_LOG_ITEM_KEY
) {
1855 key_type
= BTRFS_DIR_LOG_INDEX_KEY
;
1856 dir_key
.type
= BTRFS_DIR_INDEX_KEY
;
1857 btrfs_release_path(path
);
1861 btrfs_release_path(path
);
1862 btrfs_free_path(log_path
);
1868 * the process_func used to replay items from the log tree. This
1869 * gets called in two different stages. The first stage just looks
1870 * for inodes and makes sure they are all copied into the subvolume.
1872 * The second stage copies all the other item types from the log into
1873 * the subvolume. The two stage approach is slower, but gets rid of
1874 * lots of complexity around inodes referencing other inodes that exist
1875 * only in the log (references come from either directory items or inode
1878 static int replay_one_buffer(struct btrfs_root
*log
, struct extent_buffer
*eb
,
1879 struct walk_control
*wc
, u64 gen
)
1882 struct btrfs_path
*path
;
1883 struct btrfs_root
*root
= wc
->replay_dest
;
1884 struct btrfs_key key
;
1889 ret
= btrfs_read_buffer(eb
, gen
);
1893 level
= btrfs_header_level(eb
);
1898 path
= btrfs_alloc_path();
1902 nritems
= btrfs_header_nritems(eb
);
1903 for (i
= 0; i
< nritems
; i
++) {
1904 btrfs_item_key_to_cpu(eb
, &key
, i
);
1906 /* inode keys are done during the first stage */
1907 if (key
.type
== BTRFS_INODE_ITEM_KEY
&&
1908 wc
->stage
== LOG_WALK_REPLAY_INODES
) {
1909 struct btrfs_inode_item
*inode_item
;
1912 inode_item
= btrfs_item_ptr(eb
, i
,
1913 struct btrfs_inode_item
);
1914 mode
= btrfs_inode_mode(eb
, inode_item
);
1915 if (S_ISDIR(mode
)) {
1916 ret
= replay_dir_deletes(wc
->trans
,
1917 root
, log
, path
, key
.objectid
, 0);
1920 ret
= overwrite_item(wc
->trans
, root
, path
,
1924 /* for regular files, make sure corresponding
1925 * orhpan item exist. extents past the new EOF
1926 * will be truncated later by orphan cleanup.
1928 if (S_ISREG(mode
)) {
1929 ret
= insert_orphan_item(wc
->trans
, root
,
1934 ret
= link_to_fixup_dir(wc
->trans
, root
,
1935 path
, key
.objectid
);
1938 if (wc
->stage
< LOG_WALK_REPLAY_ALL
)
1941 /* these keys are simply copied */
1942 if (key
.type
== BTRFS_XATTR_ITEM_KEY
) {
1943 ret
= overwrite_item(wc
->trans
, root
, path
,
1946 } else if (key
.type
== BTRFS_INODE_REF_KEY
) {
1947 ret
= add_inode_ref(wc
->trans
, root
, log
, path
,
1949 BUG_ON(ret
&& ret
!= -ENOENT
);
1950 } else if (key
.type
== BTRFS_INODE_EXTREF_KEY
) {
1951 ret
= add_inode_ref(wc
->trans
, root
, log
, path
,
1953 BUG_ON(ret
&& ret
!= -ENOENT
);
1954 } else if (key
.type
== BTRFS_EXTENT_DATA_KEY
) {
1955 ret
= replay_one_extent(wc
->trans
, root
, path
,
1958 } else if (key
.type
== BTRFS_DIR_ITEM_KEY
||
1959 key
.type
== BTRFS_DIR_INDEX_KEY
) {
1960 ret
= replay_one_dir_item(wc
->trans
, root
, path
,
1965 btrfs_free_path(path
);
1969 static noinline
int walk_down_log_tree(struct btrfs_trans_handle
*trans
,
1970 struct btrfs_root
*root
,
1971 struct btrfs_path
*path
, int *level
,
1972 struct walk_control
*wc
)
1977 struct extent_buffer
*next
;
1978 struct extent_buffer
*cur
;
1979 struct extent_buffer
*parent
;
1983 WARN_ON(*level
< 0);
1984 WARN_ON(*level
>= BTRFS_MAX_LEVEL
);
1986 while (*level
> 0) {
1987 WARN_ON(*level
< 0);
1988 WARN_ON(*level
>= BTRFS_MAX_LEVEL
);
1989 cur
= path
->nodes
[*level
];
1991 if (btrfs_header_level(cur
) != *level
)
1994 if (path
->slots
[*level
] >=
1995 btrfs_header_nritems(cur
))
1998 bytenr
= btrfs_node_blockptr(cur
, path
->slots
[*level
]);
1999 ptr_gen
= btrfs_node_ptr_generation(cur
, path
->slots
[*level
]);
2000 blocksize
= btrfs_level_size(root
, *level
- 1);
2002 parent
= path
->nodes
[*level
];
2003 root_owner
= btrfs_header_owner(parent
);
2005 next
= btrfs_find_create_tree_block(root
, bytenr
, blocksize
);
2010 ret
= wc
->process_func(root
, next
, wc
, ptr_gen
);
2014 path
->slots
[*level
]++;
2016 ret
= btrfs_read_buffer(next
, ptr_gen
);
2018 free_extent_buffer(next
);
2022 btrfs_tree_lock(next
);
2023 btrfs_set_lock_blocking(next
);
2024 clean_tree_block(trans
, root
, next
);
2025 btrfs_wait_tree_block_writeback(next
);
2026 btrfs_tree_unlock(next
);
2028 WARN_ON(root_owner
!=
2029 BTRFS_TREE_LOG_OBJECTID
);
2030 ret
= btrfs_free_and_pin_reserved_extent(root
,
2032 BUG_ON(ret
); /* -ENOMEM or logic errors */
2034 free_extent_buffer(next
);
2037 ret
= btrfs_read_buffer(next
, ptr_gen
);
2039 free_extent_buffer(next
);
2043 WARN_ON(*level
<= 0);
2044 if (path
->nodes
[*level
-1])
2045 free_extent_buffer(path
->nodes
[*level
-1]);
2046 path
->nodes
[*level
-1] = next
;
2047 *level
= btrfs_header_level(next
);
2048 path
->slots
[*level
] = 0;
2051 WARN_ON(*level
< 0);
2052 WARN_ON(*level
>= BTRFS_MAX_LEVEL
);
2054 path
->slots
[*level
] = btrfs_header_nritems(path
->nodes
[*level
]);
2060 static noinline
int walk_up_log_tree(struct btrfs_trans_handle
*trans
,
2061 struct btrfs_root
*root
,
2062 struct btrfs_path
*path
, int *level
,
2063 struct walk_control
*wc
)
2070 for (i
= *level
; i
< BTRFS_MAX_LEVEL
- 1 && path
->nodes
[i
]; i
++) {
2071 slot
= path
->slots
[i
];
2072 if (slot
+ 1 < btrfs_header_nritems(path
->nodes
[i
])) {
2075 WARN_ON(*level
== 0);
2078 struct extent_buffer
*parent
;
2079 if (path
->nodes
[*level
] == root
->node
)
2080 parent
= path
->nodes
[*level
];
2082 parent
= path
->nodes
[*level
+ 1];
2084 root_owner
= btrfs_header_owner(parent
);
2085 ret
= wc
->process_func(root
, path
->nodes
[*level
], wc
,
2086 btrfs_header_generation(path
->nodes
[*level
]));
2091 struct extent_buffer
*next
;
2093 next
= path
->nodes
[*level
];
2095 btrfs_tree_lock(next
);
2096 btrfs_set_lock_blocking(next
);
2097 clean_tree_block(trans
, root
, next
);
2098 btrfs_wait_tree_block_writeback(next
);
2099 btrfs_tree_unlock(next
);
2101 WARN_ON(root_owner
!= BTRFS_TREE_LOG_OBJECTID
);
2102 ret
= btrfs_free_and_pin_reserved_extent(root
,
2103 path
->nodes
[*level
]->start
,
2104 path
->nodes
[*level
]->len
);
2107 free_extent_buffer(path
->nodes
[*level
]);
2108 path
->nodes
[*level
] = NULL
;
2116 * drop the reference count on the tree rooted at 'snap'. This traverses
2117 * the tree freeing any blocks that have a ref count of zero after being
2120 static int walk_log_tree(struct btrfs_trans_handle
*trans
,
2121 struct btrfs_root
*log
, struct walk_control
*wc
)
2126 struct btrfs_path
*path
;
2130 path
= btrfs_alloc_path();
2134 level
= btrfs_header_level(log
->node
);
2136 path
->nodes
[level
] = log
->node
;
2137 extent_buffer_get(log
->node
);
2138 path
->slots
[level
] = 0;
2141 wret
= walk_down_log_tree(trans
, log
, path
, &level
, wc
);
2149 wret
= walk_up_log_tree(trans
, log
, path
, &level
, wc
);
2158 /* was the root node processed? if not, catch it here */
2159 if (path
->nodes
[orig_level
]) {
2160 ret
= wc
->process_func(log
, path
->nodes
[orig_level
], wc
,
2161 btrfs_header_generation(path
->nodes
[orig_level
]));
2165 struct extent_buffer
*next
;
2167 next
= path
->nodes
[orig_level
];
2169 btrfs_tree_lock(next
);
2170 btrfs_set_lock_blocking(next
);
2171 clean_tree_block(trans
, log
, next
);
2172 btrfs_wait_tree_block_writeback(next
);
2173 btrfs_tree_unlock(next
);
2175 WARN_ON(log
->root_key
.objectid
!=
2176 BTRFS_TREE_LOG_OBJECTID
);
2177 ret
= btrfs_free_and_pin_reserved_extent(log
, next
->start
,
2179 BUG_ON(ret
); /* -ENOMEM or logic errors */
2184 for (i
= 0; i
<= orig_level
; i
++) {
2185 if (path
->nodes
[i
]) {
2186 free_extent_buffer(path
->nodes
[i
]);
2187 path
->nodes
[i
] = NULL
;
2190 btrfs_free_path(path
);
2195 * helper function to update the item for a given subvolumes log root
2196 * in the tree of log roots
2198 static int update_log_root(struct btrfs_trans_handle
*trans
,
2199 struct btrfs_root
*log
)
2203 if (log
->log_transid
== 1) {
2204 /* insert root item on the first sync */
2205 ret
= btrfs_insert_root(trans
, log
->fs_info
->log_root_tree
,
2206 &log
->root_key
, &log
->root_item
);
2208 ret
= btrfs_update_root(trans
, log
->fs_info
->log_root_tree
,
2209 &log
->root_key
, &log
->root_item
);
2214 static int wait_log_commit(struct btrfs_trans_handle
*trans
,
2215 struct btrfs_root
*root
, unsigned long transid
)
2218 int index
= transid
% 2;
2221 * we only allow two pending log transactions at a time,
2222 * so we know that if ours is more than 2 older than the
2223 * current transaction, we're done
2226 prepare_to_wait(&root
->log_commit_wait
[index
],
2227 &wait
, TASK_UNINTERRUPTIBLE
);
2228 mutex_unlock(&root
->log_mutex
);
2230 if (root
->fs_info
->last_trans_log_full_commit
!=
2231 trans
->transid
&& root
->log_transid
< transid
+ 2 &&
2232 atomic_read(&root
->log_commit
[index
]))
2235 finish_wait(&root
->log_commit_wait
[index
], &wait
);
2236 mutex_lock(&root
->log_mutex
);
2237 } while (root
->fs_info
->last_trans_log_full_commit
!=
2238 trans
->transid
&& root
->log_transid
< transid
+ 2 &&
2239 atomic_read(&root
->log_commit
[index
]));
2243 static void wait_for_writer(struct btrfs_trans_handle
*trans
,
2244 struct btrfs_root
*root
)
2247 while (root
->fs_info
->last_trans_log_full_commit
!=
2248 trans
->transid
&& atomic_read(&root
->log_writers
)) {
2249 prepare_to_wait(&root
->log_writer_wait
,
2250 &wait
, TASK_UNINTERRUPTIBLE
);
2251 mutex_unlock(&root
->log_mutex
);
2252 if (root
->fs_info
->last_trans_log_full_commit
!=
2253 trans
->transid
&& atomic_read(&root
->log_writers
))
2255 mutex_lock(&root
->log_mutex
);
2256 finish_wait(&root
->log_writer_wait
, &wait
);
2261 * btrfs_sync_log does sends a given tree log down to the disk and
2262 * updates the super blocks to record it. When this call is done,
2263 * you know that any inodes previously logged are safely on disk only
2266 * Any other return value means you need to call btrfs_commit_transaction.
2267 * Some of the edge cases for fsyncing directories that have had unlinks
2268 * or renames done in the past mean that sometimes the only safe
2269 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN,
2270 * that has happened.
2272 int btrfs_sync_log(struct btrfs_trans_handle
*trans
,
2273 struct btrfs_root
*root
)
2279 struct btrfs_root
*log
= root
->log_root
;
2280 struct btrfs_root
*log_root_tree
= root
->fs_info
->log_root_tree
;
2281 unsigned long log_transid
= 0;
2283 mutex_lock(&root
->log_mutex
);
2284 index1
= root
->log_transid
% 2;
2285 if (atomic_read(&root
->log_commit
[index1
])) {
2286 wait_log_commit(trans
, root
, root
->log_transid
);
2287 mutex_unlock(&root
->log_mutex
);
2290 atomic_set(&root
->log_commit
[index1
], 1);
2292 /* wait for previous tree log sync to complete */
2293 if (atomic_read(&root
->log_commit
[(index1
+ 1) % 2]))
2294 wait_log_commit(trans
, root
, root
->log_transid
- 1);
2296 int batch
= atomic_read(&root
->log_batch
);
2297 /* when we're on an ssd, just kick the log commit out */
2298 if (!btrfs_test_opt(root
, SSD
) && root
->log_multiple_pids
) {
2299 mutex_unlock(&root
->log_mutex
);
2300 schedule_timeout_uninterruptible(1);
2301 mutex_lock(&root
->log_mutex
);
2303 wait_for_writer(trans
, root
);
2304 if (batch
== atomic_read(&root
->log_batch
))
2308 /* bail out if we need to do a full commit */
2309 if (root
->fs_info
->last_trans_log_full_commit
== trans
->transid
) {
2311 mutex_unlock(&root
->log_mutex
);
2315 log_transid
= root
->log_transid
;
2316 if (log_transid
% 2 == 0)
2317 mark
= EXTENT_DIRTY
;
2321 /* we start IO on all the marked extents here, but we don't actually
2322 * wait for them until later.
2324 ret
= btrfs_write_marked_extents(log
, &log
->dirty_log_pages
, mark
);
2326 btrfs_abort_transaction(trans
, root
, ret
);
2327 mutex_unlock(&root
->log_mutex
);
2331 btrfs_set_root_node(&log
->root_item
, log
->node
);
2333 root
->log_transid
++;
2334 log
->log_transid
= root
->log_transid
;
2335 root
->log_start_pid
= 0;
2338 * IO has been started, blocks of the log tree have WRITTEN flag set
2339 * in their headers. new modifications of the log will be written to
2340 * new positions. so it's safe to allow log writers to go in.
2342 mutex_unlock(&root
->log_mutex
);
2344 mutex_lock(&log_root_tree
->log_mutex
);
2345 atomic_inc(&log_root_tree
->log_batch
);
2346 atomic_inc(&log_root_tree
->log_writers
);
2347 mutex_unlock(&log_root_tree
->log_mutex
);
2349 ret
= update_log_root(trans
, log
);
2351 mutex_lock(&log_root_tree
->log_mutex
);
2352 if (atomic_dec_and_test(&log_root_tree
->log_writers
)) {
2354 if (waitqueue_active(&log_root_tree
->log_writer_wait
))
2355 wake_up(&log_root_tree
->log_writer_wait
);
2359 if (ret
!= -ENOSPC
) {
2360 btrfs_abort_transaction(trans
, root
, ret
);
2361 mutex_unlock(&log_root_tree
->log_mutex
);
2364 root
->fs_info
->last_trans_log_full_commit
= trans
->transid
;
2365 btrfs_wait_marked_extents(log
, &log
->dirty_log_pages
, mark
);
2366 mutex_unlock(&log_root_tree
->log_mutex
);
2371 index2
= log_root_tree
->log_transid
% 2;
2372 if (atomic_read(&log_root_tree
->log_commit
[index2
])) {
2373 btrfs_wait_marked_extents(log
, &log
->dirty_log_pages
, mark
);
2374 wait_log_commit(trans
, log_root_tree
,
2375 log_root_tree
->log_transid
);
2376 mutex_unlock(&log_root_tree
->log_mutex
);
2380 atomic_set(&log_root_tree
->log_commit
[index2
], 1);
2382 if (atomic_read(&log_root_tree
->log_commit
[(index2
+ 1) % 2])) {
2383 wait_log_commit(trans
, log_root_tree
,
2384 log_root_tree
->log_transid
- 1);
2387 wait_for_writer(trans
, log_root_tree
);
2390 * now that we've moved on to the tree of log tree roots,
2391 * check the full commit flag again
2393 if (root
->fs_info
->last_trans_log_full_commit
== trans
->transid
) {
2394 btrfs_wait_marked_extents(log
, &log
->dirty_log_pages
, mark
);
2395 mutex_unlock(&log_root_tree
->log_mutex
);
2397 goto out_wake_log_root
;
2400 ret
= btrfs_write_and_wait_marked_extents(log_root_tree
,
2401 &log_root_tree
->dirty_log_pages
,
2402 EXTENT_DIRTY
| EXTENT_NEW
);
2404 btrfs_abort_transaction(trans
, root
, ret
);
2405 mutex_unlock(&log_root_tree
->log_mutex
);
2406 goto out_wake_log_root
;
2408 btrfs_wait_marked_extents(log
, &log
->dirty_log_pages
, mark
);
2410 btrfs_set_super_log_root(root
->fs_info
->super_for_commit
,
2411 log_root_tree
->node
->start
);
2412 btrfs_set_super_log_root_level(root
->fs_info
->super_for_commit
,
2413 btrfs_header_level(log_root_tree
->node
));
2415 log_root_tree
->log_transid
++;
2418 mutex_unlock(&log_root_tree
->log_mutex
);
2421 * nobody else is going to jump in and write the the ctree
2422 * super here because the log_commit atomic below is protecting
2423 * us. We must be called with a transaction handle pinning
2424 * the running transaction open, so a full commit can't hop
2425 * in and cause problems either.
2427 btrfs_scrub_pause_super(root
);
2428 ret
= write_ctree_super(trans
, root
->fs_info
->tree_root
, 1);
2429 btrfs_scrub_continue_super(root
);
2431 btrfs_abort_transaction(trans
, root
, ret
);
2432 goto out_wake_log_root
;
2435 mutex_lock(&root
->log_mutex
);
2436 if (root
->last_log_commit
< log_transid
)
2437 root
->last_log_commit
= log_transid
;
2438 mutex_unlock(&root
->log_mutex
);
2441 atomic_set(&log_root_tree
->log_commit
[index2
], 0);
2443 if (waitqueue_active(&log_root_tree
->log_commit_wait
[index2
]))
2444 wake_up(&log_root_tree
->log_commit_wait
[index2
]);
2446 atomic_set(&root
->log_commit
[index1
], 0);
2448 if (waitqueue_active(&root
->log_commit_wait
[index1
]))
2449 wake_up(&root
->log_commit_wait
[index1
]);
2453 static void free_log_tree(struct btrfs_trans_handle
*trans
,
2454 struct btrfs_root
*log
)
2459 struct walk_control wc
= {
2461 .process_func
= process_one_buffer
2464 ret
= walk_log_tree(trans
, log
, &wc
);
2468 ret
= find_first_extent_bit(&log
->dirty_log_pages
,
2469 0, &start
, &end
, EXTENT_DIRTY
| EXTENT_NEW
,
2474 clear_extent_bits(&log
->dirty_log_pages
, start
, end
,
2475 EXTENT_DIRTY
| EXTENT_NEW
, GFP_NOFS
);
2478 free_extent_buffer(log
->node
);
2483 * free all the extents used by the tree log. This should be called
2484 * at commit time of the full transaction
2486 int btrfs_free_log(struct btrfs_trans_handle
*trans
, struct btrfs_root
*root
)
2488 if (root
->log_root
) {
2489 free_log_tree(trans
, root
->log_root
);
2490 root
->log_root
= NULL
;
2495 int btrfs_free_log_root_tree(struct btrfs_trans_handle
*trans
,
2496 struct btrfs_fs_info
*fs_info
)
2498 if (fs_info
->log_root_tree
) {
2499 free_log_tree(trans
, fs_info
->log_root_tree
);
2500 fs_info
->log_root_tree
= NULL
;
2506 * If both a file and directory are logged, and unlinks or renames are
2507 * mixed in, we have a few interesting corners:
2509 * create file X in dir Y
2510 * link file X to X.link in dir Y
2512 * unlink file X but leave X.link
2515 * After a crash we would expect only X.link to exist. But file X
2516 * didn't get fsync'd again so the log has back refs for X and X.link.
2518 * We solve this by removing directory entries and inode backrefs from the
2519 * log when a file that was logged in the current transaction is
2520 * unlinked. Any later fsync will include the updated log entries, and
2521 * we'll be able to reconstruct the proper directory items from backrefs.
2523 * This optimizations allows us to avoid relogging the entire inode
2524 * or the entire directory.
2526 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle
*trans
,
2527 struct btrfs_root
*root
,
2528 const char *name
, int name_len
,
2529 struct inode
*dir
, u64 index
)
2531 struct btrfs_root
*log
;
2532 struct btrfs_dir_item
*di
;
2533 struct btrfs_path
*path
;
2537 u64 dir_ino
= btrfs_ino(dir
);
2539 if (BTRFS_I(dir
)->logged_trans
< trans
->transid
)
2542 ret
= join_running_log_trans(root
);
2546 mutex_lock(&BTRFS_I(dir
)->log_mutex
);
2548 log
= root
->log_root
;
2549 path
= btrfs_alloc_path();
2555 di
= btrfs_lookup_dir_item(trans
, log
, path
, dir_ino
,
2556 name
, name_len
, -1);
2562 ret
= btrfs_delete_one_dir_name(trans
, log
, path
, di
);
2563 bytes_del
+= name_len
;
2566 btrfs_release_path(path
);
2567 di
= btrfs_lookup_dir_index_item(trans
, log
, path
, dir_ino
,
2568 index
, name
, name_len
, -1);
2574 ret
= btrfs_delete_one_dir_name(trans
, log
, path
, di
);
2575 bytes_del
+= name_len
;
2579 /* update the directory size in the log to reflect the names
2583 struct btrfs_key key
;
2585 key
.objectid
= dir_ino
;
2587 key
.type
= BTRFS_INODE_ITEM_KEY
;
2588 btrfs_release_path(path
);
2590 ret
= btrfs_search_slot(trans
, log
, &key
, path
, 0, 1);
2596 struct btrfs_inode_item
*item
;
2599 item
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
2600 struct btrfs_inode_item
);
2601 i_size
= btrfs_inode_size(path
->nodes
[0], item
);
2602 if (i_size
> bytes_del
)
2603 i_size
-= bytes_del
;
2606 btrfs_set_inode_size(path
->nodes
[0], item
, i_size
);
2607 btrfs_mark_buffer_dirty(path
->nodes
[0]);
2610 btrfs_release_path(path
);
2613 btrfs_free_path(path
);
2615 mutex_unlock(&BTRFS_I(dir
)->log_mutex
);
2616 if (ret
== -ENOSPC
) {
2617 root
->fs_info
->last_trans_log_full_commit
= trans
->transid
;
2620 btrfs_abort_transaction(trans
, root
, ret
);
2622 btrfs_end_log_trans(root
);
2627 /* see comments for btrfs_del_dir_entries_in_log */
2628 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle
*trans
,
2629 struct btrfs_root
*root
,
2630 const char *name
, int name_len
,
2631 struct inode
*inode
, u64 dirid
)
2633 struct btrfs_root
*log
;
2637 if (BTRFS_I(inode
)->logged_trans
< trans
->transid
)
2640 ret
= join_running_log_trans(root
);
2643 log
= root
->log_root
;
2644 mutex_lock(&BTRFS_I(inode
)->log_mutex
);
2646 ret
= btrfs_del_inode_ref(trans
, log
, name
, name_len
, btrfs_ino(inode
),
2648 mutex_unlock(&BTRFS_I(inode
)->log_mutex
);
2649 if (ret
== -ENOSPC
) {
2650 root
->fs_info
->last_trans_log_full_commit
= trans
->transid
;
2652 } else if (ret
< 0 && ret
!= -ENOENT
)
2653 btrfs_abort_transaction(trans
, root
, ret
);
2654 btrfs_end_log_trans(root
);
2660 * creates a range item in the log for 'dirid'. first_offset and
2661 * last_offset tell us which parts of the key space the log should
2662 * be considered authoritative for.
2664 static noinline
int insert_dir_log_key(struct btrfs_trans_handle
*trans
,
2665 struct btrfs_root
*log
,
2666 struct btrfs_path
*path
,
2667 int key_type
, u64 dirid
,
2668 u64 first_offset
, u64 last_offset
)
2671 struct btrfs_key key
;
2672 struct btrfs_dir_log_item
*item
;
2674 key
.objectid
= dirid
;
2675 key
.offset
= first_offset
;
2676 if (key_type
== BTRFS_DIR_ITEM_KEY
)
2677 key
.type
= BTRFS_DIR_LOG_ITEM_KEY
;
2679 key
.type
= BTRFS_DIR_LOG_INDEX_KEY
;
2680 ret
= btrfs_insert_empty_item(trans
, log
, path
, &key
, sizeof(*item
));
2684 item
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
2685 struct btrfs_dir_log_item
);
2686 btrfs_set_dir_log_end(path
->nodes
[0], item
, last_offset
);
2687 btrfs_mark_buffer_dirty(path
->nodes
[0]);
2688 btrfs_release_path(path
);
2693 * log all the items included in the current transaction for a given
2694 * directory. This also creates the range items in the log tree required
2695 * to replay anything deleted before the fsync
2697 static noinline
int log_dir_items(struct btrfs_trans_handle
*trans
,
2698 struct btrfs_root
*root
, struct inode
*inode
,
2699 struct btrfs_path
*path
,
2700 struct btrfs_path
*dst_path
, int key_type
,
2701 u64 min_offset
, u64
*last_offset_ret
)
2703 struct btrfs_key min_key
;
2704 struct btrfs_key max_key
;
2705 struct btrfs_root
*log
= root
->log_root
;
2706 struct extent_buffer
*src
;
2711 u64 first_offset
= min_offset
;
2712 u64 last_offset
= (u64
)-1;
2713 u64 ino
= btrfs_ino(inode
);
2715 log
= root
->log_root
;
2716 max_key
.objectid
= ino
;
2717 max_key
.offset
= (u64
)-1;
2718 max_key
.type
= key_type
;
2720 min_key
.objectid
= ino
;
2721 min_key
.type
= key_type
;
2722 min_key
.offset
= min_offset
;
2724 path
->keep_locks
= 1;
2726 ret
= btrfs_search_forward(root
, &min_key
, &max_key
,
2727 path
, 0, trans
->transid
);
2730 * we didn't find anything from this transaction, see if there
2731 * is anything at all
2733 if (ret
!= 0 || min_key
.objectid
!= ino
|| min_key
.type
!= key_type
) {
2734 min_key
.objectid
= ino
;
2735 min_key
.type
= key_type
;
2736 min_key
.offset
= (u64
)-1;
2737 btrfs_release_path(path
);
2738 ret
= btrfs_search_slot(NULL
, root
, &min_key
, path
, 0, 0);
2740 btrfs_release_path(path
);
2743 ret
= btrfs_previous_item(root
, path
, ino
, key_type
);
2745 /* if ret == 0 there are items for this type,
2746 * create a range to tell us the last key of this type.
2747 * otherwise, there are no items in this directory after
2748 * *min_offset, and we create a range to indicate that.
2751 struct btrfs_key tmp
;
2752 btrfs_item_key_to_cpu(path
->nodes
[0], &tmp
,
2754 if (key_type
== tmp
.type
)
2755 first_offset
= max(min_offset
, tmp
.offset
) + 1;
2760 /* go backward to find any previous key */
2761 ret
= btrfs_previous_item(root
, path
, ino
, key_type
);
2763 struct btrfs_key tmp
;
2764 btrfs_item_key_to_cpu(path
->nodes
[0], &tmp
, path
->slots
[0]);
2765 if (key_type
== tmp
.type
) {
2766 first_offset
= tmp
.offset
;
2767 ret
= overwrite_item(trans
, log
, dst_path
,
2768 path
->nodes
[0], path
->slots
[0],
2776 btrfs_release_path(path
);
2778 /* find the first key from this transaction again */
2779 ret
= btrfs_search_slot(NULL
, root
, &min_key
, path
, 0, 0);
2786 * we have a block from this transaction, log every item in it
2787 * from our directory
2790 struct btrfs_key tmp
;
2791 src
= path
->nodes
[0];
2792 nritems
= btrfs_header_nritems(src
);
2793 for (i
= path
->slots
[0]; i
< nritems
; i
++) {
2794 btrfs_item_key_to_cpu(src
, &min_key
, i
);
2796 if (min_key
.objectid
!= ino
|| min_key
.type
!= key_type
)
2798 ret
= overwrite_item(trans
, log
, dst_path
, src
, i
,
2805 path
->slots
[0] = nritems
;
2808 * look ahead to the next item and see if it is also
2809 * from this directory and from this transaction
2811 ret
= btrfs_next_leaf(root
, path
);
2813 last_offset
= (u64
)-1;
2816 btrfs_item_key_to_cpu(path
->nodes
[0], &tmp
, path
->slots
[0]);
2817 if (tmp
.objectid
!= ino
|| tmp
.type
!= key_type
) {
2818 last_offset
= (u64
)-1;
2821 if (btrfs_header_generation(path
->nodes
[0]) != trans
->transid
) {
2822 ret
= overwrite_item(trans
, log
, dst_path
,
2823 path
->nodes
[0], path
->slots
[0],
2828 last_offset
= tmp
.offset
;
2833 btrfs_release_path(path
);
2834 btrfs_release_path(dst_path
);
2837 *last_offset_ret
= last_offset
;
2839 * insert the log range keys to indicate where the log
2842 ret
= insert_dir_log_key(trans
, log
, path
, key_type
,
2843 ino
, first_offset
, last_offset
);
2851 * logging directories is very similar to logging inodes, We find all the items
2852 * from the current transaction and write them to the log.
2854 * The recovery code scans the directory in the subvolume, and if it finds a
2855 * key in the range logged that is not present in the log tree, then it means
2856 * that dir entry was unlinked during the transaction.
2858 * In order for that scan to work, we must include one key smaller than
2859 * the smallest logged by this transaction and one key larger than the largest
2860 * key logged by this transaction.
2862 static noinline
int log_directory_changes(struct btrfs_trans_handle
*trans
,
2863 struct btrfs_root
*root
, struct inode
*inode
,
2864 struct btrfs_path
*path
,
2865 struct btrfs_path
*dst_path
)
2870 int key_type
= BTRFS_DIR_ITEM_KEY
;
2876 ret
= log_dir_items(trans
, root
, inode
, path
,
2877 dst_path
, key_type
, min_key
,
2881 if (max_key
== (u64
)-1)
2883 min_key
= max_key
+ 1;
2886 if (key_type
== BTRFS_DIR_ITEM_KEY
) {
2887 key_type
= BTRFS_DIR_INDEX_KEY
;
2894 * a helper function to drop items from the log before we relog an
2895 * inode. max_key_type indicates the highest item type to remove.
2896 * This cannot be run for file data extents because it does not
2897 * free the extents they point to.
2899 static int drop_objectid_items(struct btrfs_trans_handle
*trans
,
2900 struct btrfs_root
*log
,
2901 struct btrfs_path
*path
,
2902 u64 objectid
, int max_key_type
)
2905 struct btrfs_key key
;
2906 struct btrfs_key found_key
;
2909 key
.objectid
= objectid
;
2910 key
.type
= max_key_type
;
2911 key
.offset
= (u64
)-1;
2914 ret
= btrfs_search_slot(trans
, log
, &key
, path
, -1, 1);
2919 if (path
->slots
[0] == 0)
2923 btrfs_item_key_to_cpu(path
->nodes
[0], &found_key
,
2926 if (found_key
.objectid
!= objectid
)
2929 found_key
.offset
= 0;
2931 ret
= btrfs_bin_search(path
->nodes
[0], &found_key
, 0,
2934 ret
= btrfs_del_items(trans
, log
, path
, start_slot
,
2935 path
->slots
[0] - start_slot
+ 1);
2937 * If start slot isn't 0 then we don't need to re-search, we've
2938 * found the last guy with the objectid in this tree.
2940 if (ret
|| start_slot
!= 0)
2942 btrfs_release_path(path
);
2944 btrfs_release_path(path
);
2950 static void fill_inode_item(struct btrfs_trans_handle
*trans
,
2951 struct extent_buffer
*leaf
,
2952 struct btrfs_inode_item
*item
,
2953 struct inode
*inode
, int log_inode_only
)
2955 struct btrfs_map_token token
;
2957 btrfs_init_map_token(&token
);
2959 if (log_inode_only
) {
2960 /* set the generation to zero so the recover code
2961 * can tell the difference between an logging
2962 * just to say 'this inode exists' and a logging
2963 * to say 'update this inode with these values'
2965 btrfs_set_token_inode_generation(leaf
, item
, 0, &token
);
2966 btrfs_set_token_inode_size(leaf
, item
, 0, &token
);
2968 btrfs_set_token_inode_generation(leaf
, item
,
2969 BTRFS_I(inode
)->generation
,
2971 btrfs_set_token_inode_size(leaf
, item
, inode
->i_size
, &token
);
2974 btrfs_set_token_inode_uid(leaf
, item
, i_uid_read(inode
), &token
);
2975 btrfs_set_token_inode_gid(leaf
, item
, i_gid_read(inode
), &token
);
2976 btrfs_set_token_inode_mode(leaf
, item
, inode
->i_mode
, &token
);
2977 btrfs_set_token_inode_nlink(leaf
, item
, inode
->i_nlink
, &token
);
2979 btrfs_set_token_timespec_sec(leaf
, btrfs_inode_atime(item
),
2980 inode
->i_atime
.tv_sec
, &token
);
2981 btrfs_set_token_timespec_nsec(leaf
, btrfs_inode_atime(item
),
2982 inode
->i_atime
.tv_nsec
, &token
);
2984 btrfs_set_token_timespec_sec(leaf
, btrfs_inode_mtime(item
),
2985 inode
->i_mtime
.tv_sec
, &token
);
2986 btrfs_set_token_timespec_nsec(leaf
, btrfs_inode_mtime(item
),
2987 inode
->i_mtime
.tv_nsec
, &token
);
2989 btrfs_set_token_timespec_sec(leaf
, btrfs_inode_ctime(item
),
2990 inode
->i_ctime
.tv_sec
, &token
);
2991 btrfs_set_token_timespec_nsec(leaf
, btrfs_inode_ctime(item
),
2992 inode
->i_ctime
.tv_nsec
, &token
);
2994 btrfs_set_token_inode_nbytes(leaf
, item
, inode_get_bytes(inode
),
2997 btrfs_set_token_inode_sequence(leaf
, item
, inode
->i_version
, &token
);
2998 btrfs_set_token_inode_transid(leaf
, item
, trans
->transid
, &token
);
2999 btrfs_set_token_inode_rdev(leaf
, item
, inode
->i_rdev
, &token
);
3000 btrfs_set_token_inode_flags(leaf
, item
, BTRFS_I(inode
)->flags
, &token
);
3001 btrfs_set_token_inode_block_group(leaf
, item
, 0, &token
);
3004 static int log_inode_item(struct btrfs_trans_handle
*trans
,
3005 struct btrfs_root
*log
, struct btrfs_path
*path
,
3006 struct inode
*inode
)
3008 struct btrfs_inode_item
*inode_item
;
3009 struct btrfs_key key
;
3012 memcpy(&key
, &BTRFS_I(inode
)->location
, sizeof(key
));
3013 ret
= btrfs_insert_empty_item(trans
, log
, path
, &key
,
3014 sizeof(*inode_item
));
3015 if (ret
&& ret
!= -EEXIST
)
3017 inode_item
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
3018 struct btrfs_inode_item
);
3019 fill_inode_item(trans
, path
->nodes
[0], inode_item
, inode
, 0);
3020 btrfs_release_path(path
);
3024 static noinline
int copy_items(struct btrfs_trans_handle
*trans
,
3025 struct inode
*inode
,
3026 struct btrfs_path
*dst_path
,
3027 struct extent_buffer
*src
,
3028 int start_slot
, int nr
, int inode_only
)
3030 unsigned long src_offset
;
3031 unsigned long dst_offset
;
3032 struct btrfs_root
*log
= BTRFS_I(inode
)->root
->log_root
;
3033 struct btrfs_file_extent_item
*extent
;
3034 struct btrfs_inode_item
*inode_item
;
3036 struct btrfs_key
*ins_keys
;
3040 struct list_head ordered_sums
;
3041 int skip_csum
= BTRFS_I(inode
)->flags
& BTRFS_INODE_NODATASUM
;
3043 INIT_LIST_HEAD(&ordered_sums
);
3045 ins_data
= kmalloc(nr
* sizeof(struct btrfs_key
) +
3046 nr
* sizeof(u32
), GFP_NOFS
);
3050 ins_sizes
= (u32
*)ins_data
;
3051 ins_keys
= (struct btrfs_key
*)(ins_data
+ nr
* sizeof(u32
));
3053 for (i
= 0; i
< nr
; i
++) {
3054 ins_sizes
[i
] = btrfs_item_size_nr(src
, i
+ start_slot
);
3055 btrfs_item_key_to_cpu(src
, ins_keys
+ i
, i
+ start_slot
);
3057 ret
= btrfs_insert_empty_items(trans
, log
, dst_path
,
3058 ins_keys
, ins_sizes
, nr
);
3064 for (i
= 0; i
< nr
; i
++, dst_path
->slots
[0]++) {
3065 dst_offset
= btrfs_item_ptr_offset(dst_path
->nodes
[0],
3066 dst_path
->slots
[0]);
3068 src_offset
= btrfs_item_ptr_offset(src
, start_slot
+ i
);
3070 if (ins_keys
[i
].type
== BTRFS_INODE_ITEM_KEY
) {
3071 inode_item
= btrfs_item_ptr(dst_path
->nodes
[0],
3073 struct btrfs_inode_item
);
3074 fill_inode_item(trans
, dst_path
->nodes
[0], inode_item
,
3075 inode
, inode_only
== LOG_INODE_EXISTS
);
3077 copy_extent_buffer(dst_path
->nodes
[0], src
, dst_offset
,
3078 src_offset
, ins_sizes
[i
]);
3081 /* take a reference on file data extents so that truncates
3082 * or deletes of this inode don't have to relog the inode
3085 if (btrfs_key_type(ins_keys
+ i
) == BTRFS_EXTENT_DATA_KEY
&&
3088 extent
= btrfs_item_ptr(src
, start_slot
+ i
,
3089 struct btrfs_file_extent_item
);
3091 if (btrfs_file_extent_generation(src
, extent
) < trans
->transid
)
3094 found_type
= btrfs_file_extent_type(src
, extent
);
3095 if (found_type
== BTRFS_FILE_EXTENT_REG
) {
3097 ds
= btrfs_file_extent_disk_bytenr(src
,
3099 /* ds == 0 is a hole */
3103 dl
= btrfs_file_extent_disk_num_bytes(src
,
3105 cs
= btrfs_file_extent_offset(src
, extent
);
3106 cl
= btrfs_file_extent_num_bytes(src
,
3108 if (btrfs_file_extent_compression(src
,
3114 ret
= btrfs_lookup_csums_range(
3115 log
->fs_info
->csum_root
,
3116 ds
+ cs
, ds
+ cs
+ cl
- 1,
3123 btrfs_mark_buffer_dirty(dst_path
->nodes
[0]);
3124 btrfs_release_path(dst_path
);
3128 * we have to do this after the loop above to avoid changing the
3129 * log tree while trying to change the log tree.
3132 while (!list_empty(&ordered_sums
)) {
3133 struct btrfs_ordered_sum
*sums
= list_entry(ordered_sums
.next
,
3134 struct btrfs_ordered_sum
,
3137 ret
= btrfs_csum_file_blocks(trans
, log
, sums
);
3138 list_del(&sums
->list
);
3144 static int extent_cmp(void *priv
, struct list_head
*a
, struct list_head
*b
)
3146 struct extent_map
*em1
, *em2
;
3148 em1
= list_entry(a
, struct extent_map
, list
);
3149 em2
= list_entry(b
, struct extent_map
, list
);
3151 if (em1
->start
< em2
->start
)
3153 else if (em1
->start
> em2
->start
)
3158 static int drop_adjacent_extents(struct btrfs_trans_handle
*trans
,
3159 struct btrfs_root
*root
, struct inode
*inode
,
3160 struct extent_map
*em
,
3161 struct btrfs_path
*path
)
3163 struct btrfs_file_extent_item
*fi
;
3164 struct extent_buffer
*leaf
;
3165 struct btrfs_key key
, new_key
;
3166 struct btrfs_map_token token
;
3168 u64 extent_offset
= 0;
3175 btrfs_init_map_token(&token
);
3176 leaf
= path
->nodes
[0];
3178 if (path
->slots
[0] >= btrfs_header_nritems(leaf
)) {
3180 ret
= btrfs_del_items(trans
, root
, path
,
3187 ret
= btrfs_next_leaf_write(trans
, root
, path
, 1);
3192 leaf
= path
->nodes
[0];
3195 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
3196 if (key
.objectid
!= btrfs_ino(inode
) ||
3197 key
.type
!= BTRFS_EXTENT_DATA_KEY
||
3198 key
.offset
>= em
->start
+ em
->len
)
3201 fi
= btrfs_item_ptr(leaf
, path
->slots
[0],
3202 struct btrfs_file_extent_item
);
3203 extent_type
= btrfs_token_file_extent_type(leaf
, fi
, &token
);
3204 if (extent_type
== BTRFS_FILE_EXTENT_REG
||
3205 extent_type
== BTRFS_FILE_EXTENT_PREALLOC
) {
3206 extent_offset
= btrfs_token_file_extent_offset(leaf
,
3208 extent_end
= key
.offset
+
3209 btrfs_token_file_extent_num_bytes(leaf
, fi
,
3211 } else if (extent_type
== BTRFS_FILE_EXTENT_INLINE
) {
3212 extent_end
= key
.offset
+
3213 btrfs_file_extent_inline_len(leaf
, fi
);
3218 if (extent_end
<= em
->len
+ em
->start
) {
3220 del_slot
= path
->slots
[0];
3227 * Ok so we'll ignore previous items if we log a new extent,
3228 * which can lead to overlapping extents, so if we have an
3229 * existing extent we want to adjust we _have_ to check the next
3230 * guy to make sure we even need this extent anymore, this keeps
3231 * us from panicing in set_item_key_safe.
3233 if (path
->slots
[0] < btrfs_header_nritems(leaf
) - 1) {
3234 struct btrfs_key tmp_key
;
3236 btrfs_item_key_to_cpu(leaf
, &tmp_key
,
3237 path
->slots
[0] + 1);
3238 if (tmp_key
.objectid
== btrfs_ino(inode
) &&
3239 tmp_key
.type
== BTRFS_EXTENT_DATA_KEY
&&
3240 tmp_key
.offset
<= em
->start
+ em
->len
) {
3242 del_slot
= path
->slots
[0];
3248 BUG_ON(extent_type
== BTRFS_FILE_EXTENT_INLINE
);
3249 memcpy(&new_key
, &key
, sizeof(new_key
));
3250 new_key
.offset
= em
->start
+ em
->len
;
3251 btrfs_set_item_key_safe(trans
, root
, path
, &new_key
);
3252 extent_offset
+= em
->start
+ em
->len
- key
.offset
;
3253 btrfs_set_token_file_extent_offset(leaf
, fi
, extent_offset
,
3255 btrfs_set_token_file_extent_num_bytes(leaf
, fi
, extent_end
-
3256 (em
->start
+ em
->len
),
3258 btrfs_mark_buffer_dirty(leaf
);
3262 ret
= btrfs_del_items(trans
, root
, path
, del_slot
, del_nr
);
3267 static int log_one_extent(struct btrfs_trans_handle
*trans
,
3268 struct inode
*inode
, struct btrfs_root
*root
,
3269 struct extent_map
*em
, struct btrfs_path
*path
)
3271 struct btrfs_root
*log
= root
->log_root
;
3272 struct btrfs_file_extent_item
*fi
;
3273 struct extent_buffer
*leaf
;
3274 struct list_head ordered_sums
;
3275 struct btrfs_map_token token
;
3276 struct btrfs_key key
;
3277 u64 csum_offset
= em
->mod_start
- em
->start
;
3278 u64 csum_len
= em
->mod_len
;
3279 u64 extent_offset
= em
->start
- em
->orig_start
;
3282 bool skip_csum
= BTRFS_I(inode
)->flags
& BTRFS_INODE_NODATASUM
;
3284 INIT_LIST_HEAD(&ordered_sums
);
3285 btrfs_init_map_token(&token
);
3286 key
.objectid
= btrfs_ino(inode
);
3287 key
.type
= BTRFS_EXTENT_DATA_KEY
;
3288 key
.offset
= em
->start
;
3289 path
->really_keep_locks
= 1;
3291 ret
= btrfs_insert_empty_item(trans
, log
, path
, &key
, sizeof(*fi
));
3292 if (ret
&& ret
!= -EEXIST
) {
3293 path
->really_keep_locks
= 0;
3296 leaf
= path
->nodes
[0];
3297 fi
= btrfs_item_ptr(leaf
, path
->slots
[0],
3298 struct btrfs_file_extent_item
);
3299 btrfs_set_token_file_extent_generation(leaf
, fi
, em
->generation
,
3301 if (test_bit(EXTENT_FLAG_PREALLOC
, &em
->flags
)) {
3303 btrfs_set_token_file_extent_type(leaf
, fi
,
3304 BTRFS_FILE_EXTENT_PREALLOC
,
3307 btrfs_set_token_file_extent_type(leaf
, fi
,
3308 BTRFS_FILE_EXTENT_REG
,
3310 if (em
->block_start
== 0)
3314 block_len
= max(em
->block_len
, em
->orig_block_len
);
3315 if (em
->compress_type
!= BTRFS_COMPRESS_NONE
) {
3316 btrfs_set_token_file_extent_disk_bytenr(leaf
, fi
,
3319 btrfs_set_token_file_extent_disk_num_bytes(leaf
, fi
, block_len
,
3321 } else if (em
->block_start
< EXTENT_MAP_LAST_BYTE
) {
3322 btrfs_set_token_file_extent_disk_bytenr(leaf
, fi
,
3324 extent_offset
, &token
);
3325 btrfs_set_token_file_extent_disk_num_bytes(leaf
, fi
, block_len
,
3328 btrfs_set_token_file_extent_disk_bytenr(leaf
, fi
, 0, &token
);
3329 btrfs_set_token_file_extent_disk_num_bytes(leaf
, fi
, 0,
3333 btrfs_set_token_file_extent_offset(leaf
, fi
,
3334 em
->start
- em
->orig_start
,
3336 btrfs_set_token_file_extent_num_bytes(leaf
, fi
, em
->len
, &token
);
3337 btrfs_set_token_file_extent_ram_bytes(leaf
, fi
, em
->len
, &token
);
3338 btrfs_set_token_file_extent_compression(leaf
, fi
, em
->compress_type
,
3340 btrfs_set_token_file_extent_encryption(leaf
, fi
, 0, &token
);
3341 btrfs_set_token_file_extent_other_encoding(leaf
, fi
, 0, &token
);
3342 btrfs_mark_buffer_dirty(leaf
);
3345 * Have to check the extent to the right of us to make sure it doesn't
3346 * fall in our current range. We're ok if the previous extent is in our
3347 * range since the recovery stuff will run us in key order and thus just
3348 * drop the part we overwrote.
3350 ret
= drop_adjacent_extents(trans
, log
, inode
, em
, path
);
3351 btrfs_release_path(path
);
3352 path
->really_keep_locks
= 0;
3360 if (em
->compress_type
) {
3362 csum_len
= block_len
;
3365 /* block start is already adjusted for the file extent offset. */
3366 ret
= btrfs_lookup_csums_range(log
->fs_info
->csum_root
,
3367 em
->block_start
+ csum_offset
,
3368 em
->block_start
+ csum_offset
+
3369 csum_len
- 1, &ordered_sums
, 0);
3373 while (!list_empty(&ordered_sums
)) {
3374 struct btrfs_ordered_sum
*sums
= list_entry(ordered_sums
.next
,
3375 struct btrfs_ordered_sum
,
3378 ret
= btrfs_csum_file_blocks(trans
, log
, sums
);
3379 list_del(&sums
->list
);
3386 static int btrfs_log_changed_extents(struct btrfs_trans_handle
*trans
,
3387 struct btrfs_root
*root
,
3388 struct inode
*inode
,
3389 struct btrfs_path
*path
)
3391 struct extent_map
*em
, *n
;
3392 struct list_head extents
;
3393 struct extent_map_tree
*tree
= &BTRFS_I(inode
)->extent_tree
;
3397 INIT_LIST_HEAD(&extents
);
3399 write_lock(&tree
->lock
);
3400 test_gen
= root
->fs_info
->last_trans_committed
;
3402 list_for_each_entry_safe(em
, n
, &tree
->modified_extents
, list
) {
3403 list_del_init(&em
->list
);
3404 if (em
->generation
<= test_gen
)
3406 /* Need a ref to keep it from getting evicted from cache */
3407 atomic_inc(&em
->refs
);
3408 set_bit(EXTENT_FLAG_LOGGING
, &em
->flags
);
3409 list_add_tail(&em
->list
, &extents
);
3412 list_sort(NULL
, &extents
, extent_cmp
);
3414 while (!list_empty(&extents
)) {
3415 em
= list_entry(extents
.next
, struct extent_map
, list
);
3417 list_del_init(&em
->list
);
3420 * If we had an error we just need to delete everybody from our
3424 clear_em_logging(tree
, em
);
3425 free_extent_map(em
);
3429 write_unlock(&tree
->lock
);
3431 ret
= log_one_extent(trans
, inode
, root
, em
, path
);
3432 write_lock(&tree
->lock
);
3433 clear_em_logging(tree
, em
);
3434 free_extent_map(em
);
3436 WARN_ON(!list_empty(&extents
));
3437 write_unlock(&tree
->lock
);
3439 btrfs_release_path(path
);
3443 /* log a single inode in the tree log.
3444 * At least one parent directory for this inode must exist in the tree
3445 * or be logged already.
3447 * Any items from this inode changed by the current transaction are copied
3448 * to the log tree. An extra reference is taken on any extents in this
3449 * file, allowing us to avoid a whole pile of corner cases around logging
3450 * blocks that have been removed from the tree.
3452 * See LOG_INODE_ALL and related defines for a description of what inode_only
3455 * This handles both files and directories.
3457 static int btrfs_log_inode(struct btrfs_trans_handle
*trans
,
3458 struct btrfs_root
*root
, struct inode
*inode
,
3461 struct btrfs_path
*path
;
3462 struct btrfs_path
*dst_path
;
3463 struct btrfs_key min_key
;
3464 struct btrfs_key max_key
;
3465 struct btrfs_root
*log
= root
->log_root
;
3466 struct extent_buffer
*src
= NULL
;
3470 int ins_start_slot
= 0;
3472 bool fast_search
= false;
3473 u64 ino
= btrfs_ino(inode
);
3475 log
= root
->log_root
;
3477 path
= btrfs_alloc_path();
3480 dst_path
= btrfs_alloc_path();
3482 btrfs_free_path(path
);
3486 min_key
.objectid
= ino
;
3487 min_key
.type
= BTRFS_INODE_ITEM_KEY
;
3490 max_key
.objectid
= ino
;
3493 /* today the code can only do partial logging of directories */
3494 if (S_ISDIR(inode
->i_mode
) ||
3495 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC
,
3496 &BTRFS_I(inode
)->runtime_flags
) &&
3497 inode_only
== LOG_INODE_EXISTS
))
3498 max_key
.type
= BTRFS_XATTR_ITEM_KEY
;
3500 max_key
.type
= (u8
)-1;
3501 max_key
.offset
= (u64
)-1;
3503 /* Only run delayed items if we are a dir or a new file */
3504 if (S_ISDIR(inode
->i_mode
) ||
3505 BTRFS_I(inode
)->generation
> root
->fs_info
->last_trans_committed
) {
3506 ret
= btrfs_commit_inode_delayed_items(trans
, inode
);
3508 btrfs_free_path(path
);
3509 btrfs_free_path(dst_path
);
3514 mutex_lock(&BTRFS_I(inode
)->log_mutex
);
3517 * a brute force approach to making sure we get the most uptodate
3518 * copies of everything.
3520 if (S_ISDIR(inode
->i_mode
)) {
3521 int max_key_type
= BTRFS_DIR_LOG_INDEX_KEY
;
3523 if (inode_only
== LOG_INODE_EXISTS
)
3524 max_key_type
= BTRFS_XATTR_ITEM_KEY
;
3525 ret
= drop_objectid_items(trans
, log
, path
, ino
, max_key_type
);
3527 if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC
,
3528 &BTRFS_I(inode
)->runtime_flags
)) {
3529 clear_bit(BTRFS_INODE_COPY_EVERYTHING
,
3530 &BTRFS_I(inode
)->runtime_flags
);
3531 ret
= btrfs_truncate_inode_items(trans
, log
,
3533 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING
,
3534 &BTRFS_I(inode
)->runtime_flags
)) {
3535 if (inode_only
== LOG_INODE_ALL
)
3537 max_key
.type
= BTRFS_XATTR_ITEM_KEY
;
3538 ret
= drop_objectid_items(trans
, log
, path
, ino
,
3541 if (inode_only
== LOG_INODE_ALL
)
3543 ret
= log_inode_item(trans
, log
, dst_path
, inode
);
3556 path
->keep_locks
= 1;
3560 ret
= btrfs_search_forward(root
, &min_key
, &max_key
,
3561 path
, 0, trans
->transid
);
3565 /* note, ins_nr might be > 0 here, cleanup outside the loop */
3566 if (min_key
.objectid
!= ino
)
3568 if (min_key
.type
> max_key
.type
)
3571 src
= path
->nodes
[0];
3572 if (ins_nr
&& ins_start_slot
+ ins_nr
== path
->slots
[0]) {
3575 } else if (!ins_nr
) {
3576 ins_start_slot
= path
->slots
[0];
3581 ret
= copy_items(trans
, inode
, dst_path
, src
, ins_start_slot
,
3582 ins_nr
, inode_only
);
3588 ins_start_slot
= path
->slots
[0];
3591 nritems
= btrfs_header_nritems(path
->nodes
[0]);
3593 if (path
->slots
[0] < nritems
) {
3594 btrfs_item_key_to_cpu(path
->nodes
[0], &min_key
,
3599 ret
= copy_items(trans
, inode
, dst_path
, src
,
3601 ins_nr
, inode_only
);
3608 btrfs_release_path(path
);
3610 if (min_key
.offset
< (u64
)-1)
3612 else if (min_key
.type
< (u8
)-1)
3614 else if (min_key
.objectid
< (u64
)-1)
3620 ret
= copy_items(trans
, inode
, dst_path
, src
, ins_start_slot
,
3621 ins_nr
, inode_only
);
3631 btrfs_release_path(dst_path
);
3632 ret
= btrfs_log_changed_extents(trans
, root
, inode
, dst_path
);
3638 struct extent_map_tree
*tree
= &BTRFS_I(inode
)->extent_tree
;
3639 struct extent_map
*em
, *n
;
3641 write_lock(&tree
->lock
);
3642 list_for_each_entry_safe(em
, n
, &tree
->modified_extents
, list
)
3643 list_del_init(&em
->list
);
3644 write_unlock(&tree
->lock
);
3647 if (inode_only
== LOG_INODE_ALL
&& S_ISDIR(inode
->i_mode
)) {
3648 btrfs_release_path(path
);
3649 btrfs_release_path(dst_path
);
3650 ret
= log_directory_changes(trans
, root
, inode
, path
, dst_path
);
3656 BTRFS_I(inode
)->logged_trans
= trans
->transid
;
3657 BTRFS_I(inode
)->last_log_commit
= BTRFS_I(inode
)->last_sub_trans
;
3659 mutex_unlock(&BTRFS_I(inode
)->log_mutex
);
3661 btrfs_free_path(path
);
3662 btrfs_free_path(dst_path
);
3667 * follow the dentry parent pointers up the chain and see if any
3668 * of the directories in it require a full commit before they can
3669 * be logged. Returns zero if nothing special needs to be done or 1 if
3670 * a full commit is required.
3672 static noinline
int check_parent_dirs_for_sync(struct btrfs_trans_handle
*trans
,
3673 struct inode
*inode
,
3674 struct dentry
*parent
,
3675 struct super_block
*sb
,
3679 struct btrfs_root
*root
;
3680 struct dentry
*old_parent
= NULL
;
3683 * for regular files, if its inode is already on disk, we don't
3684 * have to worry about the parents at all. This is because
3685 * we can use the last_unlink_trans field to record renames
3686 * and other fun in this file.
3688 if (S_ISREG(inode
->i_mode
) &&
3689 BTRFS_I(inode
)->generation
<= last_committed
&&
3690 BTRFS_I(inode
)->last_unlink_trans
<= last_committed
)
3693 if (!S_ISDIR(inode
->i_mode
)) {
3694 if (!parent
|| !parent
->d_inode
|| sb
!= parent
->d_inode
->i_sb
)
3696 inode
= parent
->d_inode
;
3700 BTRFS_I(inode
)->logged_trans
= trans
->transid
;
3703 if (BTRFS_I(inode
)->last_unlink_trans
> last_committed
) {
3704 root
= BTRFS_I(inode
)->root
;
3707 * make sure any commits to the log are forced
3708 * to be full commits
3710 root
->fs_info
->last_trans_log_full_commit
=
3716 if (!parent
|| !parent
->d_inode
|| sb
!= parent
->d_inode
->i_sb
)
3719 if (IS_ROOT(parent
))
3722 parent
= dget_parent(parent
);
3724 old_parent
= parent
;
3725 inode
= parent
->d_inode
;
3734 * helper function around btrfs_log_inode to make sure newly created
3735 * parent directories also end up in the log. A minimal inode and backref
3736 * only logging is done of any parent directories that are older than
3737 * the last committed transaction
3739 int btrfs_log_inode_parent(struct btrfs_trans_handle
*trans
,
3740 struct btrfs_root
*root
, struct inode
*inode
,
3741 struct dentry
*parent
, int exists_only
)
3743 int inode_only
= exists_only
? LOG_INODE_EXISTS
: LOG_INODE_ALL
;
3744 struct super_block
*sb
;
3745 struct dentry
*old_parent
= NULL
;
3747 u64 last_committed
= root
->fs_info
->last_trans_committed
;
3751 if (btrfs_test_opt(root
, NOTREELOG
)) {
3756 if (root
->fs_info
->last_trans_log_full_commit
>
3757 root
->fs_info
->last_trans_committed
) {
3762 if (root
!= BTRFS_I(inode
)->root
||
3763 btrfs_root_refs(&root
->root_item
) == 0) {
3768 ret
= check_parent_dirs_for_sync(trans
, inode
, parent
,
3769 sb
, last_committed
);
3773 if (btrfs_inode_in_log(inode
, trans
->transid
)) {
3774 ret
= BTRFS_NO_LOG_SYNC
;
3778 ret
= start_log_trans(trans
, root
);
3782 ret
= btrfs_log_inode(trans
, root
, inode
, inode_only
);
3787 * for regular files, if its inode is already on disk, we don't
3788 * have to worry about the parents at all. This is because
3789 * we can use the last_unlink_trans field to record renames
3790 * and other fun in this file.
3792 if (S_ISREG(inode
->i_mode
) &&
3793 BTRFS_I(inode
)->generation
<= last_committed
&&
3794 BTRFS_I(inode
)->last_unlink_trans
<= last_committed
) {
3799 inode_only
= LOG_INODE_EXISTS
;
3801 if (!parent
|| !parent
->d_inode
|| sb
!= parent
->d_inode
->i_sb
)
3804 inode
= parent
->d_inode
;
3805 if (root
!= BTRFS_I(inode
)->root
)
3808 if (BTRFS_I(inode
)->generation
>
3809 root
->fs_info
->last_trans_committed
) {
3810 ret
= btrfs_log_inode(trans
, root
, inode
, inode_only
);
3814 if (IS_ROOT(parent
))
3817 parent
= dget_parent(parent
);
3819 old_parent
= parent
;
3825 WARN_ON(ret
!= -ENOSPC
);
3826 root
->fs_info
->last_trans_log_full_commit
= trans
->transid
;
3829 btrfs_end_log_trans(root
);
3835 * it is not safe to log dentry if the chunk root has added new
3836 * chunks. This returns 0 if the dentry was logged, and 1 otherwise.
3837 * If this returns 1, you must commit the transaction to safely get your
3840 int btrfs_log_dentry_safe(struct btrfs_trans_handle
*trans
,
3841 struct btrfs_root
*root
, struct dentry
*dentry
)
3843 struct dentry
*parent
= dget_parent(dentry
);
3846 ret
= btrfs_log_inode_parent(trans
, root
, dentry
->d_inode
, parent
, 0);
3853 * should be called during mount to recover any replay any log trees
3856 int btrfs_recover_log_trees(struct btrfs_root
*log_root_tree
)
3859 struct btrfs_path
*path
;
3860 struct btrfs_trans_handle
*trans
;
3861 struct btrfs_key key
;
3862 struct btrfs_key found_key
;
3863 struct btrfs_key tmp_key
;
3864 struct btrfs_root
*log
;
3865 struct btrfs_fs_info
*fs_info
= log_root_tree
->fs_info
;
3866 struct walk_control wc
= {
3867 .process_func
= process_one_buffer
,
3871 path
= btrfs_alloc_path();
3875 fs_info
->log_root_recovering
= 1;
3877 trans
= btrfs_start_transaction(fs_info
->tree_root
, 0);
3878 if (IS_ERR(trans
)) {
3879 ret
= PTR_ERR(trans
);
3886 ret
= walk_log_tree(trans
, log_root_tree
, &wc
);
3888 btrfs_error(fs_info
, ret
, "Failed to pin buffers while "
3889 "recovering log root tree.");
3894 key
.objectid
= BTRFS_TREE_LOG_OBJECTID
;
3895 key
.offset
= (u64
)-1;
3896 btrfs_set_key_type(&key
, BTRFS_ROOT_ITEM_KEY
);
3899 ret
= btrfs_search_slot(NULL
, log_root_tree
, &key
, path
, 0, 0);
3902 btrfs_error(fs_info
, ret
,
3903 "Couldn't find tree log root.");
3907 if (path
->slots
[0] == 0)
3911 btrfs_item_key_to_cpu(path
->nodes
[0], &found_key
,
3913 btrfs_release_path(path
);
3914 if (found_key
.objectid
!= BTRFS_TREE_LOG_OBJECTID
)
3917 log
= btrfs_read_fs_root_no_radix(log_root_tree
,
3921 btrfs_error(fs_info
, ret
,
3922 "Couldn't read tree log root.");
3926 tmp_key
.objectid
= found_key
.offset
;
3927 tmp_key
.type
= BTRFS_ROOT_ITEM_KEY
;
3928 tmp_key
.offset
= (u64
)-1;
3930 wc
.replay_dest
= btrfs_read_fs_root_no_name(fs_info
, &tmp_key
);
3931 if (IS_ERR(wc
.replay_dest
)) {
3932 ret
= PTR_ERR(wc
.replay_dest
);
3933 btrfs_error(fs_info
, ret
, "Couldn't read target root "
3934 "for tree log recovery.");
3938 wc
.replay_dest
->log_root
= log
;
3939 btrfs_record_root_in_trans(trans
, wc
.replay_dest
);
3940 ret
= walk_log_tree(trans
, log
, &wc
);
3943 if (wc
.stage
== LOG_WALK_REPLAY_ALL
) {
3944 ret
= fixup_inode_link_counts(trans
, wc
.replay_dest
,
3949 key
.offset
= found_key
.offset
- 1;
3950 wc
.replay_dest
->log_root
= NULL
;
3951 free_extent_buffer(log
->node
);
3952 free_extent_buffer(log
->commit_root
);
3955 if (found_key
.offset
== 0)
3958 btrfs_release_path(path
);
3960 /* step one is to pin it all, step two is to replay just inodes */
3963 wc
.process_func
= replay_one_buffer
;
3964 wc
.stage
= LOG_WALK_REPLAY_INODES
;
3967 /* step three is to replay everything */
3968 if (wc
.stage
< LOG_WALK_REPLAY_ALL
) {
3973 btrfs_free_path(path
);
3975 free_extent_buffer(log_root_tree
->node
);
3976 log_root_tree
->log_root
= NULL
;
3977 fs_info
->log_root_recovering
= 0;
3979 /* step 4: commit the transaction, which also unpins the blocks */
3980 btrfs_commit_transaction(trans
, fs_info
->tree_root
);
3982 kfree(log_root_tree
);
3986 btrfs_free_path(path
);
3991 * there are some corner cases where we want to force a full
3992 * commit instead of allowing a directory to be logged.
3994 * They revolve around files there were unlinked from the directory, and
3995 * this function updates the parent directory so that a full commit is
3996 * properly done if it is fsync'd later after the unlinks are done.
3998 void btrfs_record_unlink_dir(struct btrfs_trans_handle
*trans
,
3999 struct inode
*dir
, struct inode
*inode
,
4003 * when we're logging a file, if it hasn't been renamed
4004 * or unlinked, and its inode is fully committed on disk,
4005 * we don't have to worry about walking up the directory chain
4006 * to log its parents.
4008 * So, we use the last_unlink_trans field to put this transid
4009 * into the file. When the file is logged we check it and
4010 * don't log the parents if the file is fully on disk.
4012 if (S_ISREG(inode
->i_mode
))
4013 BTRFS_I(inode
)->last_unlink_trans
= trans
->transid
;
4016 * if this directory was already logged any new
4017 * names for this file/dir will get recorded
4020 if (BTRFS_I(dir
)->logged_trans
== trans
->transid
)
4024 * if the inode we're about to unlink was logged,
4025 * the log will be properly updated for any new names
4027 if (BTRFS_I(inode
)->logged_trans
== trans
->transid
)
4031 * when renaming files across directories, if the directory
4032 * there we're unlinking from gets fsync'd later on, there's
4033 * no way to find the destination directory later and fsync it
4034 * properly. So, we have to be conservative and force commits
4035 * so the new name gets discovered.
4040 /* we can safely do the unlink without any special recording */
4044 BTRFS_I(dir
)->last_unlink_trans
= trans
->transid
;
4048 * Call this after adding a new name for a file and it will properly
4049 * update the log to reflect the new name.
4051 * It will return zero if all goes well, and it will return 1 if a
4052 * full transaction commit is required.
4054 int btrfs_log_new_name(struct btrfs_trans_handle
*trans
,
4055 struct inode
*inode
, struct inode
*old_dir
,
4056 struct dentry
*parent
)
4058 struct btrfs_root
* root
= BTRFS_I(inode
)->root
;
4061 * this will force the logging code to walk the dentry chain
4064 if (S_ISREG(inode
->i_mode
))
4065 BTRFS_I(inode
)->last_unlink_trans
= trans
->transid
;
4068 * if this inode hasn't been logged and directory we're renaming it
4069 * from hasn't been logged, we don't need to log it
4071 if (BTRFS_I(inode
)->logged_trans
<=
4072 root
->fs_info
->last_trans_committed
&&
4073 (!old_dir
|| BTRFS_I(old_dir
)->logged_trans
<=
4074 root
->fs_info
->last_trans_committed
))
4077 return btrfs_log_inode_parent(trans
, root
, inode
, parent
, 1);