1 // SPDX-License-Identifier: GPL-2.0
4 * fs/ext4/fast_commit.c
6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
8 * Ext4 fast commits routines.
11 #include "ext4_jbd2.h"
12 #include "ext4_extents.h"
19 * Ext4 fast commits implement fine grained journalling for Ext4.
21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23 * TLV during the recovery phase. For the scenarios for which we currently
24 * don't have replay code, fast commit falls back to full commits.
25 * Fast commits record delta in one of the following three categories.
27 * (A) Directory entry updates:
29 * - EXT4_FC_TAG_UNLINK - records directory entry unlink
30 * - EXT4_FC_TAG_LINK - records directory entry link
31 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation
33 * (B) File specific data range updates:
35 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode
36 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode
38 * (C) Inode metadata (mtime / ctime etc):
40 * - EXT4_FC_TAG_INODE - record the inode that should be replayed
41 * during recovery. Note that iblocks field is
42 * not replayed and instead derived during
46 * With fast commits, we maintain all the directory entry operations in the
47 * order in which they are issued in an in-memory queue. This queue is flushed
48 * to disk during the commit operation. We also maintain a list of inodes
49 * that need to be committed during a fast commit in another in memory queue of
50 * inodes. During the commit operation, we commit in the following order:
52 * [1] Lock inodes for any further data updates by setting COMMITTING state
53 * [2] Submit data buffers of all the inodes
54 * [3] Wait for [2] to complete
55 * [4] Commit all the directory entry updates in the fast commit space
56 * [5] Commit all the changed inode structures
57 * [6] Write tail tag (this tag ensures the atomicity, please read the following
58 * section for more details).
59 * [7] Wait for [4], [5] and [6] to complete.
61 * All the inode updates must call ext4_fc_start_update() before starting an
62 * update. If such an ongoing update is present, fast commit waits for it to
63 * complete. The completion of such an update is marked by
64 * ext4_fc_stop_update().
66 * Fast Commit Ineligibility
67 * -------------------------
68 * Not all operations are supported by fast commits today (e.g extended
69 * attributes). Fast commit ineligiblity is marked by calling one of the
70 * two following functions:
72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
73 * back to full commit. This is useful in case of transient errors.
75 * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
76 * the fast commits happening between ext4_fc_start_ineligible() and
77 * ext4_fc_stop_ineligible() and one fast commit after the call to
78 * ext4_fc_stop_ineligible() to fall back to full commits. It is important to
79 * make one more fast commit to fall back to full commit after stop call so
80 * that it guaranteed that the fast commit ineligible operation contained
81 * within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
82 * followed by at least 1 full commit.
84 * Atomicity of commits
85 * --------------------
86 * In order to guarantee atomicity during the commit operation, fast commit
87 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
88 * tag contains CRC of the contents and TID of the transaction after which
89 * this fast commit should be applied. Recovery code replays fast commit
90 * logs only if there's at least 1 valid tail present. For every fast commit
91 * operation, there is 1 tail. This means, we may end up with multiple tails
92 * in the fast commit space. Here's an example:
94 * - Create a new file A and remove existing file B
96 * - Append contents to file A
100 * The fast commit space at the end of above operations would look like this:
101 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
102 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->|
104 * Replay code should thus check for all the valid tails in the FC area.
106 * Fast Commit Replay Idempotence
107 * ------------------------------
109 * Fast commits tags are idempotent in nature provided the recovery code follows
110 * certain rules. The guiding principle that the commit path follows while
111 * committing is that it stores the result of a particular operation instead of
112 * storing the procedure.
114 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
115 * was associated with inode 10. During fast commit, instead of storing this
116 * operation as a procedure "rename a to b", we store the resulting file system
117 * state as a "series" of outcomes:
119 * - Link dirent b to inode 10
121 * - Inode <10> with valid refcount
123 * Now when recovery code runs, it needs "enforce" this state on the file
124 * system. This is what guarantees idempotence of fast commit replay.
126 * Let's take an example of a procedure that is not idempotent and see how fast
127 * commits make it idempotent. Consider following sequence of operations:
129 * rm A; mv B A; read A
132 * (x), (y) and (z) are the points at which we can crash. If we store this
133 * sequence of operations as is then the replay is not idempotent. Let's say
134 * while in replay, we crash at (z). During the second replay, file A (which was
135 * actually created as a result of "mv B A" operation) would get deleted. Thus,
136 * file named A would be absent when we try to read A. So, this sequence of
137 * operations is not idempotent. However, as mentioned above, instead of storing
138 * the procedure fast commits store the outcome of each procedure. Thus the fast
139 * commit log for above procedure would be as follows:
141 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
142 * inode 11 before the replay)
144 * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11]
147 * If we crash at (z), we will have file A linked to inode 11. During the second
148 * replay, we will remove file A (inode 11). But we will create it back and make
149 * it point to inode 11. We won't find B, so we'll just skip that step. At this
150 * point, the refcount for inode 11 is not reliable, but that gets fixed by the
151 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
152 * similarly. Thus, by converting a non-idempotent procedure into a series of
153 * idempotent outcomes, fast commits ensured idempotence during the replay.
158 * 0) Fast commit replay path hardening: Fast commit replay code should use
159 * journal handles to make sure all the updates it does during the replay
160 * path are atomic. With that if we crash during fast commit replay, after
161 * trying to do recovery again, we will find a file system where fast commit
162 * area is invalid (because new full commit would be found). In order to deal
163 * with that, fast commit replay code should ensure that the "FC_REPLAY"
164 * superblock state is persisted before starting the replay, so that after
165 * the crash, fast commit recovery code can look at that flag and perform
166 * fast commit recovery even if that area is invalidated by later full
169 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
170 * eligible update must be protected within ext4_fc_start_update() and
171 * ext4_fc_stop_update(). These routines are called at much higher
172 * routines. This can be made more fine grained by combining with
173 * ext4_journal_start().
175 * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
177 * 3) Handle more ineligible cases.
180 #include <trace/events/ext4.h>
181 static struct kmem_cache
*ext4_fc_dentry_cachep
;
183 static void ext4_end_buffer_io_sync(struct buffer_head
*bh
, int uptodate
)
185 BUFFER_TRACE(bh
, "");
187 ext4_debug("%s: Block %lld up-to-date",
188 __func__
, bh
->b_blocknr
);
189 set_buffer_uptodate(bh
);
191 ext4_debug("%s: Block %lld not up-to-date",
192 __func__
, bh
->b_blocknr
);
193 clear_buffer_uptodate(bh
);
199 static inline void ext4_fc_reset_inode(struct inode
*inode
)
201 struct ext4_inode_info
*ei
= EXT4_I(inode
);
203 ei
->i_fc_lblk_start
= 0;
204 ei
->i_fc_lblk_len
= 0;
207 void ext4_fc_init_inode(struct inode
*inode
)
209 struct ext4_inode_info
*ei
= EXT4_I(inode
);
211 ext4_fc_reset_inode(inode
);
212 ext4_clear_inode_state(inode
, EXT4_STATE_FC_COMMITTING
);
213 INIT_LIST_HEAD(&ei
->i_fc_list
);
214 init_waitqueue_head(&ei
->i_fc_wait
);
215 atomic_set(&ei
->i_fc_updates
, 0);
218 /* This function must be called with sbi->s_fc_lock held. */
219 static void ext4_fc_wait_committing_inode(struct inode
*inode
)
220 __releases(&EXT4_SB(inode
->i_sb
)->s_fc_lock
)
222 wait_queue_head_t
*wq
;
223 struct ext4_inode_info
*ei
= EXT4_I(inode
);
225 #if (BITS_PER_LONG < 64)
226 DEFINE_WAIT_BIT(wait
, &ei
->i_state_flags
,
227 EXT4_STATE_FC_COMMITTING
);
228 wq
= bit_waitqueue(&ei
->i_state_flags
,
229 EXT4_STATE_FC_COMMITTING
);
231 DEFINE_WAIT_BIT(wait
, &ei
->i_flags
,
232 EXT4_STATE_FC_COMMITTING
);
233 wq
= bit_waitqueue(&ei
->i_flags
,
234 EXT4_STATE_FC_COMMITTING
);
236 lockdep_assert_held(&EXT4_SB(inode
->i_sb
)->s_fc_lock
);
237 prepare_to_wait(wq
, &wait
.wq_entry
, TASK_UNINTERRUPTIBLE
);
238 spin_unlock(&EXT4_SB(inode
->i_sb
)->s_fc_lock
);
240 finish_wait(wq
, &wait
.wq_entry
);
244 * Inform Ext4's fast about start of an inode update
246 * This function is called by the high level call VFS callbacks before
247 * performing any inode update. This function blocks if there's an ongoing
248 * fast commit on the inode in question.
250 void ext4_fc_start_update(struct inode
*inode
)
252 struct ext4_inode_info
*ei
= EXT4_I(inode
);
254 if (!test_opt2(inode
->i_sb
, JOURNAL_FAST_COMMIT
) ||
255 (EXT4_SB(inode
->i_sb
)->s_mount_state
& EXT4_FC_REPLAY
))
259 spin_lock(&EXT4_SB(inode
->i_sb
)->s_fc_lock
);
260 if (list_empty(&ei
->i_fc_list
))
263 if (ext4_test_inode_state(inode
, EXT4_STATE_FC_COMMITTING
)) {
264 ext4_fc_wait_committing_inode(inode
);
268 atomic_inc(&ei
->i_fc_updates
);
269 spin_unlock(&EXT4_SB(inode
->i_sb
)->s_fc_lock
);
273 * Stop inode update and wake up waiting fast commits if any.
275 void ext4_fc_stop_update(struct inode
*inode
)
277 struct ext4_inode_info
*ei
= EXT4_I(inode
);
279 if (!test_opt2(inode
->i_sb
, JOURNAL_FAST_COMMIT
) ||
280 (EXT4_SB(inode
->i_sb
)->s_mount_state
& EXT4_FC_REPLAY
))
283 if (atomic_dec_and_test(&ei
->i_fc_updates
))
284 wake_up_all(&ei
->i_fc_wait
);
288 * Remove inode from fast commit list. If the inode is being committed
289 * we wait until inode commit is done.
291 void ext4_fc_del(struct inode
*inode
)
293 struct ext4_inode_info
*ei
= EXT4_I(inode
);
295 if (!test_opt2(inode
->i_sb
, JOURNAL_FAST_COMMIT
) ||
296 (EXT4_SB(inode
->i_sb
)->s_mount_state
& EXT4_FC_REPLAY
))
300 spin_lock(&EXT4_SB(inode
->i_sb
)->s_fc_lock
);
301 if (list_empty(&ei
->i_fc_list
)) {
302 spin_unlock(&EXT4_SB(inode
->i_sb
)->s_fc_lock
);
306 if (ext4_test_inode_state(inode
, EXT4_STATE_FC_COMMITTING
)) {
307 ext4_fc_wait_committing_inode(inode
);
310 list_del_init(&ei
->i_fc_list
);
311 spin_unlock(&EXT4_SB(inode
->i_sb
)->s_fc_lock
);
315 * Mark file system as fast commit ineligible. This means that next commit
316 * operation would result in a full jbd2 commit.
318 void ext4_fc_mark_ineligible(struct super_block
*sb
, int reason
)
320 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
322 if (!test_opt2(sb
, JOURNAL_FAST_COMMIT
) ||
323 (EXT4_SB(sb
)->s_mount_state
& EXT4_FC_REPLAY
))
326 ext4_set_mount_flag(sb
, EXT4_MF_FC_INELIGIBLE
);
327 WARN_ON(reason
>= EXT4_FC_REASON_MAX
);
328 sbi
->s_fc_stats
.fc_ineligible_reason_count
[reason
]++;
332 * Start a fast commit ineligible update. Any commits that happen while
333 * such an operation is in progress fall back to full commits.
335 void ext4_fc_start_ineligible(struct super_block
*sb
, int reason
)
337 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
339 if (!test_opt2(sb
, JOURNAL_FAST_COMMIT
) ||
340 (EXT4_SB(sb
)->s_mount_state
& EXT4_FC_REPLAY
))
343 WARN_ON(reason
>= EXT4_FC_REASON_MAX
);
344 sbi
->s_fc_stats
.fc_ineligible_reason_count
[reason
]++;
345 atomic_inc(&sbi
->s_fc_ineligible_updates
);
349 * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
350 * to ensure that after stopping the ineligible update, at least one full
351 * commit takes place.
353 void ext4_fc_stop_ineligible(struct super_block
*sb
)
355 if (!test_opt2(sb
, JOURNAL_FAST_COMMIT
) ||
356 (EXT4_SB(sb
)->s_mount_state
& EXT4_FC_REPLAY
))
359 ext4_set_mount_flag(sb
, EXT4_MF_FC_INELIGIBLE
);
360 atomic_dec(&EXT4_SB(sb
)->s_fc_ineligible_updates
);
363 static inline int ext4_fc_is_ineligible(struct super_block
*sb
)
365 return (ext4_test_mount_flag(sb
, EXT4_MF_FC_INELIGIBLE
) ||
366 atomic_read(&EXT4_SB(sb
)->s_fc_ineligible_updates
));
370 * Generic fast commit tracking function. If this is the first time this we are
371 * called after a full commit, we initialize fast commit fields and then call
372 * __fc_track_fn() with update = 0. If we have already been called after a full
373 * commit, we pass update = 1. Based on that, the track function can determine
374 * if it needs to track a field for the first time or if it needs to just
375 * update the previously tracked value.
377 * If enqueue is set, this function enqueues the inode in fast commit list.
379 static int ext4_fc_track_template(
380 handle_t
*handle
, struct inode
*inode
,
381 int (*__fc_track_fn
)(struct inode
*, void *, bool),
382 void *args
, int enqueue
)
385 struct ext4_inode_info
*ei
= EXT4_I(inode
);
386 struct ext4_sb_info
*sbi
= EXT4_SB(inode
->i_sb
);
390 if (!test_opt2(inode
->i_sb
, JOURNAL_FAST_COMMIT
) ||
391 (sbi
->s_mount_state
& EXT4_FC_REPLAY
))
394 if (ext4_fc_is_ineligible(inode
->i_sb
))
397 tid
= handle
->h_transaction
->t_tid
;
398 mutex_lock(&ei
->i_fc_lock
);
399 if (tid
== ei
->i_sync_tid
) {
402 ext4_fc_reset_inode(inode
);
403 ei
->i_sync_tid
= tid
;
405 ret
= __fc_track_fn(inode
, args
, update
);
406 mutex_unlock(&ei
->i_fc_lock
);
411 spin_lock(&sbi
->s_fc_lock
);
412 if (list_empty(&EXT4_I(inode
)->i_fc_list
))
413 list_add_tail(&EXT4_I(inode
)->i_fc_list
,
414 (ext4_test_mount_flag(inode
->i_sb
, EXT4_MF_FC_COMMITTING
)) ?
415 &sbi
->s_fc_q
[FC_Q_STAGING
] :
416 &sbi
->s_fc_q
[FC_Q_MAIN
]);
417 spin_unlock(&sbi
->s_fc_lock
);
422 struct __track_dentry_update_args
{
423 struct dentry
*dentry
;
427 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
428 static int __track_dentry_update(struct inode
*inode
, void *arg
, bool update
)
430 struct ext4_fc_dentry_update
*node
;
431 struct ext4_inode_info
*ei
= EXT4_I(inode
);
432 struct __track_dentry_update_args
*dentry_update
=
433 (struct __track_dentry_update_args
*)arg
;
434 struct dentry
*dentry
= dentry_update
->dentry
;
435 struct ext4_sb_info
*sbi
= EXT4_SB(inode
->i_sb
);
437 mutex_unlock(&ei
->i_fc_lock
);
438 node
= kmem_cache_alloc(ext4_fc_dentry_cachep
, GFP_NOFS
);
440 ext4_fc_mark_ineligible(inode
->i_sb
, EXT4_FC_REASON_NOMEM
);
441 mutex_lock(&ei
->i_fc_lock
);
445 node
->fcd_op
= dentry_update
->op
;
446 node
->fcd_parent
= dentry
->d_parent
->d_inode
->i_ino
;
447 node
->fcd_ino
= inode
->i_ino
;
448 if (dentry
->d_name
.len
> DNAME_INLINE_LEN
) {
449 node
->fcd_name
.name
= kmalloc(dentry
->d_name
.len
, GFP_NOFS
);
450 if (!node
->fcd_name
.name
) {
451 kmem_cache_free(ext4_fc_dentry_cachep
, node
);
452 ext4_fc_mark_ineligible(inode
->i_sb
,
453 EXT4_FC_REASON_NOMEM
);
454 mutex_lock(&ei
->i_fc_lock
);
457 memcpy((u8
*)node
->fcd_name
.name
, dentry
->d_name
.name
,
460 memcpy(node
->fcd_iname
, dentry
->d_name
.name
,
462 node
->fcd_name
.name
= node
->fcd_iname
;
464 node
->fcd_name
.len
= dentry
->d_name
.len
;
466 spin_lock(&sbi
->s_fc_lock
);
467 if (ext4_test_mount_flag(inode
->i_sb
, EXT4_MF_FC_COMMITTING
))
468 list_add_tail(&node
->fcd_list
,
469 &sbi
->s_fc_dentry_q
[FC_Q_STAGING
]);
471 list_add_tail(&node
->fcd_list
, &sbi
->s_fc_dentry_q
[FC_Q_MAIN
]);
472 spin_unlock(&sbi
->s_fc_lock
);
473 mutex_lock(&ei
->i_fc_lock
);
478 void __ext4_fc_track_unlink(handle_t
*handle
,
479 struct inode
*inode
, struct dentry
*dentry
)
481 struct __track_dentry_update_args args
;
484 args
.dentry
= dentry
;
485 args
.op
= EXT4_FC_TAG_UNLINK
;
487 ret
= ext4_fc_track_template(handle
, inode
, __track_dentry_update
,
489 trace_ext4_fc_track_unlink(inode
, dentry
, ret
);
492 void ext4_fc_track_unlink(handle_t
*handle
, struct dentry
*dentry
)
494 __ext4_fc_track_unlink(handle
, d_inode(dentry
), dentry
);
497 void __ext4_fc_track_link(handle_t
*handle
,
498 struct inode
*inode
, struct dentry
*dentry
)
500 struct __track_dentry_update_args args
;
503 args
.dentry
= dentry
;
504 args
.op
= EXT4_FC_TAG_LINK
;
506 ret
= ext4_fc_track_template(handle
, inode
, __track_dentry_update
,
508 trace_ext4_fc_track_link(inode
, dentry
, ret
);
511 void ext4_fc_track_link(handle_t
*handle
, struct dentry
*dentry
)
513 __ext4_fc_track_link(handle
, d_inode(dentry
), dentry
);
516 void ext4_fc_track_create(handle_t
*handle
, struct dentry
*dentry
)
518 struct __track_dentry_update_args args
;
519 struct inode
*inode
= d_inode(dentry
);
522 args
.dentry
= dentry
;
523 args
.op
= EXT4_FC_TAG_CREAT
;
525 ret
= ext4_fc_track_template(handle
, inode
, __track_dentry_update
,
527 trace_ext4_fc_track_create(inode
, dentry
, ret
);
530 /* __track_fn for inode tracking */
531 static int __track_inode(struct inode
*inode
, void *arg
, bool update
)
536 EXT4_I(inode
)->i_fc_lblk_len
= 0;
541 void ext4_fc_track_inode(handle_t
*handle
, struct inode
*inode
)
545 if (S_ISDIR(inode
->i_mode
))
548 if (ext4_should_journal_data(inode
)) {
549 ext4_fc_mark_ineligible(inode
->i_sb
,
550 EXT4_FC_REASON_INODE_JOURNAL_DATA
);
554 ret
= ext4_fc_track_template(handle
, inode
, __track_inode
, NULL
, 1);
555 trace_ext4_fc_track_inode(inode
, ret
);
558 struct __track_range_args
{
559 ext4_lblk_t start
, end
;
562 /* __track_fn for tracking data updates */
563 static int __track_range(struct inode
*inode
, void *arg
, bool update
)
565 struct ext4_inode_info
*ei
= EXT4_I(inode
);
566 ext4_lblk_t oldstart
;
567 struct __track_range_args
*__arg
=
568 (struct __track_range_args
*)arg
;
570 if (inode
->i_ino
< EXT4_FIRST_INO(inode
->i_sb
)) {
571 ext4_debug("Special inode %ld being modified\n", inode
->i_ino
);
575 oldstart
= ei
->i_fc_lblk_start
;
577 if (update
&& ei
->i_fc_lblk_len
> 0) {
578 ei
->i_fc_lblk_start
= min(ei
->i_fc_lblk_start
, __arg
->start
);
580 max(oldstart
+ ei
->i_fc_lblk_len
- 1, __arg
->end
) -
581 ei
->i_fc_lblk_start
+ 1;
583 ei
->i_fc_lblk_start
= __arg
->start
;
584 ei
->i_fc_lblk_len
= __arg
->end
- __arg
->start
+ 1;
590 void ext4_fc_track_range(handle_t
*handle
, struct inode
*inode
, ext4_lblk_t start
,
593 struct __track_range_args args
;
596 if (S_ISDIR(inode
->i_mode
))
602 ret
= ext4_fc_track_template(handle
, inode
, __track_range
, &args
, 1);
604 trace_ext4_fc_track_range(inode
, start
, end
, ret
);
607 static void ext4_fc_submit_bh(struct super_block
*sb
)
609 int write_flags
= REQ_SYNC
;
610 struct buffer_head
*bh
= EXT4_SB(sb
)->s_fc_bh
;
612 /* TODO: REQ_FUA | REQ_PREFLUSH is unnecessarily expensive. */
613 if (test_opt(sb
, BARRIER
))
614 write_flags
|= REQ_FUA
| REQ_PREFLUSH
;
616 set_buffer_dirty(bh
);
617 set_buffer_uptodate(bh
);
618 bh
->b_end_io
= ext4_end_buffer_io_sync
;
619 submit_bh(REQ_OP_WRITE
, write_flags
, bh
);
620 EXT4_SB(sb
)->s_fc_bh
= NULL
;
623 /* Ext4 commit path routines */
625 /* memzero and update CRC */
626 static void *ext4_fc_memzero(struct super_block
*sb
, void *dst
, int len
,
631 ret
= memset(dst
, 0, len
);
633 *crc
= ext4_chksum(EXT4_SB(sb
), *crc
, dst
, len
);
638 * Allocate len bytes on a fast commit buffer.
640 * During the commit time this function is used to manage fast commit
641 * block space. We don't split a fast commit log onto different
642 * blocks. So this function makes sure that if there's not enough space
643 * on the current block, the remaining space in the current block is
644 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
645 * new block is from jbd2 and CRC is updated to reflect the padding
648 static u8
*ext4_fc_reserve_space(struct super_block
*sb
, int len
, u32
*crc
)
650 struct ext4_fc_tl
*tl
;
651 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
652 struct buffer_head
*bh
;
653 int bsize
= sbi
->s_journal
->j_blocksize
;
654 int ret
, off
= sbi
->s_fc_bytes
% bsize
;
658 * After allocating len, we should have space at least for a 0 byte
661 if (len
+ sizeof(struct ext4_fc_tl
) > bsize
)
664 if (bsize
- off
- 1 > len
+ sizeof(struct ext4_fc_tl
)) {
666 * Only allocate from current buffer if we have enough space for
667 * this request AND we have space to add a zero byte padding.
670 ret
= jbd2_fc_get_buf(EXT4_SB(sb
)->s_journal
, &bh
);
675 sbi
->s_fc_bytes
+= len
;
676 return sbi
->s_fc_bh
->b_data
+ off
;
678 /* Need to add PAD tag */
679 tl
= (struct ext4_fc_tl
*)(sbi
->s_fc_bh
->b_data
+ off
);
680 tl
->fc_tag
= cpu_to_le16(EXT4_FC_TAG_PAD
);
681 pad_len
= bsize
- off
- 1 - sizeof(struct ext4_fc_tl
);
682 tl
->fc_len
= cpu_to_le16(pad_len
);
684 *crc
= ext4_chksum(sbi
, *crc
, tl
, sizeof(*tl
));
686 ext4_fc_memzero(sb
, tl
+ 1, pad_len
, crc
);
687 ext4_fc_submit_bh(sb
);
689 ret
= jbd2_fc_get_buf(EXT4_SB(sb
)->s_journal
, &bh
);
693 sbi
->s_fc_bytes
= (sbi
->s_fc_bytes
/ bsize
+ 1) * bsize
+ len
;
694 return sbi
->s_fc_bh
->b_data
;
697 /* memcpy to fc reserved space and update CRC */
698 static void *ext4_fc_memcpy(struct super_block
*sb
, void *dst
, const void *src
,
702 *crc
= ext4_chksum(EXT4_SB(sb
), *crc
, src
, len
);
703 return memcpy(dst
, src
, len
);
707 * Complete a fast commit by writing tail tag.
709 * Writing tail tag marks the end of a fast commit. In order to guarantee
710 * atomicity, after writing tail tag, even if there's space remaining
711 * in the block, next commit shouldn't use it. That's why tail tag
712 * has the length as that of the remaining space on the block.
714 static int ext4_fc_write_tail(struct super_block
*sb
, u32 crc
)
716 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
717 struct ext4_fc_tl tl
;
718 struct ext4_fc_tail tail
;
719 int off
, bsize
= sbi
->s_journal
->j_blocksize
;
723 * ext4_fc_reserve_space takes care of allocating an extra block if
724 * there's no enough space on this block for accommodating this tail.
726 dst
= ext4_fc_reserve_space(sb
, sizeof(tl
) + sizeof(tail
), &crc
);
730 off
= sbi
->s_fc_bytes
% bsize
;
732 tl
.fc_tag
= cpu_to_le16(EXT4_FC_TAG_TAIL
);
733 tl
.fc_len
= cpu_to_le16(bsize
- off
- 1 + sizeof(struct ext4_fc_tail
));
734 sbi
->s_fc_bytes
= round_up(sbi
->s_fc_bytes
, bsize
);
736 ext4_fc_memcpy(sb
, dst
, &tl
, sizeof(tl
), &crc
);
738 tail
.fc_tid
= cpu_to_le32(sbi
->s_journal
->j_running_transaction
->t_tid
);
739 ext4_fc_memcpy(sb
, dst
, &tail
.fc_tid
, sizeof(tail
.fc_tid
), &crc
);
740 dst
+= sizeof(tail
.fc_tid
);
741 tail
.fc_crc
= cpu_to_le32(crc
);
742 ext4_fc_memcpy(sb
, dst
, &tail
.fc_crc
, sizeof(tail
.fc_crc
), NULL
);
744 ext4_fc_submit_bh(sb
);
750 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
751 * Returns false if there's not enough space.
753 static bool ext4_fc_add_tlv(struct super_block
*sb
, u16 tag
, u16 len
, u8
*val
,
756 struct ext4_fc_tl tl
;
759 dst
= ext4_fc_reserve_space(sb
, sizeof(tl
) + len
, crc
);
763 tl
.fc_tag
= cpu_to_le16(tag
);
764 tl
.fc_len
= cpu_to_le16(len
);
766 ext4_fc_memcpy(sb
, dst
, &tl
, sizeof(tl
), crc
);
767 ext4_fc_memcpy(sb
, dst
+ sizeof(tl
), val
, len
, crc
);
772 /* Same as above, but adds dentry tlv. */
773 static bool ext4_fc_add_dentry_tlv(struct super_block
*sb
, u16 tag
,
774 int parent_ino
, int ino
, int dlen
,
775 const unsigned char *dname
,
778 struct ext4_fc_dentry_info fcd
;
779 struct ext4_fc_tl tl
;
780 u8
*dst
= ext4_fc_reserve_space(sb
, sizeof(tl
) + sizeof(fcd
) + dlen
,
786 fcd
.fc_parent_ino
= cpu_to_le32(parent_ino
);
787 fcd
.fc_ino
= cpu_to_le32(ino
);
788 tl
.fc_tag
= cpu_to_le16(tag
);
789 tl
.fc_len
= cpu_to_le16(sizeof(fcd
) + dlen
);
790 ext4_fc_memcpy(sb
, dst
, &tl
, sizeof(tl
), crc
);
792 ext4_fc_memcpy(sb
, dst
, &fcd
, sizeof(fcd
), crc
);
794 ext4_fc_memcpy(sb
, dst
, dname
, dlen
, crc
);
801 * Writes inode in the fast commit space under TLV with tag @tag.
802 * Returns 0 on success, error on failure.
804 static int ext4_fc_write_inode(struct inode
*inode
, u32
*crc
)
806 struct ext4_inode_info
*ei
= EXT4_I(inode
);
807 int inode_len
= EXT4_GOOD_OLD_INODE_SIZE
;
809 struct ext4_iloc iloc
;
810 struct ext4_fc_inode fc_inode
;
811 struct ext4_fc_tl tl
;
814 ret
= ext4_get_inode_loc(inode
, &iloc
);
818 if (EXT4_INODE_SIZE(inode
->i_sb
) > EXT4_GOOD_OLD_INODE_SIZE
)
819 inode_len
+= ei
->i_extra_isize
;
821 fc_inode
.fc_ino
= cpu_to_le32(inode
->i_ino
);
822 tl
.fc_tag
= cpu_to_le16(EXT4_FC_TAG_INODE
);
823 tl
.fc_len
= cpu_to_le16(inode_len
+ sizeof(fc_inode
.fc_ino
));
825 dst
= ext4_fc_reserve_space(inode
->i_sb
,
826 sizeof(tl
) + inode_len
+ sizeof(fc_inode
.fc_ino
), crc
);
830 if (!ext4_fc_memcpy(inode
->i_sb
, dst
, &tl
, sizeof(tl
), crc
))
833 if (!ext4_fc_memcpy(inode
->i_sb
, dst
, &fc_inode
, sizeof(fc_inode
), crc
))
835 dst
+= sizeof(fc_inode
);
836 if (!ext4_fc_memcpy(inode
->i_sb
, dst
, (u8
*)ext4_raw_inode(&iloc
),
844 * Writes updated data ranges for the inode in question. Updates CRC.
845 * Returns 0 on success, error otherwise.
847 static int ext4_fc_write_inode_data(struct inode
*inode
, u32
*crc
)
849 ext4_lblk_t old_blk_size
, cur_lblk_off
, new_blk_size
;
850 struct ext4_inode_info
*ei
= EXT4_I(inode
);
851 struct ext4_map_blocks map
;
852 struct ext4_fc_add_range fc_ext
;
853 struct ext4_fc_del_range lrange
;
854 struct ext4_extent
*ex
;
857 mutex_lock(&ei
->i_fc_lock
);
858 if (ei
->i_fc_lblk_len
== 0) {
859 mutex_unlock(&ei
->i_fc_lock
);
862 old_blk_size
= ei
->i_fc_lblk_start
;
863 new_blk_size
= ei
->i_fc_lblk_start
+ ei
->i_fc_lblk_len
- 1;
864 ei
->i_fc_lblk_len
= 0;
865 mutex_unlock(&ei
->i_fc_lock
);
867 cur_lblk_off
= old_blk_size
;
868 jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
869 __func__
, cur_lblk_off
, new_blk_size
, inode
->i_ino
);
871 while (cur_lblk_off
<= new_blk_size
) {
872 map
.m_lblk
= cur_lblk_off
;
873 map
.m_len
= new_blk_size
- cur_lblk_off
+ 1;
874 ret
= ext4_map_blocks(NULL
, inode
, &map
, 0);
878 if (map
.m_len
== 0) {
884 lrange
.fc_ino
= cpu_to_le32(inode
->i_ino
);
885 lrange
.fc_lblk
= cpu_to_le32(map
.m_lblk
);
886 lrange
.fc_len
= cpu_to_le32(map
.m_len
);
887 if (!ext4_fc_add_tlv(inode
->i_sb
, EXT4_FC_TAG_DEL_RANGE
,
888 sizeof(lrange
), (u8
*)&lrange
, crc
))
891 fc_ext
.fc_ino
= cpu_to_le32(inode
->i_ino
);
892 ex
= (struct ext4_extent
*)&fc_ext
.fc_ex
;
893 ex
->ee_block
= cpu_to_le32(map
.m_lblk
);
894 ex
->ee_len
= cpu_to_le16(map
.m_len
);
895 ext4_ext_store_pblock(ex
, map
.m_pblk
);
896 if (map
.m_flags
& EXT4_MAP_UNWRITTEN
)
897 ext4_ext_mark_unwritten(ex
);
899 ext4_ext_mark_initialized(ex
);
900 if (!ext4_fc_add_tlv(inode
->i_sb
, EXT4_FC_TAG_ADD_RANGE
,
901 sizeof(fc_ext
), (u8
*)&fc_ext
, crc
))
905 cur_lblk_off
+= map
.m_len
;
912 /* Submit data for all the fast commit inodes */
913 static int ext4_fc_submit_inode_data_all(journal_t
*journal
)
915 struct super_block
*sb
= (struct super_block
*)(journal
->j_private
);
916 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
917 struct ext4_inode_info
*ei
;
918 struct list_head
*pos
;
921 spin_lock(&sbi
->s_fc_lock
);
922 ext4_set_mount_flag(sb
, EXT4_MF_FC_COMMITTING
);
923 list_for_each(pos
, &sbi
->s_fc_q
[FC_Q_MAIN
]) {
924 ei
= list_entry(pos
, struct ext4_inode_info
, i_fc_list
);
925 ext4_set_inode_state(&ei
->vfs_inode
, EXT4_STATE_FC_COMMITTING
);
926 while (atomic_read(&ei
->i_fc_updates
)) {
929 prepare_to_wait(&ei
->i_fc_wait
, &wait
,
930 TASK_UNINTERRUPTIBLE
);
931 if (atomic_read(&ei
->i_fc_updates
)) {
932 spin_unlock(&sbi
->s_fc_lock
);
934 spin_lock(&sbi
->s_fc_lock
);
936 finish_wait(&ei
->i_fc_wait
, &wait
);
938 spin_unlock(&sbi
->s_fc_lock
);
939 ret
= jbd2_submit_inode_data(ei
->jinode
);
942 spin_lock(&sbi
->s_fc_lock
);
944 spin_unlock(&sbi
->s_fc_lock
);
949 /* Wait for completion of data for all the fast commit inodes */
950 static int ext4_fc_wait_inode_data_all(journal_t
*journal
)
952 struct super_block
*sb
= (struct super_block
*)(journal
->j_private
);
953 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
954 struct ext4_inode_info
*pos
, *n
;
957 spin_lock(&sbi
->s_fc_lock
);
958 list_for_each_entry_safe(pos
, n
, &sbi
->s_fc_q
[FC_Q_MAIN
], i_fc_list
) {
959 if (!ext4_test_inode_state(&pos
->vfs_inode
,
960 EXT4_STATE_FC_COMMITTING
))
962 spin_unlock(&sbi
->s_fc_lock
);
964 ret
= jbd2_wait_inode_data(journal
, pos
->jinode
);
967 spin_lock(&sbi
->s_fc_lock
);
969 spin_unlock(&sbi
->s_fc_lock
);
974 /* Commit all the directory entry updates */
975 static int ext4_fc_commit_dentry_updates(journal_t
*journal
, u32
*crc
)
976 __acquires(&sbi
->s_fc_lock
)
977 __releases(&sbi
->s_fc_lock
)
979 struct super_block
*sb
= (struct super_block
*)(journal
->j_private
);
980 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
981 struct ext4_fc_dentry_update
*fc_dentry
;
983 struct list_head
*pos
, *n
, *fcd_pos
, *fcd_n
;
984 struct ext4_inode_info
*ei
;
987 if (list_empty(&sbi
->s_fc_dentry_q
[FC_Q_MAIN
]))
989 list_for_each_safe(fcd_pos
, fcd_n
, &sbi
->s_fc_dentry_q
[FC_Q_MAIN
]) {
990 fc_dentry
= list_entry(fcd_pos
, struct ext4_fc_dentry_update
,
992 if (fc_dentry
->fcd_op
!= EXT4_FC_TAG_CREAT
) {
993 spin_unlock(&sbi
->s_fc_lock
);
994 if (!ext4_fc_add_dentry_tlv(
995 sb
, fc_dentry
->fcd_op
,
996 fc_dentry
->fcd_parent
, fc_dentry
->fcd_ino
,
997 fc_dentry
->fcd_name
.len
,
998 fc_dentry
->fcd_name
.name
, crc
)) {
1002 spin_lock(&sbi
->s_fc_lock
);
1007 list_for_each_safe(pos
, n
, &sbi
->s_fc_q
[FC_Q_MAIN
]) {
1008 ei
= list_entry(pos
, struct ext4_inode_info
, i_fc_list
);
1009 if (ei
->vfs_inode
.i_ino
== fc_dentry
->fcd_ino
) {
1010 inode
= &ei
->vfs_inode
;
1015 * If we don't find inode in our list, then it was deleted,
1016 * in which case, we don't need to record it's create tag.
1020 spin_unlock(&sbi
->s_fc_lock
);
1023 * We first write the inode and then the create dirent. This
1024 * allows the recovery code to create an unnamed inode first
1025 * and then link it to a directory entry. This allows us
1026 * to use namei.c routines almost as is and simplifies
1027 * the recovery code.
1029 ret
= ext4_fc_write_inode(inode
, crc
);
1033 ret
= ext4_fc_write_inode_data(inode
, crc
);
1037 if (!ext4_fc_add_dentry_tlv(
1038 sb
, fc_dentry
->fcd_op
,
1039 fc_dentry
->fcd_parent
, fc_dentry
->fcd_ino
,
1040 fc_dentry
->fcd_name
.len
,
1041 fc_dentry
->fcd_name
.name
, crc
)) {
1046 spin_lock(&sbi
->s_fc_lock
);
1050 spin_lock(&sbi
->s_fc_lock
);
1054 static int ext4_fc_perform_commit(journal_t
*journal
)
1056 struct super_block
*sb
= (struct super_block
*)(journal
->j_private
);
1057 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
1058 struct ext4_inode_info
*iter
;
1059 struct ext4_fc_head head
;
1060 struct list_head
*pos
;
1061 struct inode
*inode
;
1062 struct blk_plug plug
;
1066 ret
= ext4_fc_submit_inode_data_all(journal
);
1070 ret
= ext4_fc_wait_inode_data_all(journal
);
1075 * If file system device is different from journal device, issue a cache
1076 * flush before we start writing fast commit blocks.
1078 if (journal
->j_fs_dev
!= journal
->j_dev
)
1079 blkdev_issue_flush(journal
->j_fs_dev
, GFP_NOFS
);
1081 blk_start_plug(&plug
);
1082 if (sbi
->s_fc_bytes
== 0) {
1084 * Add a head tag only if this is the first fast commit
1087 head
.fc_features
= cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES
);
1088 head
.fc_tid
= cpu_to_le32(
1089 sbi
->s_journal
->j_running_transaction
->t_tid
);
1090 if (!ext4_fc_add_tlv(sb
, EXT4_FC_TAG_HEAD
, sizeof(head
),
1095 spin_lock(&sbi
->s_fc_lock
);
1096 ret
= ext4_fc_commit_dentry_updates(journal
, &crc
);
1098 spin_unlock(&sbi
->s_fc_lock
);
1102 list_for_each(pos
, &sbi
->s_fc_q
[FC_Q_MAIN
]) {
1103 iter
= list_entry(pos
, struct ext4_inode_info
, i_fc_list
);
1104 inode
= &iter
->vfs_inode
;
1105 if (!ext4_test_inode_state(inode
, EXT4_STATE_FC_COMMITTING
))
1108 spin_unlock(&sbi
->s_fc_lock
);
1109 ret
= ext4_fc_write_inode_data(inode
, &crc
);
1112 ret
= ext4_fc_write_inode(inode
, &crc
);
1115 spin_lock(&sbi
->s_fc_lock
);
1117 spin_unlock(&sbi
->s_fc_lock
);
1119 ret
= ext4_fc_write_tail(sb
, crc
);
1122 blk_finish_plug(&plug
);
1127 * The main commit entry point. Performs a fast commit for transaction
1128 * commit_tid if needed. If it's not possible to perform a fast commit
1129 * due to various reasons, we fall back to full commit. Returns 0
1130 * on success, error otherwise.
1132 int ext4_fc_commit(journal_t
*journal
, tid_t commit_tid
)
1134 struct super_block
*sb
= (struct super_block
*)(journal
->j_private
);
1135 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
1136 int nblks
= 0, ret
, bsize
= journal
->j_blocksize
;
1137 int subtid
= atomic_read(&sbi
->s_fc_subtid
);
1138 int reason
= EXT4_FC_REASON_OK
, fc_bufs_before
= 0;
1139 ktime_t start_time
, commit_time
;
1141 trace_ext4_fc_commit_start(sb
);
1143 start_time
= ktime_get();
1145 if (!test_opt2(sb
, JOURNAL_FAST_COMMIT
) ||
1146 (ext4_fc_is_ineligible(sb
))) {
1147 reason
= EXT4_FC_REASON_INELIGIBLE
;
1152 ret
= jbd2_fc_begin_commit(journal
, commit_tid
);
1153 if (ret
== -EALREADY
) {
1154 /* There was an ongoing commit, check if we need to restart */
1155 if (atomic_read(&sbi
->s_fc_subtid
) <= subtid
&&
1156 commit_tid
> journal
->j_commit_sequence
)
1158 reason
= EXT4_FC_REASON_ALREADY_COMMITTED
;
1161 sbi
->s_fc_stats
.fc_ineligible_reason_count
[EXT4_FC_COMMIT_FAILED
]++;
1162 reason
= EXT4_FC_REASON_FC_START_FAILED
;
1166 fc_bufs_before
= (sbi
->s_fc_bytes
+ bsize
- 1) / bsize
;
1167 ret
= ext4_fc_perform_commit(journal
);
1169 sbi
->s_fc_stats
.fc_ineligible_reason_count
[EXT4_FC_COMMIT_FAILED
]++;
1170 reason
= EXT4_FC_REASON_FC_FAILED
;
1173 nblks
= (sbi
->s_fc_bytes
+ bsize
- 1) / bsize
- fc_bufs_before
;
1174 ret
= jbd2_fc_wait_bufs(journal
, nblks
);
1176 sbi
->s_fc_stats
.fc_ineligible_reason_count
[EXT4_FC_COMMIT_FAILED
]++;
1177 reason
= EXT4_FC_REASON_FC_FAILED
;
1180 atomic_inc(&sbi
->s_fc_subtid
);
1181 jbd2_fc_end_commit(journal
);
1183 /* Has any ineligible update happened since we started? */
1184 if (reason
== EXT4_FC_REASON_OK
&& ext4_fc_is_ineligible(sb
)) {
1185 sbi
->s_fc_stats
.fc_ineligible_reason_count
[EXT4_FC_COMMIT_FAILED
]++;
1186 reason
= EXT4_FC_REASON_INELIGIBLE
;
1189 spin_lock(&sbi
->s_fc_lock
);
1190 if (reason
!= EXT4_FC_REASON_OK
&&
1191 reason
!= EXT4_FC_REASON_ALREADY_COMMITTED
) {
1192 sbi
->s_fc_stats
.fc_ineligible_commits
++;
1194 sbi
->s_fc_stats
.fc_num_commits
++;
1195 sbi
->s_fc_stats
.fc_numblks
+= nblks
;
1197 spin_unlock(&sbi
->s_fc_lock
);
1198 nblks
= (reason
== EXT4_FC_REASON_OK
) ? nblks
: 0;
1199 trace_ext4_fc_commit_stop(sb
, nblks
, reason
);
1200 commit_time
= ktime_to_ns(ktime_sub(ktime_get(), start_time
));
1202 * weight the commit time higher than the average time so we don't
1203 * react too strongly to vast changes in the commit time
1205 if (likely(sbi
->s_fc_avg_commit_time
))
1206 sbi
->s_fc_avg_commit_time
= (commit_time
+
1207 sbi
->s_fc_avg_commit_time
* 3) / 4;
1209 sbi
->s_fc_avg_commit_time
= commit_time
;
1211 "Fast commit ended with blks = %d, reason = %d, subtid - %d",
1212 nblks
, reason
, subtid
);
1213 if (reason
== EXT4_FC_REASON_FC_FAILED
)
1214 return jbd2_fc_end_commit_fallback(journal
);
1215 if (reason
== EXT4_FC_REASON_FC_START_FAILED
||
1216 reason
== EXT4_FC_REASON_INELIGIBLE
)
1217 return jbd2_complete_transaction(journal
, commit_tid
);
1222 * Fast commit cleanup routine. This is called after every fast commit and
1223 * full commit. full is true if we are called after a full commit.
1225 static void ext4_fc_cleanup(journal_t
*journal
, int full
)
1227 struct super_block
*sb
= journal
->j_private
;
1228 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
1229 struct ext4_inode_info
*iter
;
1230 struct ext4_fc_dentry_update
*fc_dentry
;
1231 struct list_head
*pos
, *n
;
1233 if (full
&& sbi
->s_fc_bh
)
1234 sbi
->s_fc_bh
= NULL
;
1236 jbd2_fc_release_bufs(journal
);
1238 spin_lock(&sbi
->s_fc_lock
);
1239 list_for_each_safe(pos
, n
, &sbi
->s_fc_q
[FC_Q_MAIN
]) {
1240 iter
= list_entry(pos
, struct ext4_inode_info
, i_fc_list
);
1241 list_del_init(&iter
->i_fc_list
);
1242 ext4_clear_inode_state(&iter
->vfs_inode
,
1243 EXT4_STATE_FC_COMMITTING
);
1244 ext4_fc_reset_inode(&iter
->vfs_inode
);
1245 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1247 #if (BITS_PER_LONG < 64)
1248 wake_up_bit(&iter
->i_state_flags
, EXT4_STATE_FC_COMMITTING
);
1250 wake_up_bit(&iter
->i_flags
, EXT4_STATE_FC_COMMITTING
);
1254 while (!list_empty(&sbi
->s_fc_dentry_q
[FC_Q_MAIN
])) {
1255 fc_dentry
= list_first_entry(&sbi
->s_fc_dentry_q
[FC_Q_MAIN
],
1256 struct ext4_fc_dentry_update
,
1258 list_del_init(&fc_dentry
->fcd_list
);
1259 spin_unlock(&sbi
->s_fc_lock
);
1261 if (fc_dentry
->fcd_name
.name
&&
1262 fc_dentry
->fcd_name
.len
> DNAME_INLINE_LEN
)
1263 kfree(fc_dentry
->fcd_name
.name
);
1264 kmem_cache_free(ext4_fc_dentry_cachep
, fc_dentry
);
1265 spin_lock(&sbi
->s_fc_lock
);
1268 list_splice_init(&sbi
->s_fc_dentry_q
[FC_Q_STAGING
],
1269 &sbi
->s_fc_dentry_q
[FC_Q_MAIN
]);
1270 list_splice_init(&sbi
->s_fc_q
[FC_Q_STAGING
],
1271 &sbi
->s_fc_q
[FC_Q_STAGING
]);
1273 ext4_clear_mount_flag(sb
, EXT4_MF_FC_COMMITTING
);
1274 ext4_clear_mount_flag(sb
, EXT4_MF_FC_INELIGIBLE
);
1277 sbi
->s_fc_bytes
= 0;
1278 spin_unlock(&sbi
->s_fc_lock
);
1279 trace_ext4_fc_stats(sb
);
1282 /* Ext4 Replay Path Routines */
1284 /* Helper struct for dentry replay routines */
1285 struct dentry_info_args
{
1286 int parent_ino
, dname_len
, ino
, inode_len
;
1290 static inline void tl_to_darg(struct dentry_info_args
*darg
,
1291 struct ext4_fc_tl
*tl
)
1293 struct ext4_fc_dentry_info
*fcd
;
1295 fcd
= (struct ext4_fc_dentry_info
*)ext4_fc_tag_val(tl
);
1297 darg
->parent_ino
= le32_to_cpu(fcd
->fc_parent_ino
);
1298 darg
->ino
= le32_to_cpu(fcd
->fc_ino
);
1299 darg
->dname
= fcd
->fc_dname
;
1300 darg
->dname_len
= ext4_fc_tag_len(tl
) -
1301 sizeof(struct ext4_fc_dentry_info
);
1304 /* Unlink replay function */
1305 static int ext4_fc_replay_unlink(struct super_block
*sb
, struct ext4_fc_tl
*tl
)
1307 struct inode
*inode
, *old_parent
;
1309 struct dentry_info_args darg
;
1312 tl_to_darg(&darg
, tl
);
1314 trace_ext4_fc_replay(sb
, EXT4_FC_TAG_UNLINK
, darg
.ino
,
1315 darg
.parent_ino
, darg
.dname_len
);
1317 entry
.name
= darg
.dname
;
1318 entry
.len
= darg
.dname_len
;
1319 inode
= ext4_iget(sb
, darg
.ino
, EXT4_IGET_NORMAL
);
1321 if (IS_ERR_OR_NULL(inode
)) {
1322 jbd_debug(1, "Inode %d not found", darg
.ino
);
1326 old_parent
= ext4_iget(sb
, darg
.parent_ino
,
1328 if (IS_ERR_OR_NULL(old_parent
)) {
1329 jbd_debug(1, "Dir with inode %d not found", darg
.parent_ino
);
1334 ret
= __ext4_unlink(NULL
, old_parent
, &entry
, inode
);
1335 /* -ENOENT ok coz it might not exist anymore. */
1343 static int ext4_fc_replay_link_internal(struct super_block
*sb
,
1344 struct dentry_info_args
*darg
,
1345 struct inode
*inode
)
1347 struct inode
*dir
= NULL
;
1348 struct dentry
*dentry_dir
= NULL
, *dentry_inode
= NULL
;
1349 struct qstr qstr_dname
= QSTR_INIT(darg
->dname
, darg
->dname_len
);
1352 dir
= ext4_iget(sb
, darg
->parent_ino
, EXT4_IGET_NORMAL
);
1354 jbd_debug(1, "Dir with inode %d not found.", darg
->parent_ino
);
1359 dentry_dir
= d_obtain_alias(dir
);
1360 if (IS_ERR(dentry_dir
)) {
1361 jbd_debug(1, "Failed to obtain dentry");
1366 dentry_inode
= d_alloc(dentry_dir
, &qstr_dname
);
1367 if (!dentry_inode
) {
1368 jbd_debug(1, "Inode dentry not created.");
1373 ret
= __ext4_link(dir
, inode
, dentry_inode
);
1375 * It's possible that link already existed since data blocks
1376 * for the dir in question got persisted before we crashed OR
1377 * we replayed this tag and crashed before the entire replay
1380 if (ret
&& ret
!= -EEXIST
) {
1381 jbd_debug(1, "Failed to link\n");
1394 d_drop(dentry_inode
);
1401 /* Link replay function */
1402 static int ext4_fc_replay_link(struct super_block
*sb
, struct ext4_fc_tl
*tl
)
1404 struct inode
*inode
;
1405 struct dentry_info_args darg
;
1408 tl_to_darg(&darg
, tl
);
1409 trace_ext4_fc_replay(sb
, EXT4_FC_TAG_LINK
, darg
.ino
,
1410 darg
.parent_ino
, darg
.dname_len
);
1412 inode
= ext4_iget(sb
, darg
.ino
, EXT4_IGET_NORMAL
);
1413 if (IS_ERR_OR_NULL(inode
)) {
1414 jbd_debug(1, "Inode not found.");
1418 ret
= ext4_fc_replay_link_internal(sb
, &darg
, inode
);
1424 * Record all the modified inodes during replay. We use this later to setup
1425 * block bitmaps correctly.
1427 static int ext4_fc_record_modified_inode(struct super_block
*sb
, int ino
)
1429 struct ext4_fc_replay_state
*state
;
1432 state
= &EXT4_SB(sb
)->s_fc_replay_state
;
1433 for (i
= 0; i
< state
->fc_modified_inodes_used
; i
++)
1434 if (state
->fc_modified_inodes
[i
] == ino
)
1436 if (state
->fc_modified_inodes_used
== state
->fc_modified_inodes_size
) {
1437 state
->fc_modified_inodes_size
+=
1438 EXT4_FC_REPLAY_REALLOC_INCREMENT
;
1439 state
->fc_modified_inodes
= krealloc(
1440 state
->fc_modified_inodes
, sizeof(int) *
1441 state
->fc_modified_inodes_size
,
1443 if (!state
->fc_modified_inodes
)
1446 state
->fc_modified_inodes
[state
->fc_modified_inodes_used
++] = ino
;
1451 * Inode replay function
1453 static int ext4_fc_replay_inode(struct super_block
*sb
, struct ext4_fc_tl
*tl
)
1455 struct ext4_fc_inode
*fc_inode
;
1456 struct ext4_inode
*raw_inode
;
1457 struct ext4_inode
*raw_fc_inode
;
1458 struct inode
*inode
= NULL
;
1459 struct ext4_iloc iloc
;
1460 int inode_len
, ino
, ret
, tag
= le16_to_cpu(tl
->fc_tag
);
1461 struct ext4_extent_header
*eh
;
1463 fc_inode
= (struct ext4_fc_inode
*)ext4_fc_tag_val(tl
);
1465 ino
= le32_to_cpu(fc_inode
->fc_ino
);
1466 trace_ext4_fc_replay(sb
, tag
, ino
, 0, 0);
1468 inode
= ext4_iget(sb
, ino
, EXT4_IGET_NORMAL
);
1469 if (!IS_ERR_OR_NULL(inode
)) {
1470 ext4_ext_clear_bb(inode
);
1474 ext4_fc_record_modified_inode(sb
, ino
);
1476 raw_fc_inode
= (struct ext4_inode
*)fc_inode
->fc_raw_inode
;
1477 ret
= ext4_get_fc_inode_loc(sb
, ino
, &iloc
);
1481 inode_len
= ext4_fc_tag_len(tl
) - sizeof(struct ext4_fc_inode
);
1482 raw_inode
= ext4_raw_inode(&iloc
);
1484 memcpy(raw_inode
, raw_fc_inode
, offsetof(struct ext4_inode
, i_block
));
1485 memcpy(&raw_inode
->i_generation
, &raw_fc_inode
->i_generation
,
1486 inode_len
- offsetof(struct ext4_inode
, i_generation
));
1487 if (le32_to_cpu(raw_inode
->i_flags
) & EXT4_EXTENTS_FL
) {
1488 eh
= (struct ext4_extent_header
*)(&raw_inode
->i_block
[0]);
1489 if (eh
->eh_magic
!= EXT4_EXT_MAGIC
) {
1490 memset(eh
, 0, sizeof(*eh
));
1491 eh
->eh_magic
= EXT4_EXT_MAGIC
;
1492 eh
->eh_max
= cpu_to_le16(
1493 (sizeof(raw_inode
->i_block
) -
1494 sizeof(struct ext4_extent_header
))
1495 / sizeof(struct ext4_extent
));
1497 } else if (le32_to_cpu(raw_inode
->i_flags
) & EXT4_INLINE_DATA_FL
) {
1498 memcpy(raw_inode
->i_block
, raw_fc_inode
->i_block
,
1499 sizeof(raw_inode
->i_block
));
1502 /* Immediately update the inode on disk. */
1503 ret
= ext4_handle_dirty_metadata(NULL
, NULL
, iloc
.bh
);
1506 ret
= sync_dirty_buffer(iloc
.bh
);
1509 ret
= ext4_mark_inode_used(sb
, ino
);
1513 /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1514 inode
= ext4_iget(sb
, ino
, EXT4_IGET_NORMAL
);
1515 if (IS_ERR_OR_NULL(inode
)) {
1516 jbd_debug(1, "Inode not found.");
1517 return -EFSCORRUPTED
;
1521 * Our allocator could have made different decisions than before
1522 * crashing. This should be fixed but until then, we calculate
1523 * the number of blocks the inode.
1525 ext4_ext_replay_set_iblocks(inode
);
1527 inode
->i_generation
= le32_to_cpu(ext4_raw_inode(&iloc
)->i_generation
);
1528 ext4_reset_inode_seed(inode
);
1530 ext4_inode_csum_set(inode
, ext4_raw_inode(&iloc
), EXT4_I(inode
));
1531 ret
= ext4_handle_dirty_metadata(NULL
, NULL
, iloc
.bh
);
1532 sync_dirty_buffer(iloc
.bh
);
1537 blkdev_issue_flush(sb
->s_bdev
, GFP_KERNEL
);
1543 * Dentry create replay function.
1545 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1546 * inode for which we are trying to create a dentry here, should already have
1547 * been replayed before we start here.
1549 static int ext4_fc_replay_create(struct super_block
*sb
, struct ext4_fc_tl
*tl
)
1552 struct inode
*inode
= NULL
;
1553 struct inode
*dir
= NULL
;
1554 struct dentry_info_args darg
;
1556 tl_to_darg(&darg
, tl
);
1558 trace_ext4_fc_replay(sb
, EXT4_FC_TAG_CREAT
, darg
.ino
,
1559 darg
.parent_ino
, darg
.dname_len
);
1561 /* This takes care of update group descriptor and other metadata */
1562 ret
= ext4_mark_inode_used(sb
, darg
.ino
);
1566 inode
= ext4_iget(sb
, darg
.ino
, EXT4_IGET_NORMAL
);
1567 if (IS_ERR_OR_NULL(inode
)) {
1568 jbd_debug(1, "inode %d not found.", darg
.ino
);
1574 if (S_ISDIR(inode
->i_mode
)) {
1576 * If we are creating a directory, we need to make sure that the
1577 * dot and dot dot dirents are setup properly.
1579 dir
= ext4_iget(sb
, darg
.parent_ino
, EXT4_IGET_NORMAL
);
1580 if (IS_ERR_OR_NULL(dir
)) {
1581 jbd_debug(1, "Dir %d not found.", darg
.ino
);
1584 ret
= ext4_init_new_dir(NULL
, dir
, inode
);
1591 ret
= ext4_fc_replay_link_internal(sb
, &darg
, inode
);
1594 set_nlink(inode
, 1);
1595 ext4_mark_inode_dirty(NULL
, inode
);
1603 * Record physical disk regions which are in use as per fast commit area. Our
1604 * simple replay phase allocator excludes these regions from allocation.
1606 static int ext4_fc_record_regions(struct super_block
*sb
, int ino
,
1607 ext4_lblk_t lblk
, ext4_fsblk_t pblk
, int len
)
1609 struct ext4_fc_replay_state
*state
;
1610 struct ext4_fc_alloc_region
*region
;
1612 state
= &EXT4_SB(sb
)->s_fc_replay_state
;
1613 if (state
->fc_regions_used
== state
->fc_regions_size
) {
1614 state
->fc_regions_size
+=
1615 EXT4_FC_REPLAY_REALLOC_INCREMENT
;
1616 state
->fc_regions
= krealloc(
1618 state
->fc_regions_size
*
1619 sizeof(struct ext4_fc_alloc_region
),
1621 if (!state
->fc_regions
)
1624 region
= &state
->fc_regions
[state
->fc_regions_used
++];
1626 region
->lblk
= lblk
;
1627 region
->pblk
= pblk
;
1633 /* Replay add range tag */
1634 static int ext4_fc_replay_add_range(struct super_block
*sb
,
1635 struct ext4_fc_tl
*tl
)
1637 struct ext4_fc_add_range
*fc_add_ex
;
1638 struct ext4_extent newex
, *ex
;
1639 struct inode
*inode
;
1640 ext4_lblk_t start
, cur
;
1642 ext4_fsblk_t start_pblk
;
1643 struct ext4_map_blocks map
;
1644 struct ext4_ext_path
*path
= NULL
;
1647 fc_add_ex
= (struct ext4_fc_add_range
*)ext4_fc_tag_val(tl
);
1648 ex
= (struct ext4_extent
*)&fc_add_ex
->fc_ex
;
1650 trace_ext4_fc_replay(sb
, EXT4_FC_TAG_ADD_RANGE
,
1651 le32_to_cpu(fc_add_ex
->fc_ino
), le32_to_cpu(ex
->ee_block
),
1652 ext4_ext_get_actual_len(ex
));
1654 inode
= ext4_iget(sb
, le32_to_cpu(fc_add_ex
->fc_ino
),
1656 if (IS_ERR_OR_NULL(inode
)) {
1657 jbd_debug(1, "Inode not found.");
1661 ret
= ext4_fc_record_modified_inode(sb
, inode
->i_ino
);
1663 start
= le32_to_cpu(ex
->ee_block
);
1664 start_pblk
= ext4_ext_pblock(ex
);
1665 len
= ext4_ext_get_actual_len(ex
);
1669 jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1670 start
, start_pblk
, len
, ext4_ext_is_unwritten(ex
),
1673 while (remaining
> 0) {
1675 map
.m_len
= remaining
;
1677 ret
= ext4_map_blocks(NULL
, inode
, &map
, 0);
1685 /* Range is not mapped */
1686 path
= ext4_find_extent(inode
, cur
, NULL
, 0);
1691 memset(&newex
, 0, sizeof(newex
));
1692 newex
.ee_block
= cpu_to_le32(cur
);
1693 ext4_ext_store_pblock(
1694 &newex
, start_pblk
+ cur
- start
);
1695 newex
.ee_len
= cpu_to_le16(map
.m_len
);
1696 if (ext4_ext_is_unwritten(ex
))
1697 ext4_ext_mark_unwritten(&newex
);
1698 down_write(&EXT4_I(inode
)->i_data_sem
);
1699 ret
= ext4_ext_insert_extent(
1700 NULL
, inode
, &path
, &newex
, 0);
1701 up_write((&EXT4_I(inode
)->i_data_sem
));
1702 ext4_ext_drop_refs(path
);
1711 if (start_pblk
+ cur
- start
!= map
.m_pblk
) {
1713 * Logical to physical mapping changed. This can happen
1714 * if this range was removed and then reallocated to
1715 * map to new physical blocks during a fast commit.
1717 ret
= ext4_ext_replay_update_ex(inode
, cur
, map
.m_len
,
1718 ext4_ext_is_unwritten(ex
),
1719 start_pblk
+ cur
- start
);
1725 * Mark the old blocks as free since they aren't used
1726 * anymore. We maintain an array of all the modified
1727 * inodes. In case these blocks are still used at either
1728 * a different logical range in the same inode or in
1729 * some different inode, we will mark them as allocated
1730 * at the end of the FC replay using our array of
1733 ext4_mb_mark_bb(inode
->i_sb
, map
.m_pblk
, map
.m_len
, 0);
1737 /* Range is mapped and needs a state change */
1738 jbd_debug(1, "Converting from %d to %d %lld",
1739 map
.m_flags
& EXT4_MAP_UNWRITTEN
,
1740 ext4_ext_is_unwritten(ex
), map
.m_pblk
);
1741 ret
= ext4_ext_replay_update_ex(inode
, cur
, map
.m_len
,
1742 ext4_ext_is_unwritten(ex
), map
.m_pblk
);
1748 * We may have split the extent tree while toggling the state.
1749 * Try to shrink the extent tree now.
1751 ext4_ext_replay_shrink_inode(inode
, start
+ len
);
1754 remaining
-= map
.m_len
;
1756 ext4_ext_replay_shrink_inode(inode
, i_size_read(inode
) >>
1757 sb
->s_blocksize_bits
);
1762 /* Replay DEL_RANGE tag */
1764 ext4_fc_replay_del_range(struct super_block
*sb
, struct ext4_fc_tl
*tl
)
1766 struct inode
*inode
;
1767 struct ext4_fc_del_range
*lrange
;
1768 struct ext4_map_blocks map
;
1769 ext4_lblk_t cur
, remaining
;
1772 lrange
= (struct ext4_fc_del_range
*)ext4_fc_tag_val(tl
);
1773 cur
= le32_to_cpu(lrange
->fc_lblk
);
1774 remaining
= le32_to_cpu(lrange
->fc_len
);
1776 trace_ext4_fc_replay(sb
, EXT4_FC_TAG_DEL_RANGE
,
1777 le32_to_cpu(lrange
->fc_ino
), cur
, remaining
);
1779 inode
= ext4_iget(sb
, le32_to_cpu(lrange
->fc_ino
), EXT4_IGET_NORMAL
);
1780 if (IS_ERR_OR_NULL(inode
)) {
1781 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange
->fc_ino
));
1785 ret
= ext4_fc_record_modified_inode(sb
, inode
->i_ino
);
1787 jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1788 inode
->i_ino
, le32_to_cpu(lrange
->fc_lblk
),
1789 le32_to_cpu(lrange
->fc_len
));
1790 while (remaining
> 0) {
1792 map
.m_len
= remaining
;
1794 ret
= ext4_map_blocks(NULL
, inode
, &map
, 0);
1802 ext4_mb_mark_bb(inode
->i_sb
, map
.m_pblk
, map
.m_len
, 0);
1804 remaining
-= map
.m_len
;
1809 ret
= ext4_punch_hole(inode
,
1810 le32_to_cpu(lrange
->fc_lblk
) << sb
->s_blocksize_bits
,
1811 le32_to_cpu(lrange
->fc_len
) << sb
->s_blocksize_bits
);
1813 jbd_debug(1, "ext4_punch_hole returned %d", ret
);
1814 ext4_ext_replay_shrink_inode(inode
,
1815 i_size_read(inode
) >> sb
->s_blocksize_bits
);
1816 ext4_mark_inode_dirty(NULL
, inode
);
1822 static void ext4_fc_set_bitmaps_and_counters(struct super_block
*sb
)
1824 struct ext4_fc_replay_state
*state
;
1825 struct inode
*inode
;
1826 struct ext4_ext_path
*path
= NULL
;
1827 struct ext4_map_blocks map
;
1829 ext4_lblk_t cur
, end
;
1831 state
= &EXT4_SB(sb
)->s_fc_replay_state
;
1832 for (i
= 0; i
< state
->fc_modified_inodes_used
; i
++) {
1833 inode
= ext4_iget(sb
, state
->fc_modified_inodes
[i
],
1835 if (IS_ERR_OR_NULL(inode
)) {
1836 jbd_debug(1, "Inode %d not found.",
1837 state
->fc_modified_inodes
[i
]);
1841 end
= EXT_MAX_BLOCKS
;
1844 map
.m_len
= end
- cur
;
1846 ret
= ext4_map_blocks(NULL
, inode
, &map
, 0);
1851 path
= ext4_find_extent(inode
, map
.m_lblk
, NULL
, 0);
1852 if (!IS_ERR_OR_NULL(path
)) {
1853 for (j
= 0; j
< path
->p_depth
; j
++)
1854 ext4_mb_mark_bb(inode
->i_sb
,
1855 path
[j
].p_block
, 1, 1);
1856 ext4_ext_drop_refs(path
);
1860 ext4_mb_mark_bb(inode
->i_sb
, map
.m_pblk
,
1863 cur
= cur
+ (map
.m_len
? map
.m_len
: 1);
1871 * Check if block is in excluded regions for block allocation. The simple
1872 * allocator that runs during replay phase is calls this function to see
1873 * if it is okay to use a block.
1875 bool ext4_fc_replay_check_excluded(struct super_block
*sb
, ext4_fsblk_t blk
)
1878 struct ext4_fc_replay_state
*state
;
1880 state
= &EXT4_SB(sb
)->s_fc_replay_state
;
1881 for (i
= 0; i
< state
->fc_regions_valid
; i
++) {
1882 if (state
->fc_regions
[i
].ino
== 0 ||
1883 state
->fc_regions
[i
].len
== 0)
1885 if (blk
>= state
->fc_regions
[i
].pblk
&&
1886 blk
< state
->fc_regions
[i
].pblk
+ state
->fc_regions
[i
].len
)
1892 /* Cleanup function called after replay */
1893 void ext4_fc_replay_cleanup(struct super_block
*sb
)
1895 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
1897 sbi
->s_mount_state
&= ~EXT4_FC_REPLAY
;
1898 kfree(sbi
->s_fc_replay_state
.fc_regions
);
1899 kfree(sbi
->s_fc_replay_state
.fc_modified_inodes
);
1903 * Recovery Scan phase handler
1905 * This function is called during the scan phase and is responsible
1906 * for doing following things:
1907 * - Make sure the fast commit area has valid tags for replay
1908 * - Count number of tags that need to be replayed by the replay handler
1910 * - Create a list of excluded blocks for allocation during replay phase
1912 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1913 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1914 * to indicate that scan has finished and JBD2 can now start replay phase.
1915 * It returns a negative error to indicate that there was an error. At the end
1916 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1917 * to indicate the number of tags that need to replayed during the replay phase.
1919 static int ext4_fc_replay_scan(journal_t
*journal
,
1920 struct buffer_head
*bh
, int off
,
1923 struct super_block
*sb
= journal
->j_private
;
1924 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
1925 struct ext4_fc_replay_state
*state
;
1926 int ret
= JBD2_FC_REPLAY_CONTINUE
;
1927 struct ext4_fc_add_range
*ext
;
1928 struct ext4_fc_tl
*tl
;
1929 struct ext4_fc_tail
*tail
;
1931 struct ext4_fc_head
*head
;
1932 struct ext4_extent
*ex
;
1934 state
= &sbi
->s_fc_replay_state
;
1936 start
= (u8
*)bh
->b_data
;
1937 end
= (__u8
*)bh
->b_data
+ journal
->j_blocksize
- 1;
1939 if (state
->fc_replay_expected_off
== 0) {
1940 state
->fc_cur_tag
= 0;
1941 state
->fc_replay_num_tags
= 0;
1943 state
->fc_regions
= NULL
;
1944 state
->fc_regions_valid
= state
->fc_regions_used
=
1945 state
->fc_regions_size
= 0;
1946 /* Check if we can stop early */
1947 if (le16_to_cpu(((struct ext4_fc_tl
*)start
)->fc_tag
)
1948 != EXT4_FC_TAG_HEAD
)
1952 if (off
!= state
->fc_replay_expected_off
) {
1953 ret
= -EFSCORRUPTED
;
1957 state
->fc_replay_expected_off
++;
1958 fc_for_each_tl(start
, end
, tl
) {
1959 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1960 tag2str(le16_to_cpu(tl
->fc_tag
)), bh
->b_blocknr
);
1961 switch (le16_to_cpu(tl
->fc_tag
)) {
1962 case EXT4_FC_TAG_ADD_RANGE
:
1963 ext
= (struct ext4_fc_add_range
*)ext4_fc_tag_val(tl
);
1964 ex
= (struct ext4_extent
*)&ext
->fc_ex
;
1965 ret
= ext4_fc_record_regions(sb
,
1966 le32_to_cpu(ext
->fc_ino
),
1967 le32_to_cpu(ex
->ee_block
), ext4_ext_pblock(ex
),
1968 ext4_ext_get_actual_len(ex
));
1971 ret
= JBD2_FC_REPLAY_CONTINUE
;
1973 case EXT4_FC_TAG_DEL_RANGE
:
1974 case EXT4_FC_TAG_LINK
:
1975 case EXT4_FC_TAG_UNLINK
:
1976 case EXT4_FC_TAG_CREAT
:
1977 case EXT4_FC_TAG_INODE
:
1978 case EXT4_FC_TAG_PAD
:
1979 state
->fc_cur_tag
++;
1980 state
->fc_crc
= ext4_chksum(sbi
, state
->fc_crc
, tl
,
1981 sizeof(*tl
) + ext4_fc_tag_len(tl
));
1983 case EXT4_FC_TAG_TAIL
:
1984 state
->fc_cur_tag
++;
1985 tail
= (struct ext4_fc_tail
*)ext4_fc_tag_val(tl
);
1986 state
->fc_crc
= ext4_chksum(sbi
, state
->fc_crc
, tl
,
1988 offsetof(struct ext4_fc_tail
,
1990 if (le32_to_cpu(tail
->fc_tid
) == expected_tid
&&
1991 le32_to_cpu(tail
->fc_crc
) == state
->fc_crc
) {
1992 state
->fc_replay_num_tags
= state
->fc_cur_tag
;
1993 state
->fc_regions_valid
=
1994 state
->fc_regions_used
;
1996 ret
= state
->fc_replay_num_tags
?
1997 JBD2_FC_REPLAY_STOP
: -EFSBADCRC
;
2001 case EXT4_FC_TAG_HEAD
:
2002 head
= (struct ext4_fc_head
*)ext4_fc_tag_val(tl
);
2003 if (le32_to_cpu(head
->fc_features
) &
2004 ~EXT4_FC_SUPPORTED_FEATURES
) {
2008 if (le32_to_cpu(head
->fc_tid
) != expected_tid
) {
2009 ret
= JBD2_FC_REPLAY_STOP
;
2012 state
->fc_cur_tag
++;
2013 state
->fc_crc
= ext4_chksum(sbi
, state
->fc_crc
, tl
,
2014 sizeof(*tl
) + ext4_fc_tag_len(tl
));
2017 ret
= state
->fc_replay_num_tags
?
2018 JBD2_FC_REPLAY_STOP
: -ECANCELED
;
2020 if (ret
< 0 || ret
== JBD2_FC_REPLAY_STOP
)
2025 trace_ext4_fc_replay_scan(sb
, ret
, off
);
2030 * Main recovery path entry point.
2031 * The meaning of return codes is similar as above.
2033 static int ext4_fc_replay(journal_t
*journal
, struct buffer_head
*bh
,
2034 enum passtype pass
, int off
, tid_t expected_tid
)
2036 struct super_block
*sb
= journal
->j_private
;
2037 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
2038 struct ext4_fc_tl
*tl
;
2040 int ret
= JBD2_FC_REPLAY_CONTINUE
;
2041 struct ext4_fc_replay_state
*state
= &sbi
->s_fc_replay_state
;
2042 struct ext4_fc_tail
*tail
;
2044 if (pass
== PASS_SCAN
) {
2045 state
->fc_current_pass
= PASS_SCAN
;
2046 return ext4_fc_replay_scan(journal
, bh
, off
, expected_tid
);
2049 if (state
->fc_current_pass
!= pass
) {
2050 state
->fc_current_pass
= pass
;
2051 sbi
->s_mount_state
|= EXT4_FC_REPLAY
;
2053 if (!sbi
->s_fc_replay_state
.fc_replay_num_tags
) {
2054 jbd_debug(1, "Replay stops\n");
2055 ext4_fc_set_bitmaps_and_counters(sb
);
2059 #ifdef CONFIG_EXT4_DEBUG
2060 if (sbi
->s_fc_debug_max_replay
&& off
>= sbi
->s_fc_debug_max_replay
) {
2061 pr_warn("Dropping fc block %d because max_replay set\n", off
);
2062 return JBD2_FC_REPLAY_STOP
;
2066 start
= (u8
*)bh
->b_data
;
2067 end
= (__u8
*)bh
->b_data
+ journal
->j_blocksize
- 1;
2069 fc_for_each_tl(start
, end
, tl
) {
2070 if (state
->fc_replay_num_tags
== 0) {
2071 ret
= JBD2_FC_REPLAY_STOP
;
2072 ext4_fc_set_bitmaps_and_counters(sb
);
2075 jbd_debug(3, "Replay phase, tag:%s\n",
2076 tag2str(le16_to_cpu(tl
->fc_tag
)));
2077 state
->fc_replay_num_tags
--;
2078 switch (le16_to_cpu(tl
->fc_tag
)) {
2079 case EXT4_FC_TAG_LINK
:
2080 ret
= ext4_fc_replay_link(sb
, tl
);
2082 case EXT4_FC_TAG_UNLINK
:
2083 ret
= ext4_fc_replay_unlink(sb
, tl
);
2085 case EXT4_FC_TAG_ADD_RANGE
:
2086 ret
= ext4_fc_replay_add_range(sb
, tl
);
2088 case EXT4_FC_TAG_CREAT
:
2089 ret
= ext4_fc_replay_create(sb
, tl
);
2091 case EXT4_FC_TAG_DEL_RANGE
:
2092 ret
= ext4_fc_replay_del_range(sb
, tl
);
2094 case EXT4_FC_TAG_INODE
:
2095 ret
= ext4_fc_replay_inode(sb
, tl
);
2097 case EXT4_FC_TAG_PAD
:
2098 trace_ext4_fc_replay(sb
, EXT4_FC_TAG_PAD
, 0,
2099 ext4_fc_tag_len(tl
), 0);
2101 case EXT4_FC_TAG_TAIL
:
2102 trace_ext4_fc_replay(sb
, EXT4_FC_TAG_TAIL
, 0,
2103 ext4_fc_tag_len(tl
), 0);
2104 tail
= (struct ext4_fc_tail
*)ext4_fc_tag_val(tl
);
2105 WARN_ON(le32_to_cpu(tail
->fc_tid
) != expected_tid
);
2107 case EXT4_FC_TAG_HEAD
:
2110 trace_ext4_fc_replay(sb
, le16_to_cpu(tl
->fc_tag
), 0,
2111 ext4_fc_tag_len(tl
), 0);
2117 ret
= JBD2_FC_REPLAY_CONTINUE
;
2122 void ext4_fc_init(struct super_block
*sb
, journal_t
*journal
)
2125 * We set replay callback even if fast commit disabled because we may
2126 * could still have fast commit blocks that need to be replayed even if
2127 * fast commit has now been turned off.
2129 journal
->j_fc_replay_callback
= ext4_fc_replay
;
2130 if (!test_opt2(sb
, JOURNAL_FAST_COMMIT
))
2132 journal
->j_fc_cleanup_callback
= ext4_fc_cleanup
;
2135 static const char *fc_ineligible_reasons
[] = {
2136 "Extended attributes changed",
2138 "Journal flag changed",
2139 "Insufficient memory",
2148 int ext4_fc_info_show(struct seq_file
*seq
, void *v
)
2150 struct ext4_sb_info
*sbi
= EXT4_SB((struct super_block
*)seq
->private);
2151 struct ext4_fc_stats
*stats
= &sbi
->s_fc_stats
;
2154 if (v
!= SEQ_START_TOKEN
)
2158 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2159 stats
->fc_num_commits
, stats
->fc_ineligible_commits
,
2161 div_u64(sbi
->s_fc_avg_commit_time
, 1000));
2162 seq_puts(seq
, "Ineligible reasons:\n");
2163 for (i
= 0; i
< EXT4_FC_REASON_MAX
; i
++)
2164 seq_printf(seq
, "\"%s\":\t%d\n", fc_ineligible_reasons
[i
],
2165 stats
->fc_ineligible_reason_count
[i
]);
2170 int __init
ext4_fc_init_dentry_cache(void)
2172 ext4_fc_dentry_cachep
= KMEM_CACHE(ext4_fc_dentry_update
,
2173 SLAB_RECLAIM_ACCOUNT
);
2175 if (ext4_fc_dentry_cachep
== NULL
)