1 // SPDX-License-Identifier: GPL-2.0
4 * fs/ext4/fast_commit.c
6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
8 * Ext4 fast commits routines.
11 #include "ext4_jbd2.h"
12 #include "ext4_extents.h"
19 * Ext4 fast commits implement fine grained journalling for Ext4.
21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23 * TLV during the recovery phase. For the scenarios for which we currently
24 * don't have replay code, fast commit falls back to full commits.
25 * Fast commits record delta in one of the following three categories.
27 * (A) Directory entry updates:
29 * - EXT4_FC_TAG_UNLINK - records directory entry unlink
30 * - EXT4_FC_TAG_LINK - records directory entry link
31 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation
33 * (B) File specific data range updates:
35 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode
36 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode
38 * (C) Inode metadata (mtime / ctime etc):
40 * - EXT4_FC_TAG_INODE - record the inode that should be replayed
41 * during recovery. Note that iblocks field is
42 * not replayed and instead derived during
46 * With fast commits, we maintain all the directory entry operations in the
47 * order in which they are issued in an in-memory queue. This queue is flushed
48 * to disk during the commit operation. We also maintain a list of inodes
49 * that need to be committed during a fast commit in another in memory queue of
50 * inodes. During the commit operation, we commit in the following order:
52 * [1] Lock inodes for any further data updates by setting COMMITTING state
53 * [2] Submit data buffers of all the inodes
54 * [3] Wait for [2] to complete
55 * [4] Commit all the directory entry updates in the fast commit space
56 * [5] Commit all the changed inode structures
57 * [6] Write tail tag (this tag ensures the atomicity, please read the following
58 * section for more details).
59 * [7] Wait for [4], [5] and [6] to complete.
61 * All the inode updates must call ext4_fc_start_update() before starting an
62 * update. If such an ongoing update is present, fast commit waits for it to
63 * complete. The completion of such an update is marked by
64 * ext4_fc_stop_update().
66 * Fast Commit Ineligibility
67 * -------------------------
69 * Not all operations are supported by fast commits today (e.g extended
70 * attributes). Fast commit ineligibility is marked by calling
71 * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back
74 * Atomicity of commits
75 * --------------------
76 * In order to guarantee atomicity during the commit operation, fast commit
77 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
78 * tag contains CRC of the contents and TID of the transaction after which
79 * this fast commit should be applied. Recovery code replays fast commit
80 * logs only if there's at least 1 valid tail present. For every fast commit
81 * operation, there is 1 tail. This means, we may end up with multiple tails
82 * in the fast commit space. Here's an example:
84 * - Create a new file A and remove existing file B
86 * - Append contents to file A
90 * The fast commit space at the end of above operations would look like this:
91 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
92 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->|
94 * Replay code should thus check for all the valid tails in the FC area.
96 * Fast Commit Replay Idempotence
97 * ------------------------------
99 * Fast commits tags are idempotent in nature provided the recovery code follows
100 * certain rules. The guiding principle that the commit path follows while
101 * committing is that it stores the result of a particular operation instead of
102 * storing the procedure.
104 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
105 * was associated with inode 10. During fast commit, instead of storing this
106 * operation as a procedure "rename a to b", we store the resulting file system
107 * state as a "series" of outcomes:
109 * - Link dirent b to inode 10
111 * - Inode <10> with valid refcount
113 * Now when recovery code runs, it needs "enforce" this state on the file
114 * system. This is what guarantees idempotence of fast commit replay.
116 * Let's take an example of a procedure that is not idempotent and see how fast
117 * commits make it idempotent. Consider following sequence of operations:
119 * rm A; mv B A; read A
122 * (x), (y) and (z) are the points at which we can crash. If we store this
123 * sequence of operations as is then the replay is not idempotent. Let's say
124 * while in replay, we crash at (z). During the second replay, file A (which was
125 * actually created as a result of "mv B A" operation) would get deleted. Thus,
126 * file named A would be absent when we try to read A. So, this sequence of
127 * operations is not idempotent. However, as mentioned above, instead of storing
128 * the procedure fast commits store the outcome of each procedure. Thus the fast
129 * commit log for above procedure would be as follows:
131 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
132 * inode 11 before the replay)
134 * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11]
137 * If we crash at (z), we will have file A linked to inode 11. During the second
138 * replay, we will remove file A (inode 11). But we will create it back and make
139 * it point to inode 11. We won't find B, so we'll just skip that step. At this
140 * point, the refcount for inode 11 is not reliable, but that gets fixed by the
141 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
142 * similarly. Thus, by converting a non-idempotent procedure into a series of
143 * idempotent outcomes, fast commits ensured idempotence during the replay.
148 * 0) Fast commit replay path hardening: Fast commit replay code should use
149 * journal handles to make sure all the updates it does during the replay
150 * path are atomic. With that if we crash during fast commit replay, after
151 * trying to do recovery again, we will find a file system where fast commit
152 * area is invalid (because new full commit would be found). In order to deal
153 * with that, fast commit replay code should ensure that the "FC_REPLAY"
154 * superblock state is persisted before starting the replay, so that after
155 * the crash, fast commit recovery code can look at that flag and perform
156 * fast commit recovery even if that area is invalidated by later full
159 * 1) Fast commit's commit path locks the entire file system during fast
160 * commit. This has significant performance penalty. Instead of that, we
161 * should use ext4_fc_start/stop_update functions to start inode level
162 * updates from ext4_journal_start/stop. Once we do that we can drop file
163 * system locking during commit path.
165 * 2) Handle more ineligible cases.
168 #include <trace/events/ext4.h>
169 static struct kmem_cache
*ext4_fc_dentry_cachep
;
171 static void ext4_end_buffer_io_sync(struct buffer_head
*bh
, int uptodate
)
173 BUFFER_TRACE(bh
, "");
175 ext4_debug("%s: Block %lld up-to-date",
176 __func__
, bh
->b_blocknr
);
177 set_buffer_uptodate(bh
);
179 ext4_debug("%s: Block %lld not up-to-date",
180 __func__
, bh
->b_blocknr
);
181 clear_buffer_uptodate(bh
);
187 static inline void ext4_fc_reset_inode(struct inode
*inode
)
189 struct ext4_inode_info
*ei
= EXT4_I(inode
);
191 ei
->i_fc_lblk_start
= 0;
192 ei
->i_fc_lblk_len
= 0;
195 void ext4_fc_init_inode(struct inode
*inode
)
197 struct ext4_inode_info
*ei
= EXT4_I(inode
);
199 ext4_fc_reset_inode(inode
);
200 ext4_clear_inode_state(inode
, EXT4_STATE_FC_COMMITTING
);
201 INIT_LIST_HEAD(&ei
->i_fc_list
);
202 INIT_LIST_HEAD(&ei
->i_fc_dilist
);
203 init_waitqueue_head(&ei
->i_fc_wait
);
204 atomic_set(&ei
->i_fc_updates
, 0);
207 /* This function must be called with sbi->s_fc_lock held. */
208 static void ext4_fc_wait_committing_inode(struct inode
*inode
)
209 __releases(&EXT4_SB(inode
->i_sb
)->s_fc_lock
)
211 wait_queue_head_t
*wq
;
212 struct ext4_inode_info
*ei
= EXT4_I(inode
);
214 #if (BITS_PER_LONG < 64)
215 DEFINE_WAIT_BIT(wait
, &ei
->i_state_flags
,
216 EXT4_STATE_FC_COMMITTING
);
217 wq
= bit_waitqueue(&ei
->i_state_flags
,
218 EXT4_STATE_FC_COMMITTING
);
220 DEFINE_WAIT_BIT(wait
, &ei
->i_flags
,
221 EXT4_STATE_FC_COMMITTING
);
222 wq
= bit_waitqueue(&ei
->i_flags
,
223 EXT4_STATE_FC_COMMITTING
);
225 lockdep_assert_held(&EXT4_SB(inode
->i_sb
)->s_fc_lock
);
226 prepare_to_wait(wq
, &wait
.wq_entry
, TASK_UNINTERRUPTIBLE
);
227 spin_unlock(&EXT4_SB(inode
->i_sb
)->s_fc_lock
);
229 finish_wait(wq
, &wait
.wq_entry
);
232 static bool ext4_fc_disabled(struct super_block
*sb
)
234 return (!test_opt2(sb
, JOURNAL_FAST_COMMIT
) ||
235 (EXT4_SB(sb
)->s_mount_state
& EXT4_FC_REPLAY
));
239 * Inform Ext4's fast about start of an inode update
241 * This function is called by the high level call VFS callbacks before
242 * performing any inode update. This function blocks if there's an ongoing
243 * fast commit on the inode in question.
245 void ext4_fc_start_update(struct inode
*inode
)
247 struct ext4_inode_info
*ei
= EXT4_I(inode
);
249 if (ext4_fc_disabled(inode
->i_sb
))
253 spin_lock(&EXT4_SB(inode
->i_sb
)->s_fc_lock
);
254 if (list_empty(&ei
->i_fc_list
))
257 if (ext4_test_inode_state(inode
, EXT4_STATE_FC_COMMITTING
)) {
258 ext4_fc_wait_committing_inode(inode
);
262 atomic_inc(&ei
->i_fc_updates
);
263 spin_unlock(&EXT4_SB(inode
->i_sb
)->s_fc_lock
);
267 * Stop inode update and wake up waiting fast commits if any.
269 void ext4_fc_stop_update(struct inode
*inode
)
271 struct ext4_inode_info
*ei
= EXT4_I(inode
);
273 if (ext4_fc_disabled(inode
->i_sb
))
276 if (atomic_dec_and_test(&ei
->i_fc_updates
))
277 wake_up_all(&ei
->i_fc_wait
);
281 * Remove inode from fast commit list. If the inode is being committed
282 * we wait until inode commit is done.
284 void ext4_fc_del(struct inode
*inode
)
286 struct ext4_inode_info
*ei
= EXT4_I(inode
);
287 struct ext4_sb_info
*sbi
= EXT4_SB(inode
->i_sb
);
288 struct ext4_fc_dentry_update
*fc_dentry
;
290 if (ext4_fc_disabled(inode
->i_sb
))
294 spin_lock(&sbi
->s_fc_lock
);
295 if (list_empty(&ei
->i_fc_list
) && list_empty(&ei
->i_fc_dilist
)) {
296 spin_unlock(&sbi
->s_fc_lock
);
300 if (ext4_test_inode_state(inode
, EXT4_STATE_FC_COMMITTING
)) {
301 ext4_fc_wait_committing_inode(inode
);
305 if (!list_empty(&ei
->i_fc_list
))
306 list_del_init(&ei
->i_fc_list
);
309 * Since this inode is getting removed, let's also remove all FC
310 * dentry create references, since it is not needed to log it anyways.
312 if (list_empty(&ei
->i_fc_dilist
)) {
313 spin_unlock(&sbi
->s_fc_lock
);
317 fc_dentry
= list_first_entry(&ei
->i_fc_dilist
, struct ext4_fc_dentry_update
, fcd_dilist
);
318 WARN_ON(fc_dentry
->fcd_op
!= EXT4_FC_TAG_CREAT
);
319 list_del_init(&fc_dentry
->fcd_list
);
320 list_del_init(&fc_dentry
->fcd_dilist
);
322 WARN_ON(!list_empty(&ei
->i_fc_dilist
));
323 spin_unlock(&sbi
->s_fc_lock
);
325 if (fc_dentry
->fcd_name
.name
&&
326 fc_dentry
->fcd_name
.len
> DNAME_INLINE_LEN
)
327 kfree(fc_dentry
->fcd_name
.name
);
328 kmem_cache_free(ext4_fc_dentry_cachep
, fc_dentry
);
334 * Mark file system as fast commit ineligible, and record latest
335 * ineligible transaction tid. This means until the recorded
336 * transaction, commit operation would result in a full jbd2 commit.
338 void ext4_fc_mark_ineligible(struct super_block
*sb
, int reason
, handle_t
*handle
)
340 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
342 bool has_transaction
= true;
345 if (ext4_fc_disabled(sb
))
348 if (handle
&& !IS_ERR(handle
))
349 tid
= handle
->h_transaction
->t_tid
;
351 read_lock(&sbi
->s_journal
->j_state_lock
);
352 if (sbi
->s_journal
->j_running_transaction
)
353 tid
= sbi
->s_journal
->j_running_transaction
->t_tid
;
355 has_transaction
= false;
356 read_unlock(&sbi
->s_journal
->j_state_lock
);
358 spin_lock(&sbi
->s_fc_lock
);
359 is_ineligible
= ext4_test_mount_flag(sb
, EXT4_MF_FC_INELIGIBLE
);
360 if (has_transaction
&& (!is_ineligible
|| tid_gt(tid
, sbi
->s_fc_ineligible_tid
)))
361 sbi
->s_fc_ineligible_tid
= tid
;
362 ext4_set_mount_flag(sb
, EXT4_MF_FC_INELIGIBLE
);
363 spin_unlock(&sbi
->s_fc_lock
);
364 WARN_ON(reason
>= EXT4_FC_REASON_MAX
);
365 sbi
->s_fc_stats
.fc_ineligible_reason_count
[reason
]++;
369 * Generic fast commit tracking function. If this is the first time this we are
370 * called after a full commit, we initialize fast commit fields and then call
371 * __fc_track_fn() with update = 0. If we have already been called after a full
372 * commit, we pass update = 1. Based on that, the track function can determine
373 * if it needs to track a field for the first time or if it needs to just
374 * update the previously tracked value.
376 * If enqueue is set, this function enqueues the inode in fast commit list.
378 static int ext4_fc_track_template(
379 handle_t
*handle
, struct inode
*inode
,
380 int (*__fc_track_fn
)(handle_t
*handle
, struct inode
*, void *, bool),
381 void *args
, int enqueue
)
384 struct ext4_inode_info
*ei
= EXT4_I(inode
);
385 struct ext4_sb_info
*sbi
= EXT4_SB(inode
->i_sb
);
389 tid
= handle
->h_transaction
->t_tid
;
390 mutex_lock(&ei
->i_fc_lock
);
391 if (tid
== ei
->i_sync_tid
) {
394 ext4_fc_reset_inode(inode
);
395 ei
->i_sync_tid
= tid
;
397 ret
= __fc_track_fn(handle
, inode
, args
, update
);
398 mutex_unlock(&ei
->i_fc_lock
);
403 spin_lock(&sbi
->s_fc_lock
);
404 if (list_empty(&EXT4_I(inode
)->i_fc_list
))
405 list_add_tail(&EXT4_I(inode
)->i_fc_list
,
406 (sbi
->s_journal
->j_flags
& JBD2_FULL_COMMIT_ONGOING
||
407 sbi
->s_journal
->j_flags
& JBD2_FAST_COMMIT_ONGOING
) ?
408 &sbi
->s_fc_q
[FC_Q_STAGING
] :
409 &sbi
->s_fc_q
[FC_Q_MAIN
]);
410 spin_unlock(&sbi
->s_fc_lock
);
415 struct __track_dentry_update_args
{
416 struct dentry
*dentry
;
420 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
421 static int __track_dentry_update(handle_t
*handle
, struct inode
*inode
,
422 void *arg
, bool update
)
424 struct ext4_fc_dentry_update
*node
;
425 struct ext4_inode_info
*ei
= EXT4_I(inode
);
426 struct __track_dentry_update_args
*dentry_update
=
427 (struct __track_dentry_update_args
*)arg
;
428 struct dentry
*dentry
= dentry_update
->dentry
;
429 struct inode
*dir
= dentry
->d_parent
->d_inode
;
430 struct super_block
*sb
= inode
->i_sb
;
431 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
433 mutex_unlock(&ei
->i_fc_lock
);
435 if (IS_ENCRYPTED(dir
)) {
436 ext4_fc_mark_ineligible(sb
, EXT4_FC_REASON_ENCRYPTED_FILENAME
,
438 mutex_lock(&ei
->i_fc_lock
);
442 node
= kmem_cache_alloc(ext4_fc_dentry_cachep
, GFP_NOFS
);
444 ext4_fc_mark_ineligible(sb
, EXT4_FC_REASON_NOMEM
, handle
);
445 mutex_lock(&ei
->i_fc_lock
);
449 node
->fcd_op
= dentry_update
->op
;
450 node
->fcd_parent
= dir
->i_ino
;
451 node
->fcd_ino
= inode
->i_ino
;
452 if (dentry
->d_name
.len
> DNAME_INLINE_LEN
) {
453 node
->fcd_name
.name
= kmalloc(dentry
->d_name
.len
, GFP_NOFS
);
454 if (!node
->fcd_name
.name
) {
455 kmem_cache_free(ext4_fc_dentry_cachep
, node
);
456 ext4_fc_mark_ineligible(sb
, EXT4_FC_REASON_NOMEM
, handle
);
457 mutex_lock(&ei
->i_fc_lock
);
460 memcpy((u8
*)node
->fcd_name
.name
, dentry
->d_name
.name
,
463 memcpy(node
->fcd_iname
, dentry
->d_name
.name
,
465 node
->fcd_name
.name
= node
->fcd_iname
;
467 node
->fcd_name
.len
= dentry
->d_name
.len
;
468 INIT_LIST_HEAD(&node
->fcd_dilist
);
469 spin_lock(&sbi
->s_fc_lock
);
470 if (sbi
->s_journal
->j_flags
& JBD2_FULL_COMMIT_ONGOING
||
471 sbi
->s_journal
->j_flags
& JBD2_FAST_COMMIT_ONGOING
)
472 list_add_tail(&node
->fcd_list
,
473 &sbi
->s_fc_dentry_q
[FC_Q_STAGING
]);
475 list_add_tail(&node
->fcd_list
, &sbi
->s_fc_dentry_q
[FC_Q_MAIN
]);
478 * This helps us keep a track of all fc_dentry updates which is part of
479 * this ext4 inode. So in case the inode is getting unlinked, before
480 * even we get a chance to fsync, we could remove all fc_dentry
481 * references while evicting the inode in ext4_fc_del().
482 * Also with this, we don't need to loop over all the inodes in
483 * sbi->s_fc_q to get the corresponding inode in
484 * ext4_fc_commit_dentry_updates().
486 if (dentry_update
->op
== EXT4_FC_TAG_CREAT
) {
487 WARN_ON(!list_empty(&ei
->i_fc_dilist
));
488 list_add_tail(&node
->fcd_dilist
, &ei
->i_fc_dilist
);
490 spin_unlock(&sbi
->s_fc_lock
);
491 mutex_lock(&ei
->i_fc_lock
);
496 void __ext4_fc_track_unlink(handle_t
*handle
,
497 struct inode
*inode
, struct dentry
*dentry
)
499 struct __track_dentry_update_args args
;
502 args
.dentry
= dentry
;
503 args
.op
= EXT4_FC_TAG_UNLINK
;
505 ret
= ext4_fc_track_template(handle
, inode
, __track_dentry_update
,
507 trace_ext4_fc_track_unlink(handle
, inode
, dentry
, ret
);
510 void ext4_fc_track_unlink(handle_t
*handle
, struct dentry
*dentry
)
512 struct inode
*inode
= d_inode(dentry
);
514 if (ext4_fc_disabled(inode
->i_sb
))
517 if (ext4_test_mount_flag(inode
->i_sb
, EXT4_MF_FC_INELIGIBLE
))
520 __ext4_fc_track_unlink(handle
, inode
, dentry
);
523 void __ext4_fc_track_link(handle_t
*handle
,
524 struct inode
*inode
, struct dentry
*dentry
)
526 struct __track_dentry_update_args args
;
529 args
.dentry
= dentry
;
530 args
.op
= EXT4_FC_TAG_LINK
;
532 ret
= ext4_fc_track_template(handle
, inode
, __track_dentry_update
,
534 trace_ext4_fc_track_link(handle
, inode
, dentry
, ret
);
537 void ext4_fc_track_link(handle_t
*handle
, struct dentry
*dentry
)
539 struct inode
*inode
= d_inode(dentry
);
541 if (ext4_fc_disabled(inode
->i_sb
))
544 if (ext4_test_mount_flag(inode
->i_sb
, EXT4_MF_FC_INELIGIBLE
))
547 __ext4_fc_track_link(handle
, inode
, dentry
);
550 void __ext4_fc_track_create(handle_t
*handle
, struct inode
*inode
,
551 struct dentry
*dentry
)
553 struct __track_dentry_update_args args
;
556 args
.dentry
= dentry
;
557 args
.op
= EXT4_FC_TAG_CREAT
;
559 ret
= ext4_fc_track_template(handle
, inode
, __track_dentry_update
,
561 trace_ext4_fc_track_create(handle
, inode
, dentry
, ret
);
564 void ext4_fc_track_create(handle_t
*handle
, struct dentry
*dentry
)
566 struct inode
*inode
= d_inode(dentry
);
568 if (ext4_fc_disabled(inode
->i_sb
))
571 if (ext4_test_mount_flag(inode
->i_sb
, EXT4_MF_FC_INELIGIBLE
))
574 __ext4_fc_track_create(handle
, inode
, dentry
);
577 /* __track_fn for inode tracking */
578 static int __track_inode(handle_t
*handle
, struct inode
*inode
, void *arg
,
584 EXT4_I(inode
)->i_fc_lblk_len
= 0;
589 void ext4_fc_track_inode(handle_t
*handle
, struct inode
*inode
)
593 if (S_ISDIR(inode
->i_mode
))
596 if (ext4_fc_disabled(inode
->i_sb
))
599 if (ext4_should_journal_data(inode
)) {
600 ext4_fc_mark_ineligible(inode
->i_sb
,
601 EXT4_FC_REASON_INODE_JOURNAL_DATA
, handle
);
605 if (ext4_test_mount_flag(inode
->i_sb
, EXT4_MF_FC_INELIGIBLE
))
608 ret
= ext4_fc_track_template(handle
, inode
, __track_inode
, NULL
, 1);
609 trace_ext4_fc_track_inode(handle
, inode
, ret
);
612 struct __track_range_args
{
613 ext4_lblk_t start
, end
;
616 /* __track_fn for tracking data updates */
617 static int __track_range(handle_t
*handle
, struct inode
*inode
, void *arg
,
620 struct ext4_inode_info
*ei
= EXT4_I(inode
);
621 ext4_lblk_t oldstart
;
622 struct __track_range_args
*__arg
=
623 (struct __track_range_args
*)arg
;
625 if (inode
->i_ino
< EXT4_FIRST_INO(inode
->i_sb
)) {
626 ext4_debug("Special inode %ld being modified\n", inode
->i_ino
);
630 oldstart
= ei
->i_fc_lblk_start
;
632 if (update
&& ei
->i_fc_lblk_len
> 0) {
633 ei
->i_fc_lblk_start
= min(ei
->i_fc_lblk_start
, __arg
->start
);
635 max(oldstart
+ ei
->i_fc_lblk_len
- 1, __arg
->end
) -
636 ei
->i_fc_lblk_start
+ 1;
638 ei
->i_fc_lblk_start
= __arg
->start
;
639 ei
->i_fc_lblk_len
= __arg
->end
- __arg
->start
+ 1;
645 void ext4_fc_track_range(handle_t
*handle
, struct inode
*inode
, ext4_lblk_t start
,
648 struct __track_range_args args
;
651 if (S_ISDIR(inode
->i_mode
))
654 if (ext4_fc_disabled(inode
->i_sb
))
657 if (ext4_test_mount_flag(inode
->i_sb
, EXT4_MF_FC_INELIGIBLE
))
660 if (ext4_has_inline_data(inode
)) {
661 ext4_fc_mark_ineligible(inode
->i_sb
, EXT4_FC_REASON_XATTR
,
669 ret
= ext4_fc_track_template(handle
, inode
, __track_range
, &args
, 1);
671 trace_ext4_fc_track_range(handle
, inode
, start
, end
, ret
);
674 static void ext4_fc_submit_bh(struct super_block
*sb
, bool is_tail
)
676 blk_opf_t write_flags
= REQ_SYNC
;
677 struct buffer_head
*bh
= EXT4_SB(sb
)->s_fc_bh
;
679 /* Add REQ_FUA | REQ_PREFLUSH only its tail */
680 if (test_opt(sb
, BARRIER
) && is_tail
)
681 write_flags
|= REQ_FUA
| REQ_PREFLUSH
;
683 set_buffer_dirty(bh
);
684 set_buffer_uptodate(bh
);
685 bh
->b_end_io
= ext4_end_buffer_io_sync
;
686 submit_bh(REQ_OP_WRITE
| write_flags
, bh
);
687 EXT4_SB(sb
)->s_fc_bh
= NULL
;
690 /* Ext4 commit path routines */
693 * Allocate len bytes on a fast commit buffer.
695 * During the commit time this function is used to manage fast commit
696 * block space. We don't split a fast commit log onto different
697 * blocks. So this function makes sure that if there's not enough space
698 * on the current block, the remaining space in the current block is
699 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
700 * new block is from jbd2 and CRC is updated to reflect the padding
703 static u8
*ext4_fc_reserve_space(struct super_block
*sb
, int len
, u32
*crc
)
705 struct ext4_fc_tl tl
;
706 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
707 struct buffer_head
*bh
;
708 int bsize
= sbi
->s_journal
->j_blocksize
;
709 int ret
, off
= sbi
->s_fc_bytes
% bsize
;
714 * If 'len' is too long to fit in any block alongside a PAD tlv, then we
715 * cannot fulfill the request.
717 if (len
> bsize
- EXT4_FC_TAG_BASE_LEN
)
721 ret
= jbd2_fc_get_buf(EXT4_SB(sb
)->s_journal
, &bh
);
726 dst
= sbi
->s_fc_bh
->b_data
+ off
;
729 * Allocate the bytes in the current block if we can do so while still
730 * leaving enough space for a PAD tlv.
732 remaining
= bsize
- EXT4_FC_TAG_BASE_LEN
- off
;
733 if (len
<= remaining
) {
734 sbi
->s_fc_bytes
+= len
;
739 * Else, terminate the current block with a PAD tlv, then allocate a new
740 * block and allocate the bytes at the start of that new block.
743 tl
.fc_tag
= cpu_to_le16(EXT4_FC_TAG_PAD
);
744 tl
.fc_len
= cpu_to_le16(remaining
);
745 memcpy(dst
, &tl
, EXT4_FC_TAG_BASE_LEN
);
746 memset(dst
+ EXT4_FC_TAG_BASE_LEN
, 0, remaining
);
747 *crc
= ext4_chksum(sbi
, *crc
, sbi
->s_fc_bh
->b_data
, bsize
);
749 ext4_fc_submit_bh(sb
, false);
751 ret
= jbd2_fc_get_buf(EXT4_SB(sb
)->s_journal
, &bh
);
755 sbi
->s_fc_bytes
+= bsize
- off
+ len
;
756 return sbi
->s_fc_bh
->b_data
;
760 * Complete a fast commit by writing tail tag.
762 * Writing tail tag marks the end of a fast commit. In order to guarantee
763 * atomicity, after writing tail tag, even if there's space remaining
764 * in the block, next commit shouldn't use it. That's why tail tag
765 * has the length as that of the remaining space on the block.
767 static int ext4_fc_write_tail(struct super_block
*sb
, u32 crc
)
769 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
770 struct ext4_fc_tl tl
;
771 struct ext4_fc_tail tail
;
772 int off
, bsize
= sbi
->s_journal
->j_blocksize
;
776 * ext4_fc_reserve_space takes care of allocating an extra block if
777 * there's no enough space on this block for accommodating this tail.
779 dst
= ext4_fc_reserve_space(sb
, EXT4_FC_TAG_BASE_LEN
+ sizeof(tail
), &crc
);
783 off
= sbi
->s_fc_bytes
% bsize
;
785 tl
.fc_tag
= cpu_to_le16(EXT4_FC_TAG_TAIL
);
786 tl
.fc_len
= cpu_to_le16(bsize
- off
+ sizeof(struct ext4_fc_tail
));
787 sbi
->s_fc_bytes
= round_up(sbi
->s_fc_bytes
, bsize
);
789 memcpy(dst
, &tl
, EXT4_FC_TAG_BASE_LEN
);
790 dst
+= EXT4_FC_TAG_BASE_LEN
;
791 tail
.fc_tid
= cpu_to_le32(sbi
->s_journal
->j_running_transaction
->t_tid
);
792 memcpy(dst
, &tail
.fc_tid
, sizeof(tail
.fc_tid
));
793 dst
+= sizeof(tail
.fc_tid
);
794 crc
= ext4_chksum(sbi
, crc
, sbi
->s_fc_bh
->b_data
,
795 dst
- (u8
*)sbi
->s_fc_bh
->b_data
);
796 tail
.fc_crc
= cpu_to_le32(crc
);
797 memcpy(dst
, &tail
.fc_crc
, sizeof(tail
.fc_crc
));
798 dst
+= sizeof(tail
.fc_crc
);
799 memset(dst
, 0, bsize
- off
); /* Don't leak uninitialized memory. */
801 ext4_fc_submit_bh(sb
, true);
807 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
808 * Returns false if there's not enough space.
810 static bool ext4_fc_add_tlv(struct super_block
*sb
, u16 tag
, u16 len
, u8
*val
,
813 struct ext4_fc_tl tl
;
816 dst
= ext4_fc_reserve_space(sb
, EXT4_FC_TAG_BASE_LEN
+ len
, crc
);
820 tl
.fc_tag
= cpu_to_le16(tag
);
821 tl
.fc_len
= cpu_to_le16(len
);
823 memcpy(dst
, &tl
, EXT4_FC_TAG_BASE_LEN
);
824 memcpy(dst
+ EXT4_FC_TAG_BASE_LEN
, val
, len
);
829 /* Same as above, but adds dentry tlv. */
830 static bool ext4_fc_add_dentry_tlv(struct super_block
*sb
, u32
*crc
,
831 struct ext4_fc_dentry_update
*fc_dentry
)
833 struct ext4_fc_dentry_info fcd
;
834 struct ext4_fc_tl tl
;
835 int dlen
= fc_dentry
->fcd_name
.len
;
836 u8
*dst
= ext4_fc_reserve_space(sb
,
837 EXT4_FC_TAG_BASE_LEN
+ sizeof(fcd
) + dlen
, crc
);
842 fcd
.fc_parent_ino
= cpu_to_le32(fc_dentry
->fcd_parent
);
843 fcd
.fc_ino
= cpu_to_le32(fc_dentry
->fcd_ino
);
844 tl
.fc_tag
= cpu_to_le16(fc_dentry
->fcd_op
);
845 tl
.fc_len
= cpu_to_le16(sizeof(fcd
) + dlen
);
846 memcpy(dst
, &tl
, EXT4_FC_TAG_BASE_LEN
);
847 dst
+= EXT4_FC_TAG_BASE_LEN
;
848 memcpy(dst
, &fcd
, sizeof(fcd
));
850 memcpy(dst
, fc_dentry
->fcd_name
.name
, dlen
);
856 * Writes inode in the fast commit space under TLV with tag @tag.
857 * Returns 0 on success, error on failure.
859 static int ext4_fc_write_inode(struct inode
*inode
, u32
*crc
)
861 struct ext4_inode_info
*ei
= EXT4_I(inode
);
862 int inode_len
= EXT4_GOOD_OLD_INODE_SIZE
;
864 struct ext4_iloc iloc
;
865 struct ext4_fc_inode fc_inode
;
866 struct ext4_fc_tl tl
;
869 ret
= ext4_get_inode_loc(inode
, &iloc
);
873 if (ext4_test_inode_flag(inode
, EXT4_INODE_INLINE_DATA
))
874 inode_len
= EXT4_INODE_SIZE(inode
->i_sb
);
875 else if (EXT4_INODE_SIZE(inode
->i_sb
) > EXT4_GOOD_OLD_INODE_SIZE
)
876 inode_len
+= ei
->i_extra_isize
;
878 fc_inode
.fc_ino
= cpu_to_le32(inode
->i_ino
);
879 tl
.fc_tag
= cpu_to_le16(EXT4_FC_TAG_INODE
);
880 tl
.fc_len
= cpu_to_le16(inode_len
+ sizeof(fc_inode
.fc_ino
));
883 dst
= ext4_fc_reserve_space(inode
->i_sb
,
884 EXT4_FC_TAG_BASE_LEN
+ inode_len
+ sizeof(fc_inode
.fc_ino
), crc
);
888 memcpy(dst
, &tl
, EXT4_FC_TAG_BASE_LEN
);
889 dst
+= EXT4_FC_TAG_BASE_LEN
;
890 memcpy(dst
, &fc_inode
, sizeof(fc_inode
));
891 dst
+= sizeof(fc_inode
);
892 memcpy(dst
, (u8
*)ext4_raw_inode(&iloc
), inode_len
);
900 * Writes updated data ranges for the inode in question. Updates CRC.
901 * Returns 0 on success, error otherwise.
903 static int ext4_fc_write_inode_data(struct inode
*inode
, u32
*crc
)
905 ext4_lblk_t old_blk_size
, cur_lblk_off
, new_blk_size
;
906 struct ext4_inode_info
*ei
= EXT4_I(inode
);
907 struct ext4_map_blocks map
;
908 struct ext4_fc_add_range fc_ext
;
909 struct ext4_fc_del_range lrange
;
910 struct ext4_extent
*ex
;
913 mutex_lock(&ei
->i_fc_lock
);
914 if (ei
->i_fc_lblk_len
== 0) {
915 mutex_unlock(&ei
->i_fc_lock
);
918 old_blk_size
= ei
->i_fc_lblk_start
;
919 new_blk_size
= ei
->i_fc_lblk_start
+ ei
->i_fc_lblk_len
- 1;
920 ei
->i_fc_lblk_len
= 0;
921 mutex_unlock(&ei
->i_fc_lock
);
923 cur_lblk_off
= old_blk_size
;
924 ext4_debug("will try writing %d to %d for inode %ld\n",
925 cur_lblk_off
, new_blk_size
, inode
->i_ino
);
927 while (cur_lblk_off
<= new_blk_size
) {
928 map
.m_lblk
= cur_lblk_off
;
929 map
.m_len
= new_blk_size
- cur_lblk_off
+ 1;
930 ret
= ext4_map_blocks(NULL
, inode
, &map
, 0);
934 if (map
.m_len
== 0) {
940 lrange
.fc_ino
= cpu_to_le32(inode
->i_ino
);
941 lrange
.fc_lblk
= cpu_to_le32(map
.m_lblk
);
942 lrange
.fc_len
= cpu_to_le32(map
.m_len
);
943 if (!ext4_fc_add_tlv(inode
->i_sb
, EXT4_FC_TAG_DEL_RANGE
,
944 sizeof(lrange
), (u8
*)&lrange
, crc
))
947 unsigned int max
= (map
.m_flags
& EXT4_MAP_UNWRITTEN
) ?
948 EXT_UNWRITTEN_MAX_LEN
: EXT_INIT_MAX_LEN
;
950 /* Limit the number of blocks in one extent */
951 map
.m_len
= min(max
, map
.m_len
);
953 fc_ext
.fc_ino
= cpu_to_le32(inode
->i_ino
);
954 ex
= (struct ext4_extent
*)&fc_ext
.fc_ex
;
955 ex
->ee_block
= cpu_to_le32(map
.m_lblk
);
956 ex
->ee_len
= cpu_to_le16(map
.m_len
);
957 ext4_ext_store_pblock(ex
, map
.m_pblk
);
958 if (map
.m_flags
& EXT4_MAP_UNWRITTEN
)
959 ext4_ext_mark_unwritten(ex
);
961 ext4_ext_mark_initialized(ex
);
962 if (!ext4_fc_add_tlv(inode
->i_sb
, EXT4_FC_TAG_ADD_RANGE
,
963 sizeof(fc_ext
), (u8
*)&fc_ext
, crc
))
967 cur_lblk_off
+= map
.m_len
;
974 /* Submit data for all the fast commit inodes */
975 static int ext4_fc_submit_inode_data_all(journal_t
*journal
)
977 struct super_block
*sb
= journal
->j_private
;
978 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
979 struct ext4_inode_info
*ei
;
982 spin_lock(&sbi
->s_fc_lock
);
983 list_for_each_entry(ei
, &sbi
->s_fc_q
[FC_Q_MAIN
], i_fc_list
) {
984 ext4_set_inode_state(&ei
->vfs_inode
, EXT4_STATE_FC_COMMITTING
);
985 while (atomic_read(&ei
->i_fc_updates
)) {
988 prepare_to_wait(&ei
->i_fc_wait
, &wait
,
989 TASK_UNINTERRUPTIBLE
);
990 if (atomic_read(&ei
->i_fc_updates
)) {
991 spin_unlock(&sbi
->s_fc_lock
);
993 spin_lock(&sbi
->s_fc_lock
);
995 finish_wait(&ei
->i_fc_wait
, &wait
);
997 spin_unlock(&sbi
->s_fc_lock
);
998 ret
= jbd2_submit_inode_data(journal
, ei
->jinode
);
1001 spin_lock(&sbi
->s_fc_lock
);
1003 spin_unlock(&sbi
->s_fc_lock
);
1008 /* Wait for completion of data for all the fast commit inodes */
1009 static int ext4_fc_wait_inode_data_all(journal_t
*journal
)
1011 struct super_block
*sb
= journal
->j_private
;
1012 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
1013 struct ext4_inode_info
*pos
, *n
;
1016 spin_lock(&sbi
->s_fc_lock
);
1017 list_for_each_entry_safe(pos
, n
, &sbi
->s_fc_q
[FC_Q_MAIN
], i_fc_list
) {
1018 if (!ext4_test_inode_state(&pos
->vfs_inode
,
1019 EXT4_STATE_FC_COMMITTING
))
1021 spin_unlock(&sbi
->s_fc_lock
);
1023 ret
= jbd2_wait_inode_data(journal
, pos
->jinode
);
1026 spin_lock(&sbi
->s_fc_lock
);
1028 spin_unlock(&sbi
->s_fc_lock
);
1033 /* Commit all the directory entry updates */
1034 static int ext4_fc_commit_dentry_updates(journal_t
*journal
, u32
*crc
)
1035 __acquires(&sbi
->s_fc_lock
)
1036 __releases(&sbi
->s_fc_lock
)
1038 struct super_block
*sb
= journal
->j_private
;
1039 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
1040 struct ext4_fc_dentry_update
*fc_dentry
, *fc_dentry_n
;
1041 struct inode
*inode
;
1042 struct ext4_inode_info
*ei
;
1045 if (list_empty(&sbi
->s_fc_dentry_q
[FC_Q_MAIN
]))
1047 list_for_each_entry_safe(fc_dentry
, fc_dentry_n
,
1048 &sbi
->s_fc_dentry_q
[FC_Q_MAIN
], fcd_list
) {
1049 if (fc_dentry
->fcd_op
!= EXT4_FC_TAG_CREAT
) {
1050 spin_unlock(&sbi
->s_fc_lock
);
1051 if (!ext4_fc_add_dentry_tlv(sb
, crc
, fc_dentry
)) {
1055 spin_lock(&sbi
->s_fc_lock
);
1059 * With fcd_dilist we need not loop in sbi->s_fc_q to get the
1060 * corresponding inode pointer
1062 WARN_ON(list_empty(&fc_dentry
->fcd_dilist
));
1063 ei
= list_first_entry(&fc_dentry
->fcd_dilist
,
1064 struct ext4_inode_info
, i_fc_dilist
);
1065 inode
= &ei
->vfs_inode
;
1066 WARN_ON(inode
->i_ino
!= fc_dentry
->fcd_ino
);
1068 spin_unlock(&sbi
->s_fc_lock
);
1071 * We first write the inode and then the create dirent. This
1072 * allows the recovery code to create an unnamed inode first
1073 * and then link it to a directory entry. This allows us
1074 * to use namei.c routines almost as is and simplifies
1075 * the recovery code.
1077 ret
= ext4_fc_write_inode(inode
, crc
);
1081 ret
= ext4_fc_write_inode_data(inode
, crc
);
1085 if (!ext4_fc_add_dentry_tlv(sb
, crc
, fc_dentry
)) {
1090 spin_lock(&sbi
->s_fc_lock
);
1094 spin_lock(&sbi
->s_fc_lock
);
1098 static int ext4_fc_perform_commit(journal_t
*journal
)
1100 struct super_block
*sb
= journal
->j_private
;
1101 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
1102 struct ext4_inode_info
*iter
;
1103 struct ext4_fc_head head
;
1104 struct inode
*inode
;
1105 struct blk_plug plug
;
1109 ret
= ext4_fc_submit_inode_data_all(journal
);
1113 ret
= ext4_fc_wait_inode_data_all(journal
);
1118 * If file system device is different from journal device, issue a cache
1119 * flush before we start writing fast commit blocks.
1121 if (journal
->j_fs_dev
!= journal
->j_dev
)
1122 blkdev_issue_flush(journal
->j_fs_dev
);
1124 blk_start_plug(&plug
);
1125 if (sbi
->s_fc_bytes
== 0) {
1127 * Add a head tag only if this is the first fast commit
1130 head
.fc_features
= cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES
);
1131 head
.fc_tid
= cpu_to_le32(
1132 sbi
->s_journal
->j_running_transaction
->t_tid
);
1133 if (!ext4_fc_add_tlv(sb
, EXT4_FC_TAG_HEAD
, sizeof(head
),
1134 (u8
*)&head
, &crc
)) {
1140 spin_lock(&sbi
->s_fc_lock
);
1141 ret
= ext4_fc_commit_dentry_updates(journal
, &crc
);
1143 spin_unlock(&sbi
->s_fc_lock
);
1147 list_for_each_entry(iter
, &sbi
->s_fc_q
[FC_Q_MAIN
], i_fc_list
) {
1148 inode
= &iter
->vfs_inode
;
1149 if (!ext4_test_inode_state(inode
, EXT4_STATE_FC_COMMITTING
))
1152 spin_unlock(&sbi
->s_fc_lock
);
1153 ret
= ext4_fc_write_inode_data(inode
, &crc
);
1156 ret
= ext4_fc_write_inode(inode
, &crc
);
1159 spin_lock(&sbi
->s_fc_lock
);
1161 spin_unlock(&sbi
->s_fc_lock
);
1163 ret
= ext4_fc_write_tail(sb
, crc
);
1166 blk_finish_plug(&plug
);
1170 static void ext4_fc_update_stats(struct super_block
*sb
, int status
,
1171 u64 commit_time
, int nblks
, tid_t commit_tid
)
1173 struct ext4_fc_stats
*stats
= &EXT4_SB(sb
)->s_fc_stats
;
1175 ext4_debug("Fast commit ended with status = %d for tid %u",
1176 status
, commit_tid
);
1177 if (status
== EXT4_FC_STATUS_OK
) {
1178 stats
->fc_num_commits
++;
1179 stats
->fc_numblks
+= nblks
;
1180 if (likely(stats
->s_fc_avg_commit_time
))
1181 stats
->s_fc_avg_commit_time
=
1183 stats
->s_fc_avg_commit_time
* 3) / 4;
1185 stats
->s_fc_avg_commit_time
= commit_time
;
1186 } else if (status
== EXT4_FC_STATUS_FAILED
||
1187 status
== EXT4_FC_STATUS_INELIGIBLE
) {
1188 if (status
== EXT4_FC_STATUS_FAILED
)
1189 stats
->fc_failed_commits
++;
1190 stats
->fc_ineligible_commits
++;
1192 stats
->fc_skipped_commits
++;
1194 trace_ext4_fc_commit_stop(sb
, nblks
, status
, commit_tid
);
1198 * The main commit entry point. Performs a fast commit for transaction
1199 * commit_tid if needed. If it's not possible to perform a fast commit
1200 * due to various reasons, we fall back to full commit. Returns 0
1201 * on success, error otherwise.
1203 int ext4_fc_commit(journal_t
*journal
, tid_t commit_tid
)
1205 struct super_block
*sb
= journal
->j_private
;
1206 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
1207 int nblks
= 0, ret
, bsize
= journal
->j_blocksize
;
1208 int subtid
= atomic_read(&sbi
->s_fc_subtid
);
1209 int status
= EXT4_FC_STATUS_OK
, fc_bufs_before
= 0;
1210 ktime_t start_time
, commit_time
;
1212 if (!test_opt2(sb
, JOURNAL_FAST_COMMIT
))
1213 return jbd2_complete_transaction(journal
, commit_tid
);
1215 trace_ext4_fc_commit_start(sb
, commit_tid
);
1217 start_time
= ktime_get();
1220 ret
= jbd2_fc_begin_commit(journal
, commit_tid
);
1221 if (ret
== -EALREADY
) {
1222 /* There was an ongoing commit, check if we need to restart */
1223 if (atomic_read(&sbi
->s_fc_subtid
) <= subtid
&&
1224 tid_gt(commit_tid
, journal
->j_commit_sequence
))
1226 ext4_fc_update_stats(sb
, EXT4_FC_STATUS_SKIPPED
, 0, 0,
1231 * Commit couldn't start. Just update stats and perform a
1234 ext4_fc_update_stats(sb
, EXT4_FC_STATUS_FAILED
, 0, 0,
1236 return jbd2_complete_transaction(journal
, commit_tid
);
1240 * After establishing journal barrier via jbd2_fc_begin_commit(), check
1241 * if we are fast commit ineligible.
1243 if (ext4_test_mount_flag(sb
, EXT4_MF_FC_INELIGIBLE
)) {
1244 status
= EXT4_FC_STATUS_INELIGIBLE
;
1248 fc_bufs_before
= (sbi
->s_fc_bytes
+ bsize
- 1) / bsize
;
1249 ret
= ext4_fc_perform_commit(journal
);
1251 status
= EXT4_FC_STATUS_FAILED
;
1254 nblks
= (sbi
->s_fc_bytes
+ bsize
- 1) / bsize
- fc_bufs_before
;
1255 ret
= jbd2_fc_wait_bufs(journal
, nblks
);
1257 status
= EXT4_FC_STATUS_FAILED
;
1260 atomic_inc(&sbi
->s_fc_subtid
);
1261 ret
= jbd2_fc_end_commit(journal
);
1263 * weight the commit time higher than the average time so we
1264 * don't react too strongly to vast changes in the commit time
1266 commit_time
= ktime_to_ns(ktime_sub(ktime_get(), start_time
));
1267 ext4_fc_update_stats(sb
, status
, commit_time
, nblks
, commit_tid
);
1271 ret
= jbd2_fc_end_commit_fallback(journal
);
1272 ext4_fc_update_stats(sb
, status
, 0, 0, commit_tid
);
1277 * Fast commit cleanup routine. This is called after every fast commit and
1278 * full commit. full is true if we are called after a full commit.
1280 static void ext4_fc_cleanup(journal_t
*journal
, int full
, tid_t tid
)
1282 struct super_block
*sb
= journal
->j_private
;
1283 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
1284 struct ext4_inode_info
*iter
, *iter_n
;
1285 struct ext4_fc_dentry_update
*fc_dentry
;
1287 if (full
&& sbi
->s_fc_bh
)
1288 sbi
->s_fc_bh
= NULL
;
1290 trace_ext4_fc_cleanup(journal
, full
, tid
);
1291 jbd2_fc_release_bufs(journal
);
1293 spin_lock(&sbi
->s_fc_lock
);
1294 list_for_each_entry_safe(iter
, iter_n
, &sbi
->s_fc_q
[FC_Q_MAIN
],
1296 list_del_init(&iter
->i_fc_list
);
1297 ext4_clear_inode_state(&iter
->vfs_inode
,
1298 EXT4_STATE_FC_COMMITTING
);
1299 if (tid_geq(tid
, iter
->i_sync_tid
)) {
1300 ext4_fc_reset_inode(&iter
->vfs_inode
);
1303 * We are called after a full commit, inode has been
1304 * modified while the commit was running. Re-enqueue
1305 * the inode into STAGING, which will then be splice
1306 * back into MAIN. This cannot happen during
1307 * fastcommit because the journal is locked all the
1308 * time in that case (and tid doesn't increase so
1309 * tid check above isn't reliable).
1311 list_add_tail(&EXT4_I(&iter
->vfs_inode
)->i_fc_list
,
1312 &sbi
->s_fc_q
[FC_Q_STAGING
]);
1314 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1316 #if (BITS_PER_LONG < 64)
1317 wake_up_bit(&iter
->i_state_flags
, EXT4_STATE_FC_COMMITTING
);
1319 wake_up_bit(&iter
->i_flags
, EXT4_STATE_FC_COMMITTING
);
1323 while (!list_empty(&sbi
->s_fc_dentry_q
[FC_Q_MAIN
])) {
1324 fc_dentry
= list_first_entry(&sbi
->s_fc_dentry_q
[FC_Q_MAIN
],
1325 struct ext4_fc_dentry_update
,
1327 list_del_init(&fc_dentry
->fcd_list
);
1328 list_del_init(&fc_dentry
->fcd_dilist
);
1329 spin_unlock(&sbi
->s_fc_lock
);
1331 if (fc_dentry
->fcd_name
.name
&&
1332 fc_dentry
->fcd_name
.len
> DNAME_INLINE_LEN
)
1333 kfree(fc_dentry
->fcd_name
.name
);
1334 kmem_cache_free(ext4_fc_dentry_cachep
, fc_dentry
);
1335 spin_lock(&sbi
->s_fc_lock
);
1338 list_splice_init(&sbi
->s_fc_dentry_q
[FC_Q_STAGING
],
1339 &sbi
->s_fc_dentry_q
[FC_Q_MAIN
]);
1340 list_splice_init(&sbi
->s_fc_q
[FC_Q_STAGING
],
1341 &sbi
->s_fc_q
[FC_Q_MAIN
]);
1343 if (tid_geq(tid
, sbi
->s_fc_ineligible_tid
)) {
1344 sbi
->s_fc_ineligible_tid
= 0;
1345 ext4_clear_mount_flag(sb
, EXT4_MF_FC_INELIGIBLE
);
1349 sbi
->s_fc_bytes
= 0;
1350 spin_unlock(&sbi
->s_fc_lock
);
1351 trace_ext4_fc_stats(sb
);
1354 /* Ext4 Replay Path Routines */
1356 /* Helper struct for dentry replay routines */
1357 struct dentry_info_args
{
1358 int parent_ino
, dname_len
, ino
, inode_len
;
1362 /* Same as struct ext4_fc_tl, but uses native endianness fields */
1363 struct ext4_fc_tl_mem
{
1368 static inline void tl_to_darg(struct dentry_info_args
*darg
,
1369 struct ext4_fc_tl_mem
*tl
, u8
*val
)
1371 struct ext4_fc_dentry_info fcd
;
1373 memcpy(&fcd
, val
, sizeof(fcd
));
1375 darg
->parent_ino
= le32_to_cpu(fcd
.fc_parent_ino
);
1376 darg
->ino
= le32_to_cpu(fcd
.fc_ino
);
1377 darg
->dname
= val
+ offsetof(struct ext4_fc_dentry_info
, fc_dname
);
1378 darg
->dname_len
= tl
->fc_len
- sizeof(struct ext4_fc_dentry_info
);
1381 static inline void ext4_fc_get_tl(struct ext4_fc_tl_mem
*tl
, u8
*val
)
1383 struct ext4_fc_tl tl_disk
;
1385 memcpy(&tl_disk
, val
, EXT4_FC_TAG_BASE_LEN
);
1386 tl
->fc_len
= le16_to_cpu(tl_disk
.fc_len
);
1387 tl
->fc_tag
= le16_to_cpu(tl_disk
.fc_tag
);
1390 /* Unlink replay function */
1391 static int ext4_fc_replay_unlink(struct super_block
*sb
,
1392 struct ext4_fc_tl_mem
*tl
, u8
*val
)
1394 struct inode
*inode
, *old_parent
;
1396 struct dentry_info_args darg
;
1399 tl_to_darg(&darg
, tl
, val
);
1401 trace_ext4_fc_replay(sb
, EXT4_FC_TAG_UNLINK
, darg
.ino
,
1402 darg
.parent_ino
, darg
.dname_len
);
1404 entry
.name
= darg
.dname
;
1405 entry
.len
= darg
.dname_len
;
1406 inode
= ext4_iget(sb
, darg
.ino
, EXT4_IGET_NORMAL
);
1408 if (IS_ERR(inode
)) {
1409 ext4_debug("Inode %d not found", darg
.ino
);
1413 old_parent
= ext4_iget(sb
, darg
.parent_ino
,
1415 if (IS_ERR(old_parent
)) {
1416 ext4_debug("Dir with inode %d not found", darg
.parent_ino
);
1421 ret
= __ext4_unlink(old_parent
, &entry
, inode
, NULL
);
1422 /* -ENOENT ok coz it might not exist anymore. */
1430 static int ext4_fc_replay_link_internal(struct super_block
*sb
,
1431 struct dentry_info_args
*darg
,
1432 struct inode
*inode
)
1434 struct inode
*dir
= NULL
;
1435 struct dentry
*dentry_dir
= NULL
, *dentry_inode
= NULL
;
1436 struct qstr qstr_dname
= QSTR_INIT(darg
->dname
, darg
->dname_len
);
1439 dir
= ext4_iget(sb
, darg
->parent_ino
, EXT4_IGET_NORMAL
);
1441 ext4_debug("Dir with inode %d not found.", darg
->parent_ino
);
1446 dentry_dir
= d_obtain_alias(dir
);
1447 if (IS_ERR(dentry_dir
)) {
1448 ext4_debug("Failed to obtain dentry");
1453 dentry_inode
= d_alloc(dentry_dir
, &qstr_dname
);
1454 if (!dentry_inode
) {
1455 ext4_debug("Inode dentry not created.");
1460 ret
= __ext4_link(dir
, inode
, dentry_inode
);
1462 * It's possible that link already existed since data blocks
1463 * for the dir in question got persisted before we crashed OR
1464 * we replayed this tag and crashed before the entire replay
1467 if (ret
&& ret
!= -EEXIST
) {
1468 ext4_debug("Failed to link\n");
1481 d_drop(dentry_inode
);
1488 /* Link replay function */
1489 static int ext4_fc_replay_link(struct super_block
*sb
,
1490 struct ext4_fc_tl_mem
*tl
, u8
*val
)
1492 struct inode
*inode
;
1493 struct dentry_info_args darg
;
1496 tl_to_darg(&darg
, tl
, val
);
1497 trace_ext4_fc_replay(sb
, EXT4_FC_TAG_LINK
, darg
.ino
,
1498 darg
.parent_ino
, darg
.dname_len
);
1500 inode
= ext4_iget(sb
, darg
.ino
, EXT4_IGET_NORMAL
);
1501 if (IS_ERR(inode
)) {
1502 ext4_debug("Inode not found.");
1506 ret
= ext4_fc_replay_link_internal(sb
, &darg
, inode
);
1512 * Record all the modified inodes during replay. We use this later to setup
1513 * block bitmaps correctly.
1515 static int ext4_fc_record_modified_inode(struct super_block
*sb
, int ino
)
1517 struct ext4_fc_replay_state
*state
;
1520 state
= &EXT4_SB(sb
)->s_fc_replay_state
;
1521 for (i
= 0; i
< state
->fc_modified_inodes_used
; i
++)
1522 if (state
->fc_modified_inodes
[i
] == ino
)
1524 if (state
->fc_modified_inodes_used
== state
->fc_modified_inodes_size
) {
1525 int *fc_modified_inodes
;
1527 fc_modified_inodes
= krealloc(state
->fc_modified_inodes
,
1528 sizeof(int) * (state
->fc_modified_inodes_size
+
1529 EXT4_FC_REPLAY_REALLOC_INCREMENT
),
1531 if (!fc_modified_inodes
)
1533 state
->fc_modified_inodes
= fc_modified_inodes
;
1534 state
->fc_modified_inodes_size
+=
1535 EXT4_FC_REPLAY_REALLOC_INCREMENT
;
1537 state
->fc_modified_inodes
[state
->fc_modified_inodes_used
++] = ino
;
1542 * Inode replay function
1544 static int ext4_fc_replay_inode(struct super_block
*sb
,
1545 struct ext4_fc_tl_mem
*tl
, u8
*val
)
1547 struct ext4_fc_inode fc_inode
;
1548 struct ext4_inode
*raw_inode
;
1549 struct ext4_inode
*raw_fc_inode
;
1550 struct inode
*inode
= NULL
;
1551 struct ext4_iloc iloc
;
1552 int inode_len
, ino
, ret
, tag
= tl
->fc_tag
;
1553 struct ext4_extent_header
*eh
;
1554 size_t off_gen
= offsetof(struct ext4_inode
, i_generation
);
1556 memcpy(&fc_inode
, val
, sizeof(fc_inode
));
1558 ino
= le32_to_cpu(fc_inode
.fc_ino
);
1559 trace_ext4_fc_replay(sb
, tag
, ino
, 0, 0);
1561 inode
= ext4_iget(sb
, ino
, EXT4_IGET_NORMAL
);
1562 if (!IS_ERR(inode
)) {
1563 ext4_ext_clear_bb(inode
);
1568 ret
= ext4_fc_record_modified_inode(sb
, ino
);
1572 raw_fc_inode
= (struct ext4_inode
*)
1573 (val
+ offsetof(struct ext4_fc_inode
, fc_raw_inode
));
1574 ret
= ext4_get_fc_inode_loc(sb
, ino
, &iloc
);
1578 inode_len
= tl
->fc_len
- sizeof(struct ext4_fc_inode
);
1579 raw_inode
= ext4_raw_inode(&iloc
);
1581 memcpy(raw_inode
, raw_fc_inode
, offsetof(struct ext4_inode
, i_block
));
1582 memcpy((u8
*)raw_inode
+ off_gen
, (u8
*)raw_fc_inode
+ off_gen
,
1583 inode_len
- off_gen
);
1584 if (le32_to_cpu(raw_inode
->i_flags
) & EXT4_EXTENTS_FL
) {
1585 eh
= (struct ext4_extent_header
*)(&raw_inode
->i_block
[0]);
1586 if (eh
->eh_magic
!= EXT4_EXT_MAGIC
) {
1587 memset(eh
, 0, sizeof(*eh
));
1588 eh
->eh_magic
= EXT4_EXT_MAGIC
;
1589 eh
->eh_max
= cpu_to_le16(
1590 (sizeof(raw_inode
->i_block
) -
1591 sizeof(struct ext4_extent_header
))
1592 / sizeof(struct ext4_extent
));
1594 } else if (le32_to_cpu(raw_inode
->i_flags
) & EXT4_INLINE_DATA_FL
) {
1595 memcpy(raw_inode
->i_block
, raw_fc_inode
->i_block
,
1596 sizeof(raw_inode
->i_block
));
1599 /* Immediately update the inode on disk. */
1600 ret
= ext4_handle_dirty_metadata(NULL
, NULL
, iloc
.bh
);
1603 ret
= sync_dirty_buffer(iloc
.bh
);
1606 ret
= ext4_mark_inode_used(sb
, ino
);
1610 /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1611 inode
= ext4_iget(sb
, ino
, EXT4_IGET_NORMAL
);
1612 if (IS_ERR(inode
)) {
1613 ext4_debug("Inode not found.");
1614 return -EFSCORRUPTED
;
1618 * Our allocator could have made different decisions than before
1619 * crashing. This should be fixed but until then, we calculate
1620 * the number of blocks the inode.
1622 if (!ext4_test_inode_flag(inode
, EXT4_INODE_INLINE_DATA
))
1623 ext4_ext_replay_set_iblocks(inode
);
1625 inode
->i_generation
= le32_to_cpu(ext4_raw_inode(&iloc
)->i_generation
);
1626 ext4_reset_inode_seed(inode
);
1628 ext4_inode_csum_set(inode
, ext4_raw_inode(&iloc
), EXT4_I(inode
));
1629 ret
= ext4_handle_dirty_metadata(NULL
, NULL
, iloc
.bh
);
1630 sync_dirty_buffer(iloc
.bh
);
1635 blkdev_issue_flush(sb
->s_bdev
);
1641 * Dentry create replay function.
1643 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1644 * inode for which we are trying to create a dentry here, should already have
1645 * been replayed before we start here.
1647 static int ext4_fc_replay_create(struct super_block
*sb
,
1648 struct ext4_fc_tl_mem
*tl
, u8
*val
)
1651 struct inode
*inode
= NULL
;
1652 struct inode
*dir
= NULL
;
1653 struct dentry_info_args darg
;
1655 tl_to_darg(&darg
, tl
, val
);
1657 trace_ext4_fc_replay(sb
, EXT4_FC_TAG_CREAT
, darg
.ino
,
1658 darg
.parent_ino
, darg
.dname_len
);
1660 /* This takes care of update group descriptor and other metadata */
1661 ret
= ext4_mark_inode_used(sb
, darg
.ino
);
1665 inode
= ext4_iget(sb
, darg
.ino
, EXT4_IGET_NORMAL
);
1666 if (IS_ERR(inode
)) {
1667 ext4_debug("inode %d not found.", darg
.ino
);
1673 if (S_ISDIR(inode
->i_mode
)) {
1675 * If we are creating a directory, we need to make sure that the
1676 * dot and dot dot dirents are setup properly.
1678 dir
= ext4_iget(sb
, darg
.parent_ino
, EXT4_IGET_NORMAL
);
1680 ext4_debug("Dir %d not found.", darg
.ino
);
1683 ret
= ext4_init_new_dir(NULL
, dir
, inode
);
1690 ret
= ext4_fc_replay_link_internal(sb
, &darg
, inode
);
1693 set_nlink(inode
, 1);
1694 ext4_mark_inode_dirty(NULL
, inode
);
1701 * Record physical disk regions which are in use as per fast commit area,
1702 * and used by inodes during replay phase. Our simple replay phase
1703 * allocator excludes these regions from allocation.
1705 int ext4_fc_record_regions(struct super_block
*sb
, int ino
,
1706 ext4_lblk_t lblk
, ext4_fsblk_t pblk
, int len
, int replay
)
1708 struct ext4_fc_replay_state
*state
;
1709 struct ext4_fc_alloc_region
*region
;
1711 state
= &EXT4_SB(sb
)->s_fc_replay_state
;
1713 * during replay phase, the fc_regions_valid may not same as
1714 * fc_regions_used, update it when do new additions.
1716 if (replay
&& state
->fc_regions_used
!= state
->fc_regions_valid
)
1717 state
->fc_regions_used
= state
->fc_regions_valid
;
1718 if (state
->fc_regions_used
== state
->fc_regions_size
) {
1719 struct ext4_fc_alloc_region
*fc_regions
;
1721 fc_regions
= krealloc(state
->fc_regions
,
1722 sizeof(struct ext4_fc_alloc_region
) *
1723 (state
->fc_regions_size
+
1724 EXT4_FC_REPLAY_REALLOC_INCREMENT
),
1728 state
->fc_regions_size
+=
1729 EXT4_FC_REPLAY_REALLOC_INCREMENT
;
1730 state
->fc_regions
= fc_regions
;
1732 region
= &state
->fc_regions
[state
->fc_regions_used
++];
1734 region
->lblk
= lblk
;
1735 region
->pblk
= pblk
;
1739 state
->fc_regions_valid
++;
1744 /* Replay add range tag */
1745 static int ext4_fc_replay_add_range(struct super_block
*sb
,
1746 struct ext4_fc_tl_mem
*tl
, u8
*val
)
1748 struct ext4_fc_add_range fc_add_ex
;
1749 struct ext4_extent newex
, *ex
;
1750 struct inode
*inode
;
1751 ext4_lblk_t start
, cur
;
1753 ext4_fsblk_t start_pblk
;
1754 struct ext4_map_blocks map
;
1755 struct ext4_ext_path
*path
= NULL
;
1758 memcpy(&fc_add_ex
, val
, sizeof(fc_add_ex
));
1759 ex
= (struct ext4_extent
*)&fc_add_ex
.fc_ex
;
1761 trace_ext4_fc_replay(sb
, EXT4_FC_TAG_ADD_RANGE
,
1762 le32_to_cpu(fc_add_ex
.fc_ino
), le32_to_cpu(ex
->ee_block
),
1763 ext4_ext_get_actual_len(ex
));
1765 inode
= ext4_iget(sb
, le32_to_cpu(fc_add_ex
.fc_ino
), EXT4_IGET_NORMAL
);
1766 if (IS_ERR(inode
)) {
1767 ext4_debug("Inode not found.");
1771 ret
= ext4_fc_record_modified_inode(sb
, inode
->i_ino
);
1775 start
= le32_to_cpu(ex
->ee_block
);
1776 start_pblk
= ext4_ext_pblock(ex
);
1777 len
= ext4_ext_get_actual_len(ex
);
1781 ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1782 start
, start_pblk
, len
, ext4_ext_is_unwritten(ex
),
1785 while (remaining
> 0) {
1787 map
.m_len
= remaining
;
1789 ret
= ext4_map_blocks(NULL
, inode
, &map
, 0);
1795 /* Range is not mapped */
1796 path
= ext4_find_extent(inode
, cur
, path
, 0);
1799 memset(&newex
, 0, sizeof(newex
));
1800 newex
.ee_block
= cpu_to_le32(cur
);
1801 ext4_ext_store_pblock(
1802 &newex
, start_pblk
+ cur
- start
);
1803 newex
.ee_len
= cpu_to_le16(map
.m_len
);
1804 if (ext4_ext_is_unwritten(ex
))
1805 ext4_ext_mark_unwritten(&newex
);
1806 down_write(&EXT4_I(inode
)->i_data_sem
);
1807 path
= ext4_ext_insert_extent(NULL
, inode
,
1809 up_write((&EXT4_I(inode
)->i_data_sem
));
1815 if (start_pblk
+ cur
- start
!= map
.m_pblk
) {
1817 * Logical to physical mapping changed. This can happen
1818 * if this range was removed and then reallocated to
1819 * map to new physical blocks during a fast commit.
1821 ret
= ext4_ext_replay_update_ex(inode
, cur
, map
.m_len
,
1822 ext4_ext_is_unwritten(ex
),
1823 start_pblk
+ cur
- start
);
1827 * Mark the old blocks as free since they aren't used
1828 * anymore. We maintain an array of all the modified
1829 * inodes. In case these blocks are still used at either
1830 * a different logical range in the same inode or in
1831 * some different inode, we will mark them as allocated
1832 * at the end of the FC replay using our array of
1835 ext4_mb_mark_bb(inode
->i_sb
, map
.m_pblk
, map
.m_len
, false);
1839 /* Range is mapped and needs a state change */
1840 ext4_debug("Converting from %ld to %d %lld",
1841 map
.m_flags
& EXT4_MAP_UNWRITTEN
,
1842 ext4_ext_is_unwritten(ex
), map
.m_pblk
);
1843 ret
= ext4_ext_replay_update_ex(inode
, cur
, map
.m_len
,
1844 ext4_ext_is_unwritten(ex
), map
.m_pblk
);
1848 * We may have split the extent tree while toggling the state.
1849 * Try to shrink the extent tree now.
1851 ext4_ext_replay_shrink_inode(inode
, start
+ len
);
1854 remaining
-= map
.m_len
;
1856 ext4_ext_replay_shrink_inode(inode
, i_size_read(inode
) >>
1857 sb
->s_blocksize_bits
);
1859 ext4_free_ext_path(path
);
1864 /* Replay DEL_RANGE tag */
1866 ext4_fc_replay_del_range(struct super_block
*sb
,
1867 struct ext4_fc_tl_mem
*tl
, u8
*val
)
1869 struct inode
*inode
;
1870 struct ext4_fc_del_range lrange
;
1871 struct ext4_map_blocks map
;
1872 ext4_lblk_t cur
, remaining
;
1875 memcpy(&lrange
, val
, sizeof(lrange
));
1876 cur
= le32_to_cpu(lrange
.fc_lblk
);
1877 remaining
= le32_to_cpu(lrange
.fc_len
);
1879 trace_ext4_fc_replay(sb
, EXT4_FC_TAG_DEL_RANGE
,
1880 le32_to_cpu(lrange
.fc_ino
), cur
, remaining
);
1882 inode
= ext4_iget(sb
, le32_to_cpu(lrange
.fc_ino
), EXT4_IGET_NORMAL
);
1883 if (IS_ERR(inode
)) {
1884 ext4_debug("Inode %d not found", le32_to_cpu(lrange
.fc_ino
));
1888 ret
= ext4_fc_record_modified_inode(sb
, inode
->i_ino
);
1892 ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n",
1893 inode
->i_ino
, le32_to_cpu(lrange
.fc_lblk
),
1894 le32_to_cpu(lrange
.fc_len
));
1895 while (remaining
> 0) {
1897 map
.m_len
= remaining
;
1899 ret
= ext4_map_blocks(NULL
, inode
, &map
, 0);
1905 ext4_mb_mark_bb(inode
->i_sb
, map
.m_pblk
, map
.m_len
, false);
1907 remaining
-= map
.m_len
;
1912 down_write(&EXT4_I(inode
)->i_data_sem
);
1913 ret
= ext4_ext_remove_space(inode
, le32_to_cpu(lrange
.fc_lblk
),
1914 le32_to_cpu(lrange
.fc_lblk
) +
1915 le32_to_cpu(lrange
.fc_len
) - 1);
1916 up_write(&EXT4_I(inode
)->i_data_sem
);
1919 ext4_ext_replay_shrink_inode(inode
,
1920 i_size_read(inode
) >> sb
->s_blocksize_bits
);
1921 ext4_mark_inode_dirty(NULL
, inode
);
1927 static void ext4_fc_set_bitmaps_and_counters(struct super_block
*sb
)
1929 struct ext4_fc_replay_state
*state
;
1930 struct inode
*inode
;
1931 struct ext4_ext_path
*path
= NULL
;
1932 struct ext4_map_blocks map
;
1934 ext4_lblk_t cur
, end
;
1936 state
= &EXT4_SB(sb
)->s_fc_replay_state
;
1937 for (i
= 0; i
< state
->fc_modified_inodes_used
; i
++) {
1938 inode
= ext4_iget(sb
, state
->fc_modified_inodes
[i
],
1940 if (IS_ERR(inode
)) {
1941 ext4_debug("Inode %d not found.",
1942 state
->fc_modified_inodes
[i
]);
1946 end
= EXT_MAX_BLOCKS
;
1947 if (ext4_test_inode_flag(inode
, EXT4_INODE_INLINE_DATA
)) {
1953 map
.m_len
= end
- cur
;
1955 ret
= ext4_map_blocks(NULL
, inode
, &map
, 0);
1960 path
= ext4_find_extent(inode
, map
.m_lblk
, path
, 0);
1961 if (!IS_ERR(path
)) {
1962 for (j
= 0; j
< path
->p_depth
; j
++)
1963 ext4_mb_mark_bb(inode
->i_sb
,
1964 path
[j
].p_block
, 1, true);
1969 ext4_mb_mark_bb(inode
->i_sb
, map
.m_pblk
,
1972 cur
= cur
+ (map
.m_len
? map
.m_len
: 1);
1978 ext4_free_ext_path(path
);
1982 * Check if block is in excluded regions for block allocation. The simple
1983 * allocator that runs during replay phase is calls this function to see
1984 * if it is okay to use a block.
1986 bool ext4_fc_replay_check_excluded(struct super_block
*sb
, ext4_fsblk_t blk
)
1989 struct ext4_fc_replay_state
*state
;
1991 state
= &EXT4_SB(sb
)->s_fc_replay_state
;
1992 for (i
= 0; i
< state
->fc_regions_valid
; i
++) {
1993 if (state
->fc_regions
[i
].ino
== 0 ||
1994 state
->fc_regions
[i
].len
== 0)
1996 if (in_range(blk
, state
->fc_regions
[i
].pblk
,
1997 state
->fc_regions
[i
].len
))
2003 /* Cleanup function called after replay */
2004 void ext4_fc_replay_cleanup(struct super_block
*sb
)
2006 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
2008 sbi
->s_mount_state
&= ~EXT4_FC_REPLAY
;
2009 kfree(sbi
->s_fc_replay_state
.fc_regions
);
2010 kfree(sbi
->s_fc_replay_state
.fc_modified_inodes
);
2013 static bool ext4_fc_value_len_isvalid(struct ext4_sb_info
*sbi
,
2017 case EXT4_FC_TAG_ADD_RANGE
:
2018 return len
== sizeof(struct ext4_fc_add_range
);
2019 case EXT4_FC_TAG_DEL_RANGE
:
2020 return len
== sizeof(struct ext4_fc_del_range
);
2021 case EXT4_FC_TAG_CREAT
:
2022 case EXT4_FC_TAG_LINK
:
2023 case EXT4_FC_TAG_UNLINK
:
2024 len
-= sizeof(struct ext4_fc_dentry_info
);
2025 return len
>= 1 && len
<= EXT4_NAME_LEN
;
2026 case EXT4_FC_TAG_INODE
:
2027 len
-= sizeof(struct ext4_fc_inode
);
2028 return len
>= EXT4_GOOD_OLD_INODE_SIZE
&&
2029 len
<= sbi
->s_inode_size
;
2030 case EXT4_FC_TAG_PAD
:
2031 return true; /* padding can have any length */
2032 case EXT4_FC_TAG_TAIL
:
2033 return len
>= sizeof(struct ext4_fc_tail
);
2034 case EXT4_FC_TAG_HEAD
:
2035 return len
== sizeof(struct ext4_fc_head
);
2041 * Recovery Scan phase handler
2043 * This function is called during the scan phase and is responsible
2044 * for doing following things:
2045 * - Make sure the fast commit area has valid tags for replay
2046 * - Count number of tags that need to be replayed by the replay handler
2048 * - Create a list of excluded blocks for allocation during replay phase
2050 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
2051 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
2052 * to indicate that scan has finished and JBD2 can now start replay phase.
2053 * It returns a negative error to indicate that there was an error. At the end
2054 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
2055 * to indicate the number of tags that need to replayed during the replay phase.
2057 static int ext4_fc_replay_scan(journal_t
*journal
,
2058 struct buffer_head
*bh
, int off
,
2061 struct super_block
*sb
= journal
->j_private
;
2062 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
2063 struct ext4_fc_replay_state
*state
;
2064 int ret
= JBD2_FC_REPLAY_CONTINUE
;
2065 struct ext4_fc_add_range ext
;
2066 struct ext4_fc_tl_mem tl
;
2067 struct ext4_fc_tail tail
;
2068 __u8
*start
, *end
, *cur
, *val
;
2069 struct ext4_fc_head head
;
2070 struct ext4_extent
*ex
;
2072 state
= &sbi
->s_fc_replay_state
;
2074 start
= (u8
*)bh
->b_data
;
2075 end
= start
+ journal
->j_blocksize
;
2077 if (state
->fc_replay_expected_off
== 0) {
2078 state
->fc_cur_tag
= 0;
2079 state
->fc_replay_num_tags
= 0;
2081 state
->fc_regions
= NULL
;
2082 state
->fc_regions_valid
= state
->fc_regions_used
=
2083 state
->fc_regions_size
= 0;
2084 /* Check if we can stop early */
2085 if (le16_to_cpu(((struct ext4_fc_tl
*)start
)->fc_tag
)
2086 != EXT4_FC_TAG_HEAD
)
2090 if (off
!= state
->fc_replay_expected_off
) {
2091 ret
= -EFSCORRUPTED
;
2095 state
->fc_replay_expected_off
++;
2096 for (cur
= start
; cur
<= end
- EXT4_FC_TAG_BASE_LEN
;
2097 cur
= cur
+ EXT4_FC_TAG_BASE_LEN
+ tl
.fc_len
) {
2098 ext4_fc_get_tl(&tl
, cur
);
2099 val
= cur
+ EXT4_FC_TAG_BASE_LEN
;
2100 if (tl
.fc_len
> end
- val
||
2101 !ext4_fc_value_len_isvalid(sbi
, tl
.fc_tag
, tl
.fc_len
)) {
2102 ret
= state
->fc_replay_num_tags
?
2103 JBD2_FC_REPLAY_STOP
: -ECANCELED
;
2106 ext4_debug("Scan phase, tag:%s, blk %lld\n",
2107 tag2str(tl
.fc_tag
), bh
->b_blocknr
);
2108 switch (tl
.fc_tag
) {
2109 case EXT4_FC_TAG_ADD_RANGE
:
2110 memcpy(&ext
, val
, sizeof(ext
));
2111 ex
= (struct ext4_extent
*)&ext
.fc_ex
;
2112 ret
= ext4_fc_record_regions(sb
,
2113 le32_to_cpu(ext
.fc_ino
),
2114 le32_to_cpu(ex
->ee_block
), ext4_ext_pblock(ex
),
2115 ext4_ext_get_actual_len(ex
), 0);
2118 ret
= JBD2_FC_REPLAY_CONTINUE
;
2120 case EXT4_FC_TAG_DEL_RANGE
:
2121 case EXT4_FC_TAG_LINK
:
2122 case EXT4_FC_TAG_UNLINK
:
2123 case EXT4_FC_TAG_CREAT
:
2124 case EXT4_FC_TAG_INODE
:
2125 case EXT4_FC_TAG_PAD
:
2126 state
->fc_cur_tag
++;
2127 state
->fc_crc
= ext4_chksum(sbi
, state
->fc_crc
, cur
,
2128 EXT4_FC_TAG_BASE_LEN
+ tl
.fc_len
);
2130 case EXT4_FC_TAG_TAIL
:
2131 state
->fc_cur_tag
++;
2132 memcpy(&tail
, val
, sizeof(tail
));
2133 state
->fc_crc
= ext4_chksum(sbi
, state
->fc_crc
, cur
,
2134 EXT4_FC_TAG_BASE_LEN
+
2135 offsetof(struct ext4_fc_tail
,
2137 if (le32_to_cpu(tail
.fc_tid
) == expected_tid
&&
2138 le32_to_cpu(tail
.fc_crc
) == state
->fc_crc
) {
2139 state
->fc_replay_num_tags
= state
->fc_cur_tag
;
2140 state
->fc_regions_valid
=
2141 state
->fc_regions_used
;
2143 ret
= state
->fc_replay_num_tags
?
2144 JBD2_FC_REPLAY_STOP
: -EFSBADCRC
;
2148 case EXT4_FC_TAG_HEAD
:
2149 memcpy(&head
, val
, sizeof(head
));
2150 if (le32_to_cpu(head
.fc_features
) &
2151 ~EXT4_FC_SUPPORTED_FEATURES
) {
2155 if (le32_to_cpu(head
.fc_tid
) != expected_tid
) {
2156 ret
= JBD2_FC_REPLAY_STOP
;
2159 state
->fc_cur_tag
++;
2160 state
->fc_crc
= ext4_chksum(sbi
, state
->fc_crc
, cur
,
2161 EXT4_FC_TAG_BASE_LEN
+ tl
.fc_len
);
2164 ret
= state
->fc_replay_num_tags
?
2165 JBD2_FC_REPLAY_STOP
: -ECANCELED
;
2167 if (ret
< 0 || ret
== JBD2_FC_REPLAY_STOP
)
2172 trace_ext4_fc_replay_scan(sb
, ret
, off
);
2177 * Main recovery path entry point.
2178 * The meaning of return codes is similar as above.
2180 static int ext4_fc_replay(journal_t
*journal
, struct buffer_head
*bh
,
2181 enum passtype pass
, int off
, tid_t expected_tid
)
2183 struct super_block
*sb
= journal
->j_private
;
2184 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
2185 struct ext4_fc_tl_mem tl
;
2186 __u8
*start
, *end
, *cur
, *val
;
2187 int ret
= JBD2_FC_REPLAY_CONTINUE
;
2188 struct ext4_fc_replay_state
*state
= &sbi
->s_fc_replay_state
;
2189 struct ext4_fc_tail tail
;
2191 if (pass
== PASS_SCAN
) {
2192 state
->fc_current_pass
= PASS_SCAN
;
2193 return ext4_fc_replay_scan(journal
, bh
, off
, expected_tid
);
2196 if (state
->fc_current_pass
!= pass
) {
2197 state
->fc_current_pass
= pass
;
2198 sbi
->s_mount_state
|= EXT4_FC_REPLAY
;
2200 if (!sbi
->s_fc_replay_state
.fc_replay_num_tags
) {
2201 ext4_debug("Replay stops\n");
2202 ext4_fc_set_bitmaps_and_counters(sb
);
2206 #ifdef CONFIG_EXT4_DEBUG
2207 if (sbi
->s_fc_debug_max_replay
&& off
>= sbi
->s_fc_debug_max_replay
) {
2208 pr_warn("Dropping fc block %d because max_replay set\n", off
);
2209 return JBD2_FC_REPLAY_STOP
;
2213 start
= (u8
*)bh
->b_data
;
2214 end
= start
+ journal
->j_blocksize
;
2216 for (cur
= start
; cur
<= end
- EXT4_FC_TAG_BASE_LEN
;
2217 cur
= cur
+ EXT4_FC_TAG_BASE_LEN
+ tl
.fc_len
) {
2218 ext4_fc_get_tl(&tl
, cur
);
2219 val
= cur
+ EXT4_FC_TAG_BASE_LEN
;
2221 if (state
->fc_replay_num_tags
== 0) {
2222 ret
= JBD2_FC_REPLAY_STOP
;
2223 ext4_fc_set_bitmaps_and_counters(sb
);
2227 ext4_debug("Replay phase, tag:%s\n", tag2str(tl
.fc_tag
));
2228 state
->fc_replay_num_tags
--;
2229 switch (tl
.fc_tag
) {
2230 case EXT4_FC_TAG_LINK
:
2231 ret
= ext4_fc_replay_link(sb
, &tl
, val
);
2233 case EXT4_FC_TAG_UNLINK
:
2234 ret
= ext4_fc_replay_unlink(sb
, &tl
, val
);
2236 case EXT4_FC_TAG_ADD_RANGE
:
2237 ret
= ext4_fc_replay_add_range(sb
, &tl
, val
);
2239 case EXT4_FC_TAG_CREAT
:
2240 ret
= ext4_fc_replay_create(sb
, &tl
, val
);
2242 case EXT4_FC_TAG_DEL_RANGE
:
2243 ret
= ext4_fc_replay_del_range(sb
, &tl
, val
);
2245 case EXT4_FC_TAG_INODE
:
2246 ret
= ext4_fc_replay_inode(sb
, &tl
, val
);
2248 case EXT4_FC_TAG_PAD
:
2249 trace_ext4_fc_replay(sb
, EXT4_FC_TAG_PAD
, 0,
2252 case EXT4_FC_TAG_TAIL
:
2253 trace_ext4_fc_replay(sb
, EXT4_FC_TAG_TAIL
,
2255 memcpy(&tail
, val
, sizeof(tail
));
2256 WARN_ON(le32_to_cpu(tail
.fc_tid
) != expected_tid
);
2258 case EXT4_FC_TAG_HEAD
:
2261 trace_ext4_fc_replay(sb
, tl
.fc_tag
, 0, tl
.fc_len
, 0);
2267 ret
= JBD2_FC_REPLAY_CONTINUE
;
2272 void ext4_fc_init(struct super_block
*sb
, journal_t
*journal
)
2275 * We set replay callback even if fast commit disabled because we may
2276 * could still have fast commit blocks that need to be replayed even if
2277 * fast commit has now been turned off.
2279 journal
->j_fc_replay_callback
= ext4_fc_replay
;
2280 if (!test_opt2(sb
, JOURNAL_FAST_COMMIT
))
2282 journal
->j_fc_cleanup_callback
= ext4_fc_cleanup
;
2285 static const char * const fc_ineligible_reasons
[] = {
2286 [EXT4_FC_REASON_XATTR
] = "Extended attributes changed",
2287 [EXT4_FC_REASON_CROSS_RENAME
] = "Cross rename",
2288 [EXT4_FC_REASON_JOURNAL_FLAG_CHANGE
] = "Journal flag changed",
2289 [EXT4_FC_REASON_NOMEM
] = "Insufficient memory",
2290 [EXT4_FC_REASON_SWAP_BOOT
] = "Swap boot",
2291 [EXT4_FC_REASON_RESIZE
] = "Resize",
2292 [EXT4_FC_REASON_RENAME_DIR
] = "Dir renamed",
2293 [EXT4_FC_REASON_FALLOC_RANGE
] = "Falloc range op",
2294 [EXT4_FC_REASON_INODE_JOURNAL_DATA
] = "Data journalling",
2295 [EXT4_FC_REASON_ENCRYPTED_FILENAME
] = "Encrypted filename",
2298 int ext4_fc_info_show(struct seq_file
*seq
, void *v
)
2300 struct ext4_sb_info
*sbi
= EXT4_SB((struct super_block
*)seq
->private);
2301 struct ext4_fc_stats
*stats
= &sbi
->s_fc_stats
;
2304 if (v
!= SEQ_START_TOKEN
)
2308 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2309 stats
->fc_num_commits
, stats
->fc_ineligible_commits
,
2311 div_u64(stats
->s_fc_avg_commit_time
, 1000));
2312 seq_puts(seq
, "Ineligible reasons:\n");
2313 for (i
= 0; i
< EXT4_FC_REASON_MAX
; i
++)
2314 seq_printf(seq
, "\"%s\":\t%d\n", fc_ineligible_reasons
[i
],
2315 stats
->fc_ineligible_reason_count
[i
]);
2320 int __init
ext4_fc_init_dentry_cache(void)
2322 ext4_fc_dentry_cachep
= KMEM_CACHE(ext4_fc_dentry_update
,
2323 SLAB_RECLAIM_ACCOUNT
);
2325 if (ext4_fc_dentry_cachep
== NULL
)
2331 void ext4_fc_destroy_dentry_cache(void)
2333 kmem_cache_destroy(ext4_fc_dentry_cachep
);