Merge tag 'regmap-fix-v5.11-rc2' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux/fpc-iii.git] / fs / ext4 / fast_commit.c
blob4fcc21c25e79399c8b3ae9069d93d063e78d50d2
1 // SPDX-License-Identifier: GPL-2.0
3 /*
4 * fs/ext4/fast_commit.c
6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
8 * Ext4 fast commits routines.
9 */
10 #include "ext4.h"
11 #include "ext4_jbd2.h"
12 #include "ext4_extents.h"
13 #include "mballoc.h"
16 * Ext4 Fast Commits
17 * -----------------
19 * Ext4 fast commits implement fine grained journalling for Ext4.
21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23 * TLV during the recovery phase. For the scenarios for which we currently
24 * don't have replay code, fast commit falls back to full commits.
25 * Fast commits record delta in one of the following three categories.
27 * (A) Directory entry updates:
29 * - EXT4_FC_TAG_UNLINK - records directory entry unlink
30 * - EXT4_FC_TAG_LINK - records directory entry link
31 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation
33 * (B) File specific data range updates:
35 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode
36 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode
38 * (C) Inode metadata (mtime / ctime etc):
40 * - EXT4_FC_TAG_INODE - record the inode that should be replayed
41 * during recovery. Note that iblocks field is
42 * not replayed and instead derived during
43 * replay.
44 * Commit Operation
45 * ----------------
46 * With fast commits, we maintain all the directory entry operations in the
47 * order in which they are issued in an in-memory queue. This queue is flushed
48 * to disk during the commit operation. We also maintain a list of inodes
49 * that need to be committed during a fast commit in another in memory queue of
50 * inodes. During the commit operation, we commit in the following order:
52 * [1] Lock inodes for any further data updates by setting COMMITTING state
53 * [2] Submit data buffers of all the inodes
54 * [3] Wait for [2] to complete
55 * [4] Commit all the directory entry updates in the fast commit space
56 * [5] Commit all the changed inode structures
57 * [6] Write tail tag (this tag ensures the atomicity, please read the following
58 * section for more details).
59 * [7] Wait for [4], [5] and [6] to complete.
61 * All the inode updates must call ext4_fc_start_update() before starting an
62 * update. If such an ongoing update is present, fast commit waits for it to
63 * complete. The completion of such an update is marked by
64 * ext4_fc_stop_update().
66 * Fast Commit Ineligibility
67 * -------------------------
68 * Not all operations are supported by fast commits today (e.g extended
69 * attributes). Fast commit ineligiblity is marked by calling one of the
70 * two following functions:
72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
73 * back to full commit. This is useful in case of transient errors.
75 * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
76 * the fast commits happening between ext4_fc_start_ineligible() and
77 * ext4_fc_stop_ineligible() and one fast commit after the call to
78 * ext4_fc_stop_ineligible() to fall back to full commits. It is important to
79 * make one more fast commit to fall back to full commit after stop call so
80 * that it guaranteed that the fast commit ineligible operation contained
81 * within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
82 * followed by at least 1 full commit.
84 * Atomicity of commits
85 * --------------------
86 * In order to guarantee atomicity during the commit operation, fast commit
87 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
88 * tag contains CRC of the contents and TID of the transaction after which
89 * this fast commit should be applied. Recovery code replays fast commit
90 * logs only if there's at least 1 valid tail present. For every fast commit
91 * operation, there is 1 tail. This means, we may end up with multiple tails
92 * in the fast commit space. Here's an example:
94 * - Create a new file A and remove existing file B
95 * - fsync()
96 * - Append contents to file A
97 * - Truncate file A
98 * - fsync()
100 * The fast commit space at the end of above operations would look like this:
101 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
102 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->|
104 * Replay code should thus check for all the valid tails in the FC area.
106 * Fast Commit Replay Idempotence
107 * ------------------------------
109 * Fast commits tags are idempotent in nature provided the recovery code follows
110 * certain rules. The guiding principle that the commit path follows while
111 * committing is that it stores the result of a particular operation instead of
112 * storing the procedure.
114 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
115 * was associated with inode 10. During fast commit, instead of storing this
116 * operation as a procedure "rename a to b", we store the resulting file system
117 * state as a "series" of outcomes:
119 * - Link dirent b to inode 10
120 * - Unlink dirent a
121 * - Inode <10> with valid refcount
123 * Now when recovery code runs, it needs "enforce" this state on the file
124 * system. This is what guarantees idempotence of fast commit replay.
126 * Let's take an example of a procedure that is not idempotent and see how fast
127 * commits make it idempotent. Consider following sequence of operations:
129 * rm A; mv B A; read A
130 * (x) (y) (z)
132 * (x), (y) and (z) are the points at which we can crash. If we store this
133 * sequence of operations as is then the replay is not idempotent. Let's say
134 * while in replay, we crash at (z). During the second replay, file A (which was
135 * actually created as a result of "mv B A" operation) would get deleted. Thus,
136 * file named A would be absent when we try to read A. So, this sequence of
137 * operations is not idempotent. However, as mentioned above, instead of storing
138 * the procedure fast commits store the outcome of each procedure. Thus the fast
139 * commit log for above procedure would be as follows:
141 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
142 * inode 11 before the replay)
144 * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11]
145 * (w) (x) (y) (z)
147 * If we crash at (z), we will have file A linked to inode 11. During the second
148 * replay, we will remove file A (inode 11). But we will create it back and make
149 * it point to inode 11. We won't find B, so we'll just skip that step. At this
150 * point, the refcount for inode 11 is not reliable, but that gets fixed by the
151 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
152 * similarly. Thus, by converting a non-idempotent procedure into a series of
153 * idempotent outcomes, fast commits ensured idempotence during the replay.
155 * TODOs
156 * -----
158 * 0) Fast commit replay path hardening: Fast commit replay code should use
159 * journal handles to make sure all the updates it does during the replay
160 * path are atomic. With that if we crash during fast commit replay, after
161 * trying to do recovery again, we will find a file system where fast commit
162 * area is invalid (because new full commit would be found). In order to deal
163 * with that, fast commit replay code should ensure that the "FC_REPLAY"
164 * superblock state is persisted before starting the replay, so that after
165 * the crash, fast commit recovery code can look at that flag and perform
166 * fast commit recovery even if that area is invalidated by later full
167 * commits.
169 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
170 * eligible update must be protected within ext4_fc_start_update() and
171 * ext4_fc_stop_update(). These routines are called at much higher
172 * routines. This can be made more fine grained by combining with
173 * ext4_journal_start().
175 * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
177 * 3) Handle more ineligible cases.
180 #include <trace/events/ext4.h>
181 static struct kmem_cache *ext4_fc_dentry_cachep;
183 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
185 BUFFER_TRACE(bh, "");
186 if (uptodate) {
187 ext4_debug("%s: Block %lld up-to-date",
188 __func__, bh->b_blocknr);
189 set_buffer_uptodate(bh);
190 } else {
191 ext4_debug("%s: Block %lld not up-to-date",
192 __func__, bh->b_blocknr);
193 clear_buffer_uptodate(bh);
196 unlock_buffer(bh);
199 static inline void ext4_fc_reset_inode(struct inode *inode)
201 struct ext4_inode_info *ei = EXT4_I(inode);
203 ei->i_fc_lblk_start = 0;
204 ei->i_fc_lblk_len = 0;
207 void ext4_fc_init_inode(struct inode *inode)
209 struct ext4_inode_info *ei = EXT4_I(inode);
211 ext4_fc_reset_inode(inode);
212 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
213 INIT_LIST_HEAD(&ei->i_fc_list);
214 init_waitqueue_head(&ei->i_fc_wait);
215 atomic_set(&ei->i_fc_updates, 0);
218 /* This function must be called with sbi->s_fc_lock held. */
219 static void ext4_fc_wait_committing_inode(struct inode *inode)
220 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
222 wait_queue_head_t *wq;
223 struct ext4_inode_info *ei = EXT4_I(inode);
225 #if (BITS_PER_LONG < 64)
226 DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
227 EXT4_STATE_FC_COMMITTING);
228 wq = bit_waitqueue(&ei->i_state_flags,
229 EXT4_STATE_FC_COMMITTING);
230 #else
231 DEFINE_WAIT_BIT(wait, &ei->i_flags,
232 EXT4_STATE_FC_COMMITTING);
233 wq = bit_waitqueue(&ei->i_flags,
234 EXT4_STATE_FC_COMMITTING);
235 #endif
236 lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
237 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
238 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
239 schedule();
240 finish_wait(wq, &wait.wq_entry);
244 * Inform Ext4's fast about start of an inode update
246 * This function is called by the high level call VFS callbacks before
247 * performing any inode update. This function blocks if there's an ongoing
248 * fast commit on the inode in question.
250 void ext4_fc_start_update(struct inode *inode)
252 struct ext4_inode_info *ei = EXT4_I(inode);
254 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
255 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
256 return;
258 restart:
259 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
260 if (list_empty(&ei->i_fc_list))
261 goto out;
263 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
264 ext4_fc_wait_committing_inode(inode);
265 goto restart;
267 out:
268 atomic_inc(&ei->i_fc_updates);
269 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
273 * Stop inode update and wake up waiting fast commits if any.
275 void ext4_fc_stop_update(struct inode *inode)
277 struct ext4_inode_info *ei = EXT4_I(inode);
279 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
280 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
281 return;
283 if (atomic_dec_and_test(&ei->i_fc_updates))
284 wake_up_all(&ei->i_fc_wait);
288 * Remove inode from fast commit list. If the inode is being committed
289 * we wait until inode commit is done.
291 void ext4_fc_del(struct inode *inode)
293 struct ext4_inode_info *ei = EXT4_I(inode);
295 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
296 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
297 return;
299 restart:
300 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
301 if (list_empty(&ei->i_fc_list)) {
302 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
303 return;
306 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
307 ext4_fc_wait_committing_inode(inode);
308 goto restart;
310 list_del_init(&ei->i_fc_list);
311 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
315 * Mark file system as fast commit ineligible. This means that next commit
316 * operation would result in a full jbd2 commit.
318 void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
320 struct ext4_sb_info *sbi = EXT4_SB(sb);
322 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
323 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
324 return;
326 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
327 WARN_ON(reason >= EXT4_FC_REASON_MAX);
328 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
332 * Start a fast commit ineligible update. Any commits that happen while
333 * such an operation is in progress fall back to full commits.
335 void ext4_fc_start_ineligible(struct super_block *sb, int reason)
337 struct ext4_sb_info *sbi = EXT4_SB(sb);
339 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
340 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
341 return;
343 WARN_ON(reason >= EXT4_FC_REASON_MAX);
344 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
345 atomic_inc(&sbi->s_fc_ineligible_updates);
349 * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
350 * to ensure that after stopping the ineligible update, at least one full
351 * commit takes place.
353 void ext4_fc_stop_ineligible(struct super_block *sb)
355 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
356 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
357 return;
359 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
360 atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
363 static inline int ext4_fc_is_ineligible(struct super_block *sb)
365 return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
366 atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
370 * Generic fast commit tracking function. If this is the first time this we are
371 * called after a full commit, we initialize fast commit fields and then call
372 * __fc_track_fn() with update = 0. If we have already been called after a full
373 * commit, we pass update = 1. Based on that, the track function can determine
374 * if it needs to track a field for the first time or if it needs to just
375 * update the previously tracked value.
377 * If enqueue is set, this function enqueues the inode in fast commit list.
379 static int ext4_fc_track_template(
380 handle_t *handle, struct inode *inode,
381 int (*__fc_track_fn)(struct inode *, void *, bool),
382 void *args, int enqueue)
384 bool update = false;
385 struct ext4_inode_info *ei = EXT4_I(inode);
386 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
387 tid_t tid = 0;
388 int ret;
390 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
391 (sbi->s_mount_state & EXT4_FC_REPLAY))
392 return -EOPNOTSUPP;
394 if (ext4_fc_is_ineligible(inode->i_sb))
395 return -EINVAL;
397 tid = handle->h_transaction->t_tid;
398 mutex_lock(&ei->i_fc_lock);
399 if (tid == ei->i_sync_tid) {
400 update = true;
401 } else {
402 ext4_fc_reset_inode(inode);
403 ei->i_sync_tid = tid;
405 ret = __fc_track_fn(inode, args, update);
406 mutex_unlock(&ei->i_fc_lock);
408 if (!enqueue)
409 return ret;
411 spin_lock(&sbi->s_fc_lock);
412 if (list_empty(&EXT4_I(inode)->i_fc_list))
413 list_add_tail(&EXT4_I(inode)->i_fc_list,
414 (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
415 &sbi->s_fc_q[FC_Q_STAGING] :
416 &sbi->s_fc_q[FC_Q_MAIN]);
417 spin_unlock(&sbi->s_fc_lock);
419 return ret;
422 struct __track_dentry_update_args {
423 struct dentry *dentry;
424 int op;
427 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
428 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
430 struct ext4_fc_dentry_update *node;
431 struct ext4_inode_info *ei = EXT4_I(inode);
432 struct __track_dentry_update_args *dentry_update =
433 (struct __track_dentry_update_args *)arg;
434 struct dentry *dentry = dentry_update->dentry;
435 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
437 mutex_unlock(&ei->i_fc_lock);
438 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
439 if (!node) {
440 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
441 mutex_lock(&ei->i_fc_lock);
442 return -ENOMEM;
445 node->fcd_op = dentry_update->op;
446 node->fcd_parent = dentry->d_parent->d_inode->i_ino;
447 node->fcd_ino = inode->i_ino;
448 if (dentry->d_name.len > DNAME_INLINE_LEN) {
449 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
450 if (!node->fcd_name.name) {
451 kmem_cache_free(ext4_fc_dentry_cachep, node);
452 ext4_fc_mark_ineligible(inode->i_sb,
453 EXT4_FC_REASON_NOMEM);
454 mutex_lock(&ei->i_fc_lock);
455 return -ENOMEM;
457 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
458 dentry->d_name.len);
459 } else {
460 memcpy(node->fcd_iname, dentry->d_name.name,
461 dentry->d_name.len);
462 node->fcd_name.name = node->fcd_iname;
464 node->fcd_name.len = dentry->d_name.len;
466 spin_lock(&sbi->s_fc_lock);
467 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
468 list_add_tail(&node->fcd_list,
469 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
470 else
471 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
472 spin_unlock(&sbi->s_fc_lock);
473 mutex_lock(&ei->i_fc_lock);
475 return 0;
478 void __ext4_fc_track_unlink(handle_t *handle,
479 struct inode *inode, struct dentry *dentry)
481 struct __track_dentry_update_args args;
482 int ret;
484 args.dentry = dentry;
485 args.op = EXT4_FC_TAG_UNLINK;
487 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
488 (void *)&args, 0);
489 trace_ext4_fc_track_unlink(inode, dentry, ret);
492 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
494 __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
497 void __ext4_fc_track_link(handle_t *handle,
498 struct inode *inode, struct dentry *dentry)
500 struct __track_dentry_update_args args;
501 int ret;
503 args.dentry = dentry;
504 args.op = EXT4_FC_TAG_LINK;
506 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
507 (void *)&args, 0);
508 trace_ext4_fc_track_link(inode, dentry, ret);
511 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
513 __ext4_fc_track_link(handle, d_inode(dentry), dentry);
516 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
518 struct __track_dentry_update_args args;
519 struct inode *inode = d_inode(dentry);
520 int ret;
522 args.dentry = dentry;
523 args.op = EXT4_FC_TAG_CREAT;
525 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
526 (void *)&args, 0);
527 trace_ext4_fc_track_create(inode, dentry, ret);
530 /* __track_fn for inode tracking */
531 static int __track_inode(struct inode *inode, void *arg, bool update)
533 if (update)
534 return -EEXIST;
536 EXT4_I(inode)->i_fc_lblk_len = 0;
538 return 0;
541 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
543 int ret;
545 if (S_ISDIR(inode->i_mode))
546 return;
548 if (ext4_should_journal_data(inode)) {
549 ext4_fc_mark_ineligible(inode->i_sb,
550 EXT4_FC_REASON_INODE_JOURNAL_DATA);
551 return;
554 ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
555 trace_ext4_fc_track_inode(inode, ret);
558 struct __track_range_args {
559 ext4_lblk_t start, end;
562 /* __track_fn for tracking data updates */
563 static int __track_range(struct inode *inode, void *arg, bool update)
565 struct ext4_inode_info *ei = EXT4_I(inode);
566 ext4_lblk_t oldstart;
567 struct __track_range_args *__arg =
568 (struct __track_range_args *)arg;
570 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
571 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
572 return -ECANCELED;
575 oldstart = ei->i_fc_lblk_start;
577 if (update && ei->i_fc_lblk_len > 0) {
578 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
579 ei->i_fc_lblk_len =
580 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
581 ei->i_fc_lblk_start + 1;
582 } else {
583 ei->i_fc_lblk_start = __arg->start;
584 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
587 return 0;
590 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
591 ext4_lblk_t end)
593 struct __track_range_args args;
594 int ret;
596 if (S_ISDIR(inode->i_mode))
597 return;
599 args.start = start;
600 args.end = end;
602 ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1);
604 trace_ext4_fc_track_range(inode, start, end, ret);
607 static void ext4_fc_submit_bh(struct super_block *sb)
609 int write_flags = REQ_SYNC;
610 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
612 /* TODO: REQ_FUA | REQ_PREFLUSH is unnecessarily expensive. */
613 if (test_opt(sb, BARRIER))
614 write_flags |= REQ_FUA | REQ_PREFLUSH;
615 lock_buffer(bh);
616 set_buffer_dirty(bh);
617 set_buffer_uptodate(bh);
618 bh->b_end_io = ext4_end_buffer_io_sync;
619 submit_bh(REQ_OP_WRITE, write_flags, bh);
620 EXT4_SB(sb)->s_fc_bh = NULL;
623 /* Ext4 commit path routines */
625 /* memzero and update CRC */
626 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
627 u32 *crc)
629 void *ret;
631 ret = memset(dst, 0, len);
632 if (crc)
633 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
634 return ret;
638 * Allocate len bytes on a fast commit buffer.
640 * During the commit time this function is used to manage fast commit
641 * block space. We don't split a fast commit log onto different
642 * blocks. So this function makes sure that if there's not enough space
643 * on the current block, the remaining space in the current block is
644 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
645 * new block is from jbd2 and CRC is updated to reflect the padding
646 * we added.
648 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
650 struct ext4_fc_tl *tl;
651 struct ext4_sb_info *sbi = EXT4_SB(sb);
652 struct buffer_head *bh;
653 int bsize = sbi->s_journal->j_blocksize;
654 int ret, off = sbi->s_fc_bytes % bsize;
655 int pad_len;
658 * After allocating len, we should have space at least for a 0 byte
659 * padding.
661 if (len + sizeof(struct ext4_fc_tl) > bsize)
662 return NULL;
664 if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
666 * Only allocate from current buffer if we have enough space for
667 * this request AND we have space to add a zero byte padding.
669 if (!sbi->s_fc_bh) {
670 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
671 if (ret)
672 return NULL;
673 sbi->s_fc_bh = bh;
675 sbi->s_fc_bytes += len;
676 return sbi->s_fc_bh->b_data + off;
678 /* Need to add PAD tag */
679 tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
680 tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
681 pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
682 tl->fc_len = cpu_to_le16(pad_len);
683 if (crc)
684 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
685 if (pad_len > 0)
686 ext4_fc_memzero(sb, tl + 1, pad_len, crc);
687 ext4_fc_submit_bh(sb);
689 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
690 if (ret)
691 return NULL;
692 sbi->s_fc_bh = bh;
693 sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
694 return sbi->s_fc_bh->b_data;
697 /* memcpy to fc reserved space and update CRC */
698 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
699 int len, u32 *crc)
701 if (crc)
702 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
703 return memcpy(dst, src, len);
707 * Complete a fast commit by writing tail tag.
709 * Writing tail tag marks the end of a fast commit. In order to guarantee
710 * atomicity, after writing tail tag, even if there's space remaining
711 * in the block, next commit shouldn't use it. That's why tail tag
712 * has the length as that of the remaining space on the block.
714 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
716 struct ext4_sb_info *sbi = EXT4_SB(sb);
717 struct ext4_fc_tl tl;
718 struct ext4_fc_tail tail;
719 int off, bsize = sbi->s_journal->j_blocksize;
720 u8 *dst;
723 * ext4_fc_reserve_space takes care of allocating an extra block if
724 * there's no enough space on this block for accommodating this tail.
726 dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
727 if (!dst)
728 return -ENOSPC;
730 off = sbi->s_fc_bytes % bsize;
732 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
733 tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
734 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
736 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
737 dst += sizeof(tl);
738 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
739 ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
740 dst += sizeof(tail.fc_tid);
741 tail.fc_crc = cpu_to_le32(crc);
742 ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
744 ext4_fc_submit_bh(sb);
746 return 0;
750 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
751 * Returns false if there's not enough space.
753 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
754 u32 *crc)
756 struct ext4_fc_tl tl;
757 u8 *dst;
759 dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
760 if (!dst)
761 return false;
763 tl.fc_tag = cpu_to_le16(tag);
764 tl.fc_len = cpu_to_le16(len);
766 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
767 ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
769 return true;
772 /* Same as above, but adds dentry tlv. */
773 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
774 int parent_ino, int ino, int dlen,
775 const unsigned char *dname,
776 u32 *crc)
778 struct ext4_fc_dentry_info fcd;
779 struct ext4_fc_tl tl;
780 u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
781 crc);
783 if (!dst)
784 return false;
786 fcd.fc_parent_ino = cpu_to_le32(parent_ino);
787 fcd.fc_ino = cpu_to_le32(ino);
788 tl.fc_tag = cpu_to_le16(tag);
789 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
790 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
791 dst += sizeof(tl);
792 ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
793 dst += sizeof(fcd);
794 ext4_fc_memcpy(sb, dst, dname, dlen, crc);
795 dst += dlen;
797 return true;
801 * Writes inode in the fast commit space under TLV with tag @tag.
802 * Returns 0 on success, error on failure.
804 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
806 struct ext4_inode_info *ei = EXT4_I(inode);
807 int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
808 int ret;
809 struct ext4_iloc iloc;
810 struct ext4_fc_inode fc_inode;
811 struct ext4_fc_tl tl;
812 u8 *dst;
814 ret = ext4_get_inode_loc(inode, &iloc);
815 if (ret)
816 return ret;
818 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
819 inode_len += ei->i_extra_isize;
821 fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
822 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
823 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
825 dst = ext4_fc_reserve_space(inode->i_sb,
826 sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
827 if (!dst)
828 return -ECANCELED;
830 if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
831 return -ECANCELED;
832 dst += sizeof(tl);
833 if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
834 return -ECANCELED;
835 dst += sizeof(fc_inode);
836 if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
837 inode_len, crc))
838 return -ECANCELED;
840 return 0;
844 * Writes updated data ranges for the inode in question. Updates CRC.
845 * Returns 0 on success, error otherwise.
847 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
849 ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
850 struct ext4_inode_info *ei = EXT4_I(inode);
851 struct ext4_map_blocks map;
852 struct ext4_fc_add_range fc_ext;
853 struct ext4_fc_del_range lrange;
854 struct ext4_extent *ex;
855 int ret;
857 mutex_lock(&ei->i_fc_lock);
858 if (ei->i_fc_lblk_len == 0) {
859 mutex_unlock(&ei->i_fc_lock);
860 return 0;
862 old_blk_size = ei->i_fc_lblk_start;
863 new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
864 ei->i_fc_lblk_len = 0;
865 mutex_unlock(&ei->i_fc_lock);
867 cur_lblk_off = old_blk_size;
868 jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
869 __func__, cur_lblk_off, new_blk_size, inode->i_ino);
871 while (cur_lblk_off <= new_blk_size) {
872 map.m_lblk = cur_lblk_off;
873 map.m_len = new_blk_size - cur_lblk_off + 1;
874 ret = ext4_map_blocks(NULL, inode, &map, 0);
875 if (ret < 0)
876 return -ECANCELED;
878 if (map.m_len == 0) {
879 cur_lblk_off++;
880 continue;
883 if (ret == 0) {
884 lrange.fc_ino = cpu_to_le32(inode->i_ino);
885 lrange.fc_lblk = cpu_to_le32(map.m_lblk);
886 lrange.fc_len = cpu_to_le32(map.m_len);
887 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
888 sizeof(lrange), (u8 *)&lrange, crc))
889 return -ENOSPC;
890 } else {
891 fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
892 ex = (struct ext4_extent *)&fc_ext.fc_ex;
893 ex->ee_block = cpu_to_le32(map.m_lblk);
894 ex->ee_len = cpu_to_le16(map.m_len);
895 ext4_ext_store_pblock(ex, map.m_pblk);
896 if (map.m_flags & EXT4_MAP_UNWRITTEN)
897 ext4_ext_mark_unwritten(ex);
898 else
899 ext4_ext_mark_initialized(ex);
900 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
901 sizeof(fc_ext), (u8 *)&fc_ext, crc))
902 return -ENOSPC;
905 cur_lblk_off += map.m_len;
908 return 0;
912 /* Submit data for all the fast commit inodes */
913 static int ext4_fc_submit_inode_data_all(journal_t *journal)
915 struct super_block *sb = (struct super_block *)(journal->j_private);
916 struct ext4_sb_info *sbi = EXT4_SB(sb);
917 struct ext4_inode_info *ei;
918 struct list_head *pos;
919 int ret = 0;
921 spin_lock(&sbi->s_fc_lock);
922 ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
923 list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
924 ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
925 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
926 while (atomic_read(&ei->i_fc_updates)) {
927 DEFINE_WAIT(wait);
929 prepare_to_wait(&ei->i_fc_wait, &wait,
930 TASK_UNINTERRUPTIBLE);
931 if (atomic_read(&ei->i_fc_updates)) {
932 spin_unlock(&sbi->s_fc_lock);
933 schedule();
934 spin_lock(&sbi->s_fc_lock);
936 finish_wait(&ei->i_fc_wait, &wait);
938 spin_unlock(&sbi->s_fc_lock);
939 ret = jbd2_submit_inode_data(ei->jinode);
940 if (ret)
941 return ret;
942 spin_lock(&sbi->s_fc_lock);
944 spin_unlock(&sbi->s_fc_lock);
946 return ret;
949 /* Wait for completion of data for all the fast commit inodes */
950 static int ext4_fc_wait_inode_data_all(journal_t *journal)
952 struct super_block *sb = (struct super_block *)(journal->j_private);
953 struct ext4_sb_info *sbi = EXT4_SB(sb);
954 struct ext4_inode_info *pos, *n;
955 int ret = 0;
957 spin_lock(&sbi->s_fc_lock);
958 list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
959 if (!ext4_test_inode_state(&pos->vfs_inode,
960 EXT4_STATE_FC_COMMITTING))
961 continue;
962 spin_unlock(&sbi->s_fc_lock);
964 ret = jbd2_wait_inode_data(journal, pos->jinode);
965 if (ret)
966 return ret;
967 spin_lock(&sbi->s_fc_lock);
969 spin_unlock(&sbi->s_fc_lock);
971 return 0;
974 /* Commit all the directory entry updates */
975 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
976 __acquires(&sbi->s_fc_lock)
977 __releases(&sbi->s_fc_lock)
979 struct super_block *sb = (struct super_block *)(journal->j_private);
980 struct ext4_sb_info *sbi = EXT4_SB(sb);
981 struct ext4_fc_dentry_update *fc_dentry;
982 struct inode *inode;
983 struct list_head *pos, *n, *fcd_pos, *fcd_n;
984 struct ext4_inode_info *ei;
985 int ret;
987 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
988 return 0;
989 list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) {
990 fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update,
991 fcd_list);
992 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
993 spin_unlock(&sbi->s_fc_lock);
994 if (!ext4_fc_add_dentry_tlv(
995 sb, fc_dentry->fcd_op,
996 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
997 fc_dentry->fcd_name.len,
998 fc_dentry->fcd_name.name, crc)) {
999 ret = -ENOSPC;
1000 goto lock_and_exit;
1002 spin_lock(&sbi->s_fc_lock);
1003 continue;
1006 inode = NULL;
1007 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
1008 ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
1009 if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
1010 inode = &ei->vfs_inode;
1011 break;
1015 * If we don't find inode in our list, then it was deleted,
1016 * in which case, we don't need to record it's create tag.
1018 if (!inode)
1019 continue;
1020 spin_unlock(&sbi->s_fc_lock);
1023 * We first write the inode and then the create dirent. This
1024 * allows the recovery code to create an unnamed inode first
1025 * and then link it to a directory entry. This allows us
1026 * to use namei.c routines almost as is and simplifies
1027 * the recovery code.
1029 ret = ext4_fc_write_inode(inode, crc);
1030 if (ret)
1031 goto lock_and_exit;
1033 ret = ext4_fc_write_inode_data(inode, crc);
1034 if (ret)
1035 goto lock_and_exit;
1037 if (!ext4_fc_add_dentry_tlv(
1038 sb, fc_dentry->fcd_op,
1039 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
1040 fc_dentry->fcd_name.len,
1041 fc_dentry->fcd_name.name, crc)) {
1042 ret = -ENOSPC;
1043 goto lock_and_exit;
1046 spin_lock(&sbi->s_fc_lock);
1048 return 0;
1049 lock_and_exit:
1050 spin_lock(&sbi->s_fc_lock);
1051 return ret;
1054 static int ext4_fc_perform_commit(journal_t *journal)
1056 struct super_block *sb = (struct super_block *)(journal->j_private);
1057 struct ext4_sb_info *sbi = EXT4_SB(sb);
1058 struct ext4_inode_info *iter;
1059 struct ext4_fc_head head;
1060 struct list_head *pos;
1061 struct inode *inode;
1062 struct blk_plug plug;
1063 int ret = 0;
1064 u32 crc = 0;
1066 ret = ext4_fc_submit_inode_data_all(journal);
1067 if (ret)
1068 return ret;
1070 ret = ext4_fc_wait_inode_data_all(journal);
1071 if (ret)
1072 return ret;
1075 * If file system device is different from journal device, issue a cache
1076 * flush before we start writing fast commit blocks.
1078 if (journal->j_fs_dev != journal->j_dev)
1079 blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
1081 blk_start_plug(&plug);
1082 if (sbi->s_fc_bytes == 0) {
1084 * Add a head tag only if this is the first fast commit
1085 * in this TID.
1087 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1088 head.fc_tid = cpu_to_le32(
1089 sbi->s_journal->j_running_transaction->t_tid);
1090 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1091 (u8 *)&head, &crc))
1092 goto out;
1095 spin_lock(&sbi->s_fc_lock);
1096 ret = ext4_fc_commit_dentry_updates(journal, &crc);
1097 if (ret) {
1098 spin_unlock(&sbi->s_fc_lock);
1099 goto out;
1102 list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
1103 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1104 inode = &iter->vfs_inode;
1105 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1106 continue;
1108 spin_unlock(&sbi->s_fc_lock);
1109 ret = ext4_fc_write_inode_data(inode, &crc);
1110 if (ret)
1111 goto out;
1112 ret = ext4_fc_write_inode(inode, &crc);
1113 if (ret)
1114 goto out;
1115 spin_lock(&sbi->s_fc_lock);
1117 spin_unlock(&sbi->s_fc_lock);
1119 ret = ext4_fc_write_tail(sb, crc);
1121 out:
1122 blk_finish_plug(&plug);
1123 return ret;
1127 * The main commit entry point. Performs a fast commit for transaction
1128 * commit_tid if needed. If it's not possible to perform a fast commit
1129 * due to various reasons, we fall back to full commit. Returns 0
1130 * on success, error otherwise.
1132 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1134 struct super_block *sb = (struct super_block *)(journal->j_private);
1135 struct ext4_sb_info *sbi = EXT4_SB(sb);
1136 int nblks = 0, ret, bsize = journal->j_blocksize;
1137 int subtid = atomic_read(&sbi->s_fc_subtid);
1138 int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1139 ktime_t start_time, commit_time;
1141 trace_ext4_fc_commit_start(sb);
1143 start_time = ktime_get();
1145 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1146 (ext4_fc_is_ineligible(sb))) {
1147 reason = EXT4_FC_REASON_INELIGIBLE;
1148 goto out;
1151 restart_fc:
1152 ret = jbd2_fc_begin_commit(journal, commit_tid);
1153 if (ret == -EALREADY) {
1154 /* There was an ongoing commit, check if we need to restart */
1155 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1156 commit_tid > journal->j_commit_sequence)
1157 goto restart_fc;
1158 reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1159 goto out;
1160 } else if (ret) {
1161 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1162 reason = EXT4_FC_REASON_FC_START_FAILED;
1163 goto out;
1166 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1167 ret = ext4_fc_perform_commit(journal);
1168 if (ret < 0) {
1169 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1170 reason = EXT4_FC_REASON_FC_FAILED;
1171 goto out;
1173 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1174 ret = jbd2_fc_wait_bufs(journal, nblks);
1175 if (ret < 0) {
1176 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1177 reason = EXT4_FC_REASON_FC_FAILED;
1178 goto out;
1180 atomic_inc(&sbi->s_fc_subtid);
1181 jbd2_fc_end_commit(journal);
1182 out:
1183 /* Has any ineligible update happened since we started? */
1184 if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1185 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1186 reason = EXT4_FC_REASON_INELIGIBLE;
1189 spin_lock(&sbi->s_fc_lock);
1190 if (reason != EXT4_FC_REASON_OK &&
1191 reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1192 sbi->s_fc_stats.fc_ineligible_commits++;
1193 } else {
1194 sbi->s_fc_stats.fc_num_commits++;
1195 sbi->s_fc_stats.fc_numblks += nblks;
1197 spin_unlock(&sbi->s_fc_lock);
1198 nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1199 trace_ext4_fc_commit_stop(sb, nblks, reason);
1200 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1202 * weight the commit time higher than the average time so we don't
1203 * react too strongly to vast changes in the commit time
1205 if (likely(sbi->s_fc_avg_commit_time))
1206 sbi->s_fc_avg_commit_time = (commit_time +
1207 sbi->s_fc_avg_commit_time * 3) / 4;
1208 else
1209 sbi->s_fc_avg_commit_time = commit_time;
1210 jbd_debug(1,
1211 "Fast commit ended with blks = %d, reason = %d, subtid - %d",
1212 nblks, reason, subtid);
1213 if (reason == EXT4_FC_REASON_FC_FAILED)
1214 return jbd2_fc_end_commit_fallback(journal);
1215 if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1216 reason == EXT4_FC_REASON_INELIGIBLE)
1217 return jbd2_complete_transaction(journal, commit_tid);
1218 return 0;
1222 * Fast commit cleanup routine. This is called after every fast commit and
1223 * full commit. full is true if we are called after a full commit.
1225 static void ext4_fc_cleanup(journal_t *journal, int full)
1227 struct super_block *sb = journal->j_private;
1228 struct ext4_sb_info *sbi = EXT4_SB(sb);
1229 struct ext4_inode_info *iter;
1230 struct ext4_fc_dentry_update *fc_dentry;
1231 struct list_head *pos, *n;
1233 if (full && sbi->s_fc_bh)
1234 sbi->s_fc_bh = NULL;
1236 jbd2_fc_release_bufs(journal);
1238 spin_lock(&sbi->s_fc_lock);
1239 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
1240 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1241 list_del_init(&iter->i_fc_list);
1242 ext4_clear_inode_state(&iter->vfs_inode,
1243 EXT4_STATE_FC_COMMITTING);
1244 ext4_fc_reset_inode(&iter->vfs_inode);
1245 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1246 smp_mb();
1247 #if (BITS_PER_LONG < 64)
1248 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1249 #else
1250 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1251 #endif
1254 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1255 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1256 struct ext4_fc_dentry_update,
1257 fcd_list);
1258 list_del_init(&fc_dentry->fcd_list);
1259 spin_unlock(&sbi->s_fc_lock);
1261 if (fc_dentry->fcd_name.name &&
1262 fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1263 kfree(fc_dentry->fcd_name.name);
1264 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1265 spin_lock(&sbi->s_fc_lock);
1268 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1269 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1270 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1271 &sbi->s_fc_q[FC_Q_STAGING]);
1273 ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
1274 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1276 if (full)
1277 sbi->s_fc_bytes = 0;
1278 spin_unlock(&sbi->s_fc_lock);
1279 trace_ext4_fc_stats(sb);
1282 /* Ext4 Replay Path Routines */
1284 /* Helper struct for dentry replay routines */
1285 struct dentry_info_args {
1286 int parent_ino, dname_len, ino, inode_len;
1287 char *dname;
1290 static inline void tl_to_darg(struct dentry_info_args *darg,
1291 struct ext4_fc_tl *tl)
1293 struct ext4_fc_dentry_info *fcd;
1295 fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl);
1297 darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino);
1298 darg->ino = le32_to_cpu(fcd->fc_ino);
1299 darg->dname = fcd->fc_dname;
1300 darg->dname_len = ext4_fc_tag_len(tl) -
1301 sizeof(struct ext4_fc_dentry_info);
1304 /* Unlink replay function */
1305 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl)
1307 struct inode *inode, *old_parent;
1308 struct qstr entry;
1309 struct dentry_info_args darg;
1310 int ret = 0;
1312 tl_to_darg(&darg, tl);
1314 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1315 darg.parent_ino, darg.dname_len);
1317 entry.name = darg.dname;
1318 entry.len = darg.dname_len;
1319 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1321 if (IS_ERR_OR_NULL(inode)) {
1322 jbd_debug(1, "Inode %d not found", darg.ino);
1323 return 0;
1326 old_parent = ext4_iget(sb, darg.parent_ino,
1327 EXT4_IGET_NORMAL);
1328 if (IS_ERR_OR_NULL(old_parent)) {
1329 jbd_debug(1, "Dir with inode %d not found", darg.parent_ino);
1330 iput(inode);
1331 return 0;
1334 ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1335 /* -ENOENT ok coz it might not exist anymore. */
1336 if (ret == -ENOENT)
1337 ret = 0;
1338 iput(old_parent);
1339 iput(inode);
1340 return ret;
1343 static int ext4_fc_replay_link_internal(struct super_block *sb,
1344 struct dentry_info_args *darg,
1345 struct inode *inode)
1347 struct inode *dir = NULL;
1348 struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1349 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1350 int ret = 0;
1352 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1353 if (IS_ERR(dir)) {
1354 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1355 dir = NULL;
1356 goto out;
1359 dentry_dir = d_obtain_alias(dir);
1360 if (IS_ERR(dentry_dir)) {
1361 jbd_debug(1, "Failed to obtain dentry");
1362 dentry_dir = NULL;
1363 goto out;
1366 dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1367 if (!dentry_inode) {
1368 jbd_debug(1, "Inode dentry not created.");
1369 ret = -ENOMEM;
1370 goto out;
1373 ret = __ext4_link(dir, inode, dentry_inode);
1375 * It's possible that link already existed since data blocks
1376 * for the dir in question got persisted before we crashed OR
1377 * we replayed this tag and crashed before the entire replay
1378 * could complete.
1380 if (ret && ret != -EEXIST) {
1381 jbd_debug(1, "Failed to link\n");
1382 goto out;
1385 ret = 0;
1386 out:
1387 if (dentry_dir) {
1388 d_drop(dentry_dir);
1389 dput(dentry_dir);
1390 } else if (dir) {
1391 iput(dir);
1393 if (dentry_inode) {
1394 d_drop(dentry_inode);
1395 dput(dentry_inode);
1398 return ret;
1401 /* Link replay function */
1402 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl)
1404 struct inode *inode;
1405 struct dentry_info_args darg;
1406 int ret = 0;
1408 tl_to_darg(&darg, tl);
1409 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1410 darg.parent_ino, darg.dname_len);
1412 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1413 if (IS_ERR_OR_NULL(inode)) {
1414 jbd_debug(1, "Inode not found.");
1415 return 0;
1418 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1419 iput(inode);
1420 return ret;
1424 * Record all the modified inodes during replay. We use this later to setup
1425 * block bitmaps correctly.
1427 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1429 struct ext4_fc_replay_state *state;
1430 int i;
1432 state = &EXT4_SB(sb)->s_fc_replay_state;
1433 for (i = 0; i < state->fc_modified_inodes_used; i++)
1434 if (state->fc_modified_inodes[i] == ino)
1435 return 0;
1436 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1437 state->fc_modified_inodes_size +=
1438 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1439 state->fc_modified_inodes = krealloc(
1440 state->fc_modified_inodes, sizeof(int) *
1441 state->fc_modified_inodes_size,
1442 GFP_KERNEL);
1443 if (!state->fc_modified_inodes)
1444 return -ENOMEM;
1446 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1447 return 0;
1451 * Inode replay function
1453 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl)
1455 struct ext4_fc_inode *fc_inode;
1456 struct ext4_inode *raw_inode;
1457 struct ext4_inode *raw_fc_inode;
1458 struct inode *inode = NULL;
1459 struct ext4_iloc iloc;
1460 int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1461 struct ext4_extent_header *eh;
1463 fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl);
1465 ino = le32_to_cpu(fc_inode->fc_ino);
1466 trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1468 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1469 if (!IS_ERR_OR_NULL(inode)) {
1470 ext4_ext_clear_bb(inode);
1471 iput(inode);
1474 ext4_fc_record_modified_inode(sb, ino);
1476 raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode;
1477 ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1478 if (ret)
1479 goto out;
1481 inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode);
1482 raw_inode = ext4_raw_inode(&iloc);
1484 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1485 memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1486 inode_len - offsetof(struct ext4_inode, i_generation));
1487 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1488 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1489 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1490 memset(eh, 0, sizeof(*eh));
1491 eh->eh_magic = EXT4_EXT_MAGIC;
1492 eh->eh_max = cpu_to_le16(
1493 (sizeof(raw_inode->i_block) -
1494 sizeof(struct ext4_extent_header))
1495 / sizeof(struct ext4_extent));
1497 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1498 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1499 sizeof(raw_inode->i_block));
1502 /* Immediately update the inode on disk. */
1503 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1504 if (ret)
1505 goto out;
1506 ret = sync_dirty_buffer(iloc.bh);
1507 if (ret)
1508 goto out;
1509 ret = ext4_mark_inode_used(sb, ino);
1510 if (ret)
1511 goto out;
1513 /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1514 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1515 if (IS_ERR_OR_NULL(inode)) {
1516 jbd_debug(1, "Inode not found.");
1517 return -EFSCORRUPTED;
1521 * Our allocator could have made different decisions than before
1522 * crashing. This should be fixed but until then, we calculate
1523 * the number of blocks the inode.
1525 ext4_ext_replay_set_iblocks(inode);
1527 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1528 ext4_reset_inode_seed(inode);
1530 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1531 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1532 sync_dirty_buffer(iloc.bh);
1533 brelse(iloc.bh);
1534 out:
1535 iput(inode);
1536 if (!ret)
1537 blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
1539 return 0;
1543 * Dentry create replay function.
1545 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1546 * inode for which we are trying to create a dentry here, should already have
1547 * been replayed before we start here.
1549 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl)
1551 int ret = 0;
1552 struct inode *inode = NULL;
1553 struct inode *dir = NULL;
1554 struct dentry_info_args darg;
1556 tl_to_darg(&darg, tl);
1558 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1559 darg.parent_ino, darg.dname_len);
1561 /* This takes care of update group descriptor and other metadata */
1562 ret = ext4_mark_inode_used(sb, darg.ino);
1563 if (ret)
1564 goto out;
1566 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1567 if (IS_ERR_OR_NULL(inode)) {
1568 jbd_debug(1, "inode %d not found.", darg.ino);
1569 inode = NULL;
1570 ret = -EINVAL;
1571 goto out;
1574 if (S_ISDIR(inode->i_mode)) {
1576 * If we are creating a directory, we need to make sure that the
1577 * dot and dot dot dirents are setup properly.
1579 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1580 if (IS_ERR_OR_NULL(dir)) {
1581 jbd_debug(1, "Dir %d not found.", darg.ino);
1582 goto out;
1584 ret = ext4_init_new_dir(NULL, dir, inode);
1585 iput(dir);
1586 if (ret) {
1587 ret = 0;
1588 goto out;
1591 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1592 if (ret)
1593 goto out;
1594 set_nlink(inode, 1);
1595 ext4_mark_inode_dirty(NULL, inode);
1596 out:
1597 if (inode)
1598 iput(inode);
1599 return ret;
1603 * Record physical disk regions which are in use as per fast commit area. Our
1604 * simple replay phase allocator excludes these regions from allocation.
1606 static int ext4_fc_record_regions(struct super_block *sb, int ino,
1607 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1609 struct ext4_fc_replay_state *state;
1610 struct ext4_fc_alloc_region *region;
1612 state = &EXT4_SB(sb)->s_fc_replay_state;
1613 if (state->fc_regions_used == state->fc_regions_size) {
1614 state->fc_regions_size +=
1615 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1616 state->fc_regions = krealloc(
1617 state->fc_regions,
1618 state->fc_regions_size *
1619 sizeof(struct ext4_fc_alloc_region),
1620 GFP_KERNEL);
1621 if (!state->fc_regions)
1622 return -ENOMEM;
1624 region = &state->fc_regions[state->fc_regions_used++];
1625 region->ino = ino;
1626 region->lblk = lblk;
1627 region->pblk = pblk;
1628 region->len = len;
1630 return 0;
1633 /* Replay add range tag */
1634 static int ext4_fc_replay_add_range(struct super_block *sb,
1635 struct ext4_fc_tl *tl)
1637 struct ext4_fc_add_range *fc_add_ex;
1638 struct ext4_extent newex, *ex;
1639 struct inode *inode;
1640 ext4_lblk_t start, cur;
1641 int remaining, len;
1642 ext4_fsblk_t start_pblk;
1643 struct ext4_map_blocks map;
1644 struct ext4_ext_path *path = NULL;
1645 int ret;
1647 fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1648 ex = (struct ext4_extent *)&fc_add_ex->fc_ex;
1650 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1651 le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block),
1652 ext4_ext_get_actual_len(ex));
1654 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino),
1655 EXT4_IGET_NORMAL);
1656 if (IS_ERR_OR_NULL(inode)) {
1657 jbd_debug(1, "Inode not found.");
1658 return 0;
1661 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1663 start = le32_to_cpu(ex->ee_block);
1664 start_pblk = ext4_ext_pblock(ex);
1665 len = ext4_ext_get_actual_len(ex);
1667 cur = start;
1668 remaining = len;
1669 jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1670 start, start_pblk, len, ext4_ext_is_unwritten(ex),
1671 inode->i_ino);
1673 while (remaining > 0) {
1674 map.m_lblk = cur;
1675 map.m_len = remaining;
1676 map.m_pblk = 0;
1677 ret = ext4_map_blocks(NULL, inode, &map, 0);
1679 if (ret < 0) {
1680 iput(inode);
1681 return 0;
1684 if (ret == 0) {
1685 /* Range is not mapped */
1686 path = ext4_find_extent(inode, cur, NULL, 0);
1687 if (IS_ERR(path)) {
1688 iput(inode);
1689 return 0;
1691 memset(&newex, 0, sizeof(newex));
1692 newex.ee_block = cpu_to_le32(cur);
1693 ext4_ext_store_pblock(
1694 &newex, start_pblk + cur - start);
1695 newex.ee_len = cpu_to_le16(map.m_len);
1696 if (ext4_ext_is_unwritten(ex))
1697 ext4_ext_mark_unwritten(&newex);
1698 down_write(&EXT4_I(inode)->i_data_sem);
1699 ret = ext4_ext_insert_extent(
1700 NULL, inode, &path, &newex, 0);
1701 up_write((&EXT4_I(inode)->i_data_sem));
1702 ext4_ext_drop_refs(path);
1703 kfree(path);
1704 if (ret) {
1705 iput(inode);
1706 return 0;
1708 goto next;
1711 if (start_pblk + cur - start != map.m_pblk) {
1713 * Logical to physical mapping changed. This can happen
1714 * if this range was removed and then reallocated to
1715 * map to new physical blocks during a fast commit.
1717 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1718 ext4_ext_is_unwritten(ex),
1719 start_pblk + cur - start);
1720 if (ret) {
1721 iput(inode);
1722 return 0;
1725 * Mark the old blocks as free since they aren't used
1726 * anymore. We maintain an array of all the modified
1727 * inodes. In case these blocks are still used at either
1728 * a different logical range in the same inode or in
1729 * some different inode, we will mark them as allocated
1730 * at the end of the FC replay using our array of
1731 * modified inodes.
1733 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1734 goto next;
1737 /* Range is mapped and needs a state change */
1738 jbd_debug(1, "Converting from %d to %d %lld",
1739 map.m_flags & EXT4_MAP_UNWRITTEN,
1740 ext4_ext_is_unwritten(ex), map.m_pblk);
1741 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1742 ext4_ext_is_unwritten(ex), map.m_pblk);
1743 if (ret) {
1744 iput(inode);
1745 return 0;
1748 * We may have split the extent tree while toggling the state.
1749 * Try to shrink the extent tree now.
1751 ext4_ext_replay_shrink_inode(inode, start + len);
1752 next:
1753 cur += map.m_len;
1754 remaining -= map.m_len;
1756 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1757 sb->s_blocksize_bits);
1758 iput(inode);
1759 return 0;
1762 /* Replay DEL_RANGE tag */
1763 static int
1764 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl)
1766 struct inode *inode;
1767 struct ext4_fc_del_range *lrange;
1768 struct ext4_map_blocks map;
1769 ext4_lblk_t cur, remaining;
1770 int ret;
1772 lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl);
1773 cur = le32_to_cpu(lrange->fc_lblk);
1774 remaining = le32_to_cpu(lrange->fc_len);
1776 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1777 le32_to_cpu(lrange->fc_ino), cur, remaining);
1779 inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL);
1780 if (IS_ERR_OR_NULL(inode)) {
1781 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino));
1782 return 0;
1785 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1787 jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1788 inode->i_ino, le32_to_cpu(lrange->fc_lblk),
1789 le32_to_cpu(lrange->fc_len));
1790 while (remaining > 0) {
1791 map.m_lblk = cur;
1792 map.m_len = remaining;
1794 ret = ext4_map_blocks(NULL, inode, &map, 0);
1795 if (ret < 0) {
1796 iput(inode);
1797 return 0;
1799 if (ret > 0) {
1800 remaining -= ret;
1801 cur += ret;
1802 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1803 } else {
1804 remaining -= map.m_len;
1805 cur += map.m_len;
1809 ret = ext4_punch_hole(inode,
1810 le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits,
1811 le32_to_cpu(lrange->fc_len) << sb->s_blocksize_bits);
1812 if (ret)
1813 jbd_debug(1, "ext4_punch_hole returned %d", ret);
1814 ext4_ext_replay_shrink_inode(inode,
1815 i_size_read(inode) >> sb->s_blocksize_bits);
1816 ext4_mark_inode_dirty(NULL, inode);
1817 iput(inode);
1819 return 0;
1822 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1824 struct ext4_fc_replay_state *state;
1825 struct inode *inode;
1826 struct ext4_ext_path *path = NULL;
1827 struct ext4_map_blocks map;
1828 int i, ret, j;
1829 ext4_lblk_t cur, end;
1831 state = &EXT4_SB(sb)->s_fc_replay_state;
1832 for (i = 0; i < state->fc_modified_inodes_used; i++) {
1833 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1834 EXT4_IGET_NORMAL);
1835 if (IS_ERR_OR_NULL(inode)) {
1836 jbd_debug(1, "Inode %d not found.",
1837 state->fc_modified_inodes[i]);
1838 continue;
1840 cur = 0;
1841 end = EXT_MAX_BLOCKS;
1842 while (cur < end) {
1843 map.m_lblk = cur;
1844 map.m_len = end - cur;
1846 ret = ext4_map_blocks(NULL, inode, &map, 0);
1847 if (ret < 0)
1848 break;
1850 if (ret > 0) {
1851 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1852 if (!IS_ERR_OR_NULL(path)) {
1853 for (j = 0; j < path->p_depth; j++)
1854 ext4_mb_mark_bb(inode->i_sb,
1855 path[j].p_block, 1, 1);
1856 ext4_ext_drop_refs(path);
1857 kfree(path);
1859 cur += ret;
1860 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1861 map.m_len, 1);
1862 } else {
1863 cur = cur + (map.m_len ? map.m_len : 1);
1866 iput(inode);
1871 * Check if block is in excluded regions for block allocation. The simple
1872 * allocator that runs during replay phase is calls this function to see
1873 * if it is okay to use a block.
1875 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1877 int i;
1878 struct ext4_fc_replay_state *state;
1880 state = &EXT4_SB(sb)->s_fc_replay_state;
1881 for (i = 0; i < state->fc_regions_valid; i++) {
1882 if (state->fc_regions[i].ino == 0 ||
1883 state->fc_regions[i].len == 0)
1884 continue;
1885 if (blk >= state->fc_regions[i].pblk &&
1886 blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1887 return true;
1889 return false;
1892 /* Cleanup function called after replay */
1893 void ext4_fc_replay_cleanup(struct super_block *sb)
1895 struct ext4_sb_info *sbi = EXT4_SB(sb);
1897 sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1898 kfree(sbi->s_fc_replay_state.fc_regions);
1899 kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1903 * Recovery Scan phase handler
1905 * This function is called during the scan phase and is responsible
1906 * for doing following things:
1907 * - Make sure the fast commit area has valid tags for replay
1908 * - Count number of tags that need to be replayed by the replay handler
1909 * - Verify CRC
1910 * - Create a list of excluded blocks for allocation during replay phase
1912 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1913 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1914 * to indicate that scan has finished and JBD2 can now start replay phase.
1915 * It returns a negative error to indicate that there was an error. At the end
1916 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1917 * to indicate the number of tags that need to replayed during the replay phase.
1919 static int ext4_fc_replay_scan(journal_t *journal,
1920 struct buffer_head *bh, int off,
1921 tid_t expected_tid)
1923 struct super_block *sb = journal->j_private;
1924 struct ext4_sb_info *sbi = EXT4_SB(sb);
1925 struct ext4_fc_replay_state *state;
1926 int ret = JBD2_FC_REPLAY_CONTINUE;
1927 struct ext4_fc_add_range *ext;
1928 struct ext4_fc_tl *tl;
1929 struct ext4_fc_tail *tail;
1930 __u8 *start, *end;
1931 struct ext4_fc_head *head;
1932 struct ext4_extent *ex;
1934 state = &sbi->s_fc_replay_state;
1936 start = (u8 *)bh->b_data;
1937 end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1939 if (state->fc_replay_expected_off == 0) {
1940 state->fc_cur_tag = 0;
1941 state->fc_replay_num_tags = 0;
1942 state->fc_crc = 0;
1943 state->fc_regions = NULL;
1944 state->fc_regions_valid = state->fc_regions_used =
1945 state->fc_regions_size = 0;
1946 /* Check if we can stop early */
1947 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1948 != EXT4_FC_TAG_HEAD)
1949 return 0;
1952 if (off != state->fc_replay_expected_off) {
1953 ret = -EFSCORRUPTED;
1954 goto out_err;
1957 state->fc_replay_expected_off++;
1958 fc_for_each_tl(start, end, tl) {
1959 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1960 tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr);
1961 switch (le16_to_cpu(tl->fc_tag)) {
1962 case EXT4_FC_TAG_ADD_RANGE:
1963 ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1964 ex = (struct ext4_extent *)&ext->fc_ex;
1965 ret = ext4_fc_record_regions(sb,
1966 le32_to_cpu(ext->fc_ino),
1967 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1968 ext4_ext_get_actual_len(ex));
1969 if (ret < 0)
1970 break;
1971 ret = JBD2_FC_REPLAY_CONTINUE;
1972 fallthrough;
1973 case EXT4_FC_TAG_DEL_RANGE:
1974 case EXT4_FC_TAG_LINK:
1975 case EXT4_FC_TAG_UNLINK:
1976 case EXT4_FC_TAG_CREAT:
1977 case EXT4_FC_TAG_INODE:
1978 case EXT4_FC_TAG_PAD:
1979 state->fc_cur_tag++;
1980 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1981 sizeof(*tl) + ext4_fc_tag_len(tl));
1982 break;
1983 case EXT4_FC_TAG_TAIL:
1984 state->fc_cur_tag++;
1985 tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
1986 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1987 sizeof(*tl) +
1988 offsetof(struct ext4_fc_tail,
1989 fc_crc));
1990 if (le32_to_cpu(tail->fc_tid) == expected_tid &&
1991 le32_to_cpu(tail->fc_crc) == state->fc_crc) {
1992 state->fc_replay_num_tags = state->fc_cur_tag;
1993 state->fc_regions_valid =
1994 state->fc_regions_used;
1995 } else {
1996 ret = state->fc_replay_num_tags ?
1997 JBD2_FC_REPLAY_STOP : -EFSBADCRC;
1999 state->fc_crc = 0;
2000 break;
2001 case EXT4_FC_TAG_HEAD:
2002 head = (struct ext4_fc_head *)ext4_fc_tag_val(tl);
2003 if (le32_to_cpu(head->fc_features) &
2004 ~EXT4_FC_SUPPORTED_FEATURES) {
2005 ret = -EOPNOTSUPP;
2006 break;
2008 if (le32_to_cpu(head->fc_tid) != expected_tid) {
2009 ret = JBD2_FC_REPLAY_STOP;
2010 break;
2012 state->fc_cur_tag++;
2013 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
2014 sizeof(*tl) + ext4_fc_tag_len(tl));
2015 break;
2016 default:
2017 ret = state->fc_replay_num_tags ?
2018 JBD2_FC_REPLAY_STOP : -ECANCELED;
2020 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2021 break;
2024 out_err:
2025 trace_ext4_fc_replay_scan(sb, ret, off);
2026 return ret;
2030 * Main recovery path entry point.
2031 * The meaning of return codes is similar as above.
2033 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2034 enum passtype pass, int off, tid_t expected_tid)
2036 struct super_block *sb = journal->j_private;
2037 struct ext4_sb_info *sbi = EXT4_SB(sb);
2038 struct ext4_fc_tl *tl;
2039 __u8 *start, *end;
2040 int ret = JBD2_FC_REPLAY_CONTINUE;
2041 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2042 struct ext4_fc_tail *tail;
2044 if (pass == PASS_SCAN) {
2045 state->fc_current_pass = PASS_SCAN;
2046 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2049 if (state->fc_current_pass != pass) {
2050 state->fc_current_pass = pass;
2051 sbi->s_mount_state |= EXT4_FC_REPLAY;
2053 if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2054 jbd_debug(1, "Replay stops\n");
2055 ext4_fc_set_bitmaps_and_counters(sb);
2056 return 0;
2059 #ifdef CONFIG_EXT4_DEBUG
2060 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2061 pr_warn("Dropping fc block %d because max_replay set\n", off);
2062 return JBD2_FC_REPLAY_STOP;
2064 #endif
2066 start = (u8 *)bh->b_data;
2067 end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2069 fc_for_each_tl(start, end, tl) {
2070 if (state->fc_replay_num_tags == 0) {
2071 ret = JBD2_FC_REPLAY_STOP;
2072 ext4_fc_set_bitmaps_and_counters(sb);
2073 break;
2075 jbd_debug(3, "Replay phase, tag:%s\n",
2076 tag2str(le16_to_cpu(tl->fc_tag)));
2077 state->fc_replay_num_tags--;
2078 switch (le16_to_cpu(tl->fc_tag)) {
2079 case EXT4_FC_TAG_LINK:
2080 ret = ext4_fc_replay_link(sb, tl);
2081 break;
2082 case EXT4_FC_TAG_UNLINK:
2083 ret = ext4_fc_replay_unlink(sb, tl);
2084 break;
2085 case EXT4_FC_TAG_ADD_RANGE:
2086 ret = ext4_fc_replay_add_range(sb, tl);
2087 break;
2088 case EXT4_FC_TAG_CREAT:
2089 ret = ext4_fc_replay_create(sb, tl);
2090 break;
2091 case EXT4_FC_TAG_DEL_RANGE:
2092 ret = ext4_fc_replay_del_range(sb, tl);
2093 break;
2094 case EXT4_FC_TAG_INODE:
2095 ret = ext4_fc_replay_inode(sb, tl);
2096 break;
2097 case EXT4_FC_TAG_PAD:
2098 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2099 ext4_fc_tag_len(tl), 0);
2100 break;
2101 case EXT4_FC_TAG_TAIL:
2102 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2103 ext4_fc_tag_len(tl), 0);
2104 tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
2105 WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid);
2106 break;
2107 case EXT4_FC_TAG_HEAD:
2108 break;
2109 default:
2110 trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0,
2111 ext4_fc_tag_len(tl), 0);
2112 ret = -ECANCELED;
2113 break;
2115 if (ret < 0)
2116 break;
2117 ret = JBD2_FC_REPLAY_CONTINUE;
2119 return ret;
2122 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2125 * We set replay callback even if fast commit disabled because we may
2126 * could still have fast commit blocks that need to be replayed even if
2127 * fast commit has now been turned off.
2129 journal->j_fc_replay_callback = ext4_fc_replay;
2130 if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2131 return;
2132 journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2135 static const char *fc_ineligible_reasons[] = {
2136 "Extended attributes changed",
2137 "Cross rename",
2138 "Journal flag changed",
2139 "Insufficient memory",
2140 "Swap boot",
2141 "Resize",
2142 "Dir renamed",
2143 "Falloc range op",
2144 "Data journalling",
2145 "FC Commit Failed"
2148 int ext4_fc_info_show(struct seq_file *seq, void *v)
2150 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2151 struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2152 int i;
2154 if (v != SEQ_START_TOKEN)
2155 return 0;
2157 seq_printf(seq,
2158 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2159 stats->fc_num_commits, stats->fc_ineligible_commits,
2160 stats->fc_numblks,
2161 div_u64(sbi->s_fc_avg_commit_time, 1000));
2162 seq_puts(seq, "Ineligible reasons:\n");
2163 for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2164 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2165 stats->fc_ineligible_reason_count[i]);
2167 return 0;
2170 int __init ext4_fc_init_dentry_cache(void)
2172 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2173 SLAB_RECLAIM_ACCOUNT);
2175 if (ext4_fc_dentry_cachep == NULL)
2176 return -ENOMEM;
2178 return 0;