fs/ext4/fast_commit.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 /*
   4  * fs/ext4/fast_commit.c
   5  *
   6  * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
   7  *
   8  * Ext4 fast commits routines.
   9  */
  10 #include "ext4.h"
  11 #include "ext4_jbd2.h"
  12 #include "ext4_extents.h"
  13 #include "mballoc.h"
  14
  15 /*
  16  * Ext4 Fast Commits
  17  * -----------------
  18  *
  19  * Ext4 fast commits implement fine grained journalling for Ext4.
  20  *
  21  * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
  22  * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
  23  * TLV during the recovery phase. For the scenarios for which we currently
  24  * don't have replay code, fast commit falls back to full commits.
  25  * Fast commits record delta in one of the following three categories.
  26  *
  27  * (A) Directory entry updates:
  28  *
  29  * - EXT4_FC_TAG_UNLINK         - records directory entry unlink
  30  * - EXT4_FC_TAG_LINK           - records directory entry link
  31  * - EXT4_FC_TAG_CREAT          - records inode and directory entry creation
  32  *
  33  * (B) File specific data range updates:
  34  *
  35  * - EXT4_FC_TAG_ADD_RANGE      - records addition of new blocks to an inode
  36  * - EXT4_FC_TAG_DEL_RANGE      - records deletion of blocks from an inode
  37  *
  38  * (C) Inode metadata (mtime / ctime etc):
  39  *
  40  * - EXT4_FC_TAG_INODE          - record the inode that should be replayed
  41  *                                during recovery. Note that iblocks field is
  42  *                                not replayed and instead derived during
  43  *                                replay.
  44  * Commit Operation
  45  * ----------------
  46  * With fast commits, we maintain all the directory entry operations in the
  47  * order in which they are issued in an in-memory queue. This queue is flushed
  48  * to disk during the commit operation. We also maintain a list of inodes
  49  * that need to be committed during a fast commit in another in memory queue of
  50  * inodes. During the commit operation, we commit in the following order:
  51  *
  52  * [1] Lock inodes for any further data updates by setting COMMITTING state
  53  * [2] Submit data buffers of all the inodes
  54  * [3] Wait for [2] to complete
  55  * [4] Commit all the directory entry updates in the fast commit space
  56  * [5] Commit all the changed inode structures
  57  * [6] Write tail tag (this tag ensures the atomicity, please read the following
  58  *     section for more details).
  59  * [7] Wait for [4], [5] and [6] to complete.
  60  *
  61  * All the inode updates must call ext4_fc_start_update() before starting an
  62  * update. If such an ongoing update is present, fast commit waits for it to
  63  * complete. The completion of such an update is marked by
  64  * ext4_fc_stop_update().
  65  *
  66  * Fast Commit Ineligibility
  67  * -------------------------
  68  * Not all operations are supported by fast commits today (e.g extended
  69  * attributes). Fast commit ineligiblity is marked by calling one of the
  70  * two following functions:
  71  *
  72  * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
  73  *   back to full commit. This is useful in case of transient errors.
  74  *
  75  * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
  76  *   the fast commits happening between ext4_fc_start_ineligible() and
  77  *   ext4_fc_stop_ineligible() and one fast commit after the call to
  78  *   ext4_fc_stop_ineligible() to fall back to full commits. It is important to
  79  *   make one more fast commit to fall back to full commit after stop call so
  80  *   that it guaranteed that the fast commit ineligible operation contained
  81  *   within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
  82  *   followed by at least 1 full commit.
  83  *
  84  * Atomicity of commits
  85  * --------------------
  86  * In order to guarantee atomicity during the commit operation, fast commit
  87  * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
  88  * tag contains CRC of the contents and TID of the transaction after which
  89  * this fast commit should be applied. Recovery code replays fast commit
  90  * logs only if there's at least 1 valid tail present. For every fast commit
  91  * operation, there is 1 tail. This means, we may end up with multiple tails
  92  * in the fast commit space. Here's an example:
  93  *
  94  * - Create a new file A and remove existing file B
  95  * - fsync()
  96  * - Append contents to file A
  97  * - Truncate file A
  98  * - fsync()
  99  *
 100  * The fast commit space at the end of above operations would look like this:
 101  *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
 102  *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
 103  *
 104  * Replay code should thus check for all the valid tails in the FC area.
 105  *
 106  * Fast Commit Replay Idempotence
 107  * ------------------------------
 108  *
 109  * Fast commits tags are idempotent in nature provided the recovery code follows
 110  * certain rules. The guiding principle that the commit path follows while
 111  * committing is that it stores the result of a particular operation instead of
 112  * storing the procedure.
 113  *
 114  * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
 115  * was associated with inode 10. During fast commit, instead of storing this
 116  * operation as a procedure "rename a to b", we store the resulting file system
 117  * state as a "series" of outcomes:
 118  *
 119  * - Link dirent b to inode 10
 120  * - Unlink dirent a
 121  * - Inode <10> with valid refcount
 122  *
 123  * Now when recovery code runs, it needs "enforce" this state on the file
 124  * system. This is what guarantees idempotence of fast commit replay.
 125  *
 126  * Let's take an example of a procedure that is not idempotent and see how fast
 127  * commits make it idempotent. Consider following sequence of operations:
 128  *
 129  *     rm A;    mv B A;    read A
 130  *  (x)     (y)        (z)
 131  *
 132  * (x), (y) and (z) are the points at which we can crash. If we store this
 133  * sequence of operations as is then the replay is not idempotent. Let's say
 134  * while in replay, we crash at (z). During the second replay, file A (which was
 135  * actually created as a result of "mv B A" operation) would get deleted. Thus,
 136  * file named A would be absent when we try to read A. So, this sequence of
 137  * operations is not idempotent. However, as mentioned above, instead of storing
 138  * the procedure fast commits store the outcome of each procedure. Thus the fast
 139  * commit log for above procedure would be as follows:
 140  *
 141  * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
 142  * inode 11 before the replay)
 143  *
 144  *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
 145  * (w)          (x)                    (y)          (z)
 146  *
 147  * If we crash at (z), we will have file A linked to inode 11. During the second
 148  * replay, we will remove file A (inode 11). But we will create it back and make
 149  * it point to inode 11. We won't find B, so we'll just skip that step. At this
 150  * point, the refcount for inode 11 is not reliable, but that gets fixed by the
 151  * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
 152  * similarly. Thus, by converting a non-idempotent procedure into a series of
 153  * idempotent outcomes, fast commits ensured idempotence during the replay.
 154  *
 155  * TODOs
 156  * -----
 157  *
 158  * 0) Fast commit replay path hardening: Fast commit replay code should use
 159  *    journal handles to make sure all the updates it does during the replay
 160  *    path are atomic. With that if we crash during fast commit replay, after
 161  *    trying to do recovery again, we will find a file system where fast commit
 162  *    area is invalid (because new full commit would be found). In order to deal
 163  *    with that, fast commit replay code should ensure that the "FC_REPLAY"
 164  *    superblock state is persisted before starting the replay, so that after
 165  *    the crash, fast commit recovery code can look at that flag and perform
 166  *    fast commit recovery even if that area is invalidated by later full
 167  *    commits.
 168  *
 169  * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
 170  *    eligible update must be protected within ext4_fc_start_update() and
 171  *    ext4_fc_stop_update(). These routines are called at much higher
 172  *    routines. This can be made more fine grained by combining with
 173  *    ext4_journal_start().
 174  *
 175  * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
 176  *
 177  * 3) Handle more ineligible cases.
 178  */
 179
 180 #include <trace/events/ext4.h>
 181 static struct kmem_cache *ext4_fc_dentry_cachep;
 182
 183 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 184 {
 185         BUFFER_TRACE(bh, "");
 186         if (uptodate) {
 187                 ext4_debug("%s: Block %lld up-to-date",
 188                            __func__, bh->b_blocknr);
 189                 set_buffer_uptodate(bh);
 190         } else {
 191                 ext4_debug("%s: Block %lld not up-to-date",
 192                            __func__, bh->b_blocknr);
 193                 clear_buffer_uptodate(bh);
 194         }
 195
 196         unlock_buffer(bh);
 197 }
 198
 199 static inline void ext4_fc_reset_inode(struct inode *inode)
 200 {
 201         struct ext4_inode_info *ei = EXT4_I(inode);
 202
 203         ei->i_fc_lblk_start = 0;
 204         ei->i_fc_lblk_len = 0;
 205 }
 206
 207 void ext4_fc_init_inode(struct inode *inode)
 208 {
 209         struct ext4_inode_info *ei = EXT4_I(inode);
 210
 211         ext4_fc_reset_inode(inode);
 212         ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
 213         INIT_LIST_HEAD(&ei->i_fc_list);
 214         init_waitqueue_head(&ei->i_fc_wait);
 215         atomic_set(&ei->i_fc_updates, 0);
 216 }
 217
 218 /* This function must be called with sbi->s_fc_lock held. */
 219 static void ext4_fc_wait_committing_inode(struct inode *inode)
 220 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
 221 {
 222         wait_queue_head_t *wq;
 223         struct ext4_inode_info *ei = EXT4_I(inode);
 224
 225 #if (BITS_PER_LONG < 64)
 226         DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
 227                         EXT4_STATE_FC_COMMITTING);
 228         wq = bit_waitqueue(&ei->i_state_flags,
 229                                 EXT4_STATE_FC_COMMITTING);
 230 #else
 231         DEFINE_WAIT_BIT(wait, &ei->i_flags,
 232                         EXT4_STATE_FC_COMMITTING);
 233         wq = bit_waitqueue(&ei->i_flags,
 234                                 EXT4_STATE_FC_COMMITTING);
 235 #endif
 236         lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
 237         prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 238         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 239         schedule();
 240         finish_wait(wq, &wait.wq_entry);
 241 }
 242
 243 /*
 244  * Inform Ext4's fast about start of an inode update
 245  *
 246  * This function is called by the high level call VFS callbacks before
 247  * performing any inode update. This function blocks if there's an ongoing
 248  * fast commit on the inode in question.
 249  */
 250 void ext4_fc_start_update(struct inode *inode)
 251 {
 252         struct ext4_inode_info *ei = EXT4_I(inode);
 253
 254         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 255             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 256                 return;
 257
 258 restart:
 259         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 260         if (list_empty(&ei->i_fc_list))
 261                 goto out;
 262
 263         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 264                 ext4_fc_wait_committing_inode(inode);
 265                 goto restart;
 266         }
 267 out:
 268         atomic_inc(&ei->i_fc_updates);
 269         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 270 }
 271
 272 /*
 273  * Stop inode update and wake up waiting fast commits if any.
 274  */
 275 void ext4_fc_stop_update(struct inode *inode)
 276 {
 277         struct ext4_inode_info *ei = EXT4_I(inode);
 278
 279         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 280             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 281                 return;
 282
 283         if (atomic_dec_and_test(&ei->i_fc_updates))
 284                 wake_up_all(&ei->i_fc_wait);
 285 }
 286
 287 /*
 288  * Remove inode from fast commit list. If the inode is being committed
 289  * we wait until inode commit is done.
 290  */
 291 void ext4_fc_del(struct inode *inode)
 292 {
 293         struct ext4_inode_info *ei = EXT4_I(inode);
 294
 295         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 296             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 297                 return;
 298
 299 restart:
 300         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 301         if (list_empty(&ei->i_fc_list)) {
 302                 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 303                 return;
 304         }
 305
 306         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 307                 ext4_fc_wait_committing_inode(inode);
 308                 goto restart;
 309         }
 310         list_del_init(&ei->i_fc_list);
 311         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 312 }
 313
 314 /*
 315  * Mark file system as fast commit ineligible. This means that next commit
 316  * operation would result in a full jbd2 commit.
 317  */
 318 void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
 319 {
 320         struct ext4_sb_info *sbi = EXT4_SB(sb);
 321
 322         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 323             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 324                 return;
 325
 326         ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 327         WARN_ON(reason >= EXT4_FC_REASON_MAX);
 328         sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 329 }
 330
 331 /*
 332  * Start a fast commit ineligible update. Any commits that happen while
 333  * such an operation is in progress fall back to full commits.
 334  */
 335 void ext4_fc_start_ineligible(struct super_block *sb, int reason)
 336 {
 337         struct ext4_sb_info *sbi = EXT4_SB(sb);
 338
 339         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 340             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 341                 return;
 342
 343         WARN_ON(reason >= EXT4_FC_REASON_MAX);
 344         sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 345         atomic_inc(&sbi->s_fc_ineligible_updates);
 346 }
 347
 348 /*
 349  * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
 350  * to ensure that after stopping the ineligible update, at least one full
 351  * commit takes place.
 352  */
 353 void ext4_fc_stop_ineligible(struct super_block *sb)
 354 {
 355         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 356             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 357                 return;
 358
 359         ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 360         atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
 361 }
 362
 363 static inline int ext4_fc_is_ineligible(struct super_block *sb)
 364 {
 365         return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
 366                 atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
 367 }
 368
 369 /*
 370  * Generic fast commit tracking function. If this is the first time this we are
 371  * called after a full commit, we initialize fast commit fields and then call
 372  * __fc_track_fn() with update = 0. If we have already been called after a full
 373  * commit, we pass update = 1. Based on that, the track function can determine
 374  * if it needs to track a field for the first time or if it needs to just
 375  * update the previously tracked value.
 376  *
 377  * If enqueue is set, this function enqueues the inode in fast commit list.
 378  */
 379 static int ext4_fc_track_template(
 380         handle_t *handle, struct inode *inode,
 381         int (*__fc_track_fn)(struct inode *, void *, bool),
 382         void *args, int enqueue)
 383 {
 384         bool update = false;
 385         struct ext4_inode_info *ei = EXT4_I(inode);
 386         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 387         tid_t tid = 0;
 388         int ret;
 389
 390         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 391             (sbi->s_mount_state & EXT4_FC_REPLAY))
 392                 return -EOPNOTSUPP;
 393
 394         if (ext4_fc_is_ineligible(inode->i_sb))
 395                 return -EINVAL;
 396
 397         tid = handle->h_transaction->t_tid;
 398         mutex_lock(&ei->i_fc_lock);
 399         if (tid == ei->i_sync_tid) {
 400                 update = true;
 401         } else {
 402                 ext4_fc_reset_inode(inode);
 403                 ei->i_sync_tid = tid;
 404         }
 405         ret = __fc_track_fn(inode, args, update);
 406         mutex_unlock(&ei->i_fc_lock);
 407
 408         if (!enqueue)
 409                 return ret;
 410
 411         spin_lock(&sbi->s_fc_lock);
 412         if (list_empty(&EXT4_I(inode)->i_fc_list))
 413                 list_add_tail(&EXT4_I(inode)->i_fc_list,
 414                                 (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
 415                                 &sbi->s_fc_q[FC_Q_STAGING] :
 416                                 &sbi->s_fc_q[FC_Q_MAIN]);
 417         spin_unlock(&sbi->s_fc_lock);
 418
 419         return ret;
 420 }
 421
 422 struct __track_dentry_update_args {
 423         struct dentry *dentry;
 424         int op;
 425 };
 426
 427 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
 428 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
 429 {
 430         struct ext4_fc_dentry_update *node;
 431         struct ext4_inode_info *ei = EXT4_I(inode);
 432         struct __track_dentry_update_args *dentry_update =
 433                 (struct __track_dentry_update_args *)arg;
 434         struct dentry *dentry = dentry_update->dentry;
 435         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 436
 437         mutex_unlock(&ei->i_fc_lock);
 438         node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
 439         if (!node) {
 440                 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
 441                 mutex_lock(&ei->i_fc_lock);
 442                 return -ENOMEM;
 443         }
 444
 445         node->fcd_op = dentry_update->op;
 446         node->fcd_parent = dentry->d_parent->d_inode->i_ino;
 447         node->fcd_ino = inode->i_ino;
 448         if (dentry->d_name.len > DNAME_INLINE_LEN) {
 449                 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
 450                 if (!node->fcd_name.name) {
 451                         kmem_cache_free(ext4_fc_dentry_cachep, node);
 452                         ext4_fc_mark_ineligible(inode->i_sb,
 453                                 EXT4_FC_REASON_NOMEM);
 454                         mutex_lock(&ei->i_fc_lock);
 455                         return -ENOMEM;
 456                 }
 457                 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
 458                         dentry->d_name.len);
 459         } else {
 460                 memcpy(node->fcd_iname, dentry->d_name.name,
 461                         dentry->d_name.len);
 462                 node->fcd_name.name = node->fcd_iname;
 463         }
 464         node->fcd_name.len = dentry->d_name.len;
 465
 466         spin_lock(&sbi->s_fc_lock);
 467         if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
 468                 list_add_tail(&node->fcd_list,
 469                                 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
 470         else
 471                 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
 472         spin_unlock(&sbi->s_fc_lock);
 473         mutex_lock(&ei->i_fc_lock);
 474
 475         return 0;
 476 }
 477
 478 void __ext4_fc_track_unlink(handle_t *handle,
 479                 struct inode *inode, struct dentry *dentry)
 480 {
 481         struct __track_dentry_update_args args;
 482         int ret;
 483
 484         args.dentry = dentry;
 485         args.op = EXT4_FC_TAG_UNLINK;
 486
 487         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 488                                         (void *)&args, 0);
 489         trace_ext4_fc_track_unlink(inode, dentry, ret);
 490 }
 491
 492 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
 493 {
 494         __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
 495 }
 496
 497 void __ext4_fc_track_link(handle_t *handle,
 498         struct inode *inode, struct dentry *dentry)
 499 {
 500         struct __track_dentry_update_args args;
 501         int ret;
 502
 503         args.dentry = dentry;
 504         args.op = EXT4_FC_TAG_LINK;
 505
 506         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 507                                         (void *)&args, 0);
 508         trace_ext4_fc_track_link(inode, dentry, ret);
 509 }
 510
 511 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
 512 {
 513         __ext4_fc_track_link(handle, d_inode(dentry), dentry);
 514 }
 515
 516 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
 517 {
 518         struct __track_dentry_update_args args;
 519         struct inode *inode = d_inode(dentry);
 520         int ret;
 521
 522         args.dentry = dentry;
 523         args.op = EXT4_FC_TAG_CREAT;
 524
 525         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 526                                         (void *)&args, 0);
 527         trace_ext4_fc_track_create(inode, dentry, ret);
 528 }
 529
 530 /* __track_fn for inode tracking */
 531 static int __track_inode(struct inode *inode, void *arg, bool update)
 532 {
 533         if (update)
 534                 return -EEXIST;
 535
 536         EXT4_I(inode)->i_fc_lblk_len = 0;
 537
 538         return 0;
 539 }
 540
 541 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
 542 {
 543         int ret;
 544
 545         if (S_ISDIR(inode->i_mode))
 546                 return;
 547
 548         if (ext4_should_journal_data(inode)) {
 549                 ext4_fc_mark_ineligible(inode->i_sb,
 550                                         EXT4_FC_REASON_INODE_JOURNAL_DATA);
 551                 return;
 552         }
 553
 554         ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
 555         trace_ext4_fc_track_inode(inode, ret);
 556 }
 557
 558 struct __track_range_args {
 559         ext4_lblk_t start, end;
 560 };
 561
 562 /* __track_fn for tracking data updates */
 563 static int __track_range(struct inode *inode, void *arg, bool update)
 564 {
 565         struct ext4_inode_info *ei = EXT4_I(inode);
 566         ext4_lblk_t oldstart;
 567         struct __track_range_args *__arg =
 568                 (struct __track_range_args *)arg;
 569
 570         if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
 571                 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
 572                 return -ECANCELED;
 573         }
 574
 575         oldstart = ei->i_fc_lblk_start;
 576
 577         if (update && ei->i_fc_lblk_len > 0) {
 578                 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
 579                 ei->i_fc_lblk_len =
 580                         max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
 581                                 ei->i_fc_lblk_start + 1;
 582         } else {
 583                 ei->i_fc_lblk_start = __arg->start;
 584                 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
 585         }
 586
 587         return 0;
 588 }
 589
 590 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
 591                          ext4_lblk_t end)
 592 {
 593         struct __track_range_args args;
 594         int ret;
 595
 596         if (S_ISDIR(inode->i_mode))
 597                 return;
 598
 599         args.start = start;
 600         args.end = end;
 601
 602         ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
 603
 604         trace_ext4_fc_track_range(inode, start, end, ret);
 605 }
 606
 607 static void ext4_fc_submit_bh(struct super_block *sb)
 608 {
 609         int write_flags = REQ_SYNC;
 610         struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
 611
 612         /* TODO: REQ_FUA | REQ_PREFLUSH is unnecessarily expensive. */
 613         if (test_opt(sb, BARRIER))
 614                 write_flags |= REQ_FUA | REQ_PREFLUSH;
 615         lock_buffer(bh);
 616         set_buffer_dirty(bh);
 617         set_buffer_uptodate(bh);
 618         bh->b_end_io = ext4_end_buffer_io_sync;
 619         submit_bh(REQ_OP_WRITE, write_flags, bh);
 620         EXT4_SB(sb)->s_fc_bh = NULL;
 621 }
 622
 623 /* Ext4 commit path routines */
 624
 625 /* memzero and update CRC */
 626 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
 627                                 u32 *crc)
 628 {
 629         void *ret;
 630
 631         ret = memset(dst, 0, len);
 632         if (crc)
 633                 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
 634         return ret;
 635 }
 636
 637 /*
 638  * Allocate len bytes on a fast commit buffer.
 639  *
 640  * During the commit time this function is used to manage fast commit
 641  * block space. We don't split a fast commit log onto different
 642  * blocks. So this function makes sure that if there's not enough space
 643  * on the current block, the remaining space in the current block is
 644  * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
 645  * new block is from jbd2 and CRC is updated to reflect the padding
 646  * we added.
 647  */
 648 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
 649 {
 650         struct ext4_fc_tl *tl;
 651         struct ext4_sb_info *sbi = EXT4_SB(sb);
 652         struct buffer_head *bh;
 653         int bsize = sbi->s_journal->j_blocksize;
 654         int ret, off = sbi->s_fc_bytes % bsize;
 655         int pad_len;
 656
 657         /*
 658          * After allocating len, we should have space at least for a 0 byte
 659          * padding.
 660          */
 661         if (len + sizeof(struct ext4_fc_tl) > bsize)
 662                 return NULL;
 663
 664         if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
 665                 /*
 666                  * Only allocate from current buffer if we have enough space for
 667                  * this request AND we have space to add a zero byte padding.
 668                  */
 669                 if (!sbi->s_fc_bh) {
 670                         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 671                         if (ret)
 672                                 return NULL;
 673                         sbi->s_fc_bh = bh;
 674                 }
 675                 sbi->s_fc_bytes += len;
 676                 return sbi->s_fc_bh->b_data + off;
 677         }
 678         /* Need to add PAD tag */
 679         tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
 680         tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
 681         pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
 682         tl->fc_len = cpu_to_le16(pad_len);
 683         if (crc)
 684                 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
 685         if (pad_len > 0)
 686                 ext4_fc_memzero(sb, tl + 1, pad_len, crc);
 687         ext4_fc_submit_bh(sb);
 688
 689         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 690         if (ret)
 691                 return NULL;
 692         sbi->s_fc_bh = bh;
 693         sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
 694         return sbi->s_fc_bh->b_data;
 695 }
 696
 697 /* memcpy to fc reserved space and update CRC */
 698 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
 699                                 int len, u32 *crc)
 700 {
 701         if (crc)
 702                 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
 703         return memcpy(dst, src, len);
 704 }
 705
 706 /*
 707  * Complete a fast commit by writing tail tag.
 708  *
 709  * Writing tail tag marks the end of a fast commit. In order to guarantee
 710  * atomicity, after writing tail tag, even if there's space remaining
 711  * in the block, next commit shouldn't use it. That's why tail tag
 712  * has the length as that of the remaining space on the block.
 713  */
 714 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
 715 {
 716         struct ext4_sb_info *sbi = EXT4_SB(sb);
 717         struct ext4_fc_tl tl;
 718         struct ext4_fc_tail tail;
 719         int off, bsize = sbi->s_journal->j_blocksize;
 720         u8 *dst;
 721
 722         /*
 723          * ext4_fc_reserve_space takes care of allocating an extra block if
 724          * there's no enough space on this block for accommodating this tail.
 725          */
 726         dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
 727         if (!dst)
 728                 return -ENOSPC;
 729
 730         off = sbi->s_fc_bytes % bsize;
 731
 732         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
 733         tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
 734         sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
 735
 736         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
 737         dst += sizeof(tl);
 738         tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
 739         ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
 740         dst += sizeof(tail.fc_tid);
 741         tail.fc_crc = cpu_to_le32(crc);
 742         ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
 743
 744         ext4_fc_submit_bh(sb);
 745
 746         return 0;
 747 }
 748
 749 /*
 750  * Adds tag, length, value and updates CRC. Returns true if tlv was added.
 751  * Returns false if there's not enough space.
 752  */
 753 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
 754                            u32 *crc)
 755 {
 756         struct ext4_fc_tl tl;
 757         u8 *dst;
 758
 759         dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
 760         if (!dst)
 761                 return false;
 762
 763         tl.fc_tag = cpu_to_le16(tag);
 764         tl.fc_len = cpu_to_le16(len);
 765
 766         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 767         ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
 768
 769         return true;
 770 }
 771
 772 /* Same as above, but adds dentry tlv. */
 773 static  bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
 774                                         int parent_ino, int ino, int dlen,
 775                                         const unsigned char *dname,
 776                                         u32 *crc)
 777 {
 778         struct ext4_fc_dentry_info fcd;
 779         struct ext4_fc_tl tl;
 780         u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
 781                                         crc);
 782
 783         if (!dst)
 784                 return false;
 785
 786         fcd.fc_parent_ino = cpu_to_le32(parent_ino);
 787         fcd.fc_ino = cpu_to_le32(ino);
 788         tl.fc_tag = cpu_to_le16(tag);
 789         tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
 790         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 791         dst += sizeof(tl);
 792         ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
 793         dst += sizeof(fcd);
 794         ext4_fc_memcpy(sb, dst, dname, dlen, crc);
 795         dst += dlen;
 796
 797         return true;
 798 }
 799
 800 /*
 801  * Writes inode in the fast commit space under TLV with tag @tag.
 802  * Returns 0 on success, error on failure.
 803  */
 804 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
 805 {
 806         struct ext4_inode_info *ei = EXT4_I(inode);
 807         int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
 808         int ret;
 809         struct ext4_iloc iloc;
 810         struct ext4_fc_inode fc_inode;
 811         struct ext4_fc_tl tl;
 812         u8 *dst;
 813
 814         ret = ext4_get_inode_loc(inode, &iloc);
 815         if (ret)
 816                 return ret;
 817
 818         if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
 819                 inode_len += ei->i_extra_isize;
 820
 821         fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
 822         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
 823         tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
 824
 825         dst = ext4_fc_reserve_space(inode->i_sb,
 826                         sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
 827         if (!dst)
 828                 return -ECANCELED;
 829
 830         if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
 831                 return -ECANCELED;
 832         dst += sizeof(tl);
 833         if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
 834                 return -ECANCELED;
 835         dst += sizeof(fc_inode);
 836         if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
 837                                         inode_len, crc))
 838                 return -ECANCELED;
 839
 840         return 0;
 841 }
 842
 843 /*
 844  * Writes updated data ranges for the inode in question. Updates CRC.
 845  * Returns 0 on success, error otherwise.
 846  */
 847 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 848 {
 849         ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
 850         struct ext4_inode_info *ei = EXT4_I(inode);
 851         struct ext4_map_blocks map;
 852         struct ext4_fc_add_range fc_ext;
 853         struct ext4_fc_del_range lrange;
 854         struct ext4_extent *ex;
 855         int ret;
 856
 857         mutex_lock(&ei->i_fc_lock);
 858         if (ei->i_fc_lblk_len == 0) {
 859                 mutex_unlock(&ei->i_fc_lock);
 860                 return 0;
 861         }
 862         old_blk_size = ei->i_fc_lblk_start;
 863         new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
 864         ei->i_fc_lblk_len = 0;
 865         mutex_unlock(&ei->i_fc_lock);
 866
 867         cur_lblk_off = old_blk_size;
 868         jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
 869                   __func__, cur_lblk_off, new_blk_size, inode->i_ino);
 870
 871         while (cur_lblk_off <= new_blk_size) {
 872                 map.m_lblk = cur_lblk_off;
 873                 map.m_len = new_blk_size - cur_lblk_off + 1;
 874                 ret = ext4_map_blocks(NULL, inode, &map, 0);
 875                 if (ret < 0)
 876                         return -ECANCELED;
 877
 878                 if (map.m_len == 0) {
 879                         cur_lblk_off++;
 880                         continue;
 881                 }
 882
 883                 if (ret == 0) {
 884                         lrange.fc_ino = cpu_to_le32(inode->i_ino);
 885                         lrange.fc_lblk = cpu_to_le32(map.m_lblk);
 886                         lrange.fc_len = cpu_to_le32(map.m_len);
 887                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
 888                                             sizeof(lrange), (u8 *)&lrange, crc))
 889                                 return -ENOSPC;
 890                 } else {
 891                         fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
 892                         ex = (struct ext4_extent *)&fc_ext.fc_ex;
 893                         ex->ee_block = cpu_to_le32(map.m_lblk);
 894                         ex->ee_len = cpu_to_le16(map.m_len);
 895                         ext4_ext_store_pblock(ex, map.m_pblk);
 896                         if (map.m_flags & EXT4_MAP_UNWRITTEN)
 897                                 ext4_ext_mark_unwritten(ex);
 898                         else
 899                                 ext4_ext_mark_initialized(ex);
 900                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
 901                                             sizeof(fc_ext), (u8 *)&fc_ext, crc))
 902                                 return -ENOSPC;
 903                 }
 904
 905                 cur_lblk_off += map.m_len;
 906         }
 907
 908         return 0;
 909 }
 910
 911
 912 /* Submit data for all the fast commit inodes */
 913 static int ext4_fc_submit_inode_data_all(journal_t *journal)
 914 {
 915         struct super_block *sb = (struct super_block *)(journal->j_private);
 916         struct ext4_sb_info *sbi = EXT4_SB(sb);
 917         struct ext4_inode_info *ei;
 918         struct list_head *pos;
 919         int ret = 0;
 920
 921         spin_lock(&sbi->s_fc_lock);
 922         ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
 923         list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
 924                 ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
 925                 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
 926                 while (atomic_read(&ei->i_fc_updates)) {
 927                         DEFINE_WAIT(wait);
 928
 929                         prepare_to_wait(&ei->i_fc_wait, &wait,
 930                                                 TASK_UNINTERRUPTIBLE);
 931                         if (atomic_read(&ei->i_fc_updates)) {
 932                                 spin_unlock(&sbi->s_fc_lock);
 933                                 schedule();
 934                                 spin_lock(&sbi->s_fc_lock);
 935                         }
 936                         finish_wait(&ei->i_fc_wait, &wait);
 937                 }
 938                 spin_unlock(&sbi->s_fc_lock);
 939                 ret = jbd2_submit_inode_data(ei->jinode);
 940                 if (ret)
 941                         return ret;
 942                 spin_lock(&sbi->s_fc_lock);
 943         }
 944         spin_unlock(&sbi->s_fc_lock);
 945
 946         return ret;
 947 }
 948
 949 /* Wait for completion of data for all the fast commit inodes */
 950 static int ext4_fc_wait_inode_data_all(journal_t *journal)
 951 {
 952         struct super_block *sb = (struct super_block *)(journal->j_private);
 953         struct ext4_sb_info *sbi = EXT4_SB(sb);
 954         struct ext4_inode_info *pos, *n;
 955         int ret = 0;
 956
 957         spin_lock(&sbi->s_fc_lock);
 958         list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 959                 if (!ext4_test_inode_state(&pos->vfs_inode,
 960                                            EXT4_STATE_FC_COMMITTING))
 961                         continue;
 962                 spin_unlock(&sbi->s_fc_lock);
 963
 964                 ret = jbd2_wait_inode_data(journal, pos->jinode);
 965                 if (ret)
 966                         return ret;
 967                 spin_lock(&sbi->s_fc_lock);
 968         }
 969         spin_unlock(&sbi->s_fc_lock);
 970
 971         return 0;
 972 }
 973
 974 /* Commit all the directory entry updates */
 975 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
 976 __acquires(&sbi->s_fc_lock)
 977 __releases(&sbi->s_fc_lock)
 978 {
 979         struct super_block *sb = (struct super_block *)(journal->j_private);
 980         struct ext4_sb_info *sbi = EXT4_SB(sb);
 981         struct ext4_fc_dentry_update *fc_dentry;
 982         struct inode *inode;
 983         struct list_head *pos, *n, *fcd_pos, *fcd_n;
 984         struct ext4_inode_info *ei;
 985         int ret;
 986
 987         if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
 988                 return 0;
 989         list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) {
 990                 fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update,
 991                                         fcd_list);
 992                 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
 993                         spin_unlock(&sbi->s_fc_lock);
 994                         if (!ext4_fc_add_dentry_tlv(
 995                                 sb, fc_dentry->fcd_op,
 996                                 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
 997                                 fc_dentry->fcd_name.len,
 998                                 fc_dentry->fcd_name.name, crc)) {
 999                                 ret = -ENOSPC;
1000                                 goto lock_and_exit;
1001                         }
1002                         spin_lock(&sbi->s_fc_lock);
1003                         continue;
1004                 }
1005
1006                 inode = NULL;
1007                 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
1008                         ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
1009                         if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
1010                                 inode = &ei->vfs_inode;
1011                                 break;
1012                         }
1013                 }
1014                 /*
1015                  * If we don't find inode in our list, then it was deleted,
1016                  * in which case, we don't need to record it's create tag.
1017                  */
1018                 if (!inode)
1019                         continue;
1020                 spin_unlock(&sbi->s_fc_lock);
1021
1022                 /*
1023                  * We first write the inode and then the create dirent. This
1024                  * allows the recovery code to create an unnamed inode first
1025                  * and then link it to a directory entry. This allows us
1026                  * to use namei.c routines almost as is and simplifies
1027                  * the recovery code.
1028                  */
1029                 ret = ext4_fc_write_inode(inode, crc);
1030                 if (ret)
1031                         goto lock_and_exit;
1032
1033                 ret = ext4_fc_write_inode_data(inode, crc);
1034                 if (ret)
1035                         goto lock_and_exit;
1036
1037                 if (!ext4_fc_add_dentry_tlv(
1038                         sb, fc_dentry->fcd_op,
1039                         fc_dentry->fcd_parent, fc_dentry->fcd_ino,
1040                         fc_dentry->fcd_name.len,
1041                         fc_dentry->fcd_name.name, crc)) {
1042                         ret = -ENOSPC;
1043                         goto lock_and_exit;
1044                 }
1045
1046                 spin_lock(&sbi->s_fc_lock);
1047         }
1048         return 0;
1049 lock_and_exit:
1050         spin_lock(&sbi->s_fc_lock);
1051         return ret;
1052 }
1053
1054 static int ext4_fc_perform_commit(journal_t *journal)
1055 {
1056         struct super_block *sb = (struct super_block *)(journal->j_private);
1057         struct ext4_sb_info *sbi = EXT4_SB(sb);
1058         struct ext4_inode_info *iter;
1059         struct ext4_fc_head head;
1060         struct list_head *pos;
1061         struct inode *inode;
1062         struct blk_plug plug;
1063         int ret = 0;
1064         u32 crc = 0;
1065
1066         ret = ext4_fc_submit_inode_data_all(journal);
1067         if (ret)
1068                 return ret;
1069
1070         ret = ext4_fc_wait_inode_data_all(journal);
1071         if (ret)
1072                 return ret;
1073
1074         /*
1075          * If file system device is different from journal device, issue a cache
1076          * flush before we start writing fast commit blocks.
1077          */
1078         if (journal->j_fs_dev != journal->j_dev)
1079                 blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
1080
1081         blk_start_plug(&plug);
1082         if (sbi->s_fc_bytes == 0) {
1083                 /*
1084                  * Add a head tag only if this is the first fast commit
1085                  * in this TID.
1086                  */
1087                 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1088                 head.fc_tid = cpu_to_le32(
1089                         sbi->s_journal->j_running_transaction->t_tid);
1090                 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1091                         (u8 *)&head, &crc))
1092                         goto out;
1093         }
1094
1095         spin_lock(&sbi->s_fc_lock);
1096         ret = ext4_fc_commit_dentry_updates(journal, &crc);
1097         if (ret) {
1098                 spin_unlock(&sbi->s_fc_lock);
1099                 goto out;
1100         }
1101
1102         list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
1103                 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1104                 inode = &iter->vfs_inode;
1105                 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1106                         continue;
1107
1108                 spin_unlock(&sbi->s_fc_lock);
1109                 ret = ext4_fc_write_inode_data(inode, &crc);
1110                 if (ret)
1111                         goto out;
1112                 ret = ext4_fc_write_inode(inode, &crc);
1113                 if (ret)
1114                         goto out;
1115                 spin_lock(&sbi->s_fc_lock);
1116         }
1117         spin_unlock(&sbi->s_fc_lock);
1118
1119         ret = ext4_fc_write_tail(sb, crc);
1120
1121 out:
1122         blk_finish_plug(&plug);
1123         return ret;
1124 }
1125
1126 /*
1127  * The main commit entry point. Performs a fast commit for transaction
1128  * commit_tid if needed. If it's not possible to perform a fast commit
1129  * due to various reasons, we fall back to full commit. Returns 0
1130  * on success, error otherwise.
1131  */
1132 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1133 {
1134         struct super_block *sb = (struct super_block *)(journal->j_private);
1135         struct ext4_sb_info *sbi = EXT4_SB(sb);
1136         int nblks = 0, ret, bsize = journal->j_blocksize;
1137         int subtid = atomic_read(&sbi->s_fc_subtid);
1138         int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1139         ktime_t start_time, commit_time;
1140
1141         trace_ext4_fc_commit_start(sb);
1142
1143         start_time = ktime_get();
1144
1145         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1146                 (ext4_fc_is_ineligible(sb))) {
1147                 reason = EXT4_FC_REASON_INELIGIBLE;
1148                 goto out;
1149         }
1150
1151 restart_fc:
1152         ret = jbd2_fc_begin_commit(journal, commit_tid);
1153         if (ret == -EALREADY) {
1154                 /* There was an ongoing commit, check if we need to restart */
1155                 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1156                         commit_tid > journal->j_commit_sequence)
1157                         goto restart_fc;
1158                 reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1159                 goto out;
1160         } else if (ret) {
1161                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1162                 reason = EXT4_FC_REASON_FC_START_FAILED;
1163                 goto out;
1164         }
1165
1166         fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1167         ret = ext4_fc_perform_commit(journal);
1168         if (ret < 0) {
1169                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1170                 reason = EXT4_FC_REASON_FC_FAILED;
1171                 goto out;
1172         }
1173         nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1174         ret = jbd2_fc_wait_bufs(journal, nblks);
1175         if (ret < 0) {
1176                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1177                 reason = EXT4_FC_REASON_FC_FAILED;
1178                 goto out;
1179         }
1180         atomic_inc(&sbi->s_fc_subtid);
1181         jbd2_fc_end_commit(journal);
1182 out:
1183         /* Has any ineligible update happened since we started? */
1184         if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1185                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1186                 reason = EXT4_FC_REASON_INELIGIBLE;
1187         }
1188
1189         spin_lock(&sbi->s_fc_lock);
1190         if (reason != EXT4_FC_REASON_OK &&
1191                 reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1192                 sbi->s_fc_stats.fc_ineligible_commits++;
1193         } else {
1194                 sbi->s_fc_stats.fc_num_commits++;
1195                 sbi->s_fc_stats.fc_numblks += nblks;
1196         }
1197         spin_unlock(&sbi->s_fc_lock);
1198         nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1199         trace_ext4_fc_commit_stop(sb, nblks, reason);
1200         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1201         /*
1202          * weight the commit time higher than the average time so we don't
1203          * react too strongly to vast changes in the commit time
1204          */
1205         if (likely(sbi->s_fc_avg_commit_time))
1206                 sbi->s_fc_avg_commit_time = (commit_time +
1207                                 sbi->s_fc_avg_commit_time * 3) / 4;
1208         else
1209                 sbi->s_fc_avg_commit_time = commit_time;
1210         jbd_debug(1,
1211                 "Fast commit ended with blks = %d, reason = %d, subtid - %d",
1212                 nblks, reason, subtid);
1213         if (reason == EXT4_FC_REASON_FC_FAILED)
1214                 return jbd2_fc_end_commit_fallback(journal);
1215         if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1216                 reason == EXT4_FC_REASON_INELIGIBLE)
1217                 return jbd2_complete_transaction(journal, commit_tid);
1218         return 0;
1219 }
1220
1221 /*
1222  * Fast commit cleanup routine. This is called after every fast commit and
1223  * full commit. full is true if we are called after a full commit.
1224  */
1225 static void ext4_fc_cleanup(journal_t *journal, int full)
1226 {
1227         struct super_block *sb = journal->j_private;
1228         struct ext4_sb_info *sbi = EXT4_SB(sb);
1229         struct ext4_inode_info *iter;
1230         struct ext4_fc_dentry_update *fc_dentry;
1231         struct list_head *pos, *n;
1232
1233         if (full && sbi->s_fc_bh)
1234                 sbi->s_fc_bh = NULL;
1235
1236         jbd2_fc_release_bufs(journal);
1237
1238         spin_lock(&sbi->s_fc_lock);
1239         list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
1240                 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1241                 list_del_init(&iter->i_fc_list);
1242                 ext4_clear_inode_state(&iter->vfs_inode,
1243                                        EXT4_STATE_FC_COMMITTING);
1244                 ext4_fc_reset_inode(&iter->vfs_inode);
1245                 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1246                 smp_mb();
1247 #if (BITS_PER_LONG < 64)
1248                 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1249 #else
1250                 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1251 #endif
1252         }
1253
1254         while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1255                 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1256                                              struct ext4_fc_dentry_update,
1257                                              fcd_list);
1258                 list_del_init(&fc_dentry->fcd_list);
1259                 spin_unlock(&sbi->s_fc_lock);
1260
1261                 if (fc_dentry->fcd_name.name &&
1262                         fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1263                         kfree(fc_dentry->fcd_name.name);
1264                 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1265                 spin_lock(&sbi->s_fc_lock);
1266         }
1267
1268         list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1269                                 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1270         list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1271                                 &sbi->s_fc_q[FC_Q_STAGING]);
1272
1273         ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
1274         ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1275
1276         if (full)
1277                 sbi->s_fc_bytes = 0;
1278         spin_unlock(&sbi->s_fc_lock);
1279         trace_ext4_fc_stats(sb);
1280 }
1281
1282 /* Ext4 Replay Path Routines */
1283
1284 /* Helper struct for dentry replay routines */
1285 struct dentry_info_args {
1286         int parent_ino, dname_len, ino, inode_len;
1287         char *dname;
1288 };
1289
1290 static inline void tl_to_darg(struct dentry_info_args *darg,
1291                                 struct  ext4_fc_tl *tl)
1292 {
1293         struct ext4_fc_dentry_info *fcd;
1294
1295         fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl);
1296
1297         darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino);
1298         darg->ino = le32_to_cpu(fcd->fc_ino);
1299         darg->dname = fcd->fc_dname;
1300         darg->dname_len = ext4_fc_tag_len(tl) -
1301                         sizeof(struct ext4_fc_dentry_info);
1302 }
1303
1304 /* Unlink replay function */
1305 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl)
1306 {
1307         struct inode *inode, *old_parent;
1308         struct qstr entry;
1309         struct dentry_info_args darg;
1310         int ret = 0;
1311
1312         tl_to_darg(&darg, tl);
1313
1314         trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1315                         darg.parent_ino, darg.dname_len);
1316
1317         entry.name = darg.dname;
1318         entry.len = darg.dname_len;
1319         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1320
1321         if (IS_ERR_OR_NULL(inode)) {
1322                 jbd_debug(1, "Inode %d not found", darg.ino);
1323                 return 0;
1324         }
1325
1326         old_parent = ext4_iget(sb, darg.parent_ino,
1327                                 EXT4_IGET_NORMAL);
1328         if (IS_ERR_OR_NULL(old_parent)) {
1329                 jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1330                 iput(inode);
1331                 return 0;
1332         }
1333
1334         ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1335         /* -ENOENT ok coz it might not exist anymore. */
1336         if (ret == -ENOENT)
1337                 ret = 0;
1338         iput(old_parent);
1339         iput(inode);
1340         return ret;
1341 }
1342
1343 static int ext4_fc_replay_link_internal(struct super_block *sb,
1344                                 struct dentry_info_args *darg,
1345                                 struct inode *inode)
1346 {
1347         struct inode *dir = NULL;
1348         struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1349         struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1350         int ret = 0;
1351
1352         dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1353         if (IS_ERR(dir)) {
1354                 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1355                 dir = NULL;
1356                 goto out;
1357         }
1358
1359         dentry_dir = d_obtain_alias(dir);
1360         if (IS_ERR(dentry_dir)) {
1361                 jbd_debug(1, "Failed to obtain dentry");
1362                 dentry_dir = NULL;
1363                 goto out;
1364         }
1365
1366         dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1367         if (!dentry_inode) {
1368                 jbd_debug(1, "Inode dentry not created.");
1369                 ret = -ENOMEM;
1370                 goto out;
1371         }
1372
1373         ret = __ext4_link(dir, inode, dentry_inode);
1374         /*
1375          * It's possible that link already existed since data blocks
1376          * for the dir in question got persisted before we crashed OR
1377          * we replayed this tag and crashed before the entire replay
1378          * could complete.
1379          */
1380         if (ret && ret != -EEXIST) {
1381                 jbd_debug(1, "Failed to link\n");
1382                 goto out;
1383         }
1384
1385         ret = 0;
1386 out:
1387         if (dentry_dir) {
1388                 d_drop(dentry_dir);
1389                 dput(dentry_dir);
1390         } else if (dir) {
1391                 iput(dir);
1392         }
1393         if (dentry_inode) {
1394                 d_drop(dentry_inode);
1395                 dput(dentry_inode);
1396         }
1397
1398         return ret;
1399 }
1400
1401 /* Link replay function */
1402 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl)
1403 {
1404         struct inode *inode;
1405         struct dentry_info_args darg;
1406         int ret = 0;
1407
1408         tl_to_darg(&darg, tl);
1409         trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1410                         darg.parent_ino, darg.dname_len);
1411
1412         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1413         if (IS_ERR_OR_NULL(inode)) {
1414                 jbd_debug(1, "Inode not found.");
1415                 return 0;
1416         }
1417
1418         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1419         iput(inode);
1420         return ret;
1421 }
1422
1423 /*
1424  * Record all the modified inodes during replay. We use this later to setup
1425  * block bitmaps correctly.
1426  */
1427 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1428 {
1429         struct ext4_fc_replay_state *state;
1430         int i;
1431
1432         state = &EXT4_SB(sb)->s_fc_replay_state;
1433         for (i = 0; i < state->fc_modified_inodes_used; i++)
1434                 if (state->fc_modified_inodes[i] == ino)
1435                         return 0;
1436         if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1437                 state->fc_modified_inodes_size +=
1438                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
1439                 state->fc_modified_inodes = krealloc(
1440                                         state->fc_modified_inodes, sizeof(int) *
1441                                         state->fc_modified_inodes_size,
1442                                         GFP_KERNEL);
1443                 if (!state->fc_modified_inodes)
1444                         return -ENOMEM;
1445         }
1446         state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1447         return 0;
1448 }
1449
1450 /*
1451  * Inode replay function
1452  */
1453 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl)
1454 {
1455         struct ext4_fc_inode *fc_inode;
1456         struct ext4_inode *raw_inode;
1457         struct ext4_inode *raw_fc_inode;
1458         struct inode *inode = NULL;
1459         struct ext4_iloc iloc;
1460         int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1461         struct ext4_extent_header *eh;
1462
1463         fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl);
1464
1465         ino = le32_to_cpu(fc_inode->fc_ino);
1466         trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1467
1468         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1469         if (!IS_ERR_OR_NULL(inode)) {
1470                 ext4_ext_clear_bb(inode);
1471                 iput(inode);
1472         }
1473
1474         ext4_fc_record_modified_inode(sb, ino);
1475
1476         raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode;
1477         ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1478         if (ret)
1479                 goto out;
1480
1481         inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode);
1482         raw_inode = ext4_raw_inode(&iloc);
1483
1484         memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1485         memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1486                 inode_len - offsetof(struct ext4_inode, i_generation));
1487         if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1488                 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1489                 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1490                         memset(eh, 0, sizeof(*eh));
1491                         eh->eh_magic = EXT4_EXT_MAGIC;
1492                         eh->eh_max = cpu_to_le16(
1493                                 (sizeof(raw_inode->i_block) -
1494                                  sizeof(struct ext4_extent_header))
1495                                  / sizeof(struct ext4_extent));
1496                 }
1497         } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1498                 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1499                         sizeof(raw_inode->i_block));
1500         }
1501
1502         /* Immediately update the inode on disk. */
1503         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1504         if (ret)
1505                 goto out;
1506         ret = sync_dirty_buffer(iloc.bh);
1507         if (ret)
1508                 goto out;
1509         ret = ext4_mark_inode_used(sb, ino);
1510         if (ret)
1511                 goto out;
1512
1513         /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1514         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1515         if (IS_ERR_OR_NULL(inode)) {
1516                 jbd_debug(1, "Inode not found.");
1517                 return -EFSCORRUPTED;
1518         }
1519
1520         /*
1521          * Our allocator could have made different decisions than before
1522          * crashing. This should be fixed but until then, we calculate
1523          * the number of blocks the inode.
1524          */
1525         ext4_ext_replay_set_iblocks(inode);
1526
1527         inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1528         ext4_reset_inode_seed(inode);
1529
1530         ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1531         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1532         sync_dirty_buffer(iloc.bh);
1533         brelse(iloc.bh);
1534 out:
1535         iput(inode);
1536         if (!ret)
1537                 blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
1538
1539         return 0;
1540 }
1541
1542 /*
1543  * Dentry create replay function.
1544  *
1545  * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1546  * inode for which we are trying to create a dentry here, should already have
1547  * been replayed before we start here.
1548  */
1549 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl)
1550 {
1551         int ret = 0;
1552         struct inode *inode = NULL;
1553         struct inode *dir = NULL;
1554         struct dentry_info_args darg;
1555
1556         tl_to_darg(&darg, tl);
1557
1558         trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1559                         darg.parent_ino, darg.dname_len);
1560
1561         /* This takes care of update group descriptor and other metadata */
1562         ret = ext4_mark_inode_used(sb, darg.ino);
1563         if (ret)
1564                 goto out;
1565
1566         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1567         if (IS_ERR_OR_NULL(inode)) {
1568                 jbd_debug(1, "inode %d not found.", darg.ino);
1569                 inode = NULL;
1570                 ret = -EINVAL;
1571                 goto out;
1572         }
1573
1574         if (S_ISDIR(inode->i_mode)) {
1575                 /*
1576                  * If we are creating a directory, we need to make sure that the
1577                  * dot and dot dot dirents are setup properly.
1578                  */
1579                 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1580                 if (IS_ERR_OR_NULL(dir)) {
1581                         jbd_debug(1, "Dir %d not found.", darg.ino);
1582                         goto out;
1583                 }
1584                 ret = ext4_init_new_dir(NULL, dir, inode);
1585                 iput(dir);
1586                 if (ret) {
1587                         ret = 0;
1588                         goto out;
1589                 }
1590         }
1591         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1592         if (ret)
1593                 goto out;
1594         set_nlink(inode, 1);
1595         ext4_mark_inode_dirty(NULL, inode);
1596 out:
1597         if (inode)
1598                 iput(inode);
1599         return ret;
1600 }
1601
1602 /*
1603  * Record physical disk regions which are in use as per fast commit area. Our
1604  * simple replay phase allocator excludes these regions from allocation.
1605  */
1606 static int ext4_fc_record_regions(struct super_block *sb, int ino,
1607                 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1608 {
1609         struct ext4_fc_replay_state *state;
1610         struct ext4_fc_alloc_region *region;
1611
1612         state = &EXT4_SB(sb)->s_fc_replay_state;
1613         if (state->fc_regions_used == state->fc_regions_size) {
1614                 state->fc_regions_size +=
1615                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
1616                 state->fc_regions = krealloc(
1617                                         state->fc_regions,
1618                                         state->fc_regions_size *
1619                                         sizeof(struct ext4_fc_alloc_region),
1620                                         GFP_KERNEL);
1621                 if (!state->fc_regions)
1622                         return -ENOMEM;
1623         }
1624         region = &state->fc_regions[state->fc_regions_used++];
1625         region->ino = ino;
1626         region->lblk = lblk;
1627         region->pblk = pblk;
1628         region->len = len;
1629
1630         return 0;
1631 }
1632
1633 /* Replay add range tag */
1634 static int ext4_fc_replay_add_range(struct super_block *sb,
1635                                 struct ext4_fc_tl *tl)
1636 {
1637         struct ext4_fc_add_range *fc_add_ex;
1638         struct ext4_extent newex, *ex;
1639         struct inode *inode;
1640         ext4_lblk_t start, cur;
1641         int remaining, len;
1642         ext4_fsblk_t start_pblk;
1643         struct ext4_map_blocks map;
1644         struct ext4_ext_path *path = NULL;
1645         int ret;
1646
1647         fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1648         ex = (struct ext4_extent *)&fc_add_ex->fc_ex;
1649
1650         trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1651                 le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block),
1652                 ext4_ext_get_actual_len(ex));
1653
1654         inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino),
1655                                 EXT4_IGET_NORMAL);
1656         if (IS_ERR_OR_NULL(inode)) {
1657                 jbd_debug(1, "Inode not found.");
1658                 return 0;
1659         }
1660
1661         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1662
1663         start = le32_to_cpu(ex->ee_block);
1664         start_pblk = ext4_ext_pblock(ex);
1665         len = ext4_ext_get_actual_len(ex);
1666
1667         cur = start;
1668         remaining = len;
1669         jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1670                   start, start_pblk, len, ext4_ext_is_unwritten(ex),
1671                   inode->i_ino);
1672
1673         while (remaining > 0) {
1674                 map.m_lblk = cur;
1675                 map.m_len = remaining;
1676                 map.m_pblk = 0;
1677                 ret = ext4_map_blocks(NULL, inode, &map, 0);
1678
1679                 if (ret < 0) {
1680                         iput(inode);
1681                         return 0;
1682                 }
1683
1684                 if (ret == 0) {
1685                         /* Range is not mapped */
1686                         path = ext4_find_extent(inode, cur, NULL, 0);
1687                         if (IS_ERR(path)) {
1688                                 iput(inode);
1689                                 return 0;
1690                         }
1691                         memset(&newex, 0, sizeof(newex));
1692                         newex.ee_block = cpu_to_le32(cur);
1693                         ext4_ext_store_pblock(
1694                                 &newex, start_pblk + cur - start);
1695                         newex.ee_len = cpu_to_le16(map.m_len);
1696                         if (ext4_ext_is_unwritten(ex))
1697                                 ext4_ext_mark_unwritten(&newex);
1698                         down_write(&EXT4_I(inode)->i_data_sem);
1699                         ret = ext4_ext_insert_extent(
1700                                 NULL, inode, &path, &newex, 0);
1701                         up_write((&EXT4_I(inode)->i_data_sem));
1702                         ext4_ext_drop_refs(path);
1703                         kfree(path);
1704                         if (ret) {
1705                                 iput(inode);
1706                                 return 0;
1707                         }
1708                         goto next;
1709                 }
1710
1711                 if (start_pblk + cur - start != map.m_pblk) {
1712                         /*
1713                          * Logical to physical mapping changed. This can happen
1714                          * if this range was removed and then reallocated to
1715                          * map to new physical blocks during a fast commit.
1716                          */
1717                         ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1718                                         ext4_ext_is_unwritten(ex),
1719                                         start_pblk + cur - start);
1720                         if (ret) {
1721                                 iput(inode);
1722                                 return 0;
1723                         }
1724                         /*
1725                          * Mark the old blocks as free since they aren't used
1726                          * anymore. We maintain an array of all the modified
1727                          * inodes. In case these blocks are still used at either
1728                          * a different logical range in the same inode or in
1729                          * some different inode, we will mark them as allocated
1730                          * at the end of the FC replay using our array of
1731                          * modified inodes.
1732                          */
1733                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1734                         goto next;
1735                 }
1736
1737                 /* Range is mapped and needs a state change */
1738                 jbd_debug(1, "Converting from %d to %d %lld",
1739                                 map.m_flags & EXT4_MAP_UNWRITTEN,
1740                         ext4_ext_is_unwritten(ex), map.m_pblk);
1741                 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1742                                         ext4_ext_is_unwritten(ex), map.m_pblk);
1743                 if (ret) {
1744                         iput(inode);
1745                         return 0;
1746                 }
1747                 /*
1748                  * We may have split the extent tree while toggling the state.
1749                  * Try to shrink the extent tree now.
1750                  */
1751                 ext4_ext_replay_shrink_inode(inode, start + len);
1752 next:
1753                 cur += map.m_len;
1754                 remaining -= map.m_len;
1755         }
1756         ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1757                                         sb->s_blocksize_bits);
1758         iput(inode);
1759         return 0;
1760 }
1761
1762 /* Replay DEL_RANGE tag */
1763 static int
1764 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl)
1765 {
1766         struct inode *inode;
1767         struct ext4_fc_del_range *lrange;
1768         struct ext4_map_blocks map;
1769         ext4_lblk_t cur, remaining;
1770         int ret;
1771
1772         lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl);
1773         cur = le32_to_cpu(lrange->fc_lblk);
1774         remaining = le32_to_cpu(lrange->fc_len);
1775
1776         trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1777                 le32_to_cpu(lrange->fc_ino), cur, remaining);
1778
1779         inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL);
1780         if (IS_ERR_OR_NULL(inode)) {
1781                 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino));
1782                 return 0;
1783         }
1784
1785         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1786
1787         jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1788                         inode->i_ino, le32_to_cpu(lrange->fc_lblk),
1789                         le32_to_cpu(lrange->fc_len));
1790         while (remaining > 0) {
1791                 map.m_lblk = cur;
1792                 map.m_len = remaining;
1793
1794                 ret = ext4_map_blocks(NULL, inode, &map, 0);
1795                 if (ret < 0) {
1796                         iput(inode);
1797                         return 0;
1798                 }
1799                 if (ret > 0) {
1800                         remaining -= ret;
1801                         cur += ret;
1802                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1803                 } else {
1804                         remaining -= map.m_len;
1805                         cur += map.m_len;
1806                 }
1807         }
1808
1809         ret = ext4_punch_hole(inode,
1810                 le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits,
1811                 le32_to_cpu(lrange->fc_len) <<  sb->s_blocksize_bits);
1812         if (ret)
1813                 jbd_debug(1, "ext4_punch_hole returned %d", ret);
1814         ext4_ext_replay_shrink_inode(inode,
1815                 i_size_read(inode) >> sb->s_blocksize_bits);
1816         ext4_mark_inode_dirty(NULL, inode);
1817         iput(inode);
1818
1819         return 0;
1820 }
1821
1822 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1823 {
1824         struct ext4_fc_replay_state *state;
1825         struct inode *inode;
1826         struct ext4_ext_path *path = NULL;
1827         struct ext4_map_blocks map;
1828         int i, ret, j;
1829         ext4_lblk_t cur, end;
1830
1831         state = &EXT4_SB(sb)->s_fc_replay_state;
1832         for (i = 0; i < state->fc_modified_inodes_used; i++) {
1833                 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1834                         EXT4_IGET_NORMAL);
1835                 if (IS_ERR_OR_NULL(inode)) {
1836                         jbd_debug(1, "Inode %d not found.",
1837                                 state->fc_modified_inodes[i]);
1838                         continue;
1839                 }
1840                 cur = 0;
1841                 end = EXT_MAX_BLOCKS;
1842                 while (cur < end) {
1843                         map.m_lblk = cur;
1844                         map.m_len = end - cur;
1845
1846                         ret = ext4_map_blocks(NULL, inode, &map, 0);
1847                         if (ret < 0)
1848                                 break;
1849
1850                         if (ret > 0) {
1851                                 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1852                                 if (!IS_ERR_OR_NULL(path)) {
1853                                         for (j = 0; j < path->p_depth; j++)
1854                                                 ext4_mb_mark_bb(inode->i_sb,
1855                                                         path[j].p_block, 1, 1);
1856                                         ext4_ext_drop_refs(path);
1857                                         kfree(path);
1858                                 }
1859                                 cur += ret;
1860                                 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1861                                                         map.m_len, 1);
1862                         } else {
1863                                 cur = cur + (map.m_len ? map.m_len : 1);
1864                         }
1865                 }
1866                 iput(inode);
1867         }
1868 }
1869
1870 /*
1871  * Check if block is in excluded regions for block allocation. The simple
1872  * allocator that runs during replay phase is calls this function to see
1873  * if it is okay to use a block.
1874  */
1875 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1876 {
1877         int i;
1878         struct ext4_fc_replay_state *state;
1879
1880         state = &EXT4_SB(sb)->s_fc_replay_state;
1881         for (i = 0; i < state->fc_regions_valid; i++) {
1882                 if (state->fc_regions[i].ino == 0 ||
1883                         state->fc_regions[i].len == 0)
1884                         continue;
1885                 if (blk >= state->fc_regions[i].pblk &&
1886                     blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1887                         return true;
1888         }
1889         return false;
1890 }
1891
1892 /* Cleanup function called after replay */
1893 void ext4_fc_replay_cleanup(struct super_block *sb)
1894 {
1895         struct ext4_sb_info *sbi = EXT4_SB(sb);
1896
1897         sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1898         kfree(sbi->s_fc_replay_state.fc_regions);
1899         kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1900 }
1901
1902 /*
1903  * Recovery Scan phase handler
1904  *
1905  * This function is called during the scan phase and is responsible
1906  * for doing following things:
1907  * - Make sure the fast commit area has valid tags for replay
1908  * - Count number of tags that need to be replayed by the replay handler
1909  * - Verify CRC
1910  * - Create a list of excluded blocks for allocation during replay phase
1911  *
1912  * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1913  * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1914  * to indicate that scan has finished and JBD2 can now start replay phase.
1915  * It returns a negative error to indicate that there was an error. At the end
1916  * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1917  * to indicate the number of tags that need to replayed during the replay phase.
1918  */
1919 static int ext4_fc_replay_scan(journal_t *journal,
1920                                 struct buffer_head *bh, int off,
1921                                 tid_t expected_tid)
1922 {
1923         struct super_block *sb = journal->j_private;
1924         struct ext4_sb_info *sbi = EXT4_SB(sb);
1925         struct ext4_fc_replay_state *state;
1926         int ret = JBD2_FC_REPLAY_CONTINUE;
1927         struct ext4_fc_add_range *ext;
1928         struct ext4_fc_tl *tl;
1929         struct ext4_fc_tail *tail;
1930         __u8 *start, *end;
1931         struct ext4_fc_head *head;
1932         struct ext4_extent *ex;
1933
1934         state = &sbi->s_fc_replay_state;
1935
1936         start = (u8 *)bh->b_data;
1937         end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1938
1939         if (state->fc_replay_expected_off == 0) {
1940                 state->fc_cur_tag = 0;
1941                 state->fc_replay_num_tags = 0;
1942                 state->fc_crc = 0;
1943                 state->fc_regions = NULL;
1944                 state->fc_regions_valid = state->fc_regions_used =
1945                         state->fc_regions_size = 0;
1946                 /* Check if we can stop early */
1947                 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1948                         != EXT4_FC_TAG_HEAD)
1949                         return 0;
1950         }
1951
1952         if (off != state->fc_replay_expected_off) {
1953                 ret = -EFSCORRUPTED;
1954                 goto out_err;
1955         }
1956
1957         state->fc_replay_expected_off++;
1958         fc_for_each_tl(start, end, tl) {
1959                 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1960                           tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr);
1961                 switch (le16_to_cpu(tl->fc_tag)) {
1962                 case EXT4_FC_TAG_ADD_RANGE:
1963                         ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1964                         ex = (struct ext4_extent *)&ext->fc_ex;
1965                         ret = ext4_fc_record_regions(sb,
1966                                 le32_to_cpu(ext->fc_ino),
1967                                 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1968                                 ext4_ext_get_actual_len(ex));
1969                         if (ret < 0)
1970                                 break;
1971                         ret = JBD2_FC_REPLAY_CONTINUE;
1972                         fallthrough;
1973                 case EXT4_FC_TAG_DEL_RANGE:
1974                 case EXT4_FC_TAG_LINK:
1975                 case EXT4_FC_TAG_UNLINK:
1976                 case EXT4_FC_TAG_CREAT:
1977                 case EXT4_FC_TAG_INODE:
1978                 case EXT4_FC_TAG_PAD:
1979                         state->fc_cur_tag++;
1980                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1981                                         sizeof(*tl) + ext4_fc_tag_len(tl));
1982                         break;
1983                 case EXT4_FC_TAG_TAIL:
1984                         state->fc_cur_tag++;
1985                         tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
1986                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1987                                                 sizeof(*tl) +
1988                                                 offsetof(struct ext4_fc_tail,
1989                                                 fc_crc));
1990                         if (le32_to_cpu(tail->fc_tid) == expected_tid &&
1991                                 le32_to_cpu(tail->fc_crc) == state->fc_crc) {
1992                                 state->fc_replay_num_tags = state->fc_cur_tag;
1993                                 state->fc_regions_valid =
1994                                         state->fc_regions_used;
1995                         } else {
1996                                 ret = state->fc_replay_num_tags ?
1997                                         JBD2_FC_REPLAY_STOP : -EFSBADCRC;
1998                         }
1999                         state->fc_crc = 0;
2000                         break;
2001                 case EXT4_FC_TAG_HEAD:
2002                         head = (struct ext4_fc_head *)ext4_fc_tag_val(tl);
2003                         if (le32_to_cpu(head->fc_features) &
2004                                 ~EXT4_FC_SUPPORTED_FEATURES) {
2005                                 ret = -EOPNOTSUPP;
2006                                 break;
2007                         }
2008                         if (le32_to_cpu(head->fc_tid) != expected_tid) {
2009                                 ret = JBD2_FC_REPLAY_STOP;
2010                                 break;
2011                         }
2012                         state->fc_cur_tag++;
2013                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
2014                                         sizeof(*tl) + ext4_fc_tag_len(tl));
2015                         break;
2016                 default:
2017                         ret = state->fc_replay_num_tags ?
2018                                 JBD2_FC_REPLAY_STOP : -ECANCELED;
2019                 }
2020                 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2021                         break;
2022         }
2023
2024 out_err:
2025         trace_ext4_fc_replay_scan(sb, ret, off);
2026         return ret;
2027 }
2028
2029 /*
2030  * Main recovery path entry point.
2031  * The meaning of return codes is similar as above.
2032  */
2033 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2034                                 enum passtype pass, int off, tid_t expected_tid)
2035 {
2036         struct super_block *sb = journal->j_private;
2037         struct ext4_sb_info *sbi = EXT4_SB(sb);
2038         struct ext4_fc_tl *tl;
2039         __u8 *start, *end;
2040         int ret = JBD2_FC_REPLAY_CONTINUE;
2041         struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2042         struct ext4_fc_tail *tail;
2043
2044         if (pass == PASS_SCAN) {
2045                 state->fc_current_pass = PASS_SCAN;
2046                 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2047         }
2048
2049         if (state->fc_current_pass != pass) {
2050                 state->fc_current_pass = pass;
2051                 sbi->s_mount_state |= EXT4_FC_REPLAY;
2052         }
2053         if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2054                 jbd_debug(1, "Replay stops\n");
2055                 ext4_fc_set_bitmaps_and_counters(sb);
2056                 return 0;
2057         }
2058
2059 #ifdef CONFIG_EXT4_DEBUG
2060         if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2061                 pr_warn("Dropping fc block %d because max_replay set\n", off);
2062                 return JBD2_FC_REPLAY_STOP;
2063         }
2064 #endif
2065
2066         start = (u8 *)bh->b_data;
2067         end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2068
2069         fc_for_each_tl(start, end, tl) {
2070                 if (state->fc_replay_num_tags == 0) {
2071                         ret = JBD2_FC_REPLAY_STOP;
2072                         ext4_fc_set_bitmaps_and_counters(sb);
2073                         break;
2074                 }
2075                 jbd_debug(3, "Replay phase, tag:%s\n",
2076                                 tag2str(le16_to_cpu(tl->fc_tag)));
2077                 state->fc_replay_num_tags--;
2078                 switch (le16_to_cpu(tl->fc_tag)) {
2079                 case EXT4_FC_TAG_LINK:
2080                         ret = ext4_fc_replay_link(sb, tl);
2081                         break;
2082                 case EXT4_FC_TAG_UNLINK:
2083                         ret = ext4_fc_replay_unlink(sb, tl);
2084                         break;
2085                 case EXT4_FC_TAG_ADD_RANGE:
2086                         ret = ext4_fc_replay_add_range(sb, tl);
2087                         break;
2088                 case EXT4_FC_TAG_CREAT:
2089                         ret = ext4_fc_replay_create(sb, tl);
2090                         break;
2091                 case EXT4_FC_TAG_DEL_RANGE:
2092                         ret = ext4_fc_replay_del_range(sb, tl);
2093                         break;
2094                 case EXT4_FC_TAG_INODE:
2095                         ret = ext4_fc_replay_inode(sb, tl);
2096                         break;
2097                 case EXT4_FC_TAG_PAD:
2098                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2099                                 ext4_fc_tag_len(tl), 0);
2100                         break;
2101                 case EXT4_FC_TAG_TAIL:
2102                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2103                                 ext4_fc_tag_len(tl), 0);
2104                         tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
2105                         WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid);
2106                         break;
2107                 case EXT4_FC_TAG_HEAD:
2108                         break;
2109                 default:
2110                         trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0,
2111                                 ext4_fc_tag_len(tl), 0);
2112                         ret = -ECANCELED;
2113                         break;
2114                 }
2115                 if (ret < 0)
2116                         break;
2117                 ret = JBD2_FC_REPLAY_CONTINUE;
2118         }
2119         return ret;
2120 }
2121
2122 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2123 {
2124         /*
2125          * We set replay callback even if fast commit disabled because we may
2126          * could still have fast commit blocks that need to be replayed even if
2127          * fast commit has now been turned off.
2128          */
2129         journal->j_fc_replay_callback = ext4_fc_replay;
2130         if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2131                 return;
2132         journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2133 }
2134
2135 static const char *fc_ineligible_reasons[] = {
2136         "Extended attributes changed",
2137         "Cross rename",
2138         "Journal flag changed",
2139         "Insufficient memory",
2140         "Swap boot",
2141         "Resize",
2142         "Dir renamed",
2143         "Falloc range op",
2144         "Data journalling",
2145         "FC Commit Failed"
2146 };
2147
2148 int ext4_fc_info_show(struct seq_file *seq, void *v)
2149 {
2150         struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2151         struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2152         int i;
2153
2154         if (v != SEQ_START_TOKEN)
2155                 return 0;
2156
2157         seq_printf(seq,
2158                 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2159                    stats->fc_num_commits, stats->fc_ineligible_commits,
2160                    stats->fc_numblks,
2161                    div_u64(sbi->s_fc_avg_commit_time, 1000));
2162         seq_puts(seq, "Ineligible reasons:\n");
2163         for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2164                 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2165                         stats->fc_ineligible_reason_count[i]);
2166
2167         return 0;
2168 }
2169
2170 int __init ext4_fc_init_dentry_cache(void)
2171 {
2172         ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2173                                            SLAB_RECLAIM_ACCOUNT);
2174
2175         if (ext4_fc_dentry_cachep == NULL)
2176                 return -ENOMEM;
2177
2178         return 0;
2179 }