add patch avoid-panic-during-forced-reboot-due-to-aborted-journal
[ext4-patch-queue.git] / add-ext4-journal-lazy-mount-option
blob73d77ce69ff69e9c17202fb6dbcd6a20af781a63
1 ext4: add journal_lazy mount option
3 This option turns out the lazy journalling option, as described in the
4 FAST 2017 paper, "Evolving Ext4 for Shingled Disks"[1].
6 [1] https://www.usenix.org/conference/fast17/technical-sessions/presentation/aghayev
8 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
9 ---
10  fs/ext4/ext4.h  |  1 +
11  fs/ext4/inode.c |  2 +-
12  fs/ext4/ioctl.c | 42 ++++++++++++++++++++++++++++++++----------
13  fs/ext4/super.c | 56 ++++++++++++++++++++++++++++++++++++++++++++------------
14  4 files changed, 78 insertions(+), 23 deletions(-)
16 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
17 index fc2bdaa71c44..e19b6bac2d91 100644
18 --- a/fs/ext4/ext4.h
19 +++ b/fs/ext4/ext4.h
20 @@ -1079,6 +1079,7 @@ struct ext4_inode_info {
21   * Mount flags set via mount options or defaults
22   */
23  #define EXT4_MOUNT_NO_MBCACHE          0x00001 /* Do not use mbcache */
24 +#define EXT4_MOUNT_JOURNAL_LAZY                0x00002 /* Do lazy writeback of journalled metadata */
25  #define EXT4_MOUNT_GRPID               0x00004 /* Create files with directory's group */
26  #define EXT4_MOUNT_DEBUG               0x00008 /* Some debugging messages */
27  #define EXT4_MOUNT_ERRORS_CONT         0x00010 /* Continue on errors */
28 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
29 index 3969d0278469..d43c326f4048 100644
30 --- a/fs/ext4/inode.c
31 +++ b/fs/ext4/inode.c
32 @@ -3287,7 +3287,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
33                 filemap_write_and_wait(mapping);
34         }
36 -       if (EXT4_JOURNAL(inode) &&
37 +       if (EXT4_JOURNAL(inode) && !test_opt(inode->i_sb, JOURNAL_LAZY) &&
38             ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
39                 /*
40                  * This is a REALLY heavyweight approach, but the use of
41 diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
42 index a7074115d6f6..8556d6003d15 100644
43 --- a/fs/ext4/ioctl.c
44 +++ b/fs/ext4/ioctl.c
45 @@ -239,6 +239,20 @@ static int ext4_ioctl_setflags(struct inode *inode,
46                 if (!capable(CAP_SYS_RESOURCE))
47                         goto flags_out;
48         }
50 +       /*
51 +        * Clearing the JOURNAL_DATA flag is *hard* with lazy
52 +        * journalling.  We can't use jbd2_journal_flush(); instead,
53 +        * we would have to make sure all blocks belonging to the file
54 +        * are evacuated from the journal and saved to their final
55 +        * location on disk.  Punt for now.
56 +        */
57 +       if ((oldflags & EXT4_JOURNAL_DATA_FL) && !jflag &&
58 +           test_opt(inode->i_sb, JOURNAL_LAZY)) {
59 +               err = -EOPNOTSUPP;
60 +               goto flags_out;
61 +       }
63         if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
64                 migrate = 1;
66 @@ -626,6 +640,22 @@ static long ext4_ioctl_group_add(struct file *file,
67         return err;
68  }
70 +/*
71 + * If we are using journalling (excepting JBD2 lazy mode), make sure
72 + * the block group descriptors are written out immediately
73 + */
74 +static int flush_fs_group_descriptors(struct super_block *sb)
76 +       int err = 0;
78 +       if (EXT4_SB(sb)->s_journal && !test_opt(sb, JOURNAL_LAZY)) {
79 +               jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
80 +               err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
81 +               jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
82 +       }
83 +       return err;
86  long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
87  {
88         struct inode *inode = file_inode(filp);
89 @@ -744,11 +774,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
90                         goto group_extend_out;
92                 err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
93 -               if (EXT4_SB(sb)->s_journal) {
94 -                       jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
95 -                       err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
96 -                       jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
97 -               }
98 +               err2 = flush_fs_group_descriptors(sb);
99                 if (err == 0)
100                         err = err2;
101                 mnt_drop_write_file(filp);
102 @@ -886,11 +912,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
103                         goto resizefs_out;
105                 err = ext4_resize_fs(sb, n_blocks_count);
106 -               if (EXT4_SB(sb)->s_journal) {
107 -                       jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
108 -                       err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
109 -                       jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
110 -               }
111 +               err2 = flush_fs_group_descriptors(sb);
112                 if (err == 0)
113                         err = err2;
114                 mnt_drop_write_file(filp);
115 diff --git a/fs/ext4/super.c b/fs/ext4/super.c
116 index 9339717b85c8..406e4d4ffae0 100644
117 --- a/fs/ext4/super.c
118 +++ b/fs/ext4/super.c
119 @@ -928,7 +928,8 @@ static void ext4_put_super(struct super_block *sb)
120         ext4_mb_release(sb);
121         ext4_ext_release(sb);
123 -       if (!sb_rdonly(sb) && !aborted && !test_opt(sb, JOURNAL_NOCLEANUP)) {
124 +       if (!sb_rdonly(sb) && !aborted && !test_opt(sb, JOURNAL_NOCLEANUP) &&
125 +           !test_opt(sb, JOURNAL_LAZY)) {
126                 ext4_clear_feature_journal_needs_recovery(sb);
127                 es->s_state = cpu_to_le16(sbi->s_mount_state);
128         }
129 @@ -1384,6 +1385,7 @@ enum {
130         Opt_inode_readahead_blks, Opt_journal_ioprio,
131         Opt_dioread_nolock, Opt_dioread_lock,
132         Opt_journal_nocleanup, Opt_journal_cleanup,
133 +       Opt_journal_nolazy, Opt_journal_lazy,
134         Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
135         Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
136  };
137 @@ -1474,6 +1476,8 @@ static const match_table_t tokens = {
138         {Opt_nombcache, "no_mbcache"},  /* for backward compatibility */
139         {Opt_journal_nocleanup, "journal_nocleanup"},
140         {Opt_journal_cleanup, "journal_cleanup"},
141 +       {Opt_journal_lazy, "journal_lazy"},
142 +       {Opt_journal_nolazy, "journal_nolazy"},
143         {Opt_removed, "check=none"},    /* mount option from ext2/3 */
144         {Opt_removed, "nocheck"},       /* mount option from ext2/3 */
145         {Opt_removed, "reservation"},   /* mount option from ext2/3 */
146 @@ -1686,6 +1690,8 @@ static const struct mount_opts {
147         {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
148         {Opt_journal_nocleanup, EXT4_MOUNT_JOURNAL_NOCLEANUP, MOPT_SET},
149         {Opt_journal_cleanup, EXT4_MOUNT_JOURNAL_NOCLEANUP, MOPT_CLEAR},
150 +       {Opt_journal_lazy, EXT4_MOUNT_JOURNAL_LAZY, MOPT_SET},
151 +       {Opt_journal_nolazy, EXT4_MOUNT_JOURNAL_LAZY, MOPT_CLEAR},
152         {Opt_err, 0, 0}
153  };
155 @@ -4570,6 +4576,10 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
156                 journal->j_flags |= JBD2_NO_CLEANUP;
157         else
158                 journal->j_flags &= ~JBD2_NO_CLEANUP;
159 +       if (test_opt(sb, JOURNAL_LAZY))
160 +               journal->j_flags |= JBD2_LAZY;
161 +       else
162 +               journal->j_flags &= ~JBD2_LAZY;
163         write_unlock(&journal->j_state_lock);
166 @@ -4804,6 +4814,24 @@ static int ext4_load_journal(struct super_block *sb,
168         EXT4_SB(sb)->s_journal = journal;
169         ext4_clear_journal_err(sb, es);
171 +       if (test_opt(sb, JOURNAL_LAZY)) {
172 +               struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
174 +               /* Read the latest version of the superblock from the journal */
175 +               lock_buffer(sbh);
176 +               clear_buffer_uptodate(sbh);
177 +               err = jbd2_bh_submit_read(journal, sbh, __func__);
178 +               if (err) {
179 +                       ext4_msg(sb, KERN_ERR, "error rereading superblock %d",
180 +                               err);
181 +                       set_buffer_uptodate(sbh);
182 +               }
183 +               if (!ext4_superblock_csum_verify(sb, es))
184 +                       ext4_msg(sb, KERN_ERR,
185 +                                "superblock csum doesn't verify"
186 +                                "after journal replay!");
187 +       }
188         return 0;
191 @@ -4894,6 +4922,9 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
193         journal_t *journal = EXT4_SB(sb)->s_journal;
195 +       if (test_opt(sb, JOURNAL_LAZY))
196 +               return;
198         if (!ext4_has_feature_journal(sb)) {
199                 BUG_ON(journal != NULL);
200                 return;
201 @@ -5029,21 +5060,20 @@ static int ext4_freeze(struct super_block *sb)
202         journal = EXT4_SB(sb)->s_journal;
204         if (journal) {
205 -               /* Now we set up the journal barrier. */
206 -               jbd2_journal_lock_updates(journal);
208                 /*
209 -                * Don't clear the needs_recovery flag if we failed to
210 -                * flush the journal.
211 +                * Set the journal barrier, then flush the journal and
212 +                * clear the needs_recovery flag if we are not in
213 +                * JBD2_LAZY mode.
214                  */
215 -               error = jbd2_journal_flush(journal);
216 -               if (error < 0)
217 -                       goto out;
218 +               jbd2_journal_lock_updates(journal);
220 -               /* Journal blocked and flushed, clear needs_recovery flag. */
221 +               if (!test_opt(sb, JOURNAL_LAZY)) {
222 +                       error = jbd2_journal_flush(journal);
223 +                       if (error < 0)
224 +                               goto out;
225 +               }
226                 ext4_clear_feature_journal_needs_recovery(sb);
227         }
229         error = ext4_commit_super(sb, 1);
230  out:
231         if (journal)
232 @@ -5061,7 +5091,7 @@ static int ext4_unfreeze(struct super_block *sb)
233         if (sb_rdonly(sb) || ext4_forced_shutdown(EXT4_SB(sb)))
234                 return 0;
236 -       if (EXT4_SB(sb)->s_journal) {
237 +       if (EXT4_SB(sb)->s_journal && !test_opt(sb, JOURNAL_LAZY)) {
238                 /* Reset the needs_recovery flag before the fs is unlocked. */
239                 ext4_set_feature_journal_needs_recovery(sb);
240         }
241 @@ -5595,6 +5625,8 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
242                  * We don't need to lock updates but journal_flush() could
243                  * otherwise be livelocked...
244                  */
245 +               if (test_opt(sb, JOURNAL_LAZY))
246 +                       return -EOPNOTSUPP;
247                 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
248                 err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
249                 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);