1 // SPDX-License-Identifier: GPL-2.0
3 * linux/fs/ext4/inode.c
5 * Copyright (C) 1992, 1993, 1994, 1995
6 * Remy Card (card@masi.ibp.fr)
7 * Laboratoire MASI - Institut Blaise Pascal
8 * Universite Pierre et Marie Curie (Paris VI)
12 * linux/fs/minix/inode.c
14 * Copyright (C) 1991, 1992 Linus Torvalds
16 * 64-bit file support on 64-bit platforms by Jakub Jelinek
17 * (jj@sunsite.ms.mff.cuni.cz)
19 * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
23 #include <linux/mount.h>
24 #include <linux/time.h>
25 #include <linux/highuid.h>
26 #include <linux/pagemap.h>
27 #include <linux/dax.h>
28 #include <linux/quotaops.h>
29 #include <linux/string.h>
30 #include <linux/buffer_head.h>
31 #include <linux/writeback.h>
32 #include <linux/pagevec.h>
33 #include <linux/mpage.h>
34 #include <linux/namei.h>
35 #include <linux/uio.h>
36 #include <linux/bio.h>
37 #include <linux/workqueue.h>
38 #include <linux/kernel.h>
39 #include <linux/printk.h>
40 #include <linux/slab.h>
41 #include <linux/bitops.h>
42 #include <linux/iomap.h>
43 #include <linux/iversion.h>
45 #include "ext4_jbd2.h"
50 #include <trace/events/ext4.h>
52 static void ext4_journalled_zero_new_buffers(handle_t
*handle
,
55 unsigned from
, unsigned to
);
57 static __u32
ext4_inode_csum(struct inode
*inode
, struct ext4_inode
*raw
,
58 struct ext4_inode_info
*ei
)
60 struct ext4_sb_info
*sbi
= EXT4_SB(inode
->i_sb
);
63 int offset
= offsetof(struct ext4_inode
, i_checksum_lo
);
64 unsigned int csum_size
= sizeof(dummy_csum
);
66 csum
= ext4_chksum(sbi
, ei
->i_csum_seed
, (__u8
*)raw
, offset
);
67 csum
= ext4_chksum(sbi
, csum
, (__u8
*)&dummy_csum
, csum_size
);
69 csum
= ext4_chksum(sbi
, csum
, (__u8
*)raw
+ offset
,
70 EXT4_GOOD_OLD_INODE_SIZE
- offset
);
72 if (EXT4_INODE_SIZE(inode
->i_sb
) > EXT4_GOOD_OLD_INODE_SIZE
) {
73 offset
= offsetof(struct ext4_inode
, i_checksum_hi
);
74 csum
= ext4_chksum(sbi
, csum
, (__u8
*)raw
+
75 EXT4_GOOD_OLD_INODE_SIZE
,
76 offset
- EXT4_GOOD_OLD_INODE_SIZE
);
77 if (EXT4_FITS_IN_INODE(raw
, ei
, i_checksum_hi
)) {
78 csum
= ext4_chksum(sbi
, csum
, (__u8
*)&dummy_csum
,
82 csum
= ext4_chksum(sbi
, csum
, (__u8
*)raw
+ offset
,
83 EXT4_INODE_SIZE(inode
->i_sb
) - offset
);
89 static int ext4_inode_csum_verify(struct inode
*inode
, struct ext4_inode
*raw
,
90 struct ext4_inode_info
*ei
)
92 __u32 provided
, calculated
;
94 if (EXT4_SB(inode
->i_sb
)->s_es
->s_creator_os
!=
95 cpu_to_le32(EXT4_OS_LINUX
) ||
96 !ext4_has_metadata_csum(inode
->i_sb
))
99 provided
= le16_to_cpu(raw
->i_checksum_lo
);
100 calculated
= ext4_inode_csum(inode
, raw
, ei
);
101 if (EXT4_INODE_SIZE(inode
->i_sb
) > EXT4_GOOD_OLD_INODE_SIZE
&&
102 EXT4_FITS_IN_INODE(raw
, ei
, i_checksum_hi
))
103 provided
|= ((__u32
)le16_to_cpu(raw
->i_checksum_hi
)) << 16;
105 calculated
&= 0xFFFF;
107 return provided
== calculated
;
110 void ext4_inode_csum_set(struct inode
*inode
, struct ext4_inode
*raw
,
111 struct ext4_inode_info
*ei
)
115 if (EXT4_SB(inode
->i_sb
)->s_es
->s_creator_os
!=
116 cpu_to_le32(EXT4_OS_LINUX
) ||
117 !ext4_has_metadata_csum(inode
->i_sb
))
120 csum
= ext4_inode_csum(inode
, raw
, ei
);
121 raw
->i_checksum_lo
= cpu_to_le16(csum
& 0xFFFF);
122 if (EXT4_INODE_SIZE(inode
->i_sb
) > EXT4_GOOD_OLD_INODE_SIZE
&&
123 EXT4_FITS_IN_INODE(raw
, ei
, i_checksum_hi
))
124 raw
->i_checksum_hi
= cpu_to_le16(csum
>> 16);
127 static inline int ext4_begin_ordered_truncate(struct inode
*inode
,
130 trace_ext4_begin_ordered_truncate(inode
, new_size
);
132 * If jinode is zero, then we never opened the file for
133 * writing, so there's no need to call
134 * jbd2_journal_begin_ordered_truncate() since there's no
135 * outstanding writes we need to flush.
137 if (!EXT4_I(inode
)->jinode
)
139 return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode
),
140 EXT4_I(inode
)->jinode
,
144 static int ext4_meta_trans_blocks(struct inode
*inode
, int lblocks
,
148 * Test whether an inode is a fast symlink.
149 * A fast symlink has its symlink data stored in ext4_inode_info->i_data.
151 int ext4_inode_is_fast_symlink(struct inode
*inode
)
153 if (!(EXT4_I(inode
)->i_flags
& EXT4_EA_INODE_FL
)) {
154 int ea_blocks
= EXT4_I(inode
)->i_file_acl
?
155 EXT4_CLUSTER_SIZE(inode
->i_sb
) >> 9 : 0;
157 if (ext4_has_inline_data(inode
))
160 return (S_ISLNK(inode
->i_mode
) && inode
->i_blocks
- ea_blocks
== 0);
162 return S_ISLNK(inode
->i_mode
) && inode
->i_size
&&
163 (inode
->i_size
< EXT4_N_BLOCKS
* 4);
167 * Called at the last iput() if i_nlink is zero.
169 void ext4_evict_inode(struct inode
*inode
)
174 * Credits for final inode cleanup and freeing:
175 * sb + inode (ext4_orphan_del()), block bitmap, group descriptor
176 * (xattr block freeing), bitmap, group descriptor (inode freeing)
178 int extra_credits
= 6;
179 struct ext4_xattr_inode_array
*ea_inode_array
= NULL
;
180 bool freeze_protected
= false;
182 trace_ext4_evict_inode(inode
);
184 if (EXT4_I(inode
)->i_flags
& EXT4_EA_INODE_FL
)
185 ext4_evict_ea_inode(inode
);
186 if (inode
->i_nlink
) {
187 truncate_inode_pages_final(&inode
->i_data
);
192 if (is_bad_inode(inode
))
194 dquot_initialize(inode
);
196 if (ext4_should_order_data(inode
))
197 ext4_begin_ordered_truncate(inode
, 0);
198 truncate_inode_pages_final(&inode
->i_data
);
201 * For inodes with journalled data, transaction commit could have
202 * dirtied the inode. And for inodes with dioread_nolock, unwritten
203 * extents converting worker could merge extents and also have dirtied
204 * the inode. Flush worker is ignoring it because of I_FREEING flag but
205 * we still need to remove the inode from the writeback lists.
207 if (!list_empty_careful(&inode
->i_io_list
))
208 inode_io_list_del(inode
);
211 * Protect us against freezing - iput() caller didn't have to have any
212 * protection against it. When we are in a running transaction though,
213 * we are already protected against freezing and we cannot grab further
214 * protection due to lock ordering constraints.
216 if (!ext4_journal_current_handle()) {
217 sb_start_intwrite(inode
->i_sb
);
218 freeze_protected
= true;
221 if (!IS_NOQUOTA(inode
))
222 extra_credits
+= EXT4_MAXQUOTAS_DEL_BLOCKS(inode
->i_sb
);
225 * Block bitmap, group descriptor, and inode are accounted in both
226 * ext4_blocks_for_truncate() and extra_credits. So subtract 3.
228 handle
= ext4_journal_start(inode
, EXT4_HT_TRUNCATE
,
229 ext4_blocks_for_truncate(inode
) + extra_credits
- 3);
230 if (IS_ERR(handle
)) {
231 ext4_std_error(inode
->i_sb
, PTR_ERR(handle
));
233 * If we're going to skip the normal cleanup, we still need to
234 * make sure that the in-core orphan linked list is properly
237 ext4_orphan_del(NULL
, inode
);
238 if (freeze_protected
)
239 sb_end_intwrite(inode
->i_sb
);
244 ext4_handle_sync(handle
);
247 * Set inode->i_size to 0 before calling ext4_truncate(). We need
248 * special handling of symlinks here because i_size is used to
249 * determine whether ext4_inode_info->i_data contains symlink data or
250 * block mappings. Setting i_size to 0 will remove its fast symlink
251 * status. Erase i_data so that it becomes a valid empty block map.
253 if (ext4_inode_is_fast_symlink(inode
))
254 memset(EXT4_I(inode
)->i_data
, 0, sizeof(EXT4_I(inode
)->i_data
));
256 err
= ext4_mark_inode_dirty(handle
, inode
);
258 ext4_warning(inode
->i_sb
,
259 "couldn't mark inode dirty (err %d)", err
);
262 if (inode
->i_blocks
) {
263 err
= ext4_truncate(inode
);
265 ext4_error_err(inode
->i_sb
, -err
,
266 "couldn't truncate inode %lu (err %d)",
272 /* Remove xattr references. */
273 err
= ext4_xattr_delete_inode(handle
, inode
, &ea_inode_array
,
276 ext4_warning(inode
->i_sb
, "xattr delete (err %d)", err
);
278 ext4_journal_stop(handle
);
279 ext4_orphan_del(NULL
, inode
);
280 if (freeze_protected
)
281 sb_end_intwrite(inode
->i_sb
);
282 ext4_xattr_inode_array_free(ea_inode_array
);
287 * Kill off the orphan record which ext4_truncate created.
288 * AKPM: I think this can be inside the above `if'.
289 * Note that ext4_orphan_del() has to be able to cope with the
290 * deletion of a non-existent orphan - this is because we don't
291 * know if ext4_truncate() actually created an orphan record.
292 * (Well, we could do this if we need to, but heck - it works)
294 ext4_orphan_del(handle
, inode
);
295 EXT4_I(inode
)->i_dtime
= (__u32
)ktime_get_real_seconds();
298 * One subtle ordering requirement: if anything has gone wrong
299 * (transaction abort, IO errors, whatever), then we can still
300 * do these next steps (the fs will already have been marked as
301 * having errors), but we can't free the inode if the mark_dirty
304 if (ext4_mark_inode_dirty(handle
, inode
))
305 /* If that failed, just do the required in-core inode clear. */
306 ext4_clear_inode(inode
);
308 ext4_free_inode(handle
, inode
);
309 ext4_journal_stop(handle
);
310 if (freeze_protected
)
311 sb_end_intwrite(inode
->i_sb
);
312 ext4_xattr_inode_array_free(ea_inode_array
);
316 * Check out some where else accidentally dirty the evicting inode,
317 * which may probably cause inode use-after-free issues later.
319 WARN_ON_ONCE(!list_empty_careful(&inode
->i_io_list
));
321 if (!list_empty(&EXT4_I(inode
)->i_fc_list
))
322 ext4_fc_mark_ineligible(inode
->i_sb
, EXT4_FC_REASON_NOMEM
, NULL
);
323 ext4_clear_inode(inode
); /* We must guarantee clearing of inode... */
327 qsize_t
*ext4_get_reserved_space(struct inode
*inode
)
329 return &EXT4_I(inode
)->i_reserved_quota
;
334 * Called with i_data_sem down, which is important since we can call
335 * ext4_discard_preallocations() from here.
337 void ext4_da_update_reserve_space(struct inode
*inode
,
338 int used
, int quota_claim
)
340 struct ext4_sb_info
*sbi
= EXT4_SB(inode
->i_sb
);
341 struct ext4_inode_info
*ei
= EXT4_I(inode
);
343 spin_lock(&ei
->i_block_reservation_lock
);
344 trace_ext4_da_update_reserve_space(inode
, used
, quota_claim
);
345 if (unlikely(used
> ei
->i_reserved_data_blocks
)) {
346 ext4_warning(inode
->i_sb
, "%s: ino %lu, used %d "
347 "with only %d reserved data blocks",
348 __func__
, inode
->i_ino
, used
,
349 ei
->i_reserved_data_blocks
);
351 used
= ei
->i_reserved_data_blocks
;
354 /* Update per-inode reservations */
355 ei
->i_reserved_data_blocks
-= used
;
356 percpu_counter_sub(&sbi
->s_dirtyclusters_counter
, used
);
358 spin_unlock(&ei
->i_block_reservation_lock
);
360 /* Update quota subsystem for data blocks */
362 dquot_claim_block(inode
, EXT4_C2B(sbi
, used
));
365 * We did fallocate with an offset that is already delayed
366 * allocated. So on delayed allocated writeback we should
367 * not re-claim the quota for fallocated blocks.
369 dquot_release_reservation_block(inode
, EXT4_C2B(sbi
, used
));
373 * If we have done all the pending block allocations and if
374 * there aren't any writers on the inode, we can discard the
375 * inode's preallocations.
377 if ((ei
->i_reserved_data_blocks
== 0) &&
378 !inode_is_open_for_write(inode
))
379 ext4_discard_preallocations(inode
);
382 static int __check_block_validity(struct inode
*inode
, const char *func
,
384 struct ext4_map_blocks
*map
)
386 if (ext4_has_feature_journal(inode
->i_sb
) &&
388 le32_to_cpu(EXT4_SB(inode
->i_sb
)->s_es
->s_journal_inum
)))
390 if (!ext4_inode_block_valid(inode
, map
->m_pblk
, map
->m_len
)) {
391 ext4_error_inode(inode
, func
, line
, map
->m_pblk
,
392 "lblock %lu mapped to illegal pblock %llu "
393 "(length %d)", (unsigned long) map
->m_lblk
,
394 map
->m_pblk
, map
->m_len
);
395 return -EFSCORRUPTED
;
400 int ext4_issue_zeroout(struct inode
*inode
, ext4_lblk_t lblk
, ext4_fsblk_t pblk
,
405 if (IS_ENCRYPTED(inode
) && S_ISREG(inode
->i_mode
))
406 return fscrypt_zeroout_range(inode
, lblk
, pblk
, len
);
408 ret
= sb_issue_zeroout(inode
->i_sb
, pblk
, len
, GFP_NOFS
);
415 #define check_block_validity(inode, map) \
416 __check_block_validity((inode), __func__, __LINE__, (map))
418 #ifdef ES_AGGRESSIVE_TEST
419 static void ext4_map_blocks_es_recheck(handle_t
*handle
,
421 struct ext4_map_blocks
*es_map
,
422 struct ext4_map_blocks
*map
,
429 * There is a race window that the result is not the same.
430 * e.g. xfstests #223 when dioread_nolock enables. The reason
431 * is that we lookup a block mapping in extent status tree with
432 * out taking i_data_sem. So at the time the unwritten extent
433 * could be converted.
435 down_read(&EXT4_I(inode
)->i_data_sem
);
436 if (ext4_test_inode_flag(inode
, EXT4_INODE_EXTENTS
)) {
437 retval
= ext4_ext_map_blocks(handle
, inode
, map
, 0);
439 retval
= ext4_ind_map_blocks(handle
, inode
, map
, 0);
441 up_read((&EXT4_I(inode
)->i_data_sem
));
444 * We don't check m_len because extent will be collpased in status
445 * tree. So the m_len might not equal.
447 if (es_map
->m_lblk
!= map
->m_lblk
||
448 es_map
->m_flags
!= map
->m_flags
||
449 es_map
->m_pblk
!= map
->m_pblk
) {
450 printk("ES cache assertion failed for inode: %lu "
451 "es_cached ex [%d/%d/%llu/%x] != "
452 "found ex [%d/%d/%llu/%x] retval %d flags %x\n",
453 inode
->i_ino
, es_map
->m_lblk
, es_map
->m_len
,
454 es_map
->m_pblk
, es_map
->m_flags
, map
->m_lblk
,
455 map
->m_len
, map
->m_pblk
, map
->m_flags
,
459 #endif /* ES_AGGRESSIVE_TEST */
461 static int ext4_map_query_blocks(handle_t
*handle
, struct inode
*inode
,
462 struct ext4_map_blocks
*map
)
467 if (ext4_test_inode_flag(inode
, EXT4_INODE_EXTENTS
))
468 retval
= ext4_ext_map_blocks(handle
, inode
, map
, 0);
470 retval
= ext4_ind_map_blocks(handle
, inode
, map
, 0);
475 if (unlikely(retval
!= map
->m_len
)) {
476 ext4_warning(inode
->i_sb
,
477 "ES len assertion failed for inode "
478 "%lu: retval %d != map->m_len %d",
479 inode
->i_ino
, retval
, map
->m_len
);
483 status
= map
->m_flags
& EXT4_MAP_UNWRITTEN
?
484 EXTENT_STATUS_UNWRITTEN
: EXTENT_STATUS_WRITTEN
;
485 ext4_es_insert_extent(inode
, map
->m_lblk
, map
->m_len
,
486 map
->m_pblk
, status
, false);
490 static int ext4_map_create_blocks(handle_t
*handle
, struct inode
*inode
,
491 struct ext4_map_blocks
*map
, int flags
)
493 struct extent_status es
;
498 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE
499 * indicates that the blocks and quotas has already been
500 * checked when the data was copied into the page cache.
502 if (map
->m_flags
& EXT4_MAP_DELAYED
)
503 flags
|= EXT4_GET_BLOCKS_DELALLOC_RESERVE
;
506 * Here we clear m_flags because after allocating an new extent,
507 * it will be set again.
509 map
->m_flags
&= ~EXT4_MAP_FLAGS
;
512 * We need to check for EXT4 here because migrate could have
513 * changed the inode type in between.
515 if (ext4_test_inode_flag(inode
, EXT4_INODE_EXTENTS
)) {
516 retval
= ext4_ext_map_blocks(handle
, inode
, map
, flags
);
518 retval
= ext4_ind_map_blocks(handle
, inode
, map
, flags
);
521 * We allocated new blocks which will result in i_data's
522 * format changing. Force the migrate to fail by clearing
525 if (retval
> 0 && map
->m_flags
& EXT4_MAP_NEW
)
526 ext4_clear_inode_state(inode
, EXT4_STATE_EXT_MIGRATE
);
531 if (unlikely(retval
!= map
->m_len
)) {
532 ext4_warning(inode
->i_sb
,
533 "ES len assertion failed for inode %lu: "
534 "retval %d != map->m_len %d",
535 inode
->i_ino
, retval
, map
->m_len
);
540 * We have to zeroout blocks before inserting them into extent
541 * status tree. Otherwise someone could look them up there and
542 * use them before they are really zeroed. We also have to
543 * unmap metadata before zeroing as otherwise writeback can
544 * overwrite zeros with stale data from block device.
546 if (flags
& EXT4_GET_BLOCKS_ZERO
&&
547 map
->m_flags
& EXT4_MAP_MAPPED
&& map
->m_flags
& EXT4_MAP_NEW
) {
548 err
= ext4_issue_zeroout(inode
, map
->m_lblk
, map
->m_pblk
,
555 * If the extent has been zeroed out, we don't need to update
556 * extent status tree.
558 if (flags
& EXT4_GET_BLOCKS_PRE_IO
&&
559 ext4_es_lookup_extent(inode
, map
->m_lblk
, NULL
, &es
)) {
560 if (ext4_es_is_written(&es
))
564 status
= map
->m_flags
& EXT4_MAP_UNWRITTEN
?
565 EXTENT_STATUS_UNWRITTEN
: EXTENT_STATUS_WRITTEN
;
566 ext4_es_insert_extent(inode
, map
->m_lblk
, map
->m_len
, map
->m_pblk
,
567 status
, flags
& EXT4_GET_BLOCKS_DELALLOC_RESERVE
);
573 * The ext4_map_blocks() function tries to look up the requested blocks,
574 * and returns if the blocks are already mapped.
576 * Otherwise it takes the write lock of the i_data_sem and allocate blocks
577 * and store the allocated blocks in the result buffer head and mark it
580 * If file type is extents based, it will call ext4_ext_map_blocks(),
581 * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
584 * On success, it returns the number of blocks being mapped or allocated.
585 * If flags doesn't contain EXT4_GET_BLOCKS_CREATE the blocks are
586 * pre-allocated and unwritten, the resulting @map is marked as unwritten.
587 * If the flags contain EXT4_GET_BLOCKS_CREATE, it will mark @map as mapped.
589 * It returns 0 if plain look up failed (blocks have not been allocated), in
590 * that case, @map is returned as unmapped but we still do fill map->m_len to
591 * indicate the length of a hole starting at map->m_lblk.
593 * It returns the error in case of allocation failure.
595 int ext4_map_blocks(handle_t
*handle
, struct inode
*inode
,
596 struct ext4_map_blocks
*map
, int flags
)
598 struct extent_status es
;
601 #ifdef ES_AGGRESSIVE_TEST
602 struct ext4_map_blocks orig_map
;
604 memcpy(&orig_map
, map
, sizeof(*map
));
608 ext_debug(inode
, "flag 0x%x, max_blocks %u, logical block %lu\n",
609 flags
, map
->m_len
, (unsigned long) map
->m_lblk
);
612 * ext4_map_blocks returns an int, and m_len is an unsigned int
614 if (unlikely(map
->m_len
> INT_MAX
))
615 map
->m_len
= INT_MAX
;
617 /* We can handle the block number less than EXT_MAX_BLOCKS */
618 if (unlikely(map
->m_lblk
>= EXT_MAX_BLOCKS
))
619 return -EFSCORRUPTED
;
621 /* Lookup extent status tree firstly */
622 if (!(EXT4_SB(inode
->i_sb
)->s_mount_state
& EXT4_FC_REPLAY
) &&
623 ext4_es_lookup_extent(inode
, map
->m_lblk
, NULL
, &es
)) {
624 if (ext4_es_is_written(&es
) || ext4_es_is_unwritten(&es
)) {
625 map
->m_pblk
= ext4_es_pblock(&es
) +
626 map
->m_lblk
- es
.es_lblk
;
627 map
->m_flags
|= ext4_es_is_written(&es
) ?
628 EXT4_MAP_MAPPED
: EXT4_MAP_UNWRITTEN
;
629 retval
= es
.es_len
- (map
->m_lblk
- es
.es_lblk
);
630 if (retval
> map
->m_len
)
633 } else if (ext4_es_is_delayed(&es
) || ext4_es_is_hole(&es
)) {
635 map
->m_flags
|= ext4_es_is_delayed(&es
) ?
636 EXT4_MAP_DELAYED
: 0;
637 retval
= es
.es_len
- (map
->m_lblk
- es
.es_lblk
);
638 if (retval
> map
->m_len
)
646 if (flags
& EXT4_GET_BLOCKS_CACHED_NOWAIT
)
648 #ifdef ES_AGGRESSIVE_TEST
649 ext4_map_blocks_es_recheck(handle
, inode
, map
,
655 * In the query cache no-wait mode, nothing we can do more if we
656 * cannot find extent in the cache.
658 if (flags
& EXT4_GET_BLOCKS_CACHED_NOWAIT
)
662 * Try to see if we can get the block without requesting a new
665 down_read(&EXT4_I(inode
)->i_data_sem
);
666 retval
= ext4_map_query_blocks(handle
, inode
, map
);
667 up_read((&EXT4_I(inode
)->i_data_sem
));
670 if (retval
> 0 && map
->m_flags
& EXT4_MAP_MAPPED
) {
671 ret
= check_block_validity(inode
, map
);
676 /* If it is only a block(s) look up */
677 if ((flags
& EXT4_GET_BLOCKS_CREATE
) == 0)
681 * Returns if the blocks have already allocated
683 * Note that if blocks have been preallocated
684 * ext4_ext_map_blocks() returns with buffer head unmapped
686 if (retval
> 0 && map
->m_flags
& EXT4_MAP_MAPPED
)
688 * If we need to convert extent to unwritten
689 * we continue and do the actual work in
690 * ext4_ext_map_blocks()
692 if (!(flags
& EXT4_GET_BLOCKS_CONVERT_UNWRITTEN
))
696 * New blocks allocate and/or writing to unwritten extent
697 * will possibly result in updating i_data, so we take
698 * the write lock of i_data_sem, and call get_block()
699 * with create == 1 flag.
701 down_write(&EXT4_I(inode
)->i_data_sem
);
702 retval
= ext4_map_create_blocks(handle
, inode
, map
, flags
);
703 up_write((&EXT4_I(inode
)->i_data_sem
));
704 if (retval
> 0 && map
->m_flags
& EXT4_MAP_MAPPED
) {
705 ret
= check_block_validity(inode
, map
);
710 * Inodes with freshly allocated blocks where contents will be
711 * visible after transaction commit must be on transaction's
714 if (map
->m_flags
& EXT4_MAP_NEW
&&
715 !(map
->m_flags
& EXT4_MAP_UNWRITTEN
) &&
716 !(flags
& EXT4_GET_BLOCKS_ZERO
) &&
717 !ext4_is_quota_file(inode
) &&
718 ext4_should_order_data(inode
)) {
720 (loff_t
)map
->m_lblk
<< inode
->i_blkbits
;
721 loff_t length
= (loff_t
)map
->m_len
<< inode
->i_blkbits
;
723 if (flags
& EXT4_GET_BLOCKS_IO_SUBMIT
)
724 ret
= ext4_jbd2_inode_add_wait(handle
, inode
,
727 ret
= ext4_jbd2_inode_add_write(handle
, inode
,
733 if (retval
> 0 && (map
->m_flags
& EXT4_MAP_UNWRITTEN
||
734 map
->m_flags
& EXT4_MAP_MAPPED
))
735 ext4_fc_track_range(handle
, inode
, map
->m_lblk
,
736 map
->m_lblk
+ map
->m_len
- 1);
738 ext_debug(inode
, "failed with err %d\n", retval
);
743 * Update EXT4_MAP_FLAGS in bh->b_state. For buffer heads attached to pages
744 * we have to be careful as someone else may be manipulating b_state as well.
746 static void ext4_update_bh_state(struct buffer_head
*bh
, unsigned long flags
)
748 unsigned long old_state
;
749 unsigned long new_state
;
751 flags
&= EXT4_MAP_FLAGS
;
753 /* Dummy buffer_head? Set non-atomically. */
755 bh
->b_state
= (bh
->b_state
& ~EXT4_MAP_FLAGS
) | flags
;
759 * Someone else may be modifying b_state. Be careful! This is ugly but
760 * once we get rid of using bh as a container for mapping information
761 * to pass to / from get_block functions, this can go away.
763 old_state
= READ_ONCE(bh
->b_state
);
765 new_state
= (old_state
& ~EXT4_MAP_FLAGS
) | flags
;
766 } while (unlikely(!try_cmpxchg(&bh
->b_state
, &old_state
, new_state
)));
769 static int _ext4_get_block(struct inode
*inode
, sector_t iblock
,
770 struct buffer_head
*bh
, int flags
)
772 struct ext4_map_blocks map
;
775 if (ext4_has_inline_data(inode
))
779 map
.m_len
= bh
->b_size
>> inode
->i_blkbits
;
781 ret
= ext4_map_blocks(ext4_journal_current_handle(), inode
, &map
,
784 map_bh(bh
, inode
->i_sb
, map
.m_pblk
);
785 ext4_update_bh_state(bh
, map
.m_flags
);
786 bh
->b_size
= inode
->i_sb
->s_blocksize
* map
.m_len
;
788 } else if (ret
== 0) {
789 /* hole case, need to fill in bh->b_size */
790 bh
->b_size
= inode
->i_sb
->s_blocksize
* map
.m_len
;
795 int ext4_get_block(struct inode
*inode
, sector_t iblock
,
796 struct buffer_head
*bh
, int create
)
798 return _ext4_get_block(inode
, iblock
, bh
,
799 create
? EXT4_GET_BLOCKS_CREATE
: 0);
803 * Get block function used when preparing for buffered write if we require
804 * creating an unwritten extent if blocks haven't been allocated. The extent
805 * will be converted to written after the IO is complete.
807 int ext4_get_block_unwritten(struct inode
*inode
, sector_t iblock
,
808 struct buffer_head
*bh_result
, int create
)
812 ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n",
813 inode
->i_ino
, create
);
814 ret
= _ext4_get_block(inode
, iblock
, bh_result
,
815 EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT
);
818 * If the buffer is marked unwritten, mark it as new to make sure it is
819 * zeroed out correctly in case of partial writes. Otherwise, there is
820 * a chance of stale data getting exposed.
822 if (ret
== 0 && buffer_unwritten(bh_result
))
823 set_buffer_new(bh_result
);
828 /* Maximum number of blocks we map for direct IO at once. */
829 #define DIO_MAX_BLOCKS 4096
832 * `handle' can be NULL if create is zero
834 struct buffer_head
*ext4_getblk(handle_t
*handle
, struct inode
*inode
,
835 ext4_lblk_t block
, int map_flags
)
837 struct ext4_map_blocks map
;
838 struct buffer_head
*bh
;
839 int create
= map_flags
& EXT4_GET_BLOCKS_CREATE
;
840 bool nowait
= map_flags
& EXT4_GET_BLOCKS_CACHED_NOWAIT
;
843 ASSERT((EXT4_SB(inode
->i_sb
)->s_mount_state
& EXT4_FC_REPLAY
)
844 || handle
!= NULL
|| create
== 0);
845 ASSERT(create
== 0 || !nowait
);
849 err
= ext4_map_blocks(handle
, inode
, &map
, map_flags
);
852 return create
? ERR_PTR(-ENOSPC
) : NULL
;
857 return sb_find_get_block(inode
->i_sb
, map
.m_pblk
);
860 * Since bh could introduce extra ref count such as referred by
861 * journal_head etc. Try to avoid using __GFP_MOVABLE here
862 * as it may fail the migration when journal_head remains.
864 bh
= getblk_unmovable(inode
->i_sb
->s_bdev
, map
.m_pblk
,
865 inode
->i_sb
->s_blocksize
);
868 return ERR_PTR(-ENOMEM
);
869 if (map
.m_flags
& EXT4_MAP_NEW
) {
871 ASSERT((EXT4_SB(inode
->i_sb
)->s_mount_state
& EXT4_FC_REPLAY
)
872 || (handle
!= NULL
));
875 * Now that we do not always journal data, we should
876 * keep in mind whether this should always journal the
877 * new buffer as metadata. For now, regular file
878 * writes use ext4_get_block instead, so it's not a
882 BUFFER_TRACE(bh
, "call get_create_access");
883 err
= ext4_journal_get_create_access(handle
, inode
->i_sb
, bh
,
889 if (!buffer_uptodate(bh
)) {
890 memset(bh
->b_data
, 0, inode
->i_sb
->s_blocksize
);
891 set_buffer_uptodate(bh
);
894 BUFFER_TRACE(bh
, "call ext4_handle_dirty_metadata");
895 err
= ext4_handle_dirty_metadata(handle
, inode
, bh
);
899 BUFFER_TRACE(bh
, "not a new buffer");
906 struct buffer_head
*ext4_bread(handle_t
*handle
, struct inode
*inode
,
907 ext4_lblk_t block
, int map_flags
)
909 struct buffer_head
*bh
;
912 bh
= ext4_getblk(handle
, inode
, block
, map_flags
);
915 if (!bh
|| ext4_buffer_uptodate(bh
))
918 ret
= ext4_read_bh_lock(bh
, REQ_META
| REQ_PRIO
, true);
926 /* Read a contiguous batch of blocks. */
927 int ext4_bread_batch(struct inode
*inode
, ext4_lblk_t block
, int bh_count
,
928 bool wait
, struct buffer_head
**bhs
)
932 for (i
= 0; i
< bh_count
; i
++) {
933 bhs
[i
] = ext4_getblk(NULL
, inode
, block
+ i
, 0 /* map_flags */);
934 if (IS_ERR(bhs
[i
])) {
935 err
= PTR_ERR(bhs
[i
]);
941 for (i
= 0; i
< bh_count
; i
++)
942 /* Note that NULL bhs[i] is valid because of holes. */
943 if (bhs
[i
] && !ext4_buffer_uptodate(bhs
[i
]))
944 ext4_read_bh_lock(bhs
[i
], REQ_META
| REQ_PRIO
, false);
949 for (i
= 0; i
< bh_count
; i
++)
951 wait_on_buffer(bhs
[i
]);
953 for (i
= 0; i
< bh_count
; i
++) {
954 if (bhs
[i
] && !buffer_uptodate(bhs
[i
])) {
962 for (i
= 0; i
< bh_count
; i
++) {
969 int ext4_walk_page_buffers(handle_t
*handle
, struct inode
*inode
,
970 struct buffer_head
*head
,
974 int (*fn
)(handle_t
*handle
, struct inode
*inode
,
975 struct buffer_head
*bh
))
977 struct buffer_head
*bh
;
978 unsigned block_start
, block_end
;
979 unsigned blocksize
= head
->b_size
;
981 struct buffer_head
*next
;
983 for (bh
= head
, block_start
= 0;
984 ret
== 0 && (bh
!= head
|| !block_start
);
985 block_start
= block_end
, bh
= next
) {
986 next
= bh
->b_this_page
;
987 block_end
= block_start
+ blocksize
;
988 if (block_end
<= from
|| block_start
>= to
) {
989 if (partial
&& !buffer_uptodate(bh
))
993 err
= (*fn
)(handle
, inode
, bh
);
1001 * Helper for handling dirtying of journalled data. We also mark the folio as
1002 * dirty so that writeback code knows about this page (and inode) contains
1003 * dirty data. ext4_writepages() then commits appropriate transaction to
1006 static int ext4_dirty_journalled_data(handle_t
*handle
, struct buffer_head
*bh
)
1008 folio_mark_dirty(bh
->b_folio
);
1009 return ext4_handle_dirty_metadata(handle
, NULL
, bh
);
1012 int do_journal_get_write_access(handle_t
*handle
, struct inode
*inode
,
1013 struct buffer_head
*bh
)
1015 if (!buffer_mapped(bh
) || buffer_freed(bh
))
1017 BUFFER_TRACE(bh
, "get write access");
1018 return ext4_journal_get_write_access(handle
, inode
->i_sb
, bh
,
1022 int ext4_block_write_begin(handle_t
*handle
, struct folio
*folio
,
1023 loff_t pos
, unsigned len
,
1024 get_block_t
*get_block
)
1026 unsigned from
= pos
& (PAGE_SIZE
- 1);
1027 unsigned to
= from
+ len
;
1028 struct inode
*inode
= folio
->mapping
->host
;
1029 unsigned block_start
, block_end
;
1032 unsigned blocksize
= inode
->i_sb
->s_blocksize
;
1034 struct buffer_head
*bh
, *head
, *wait
[2];
1037 bool should_journal_data
= ext4_should_journal_data(inode
);
1039 BUG_ON(!folio_test_locked(folio
));
1040 BUG_ON(from
> PAGE_SIZE
);
1041 BUG_ON(to
> PAGE_SIZE
);
1044 head
= folio_buffers(folio
);
1046 head
= create_empty_buffers(folio
, blocksize
, 0);
1047 bbits
= ilog2(blocksize
);
1048 block
= (sector_t
)folio
->index
<< (PAGE_SHIFT
- bbits
);
1050 for (bh
= head
, block_start
= 0; bh
!= head
|| !block_start
;
1051 block
++, block_start
= block_end
, bh
= bh
->b_this_page
) {
1052 block_end
= block_start
+ blocksize
;
1053 if (block_end
<= from
|| block_start
>= to
) {
1054 if (folio_test_uptodate(folio
)) {
1055 set_buffer_uptodate(bh
);
1060 clear_buffer_new(bh
);
1061 if (!buffer_mapped(bh
)) {
1062 WARN_ON(bh
->b_size
!= blocksize
);
1063 err
= get_block(inode
, block
, bh
, 1);
1066 if (buffer_new(bh
)) {
1068 * We may be zeroing partial buffers or all new
1069 * buffers in case of failure. Prepare JBD2 for
1072 if (should_journal_data
)
1073 do_journal_get_write_access(handle
,
1075 if (folio_test_uptodate(folio
)) {
1077 * Unlike __block_write_begin() we leave
1078 * dirtying of new uptodate buffers to
1079 * ->write_end() time or
1080 * folio_zero_new_buffers().
1082 set_buffer_uptodate(bh
);
1085 if (block_end
> to
|| block_start
< from
)
1086 folio_zero_segments(folio
, to
,
1092 if (folio_test_uptodate(folio
)) {
1093 set_buffer_uptodate(bh
);
1096 if (!buffer_uptodate(bh
) && !buffer_delay(bh
) &&
1097 !buffer_unwritten(bh
) &&
1098 (block_start
< from
|| block_end
> to
)) {
1099 ext4_read_bh_lock(bh
, 0, false);
1100 wait
[nr_wait
++] = bh
;
1104 * If we issued read requests, let them complete.
1106 for (i
= 0; i
< nr_wait
; i
++) {
1107 wait_on_buffer(wait
[i
]);
1108 if (!buffer_uptodate(wait
[i
]))
1111 if (unlikely(err
)) {
1112 if (should_journal_data
)
1113 ext4_journalled_zero_new_buffers(handle
, inode
, folio
,
1116 folio_zero_new_buffers(folio
, from
, to
);
1117 } else if (fscrypt_inode_uses_fs_layer_crypto(inode
)) {
1118 for (i
= 0; i
< nr_wait
; i
++) {
1121 err2
= fscrypt_decrypt_pagecache_blocks(folio
,
1122 blocksize
, bh_offset(wait
[i
]));
1124 clear_buffer_uptodate(wait
[i
]);
1134 * To preserve ordering, it is essential that the hole instantiation and
1135 * the data write be encapsulated in a single transaction. We cannot
1136 * close off a transaction and start a new one between the ext4_get_block()
1137 * and the ext4_write_end(). So doing the jbd2_journal_start at the start of
1138 * ext4_write_begin() is the right place.
1140 static int ext4_write_begin(struct file
*file
, struct address_space
*mapping
,
1141 loff_t pos
, unsigned len
,
1142 struct folio
**foliop
, void **fsdata
)
1144 struct inode
*inode
= mapping
->host
;
1145 int ret
, needed_blocks
;
1148 struct folio
*folio
;
1152 if (unlikely(ext4_forced_shutdown(inode
->i_sb
)))
1155 trace_ext4_write_begin(inode
, pos
, len
);
1157 * Reserve one block more for addition to orphan list in case
1158 * we allocate blocks but write fails for some reason
1160 needed_blocks
= ext4_writepage_trans_blocks(inode
) + 1;
1161 index
= pos
>> PAGE_SHIFT
;
1162 from
= pos
& (PAGE_SIZE
- 1);
1165 if (ext4_test_inode_state(inode
, EXT4_STATE_MAY_INLINE_DATA
)) {
1166 ret
= ext4_try_to_write_inline_data(mapping
, inode
, pos
, len
,
1175 * __filemap_get_folio() can take a long time if the
1176 * system is thrashing due to memory pressure, or if the folio
1177 * is being written back. So grab it first before we start
1178 * the transaction handle. This also allows us to allocate
1179 * the folio (if needed) without using GFP_NOFS.
1182 folio
= __filemap_get_folio(mapping
, index
, FGP_WRITEBEGIN
,
1183 mapping_gfp_mask(mapping
));
1185 return PTR_ERR(folio
);
1187 * The same as page allocation, we prealloc buffer heads before
1188 * starting the handle.
1190 if (!folio_buffers(folio
))
1191 create_empty_buffers(folio
, inode
->i_sb
->s_blocksize
, 0);
1193 folio_unlock(folio
);
1196 handle
= ext4_journal_start(inode
, EXT4_HT_WRITE_PAGE
, needed_blocks
);
1197 if (IS_ERR(handle
)) {
1199 return PTR_ERR(handle
);
1203 if (folio
->mapping
!= mapping
) {
1204 /* The folio got truncated from under us */
1205 folio_unlock(folio
);
1207 ext4_journal_stop(handle
);
1210 /* In case writeback began while the folio was unlocked */
1211 folio_wait_stable(folio
);
1213 if (ext4_should_dioread_nolock(inode
))
1214 ret
= ext4_block_write_begin(handle
, folio
, pos
, len
,
1215 ext4_get_block_unwritten
);
1217 ret
= ext4_block_write_begin(handle
, folio
, pos
, len
,
1219 if (!ret
&& ext4_should_journal_data(inode
)) {
1220 ret
= ext4_walk_page_buffers(handle
, inode
,
1221 folio_buffers(folio
), from
, to
,
1222 NULL
, do_journal_get_write_access
);
1226 bool extended
= (pos
+ len
> inode
->i_size
) &&
1227 !ext4_verity_in_progress(inode
);
1229 folio_unlock(folio
);
1231 * ext4_block_write_begin may have instantiated a few blocks
1232 * outside i_size. Trim these off again. Don't need
1233 * i_size_read because we hold i_rwsem.
1235 * Add inode to orphan list in case we crash before
1238 if (extended
&& ext4_can_truncate(inode
))
1239 ext4_orphan_add(handle
, inode
);
1241 ext4_journal_stop(handle
);
1243 ext4_truncate_failed_write(inode
);
1245 * If truncate failed early the inode might
1246 * still be on the orphan list; we need to
1247 * make sure the inode is removed from the
1248 * orphan list in that case.
1251 ext4_orphan_del(NULL
, inode
);
1254 if (ret
== -ENOSPC
&&
1255 ext4_should_retry_alloc(inode
->i_sb
, &retries
))
1264 /* For write_end() in data=journal mode */
1265 static int write_end_fn(handle_t
*handle
, struct inode
*inode
,
1266 struct buffer_head
*bh
)
1269 if (!buffer_mapped(bh
) || buffer_freed(bh
))
1271 set_buffer_uptodate(bh
);
1272 ret
= ext4_dirty_journalled_data(handle
, bh
);
1273 clear_buffer_meta(bh
);
1274 clear_buffer_prio(bh
);
1279 * We need to pick up the new inode size which generic_commit_write gave us
1280 * `file' can be NULL - eg, when called from page_symlink().
1282 * ext4 never places buffers on inode->i_mapping->i_private_list. metadata
1283 * buffers are managed internally.
1285 static int ext4_write_end(struct file
*file
,
1286 struct address_space
*mapping
,
1287 loff_t pos
, unsigned len
, unsigned copied
,
1288 struct folio
*folio
, void *fsdata
)
1290 handle_t
*handle
= ext4_journal_current_handle();
1291 struct inode
*inode
= mapping
->host
;
1292 loff_t old_size
= inode
->i_size
;
1294 int i_size_changed
= 0;
1295 bool verity
= ext4_verity_in_progress(inode
);
1297 trace_ext4_write_end(inode
, pos
, len
, copied
);
1299 if (ext4_has_inline_data(inode
) &&
1300 ext4_test_inode_state(inode
, EXT4_STATE_MAY_INLINE_DATA
))
1301 return ext4_write_inline_data_end(inode
, pos
, len
, copied
,
1304 copied
= block_write_end(file
, mapping
, pos
, len
, copied
, folio
, fsdata
);
1306 * it's important to update i_size while still holding folio lock:
1307 * page writeout could otherwise come in and zero beyond i_size.
1309 * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree
1310 * blocks are being written past EOF, so skip the i_size update.
1313 i_size_changed
= ext4_update_inode_size(inode
, pos
+ copied
);
1314 folio_unlock(folio
);
1317 if (old_size
< pos
&& !verity
) {
1318 pagecache_isize_extended(inode
, old_size
, pos
);
1319 ext4_zero_partial_blocks(handle
, inode
, old_size
, pos
- old_size
);
1322 * Don't mark the inode dirty under folio lock. First, it unnecessarily
1323 * makes the holding time of folio lock longer. Second, it forces lock
1324 * ordering of folio lock and transaction start for journaling
1328 ret
= ext4_mark_inode_dirty(handle
, inode
);
1330 if (pos
+ len
> inode
->i_size
&& !verity
&& ext4_can_truncate(inode
))
1331 /* if we have allocated more blocks and copied
1332 * less. We will have blocks allocated outside
1333 * inode->i_size. So truncate them
1335 ext4_orphan_add(handle
, inode
);
1337 ret2
= ext4_journal_stop(handle
);
1341 if (pos
+ len
> inode
->i_size
&& !verity
) {
1342 ext4_truncate_failed_write(inode
);
1344 * If truncate failed early the inode might still be
1345 * on the orphan list; we need to make sure the inode
1346 * is removed from the orphan list in that case.
1349 ext4_orphan_del(NULL
, inode
);
1352 return ret
? ret
: copied
;
1356 * This is a private version of folio_zero_new_buffers() which doesn't
1357 * set the buffer to be dirty, since in data=journalled mode we need
1358 * to call ext4_dirty_journalled_data() instead.
1360 static void ext4_journalled_zero_new_buffers(handle_t
*handle
,
1361 struct inode
*inode
,
1362 struct folio
*folio
,
1363 unsigned from
, unsigned to
)
1365 unsigned int block_start
= 0, block_end
;
1366 struct buffer_head
*head
, *bh
;
1368 bh
= head
= folio_buffers(folio
);
1370 block_end
= block_start
+ bh
->b_size
;
1371 if (buffer_new(bh
)) {
1372 if (block_end
> from
&& block_start
< to
) {
1373 if (!folio_test_uptodate(folio
)) {
1374 unsigned start
, size
;
1376 start
= max(from
, block_start
);
1377 size
= min(to
, block_end
) - start
;
1379 folio_zero_range(folio
, start
, size
);
1381 clear_buffer_new(bh
);
1382 write_end_fn(handle
, inode
, bh
);
1385 block_start
= block_end
;
1386 bh
= bh
->b_this_page
;
1387 } while (bh
!= head
);
1390 static int ext4_journalled_write_end(struct file
*file
,
1391 struct address_space
*mapping
,
1392 loff_t pos
, unsigned len
, unsigned copied
,
1393 struct folio
*folio
, void *fsdata
)
1395 handle_t
*handle
= ext4_journal_current_handle();
1396 struct inode
*inode
= mapping
->host
;
1397 loff_t old_size
= inode
->i_size
;
1401 int size_changed
= 0;
1402 bool verity
= ext4_verity_in_progress(inode
);
1404 trace_ext4_journalled_write_end(inode
, pos
, len
, copied
);
1405 from
= pos
& (PAGE_SIZE
- 1);
1408 BUG_ON(!ext4_handle_valid(handle
));
1410 if (ext4_has_inline_data(inode
))
1411 return ext4_write_inline_data_end(inode
, pos
, len
, copied
,
1414 if (unlikely(copied
< len
) && !folio_test_uptodate(folio
)) {
1416 ext4_journalled_zero_new_buffers(handle
, inode
, folio
,
1419 if (unlikely(copied
< len
))
1420 ext4_journalled_zero_new_buffers(handle
, inode
, folio
,
1422 ret
= ext4_walk_page_buffers(handle
, inode
,
1423 folio_buffers(folio
),
1424 from
, from
+ copied
, &partial
,
1427 folio_mark_uptodate(folio
);
1430 size_changed
= ext4_update_inode_size(inode
, pos
+ copied
);
1431 EXT4_I(inode
)->i_datasync_tid
= handle
->h_transaction
->t_tid
;
1432 folio_unlock(folio
);
1435 if (old_size
< pos
&& !verity
) {
1436 pagecache_isize_extended(inode
, old_size
, pos
);
1437 ext4_zero_partial_blocks(handle
, inode
, old_size
, pos
- old_size
);
1441 ret2
= ext4_mark_inode_dirty(handle
, inode
);
1446 if (pos
+ len
> inode
->i_size
&& !verity
&& ext4_can_truncate(inode
))
1447 /* if we have allocated more blocks and copied
1448 * less. We will have blocks allocated outside
1449 * inode->i_size. So truncate them
1451 ext4_orphan_add(handle
, inode
);
1453 ret2
= ext4_journal_stop(handle
);
1456 if (pos
+ len
> inode
->i_size
&& !verity
) {
1457 ext4_truncate_failed_write(inode
);
1459 * If truncate failed early the inode might still be
1460 * on the orphan list; we need to make sure the inode
1461 * is removed from the orphan list in that case.
1464 ext4_orphan_del(NULL
, inode
);
1467 return ret
? ret
: copied
;
1471 * Reserve space for 'nr_resv' clusters
1473 static int ext4_da_reserve_space(struct inode
*inode
, int nr_resv
)
1475 struct ext4_sb_info
*sbi
= EXT4_SB(inode
->i_sb
);
1476 struct ext4_inode_info
*ei
= EXT4_I(inode
);
1480 * We will charge metadata quota at writeout time; this saves
1481 * us from metadata over-estimation, though we may go over by
1482 * a small amount in the end. Here we just reserve for data.
1484 ret
= dquot_reserve_block(inode
, EXT4_C2B(sbi
, nr_resv
));
1488 spin_lock(&ei
->i_block_reservation_lock
);
1489 if (ext4_claim_free_clusters(sbi
, nr_resv
, 0)) {
1490 spin_unlock(&ei
->i_block_reservation_lock
);
1491 dquot_release_reservation_block(inode
, EXT4_C2B(sbi
, nr_resv
));
1494 ei
->i_reserved_data_blocks
+= nr_resv
;
1495 trace_ext4_da_reserve_space(inode
, nr_resv
);
1496 spin_unlock(&ei
->i_block_reservation_lock
);
1498 return 0; /* success */
1501 void ext4_da_release_space(struct inode
*inode
, int to_free
)
1503 struct ext4_sb_info
*sbi
= EXT4_SB(inode
->i_sb
);
1504 struct ext4_inode_info
*ei
= EXT4_I(inode
);
1507 return; /* Nothing to release, exit */
1509 spin_lock(&EXT4_I(inode
)->i_block_reservation_lock
);
1511 trace_ext4_da_release_space(inode
, to_free
);
1512 if (unlikely(to_free
> ei
->i_reserved_data_blocks
)) {
1514 * if there aren't enough reserved blocks, then the
1515 * counter is messed up somewhere. Since this
1516 * function is called from invalidate page, it's
1517 * harmless to return without any action.
1519 ext4_warning(inode
->i_sb
, "ext4_da_release_space: "
1520 "ino %lu, to_free %d with only %d reserved "
1521 "data blocks", inode
->i_ino
, to_free
,
1522 ei
->i_reserved_data_blocks
);
1524 to_free
= ei
->i_reserved_data_blocks
;
1526 ei
->i_reserved_data_blocks
-= to_free
;
1528 /* update fs dirty data blocks counter */
1529 percpu_counter_sub(&sbi
->s_dirtyclusters_counter
, to_free
);
1531 spin_unlock(&EXT4_I(inode
)->i_block_reservation_lock
);
1533 dquot_release_reservation_block(inode
, EXT4_C2B(sbi
, to_free
));
1537 * Delayed allocation stuff
1540 struct mpage_da_data
{
1541 /* These are input fields for ext4_do_writepages() */
1542 struct inode
*inode
;
1543 struct writeback_control
*wbc
;
1544 unsigned int can_map
:1; /* Can writepages call map blocks? */
1546 /* These are internal state of ext4_do_writepages() */
1547 pgoff_t first_page
; /* The first page to write */
1548 pgoff_t next_page
; /* Current page to examine */
1549 pgoff_t last_page
; /* Last page to examine */
1551 * Extent to map - this can be after first_page because that can be
1552 * fully mapped. We somewhat abuse m_flags to store whether the extent
1553 * is delalloc or unwritten.
1555 struct ext4_map_blocks map
;
1556 struct ext4_io_submit io_submit
; /* IO submission data */
1557 unsigned int do_map
:1;
1558 unsigned int scanned_until_end
:1;
1559 unsigned int journalled_more_data
:1;
1562 static void mpage_release_unused_pages(struct mpage_da_data
*mpd
,
1567 struct folio_batch fbatch
;
1568 struct inode
*inode
= mpd
->inode
;
1569 struct address_space
*mapping
= inode
->i_mapping
;
1571 /* This is necessary when next_page == 0. */
1572 if (mpd
->first_page
>= mpd
->next_page
)
1575 mpd
->scanned_until_end
= 0;
1576 index
= mpd
->first_page
;
1577 end
= mpd
->next_page
- 1;
1579 ext4_lblk_t start
, last
;
1580 start
= index
<< (PAGE_SHIFT
- inode
->i_blkbits
);
1581 last
= end
<< (PAGE_SHIFT
- inode
->i_blkbits
);
1584 * avoid racing with extent status tree scans made by
1585 * ext4_insert_delayed_block()
1587 down_write(&EXT4_I(inode
)->i_data_sem
);
1588 ext4_es_remove_extent(inode
, start
, last
- start
+ 1);
1589 up_write(&EXT4_I(inode
)->i_data_sem
);
1592 folio_batch_init(&fbatch
);
1593 while (index
<= end
) {
1594 nr
= filemap_get_folios(mapping
, &index
, end
, &fbatch
);
1597 for (i
= 0; i
< nr
; i
++) {
1598 struct folio
*folio
= fbatch
.folios
[i
];
1600 if (folio
->index
< mpd
->first_page
)
1602 if (folio_next_index(folio
) - 1 > end
)
1604 BUG_ON(!folio_test_locked(folio
));
1605 BUG_ON(folio_test_writeback(folio
));
1607 if (folio_mapped(folio
))
1608 folio_clear_dirty_for_io(folio
);
1609 block_invalidate_folio(folio
, 0,
1611 folio_clear_uptodate(folio
);
1613 folio_unlock(folio
);
1615 folio_batch_release(&fbatch
);
1619 static void ext4_print_free_blocks(struct inode
*inode
)
1621 struct ext4_sb_info
*sbi
= EXT4_SB(inode
->i_sb
);
1622 struct super_block
*sb
= inode
->i_sb
;
1623 struct ext4_inode_info
*ei
= EXT4_I(inode
);
1625 ext4_msg(sb
, KERN_CRIT
, "Total free blocks count %lld",
1626 EXT4_C2B(EXT4_SB(inode
->i_sb
),
1627 ext4_count_free_clusters(sb
)));
1628 ext4_msg(sb
, KERN_CRIT
, "Free/Dirty block details");
1629 ext4_msg(sb
, KERN_CRIT
, "free_blocks=%lld",
1630 (long long) EXT4_C2B(EXT4_SB(sb
),
1631 percpu_counter_sum(&sbi
->s_freeclusters_counter
)));
1632 ext4_msg(sb
, KERN_CRIT
, "dirty_blocks=%lld",
1633 (long long) EXT4_C2B(EXT4_SB(sb
),
1634 percpu_counter_sum(&sbi
->s_dirtyclusters_counter
)));
1635 ext4_msg(sb
, KERN_CRIT
, "Block reservation details");
1636 ext4_msg(sb
, KERN_CRIT
, "i_reserved_data_blocks=%u",
1637 ei
->i_reserved_data_blocks
);
1642 * Check whether the cluster containing lblk has been allocated or has
1643 * delalloc reservation.
1645 * Returns 0 if the cluster doesn't have either, 1 if it has delalloc
1646 * reservation, 2 if it's already been allocated, negative error code on
1649 static int ext4_clu_alloc_state(struct inode
*inode
, ext4_lblk_t lblk
)
1651 struct ext4_sb_info
*sbi
= EXT4_SB(inode
->i_sb
);
1654 /* Has delalloc reservation? */
1655 if (ext4_es_scan_clu(inode
, &ext4_es_is_delayed
, lblk
))
1658 /* Already been allocated? */
1659 if (ext4_es_scan_clu(inode
, &ext4_es_is_mapped
, lblk
))
1661 ret
= ext4_clu_mapped(inode
, EXT4_B2C(sbi
, lblk
));
1671 * ext4_insert_delayed_blocks - adds a multiple delayed blocks to the extents
1672 * status tree, incrementing the reserved
1673 * cluster/block count or making pending
1674 * reservations where needed
1676 * @inode - file containing the newly added block
1677 * @lblk - start logical block to be added
1678 * @len - length of blocks to be added
1680 * Returns 0 on success, negative error code on failure.
1682 static int ext4_insert_delayed_blocks(struct inode
*inode
, ext4_lblk_t lblk
,
1685 struct ext4_sb_info
*sbi
= EXT4_SB(inode
->i_sb
);
1687 bool lclu_allocated
= false;
1688 bool end_allocated
= false;
1689 ext4_lblk_t resv_clu
;
1690 ext4_lblk_t end
= lblk
+ len
- 1;
1693 * If the cluster containing lblk or end is shared with a delayed,
1694 * written, or unwritten extent in a bigalloc file system, it's
1695 * already been accounted for and does not need to be reserved.
1696 * A pending reservation must be made for the cluster if it's
1697 * shared with a written or unwritten extent and doesn't already
1698 * have one. Written and unwritten extents can be purged from the
1699 * extents status tree if the system is under memory pressure, so
1700 * it's necessary to examine the extent tree if a search of the
1701 * extents status tree doesn't get a match.
1703 if (sbi
->s_cluster_ratio
== 1) {
1704 ret
= ext4_da_reserve_space(inode
, len
);
1705 if (ret
!= 0) /* ENOSPC */
1707 } else { /* bigalloc */
1708 resv_clu
= EXT4_B2C(sbi
, end
) - EXT4_B2C(sbi
, lblk
) + 1;
1710 ret
= ext4_clu_alloc_state(inode
, lblk
);
1715 lclu_allocated
= (ret
== 2);
1718 if (EXT4_B2C(sbi
, lblk
) != EXT4_B2C(sbi
, end
)) {
1719 ret
= ext4_clu_alloc_state(inode
, end
);
1724 end_allocated
= (ret
== 2);
1729 ret
= ext4_da_reserve_space(inode
, resv_clu
);
1730 if (ret
!= 0) /* ENOSPC */
1735 ext4_es_insert_delayed_extent(inode
, lblk
, len
, lclu_allocated
,
1741 * Looks up the requested blocks and sets the delalloc extent map.
1742 * First try to look up for the extent entry that contains the requested
1743 * blocks in the extent status tree without i_data_sem, then try to look
1744 * up for the ondisk extent mapping with i_data_sem in read mode,
1745 * finally hold i_data_sem in write mode, looks up again and add a
1746 * delalloc extent entry if it still couldn't find any extent. Pass out
1747 * the mapped extent through @map and return 0 on success.
1749 static int ext4_da_map_blocks(struct inode
*inode
, struct ext4_map_blocks
*map
)
1751 struct extent_status es
;
1753 #ifdef ES_AGGRESSIVE_TEST
1754 struct ext4_map_blocks orig_map
;
1756 memcpy(&orig_map
, map
, sizeof(*map
));
1760 ext_debug(inode
, "max_blocks %u, logical block %lu\n", map
->m_len
,
1761 (unsigned long) map
->m_lblk
);
1763 /* Lookup extent status tree firstly */
1764 if (ext4_es_lookup_extent(inode
, map
->m_lblk
, NULL
, &es
)) {
1765 map
->m_len
= min_t(unsigned int, map
->m_len
,
1766 es
.es_len
- (map
->m_lblk
- es
.es_lblk
));
1768 if (ext4_es_is_hole(&es
))
1773 * Delayed extent could be allocated by fallocate.
1774 * So we need to check it.
1776 if (ext4_es_is_delayed(&es
)) {
1777 map
->m_flags
|= EXT4_MAP_DELAYED
;
1781 map
->m_pblk
= ext4_es_pblock(&es
) + map
->m_lblk
- es
.es_lblk
;
1782 if (ext4_es_is_written(&es
))
1783 map
->m_flags
|= EXT4_MAP_MAPPED
;
1784 else if (ext4_es_is_unwritten(&es
))
1785 map
->m_flags
|= EXT4_MAP_UNWRITTEN
;
1789 #ifdef ES_AGGRESSIVE_TEST
1790 ext4_map_blocks_es_recheck(NULL
, inode
, map
, &orig_map
, 0);
1796 * Try to see if we can get the block without requesting a new
1797 * file system block.
1799 down_read(&EXT4_I(inode
)->i_data_sem
);
1800 if (ext4_has_inline_data(inode
))
1803 retval
= ext4_map_query_blocks(NULL
, inode
, map
);
1804 up_read(&EXT4_I(inode
)->i_data_sem
);
1806 return retval
< 0 ? retval
: 0;
1809 down_write(&EXT4_I(inode
)->i_data_sem
);
1811 * Page fault path (ext4_page_mkwrite does not take i_rwsem)
1812 * and fallocate path (no folio lock) can race. Make sure we
1813 * lookup the extent status tree here again while i_data_sem
1814 * is held in write mode, before inserting a new da entry in
1815 * the extent status tree.
1817 if (ext4_es_lookup_extent(inode
, map
->m_lblk
, NULL
, &es
)) {
1818 map
->m_len
= min_t(unsigned int, map
->m_len
,
1819 es
.es_len
- (map
->m_lblk
- es
.es_lblk
));
1821 if (!ext4_es_is_hole(&es
)) {
1822 up_write(&EXT4_I(inode
)->i_data_sem
);
1825 } else if (!ext4_has_inline_data(inode
)) {
1826 retval
= ext4_map_query_blocks(NULL
, inode
, map
);
1828 up_write(&EXT4_I(inode
)->i_data_sem
);
1829 return retval
< 0 ? retval
: 0;
1833 map
->m_flags
|= EXT4_MAP_DELAYED
;
1834 retval
= ext4_insert_delayed_blocks(inode
, map
->m_lblk
, map
->m_len
);
1835 up_write(&EXT4_I(inode
)->i_data_sem
);
1841 * This is a special get_block_t callback which is used by
1842 * ext4_da_write_begin(). It will either return mapped block or
1843 * reserve space for a single block.
1845 * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
1846 * We also have b_blocknr = -1 and b_bdev initialized properly
1848 * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
1849 * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
1850 * initialized properly.
1852 int ext4_da_get_block_prep(struct inode
*inode
, sector_t iblock
,
1853 struct buffer_head
*bh
, int create
)
1855 struct ext4_map_blocks map
;
1856 sector_t invalid_block
= ~((sector_t
) 0xffff);
1859 BUG_ON(create
== 0);
1860 BUG_ON(bh
->b_size
!= inode
->i_sb
->s_blocksize
);
1862 if (invalid_block
< ext4_blocks_count(EXT4_SB(inode
->i_sb
)->s_es
))
1865 map
.m_lblk
= iblock
;
1869 * first, we need to know whether the block is allocated already
1870 * preallocated blocks are unmapped but should treated
1871 * the same as allocated blocks.
1873 ret
= ext4_da_map_blocks(inode
, &map
);
1877 if (map
.m_flags
& EXT4_MAP_DELAYED
) {
1878 map_bh(bh
, inode
->i_sb
, invalid_block
);
1880 set_buffer_delay(bh
);
1884 map_bh(bh
, inode
->i_sb
, map
.m_pblk
);
1885 ext4_update_bh_state(bh
, map
.m_flags
);
1887 if (buffer_unwritten(bh
)) {
1888 /* A delayed write to unwritten bh should be marked
1889 * new and mapped. Mapped ensures that we don't do
1890 * get_block multiple times when we write to the same
1891 * offset and new ensures that we do proper zero out
1892 * for partial write.
1895 set_buffer_mapped(bh
);
1900 static void mpage_folio_done(struct mpage_da_data
*mpd
, struct folio
*folio
)
1902 mpd
->first_page
+= folio_nr_pages(folio
);
1903 folio_unlock(folio
);
1906 static int mpage_submit_folio(struct mpage_da_data
*mpd
, struct folio
*folio
)
1912 BUG_ON(folio
->index
!= mpd
->first_page
);
1913 folio_clear_dirty_for_io(folio
);
1915 * We have to be very careful here! Nothing protects writeback path
1916 * against i_size changes and the page can be writeably mapped into
1917 * page tables. So an application can be growing i_size and writing
1918 * data through mmap while writeback runs. folio_clear_dirty_for_io()
1919 * write-protects our page in page tables and the page cannot get
1920 * written to again until we release folio lock. So only after
1921 * folio_clear_dirty_for_io() we are safe to sample i_size for
1922 * ext4_bio_write_folio() to zero-out tail of the written page. We rely
1923 * on the barrier provided by folio_test_clear_dirty() in
1924 * folio_clear_dirty_for_io() to make sure i_size is really sampled only
1925 * after page tables are updated.
1927 size
= i_size_read(mpd
->inode
);
1928 len
= folio_size(folio
);
1929 if (folio_pos(folio
) + len
> size
&&
1930 !ext4_verity_in_progress(mpd
->inode
))
1931 len
= size
& (len
- 1);
1932 err
= ext4_bio_write_folio(&mpd
->io_submit
, folio
, len
);
1934 mpd
->wbc
->nr_to_write
--;
1939 #define BH_FLAGS (BIT(BH_Unwritten) | BIT(BH_Delay))
1942 * mballoc gives us at most this number of blocks...
1943 * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
1944 * The rest of mballoc seems to handle chunks up to full group size.
1946 #define MAX_WRITEPAGES_EXTENT_LEN 2048
1949 * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
1951 * @mpd - extent of blocks
1952 * @lblk - logical number of the block in the file
1953 * @bh - buffer head we want to add to the extent
1955 * The function is used to collect contig. blocks in the same state. If the
1956 * buffer doesn't require mapping for writeback and we haven't started the
1957 * extent of buffers to map yet, the function returns 'true' immediately - the
1958 * caller can write the buffer right away. Otherwise the function returns true
1959 * if the block has been added to the extent, false if the block couldn't be
1962 static bool mpage_add_bh_to_extent(struct mpage_da_data
*mpd
, ext4_lblk_t lblk
,
1963 struct buffer_head
*bh
)
1965 struct ext4_map_blocks
*map
= &mpd
->map
;
1967 /* Buffer that doesn't need mapping for writeback? */
1968 if (!buffer_dirty(bh
) || !buffer_mapped(bh
) ||
1969 (!buffer_delay(bh
) && !buffer_unwritten(bh
))) {
1970 /* So far no extent to map => we write the buffer right away */
1971 if (map
->m_len
== 0)
1976 /* First block in the extent? */
1977 if (map
->m_len
== 0) {
1978 /* We cannot map unless handle is started... */
1983 map
->m_flags
= bh
->b_state
& BH_FLAGS
;
1987 /* Don't go larger than mballoc is willing to allocate */
1988 if (map
->m_len
>= MAX_WRITEPAGES_EXTENT_LEN
)
1991 /* Can we merge the block to our big extent? */
1992 if (lblk
== map
->m_lblk
+ map
->m_len
&&
1993 (bh
->b_state
& BH_FLAGS
) == map
->m_flags
) {
2001 * mpage_process_page_bufs - submit page buffers for IO or add them to extent
2003 * @mpd - extent of blocks for mapping
2004 * @head - the first buffer in the page
2005 * @bh - buffer we should start processing from
2006 * @lblk - logical number of the block in the file corresponding to @bh
2008 * Walk through page buffers from @bh upto @head (exclusive) and either submit
2009 * the page for IO if all buffers in this page were mapped and there's no
2010 * accumulated extent of buffers to map or add buffers in the page to the
2011 * extent of buffers to map. The function returns 1 if the caller can continue
2012 * by processing the next page, 0 if it should stop adding buffers to the
2013 * extent to map because we cannot extend it anymore. It can also return value
2014 * < 0 in case of error during IO submission.
2016 static int mpage_process_page_bufs(struct mpage_da_data
*mpd
,
2017 struct buffer_head
*head
,
2018 struct buffer_head
*bh
,
2021 struct inode
*inode
= mpd
->inode
;
2023 ext4_lblk_t blocks
= (i_size_read(inode
) + i_blocksize(inode
) - 1)
2024 >> inode
->i_blkbits
;
2026 if (ext4_verity_in_progress(inode
))
2027 blocks
= EXT_MAX_BLOCKS
;
2030 BUG_ON(buffer_locked(bh
));
2032 if (lblk
>= blocks
|| !mpage_add_bh_to_extent(mpd
, lblk
, bh
)) {
2033 /* Found extent to map? */
2036 /* Buffer needs mapping and handle is not started? */
2039 /* Everything mapped so far and we hit EOF */
2042 } while (lblk
++, (bh
= bh
->b_this_page
) != head
);
2043 /* So far everything mapped? Submit the page for IO. */
2044 if (mpd
->map
.m_len
== 0) {
2045 err
= mpage_submit_folio(mpd
, head
->b_folio
);
2048 mpage_folio_done(mpd
, head
->b_folio
);
2050 if (lblk
>= blocks
) {
2051 mpd
->scanned_until_end
= 1;
2058 * mpage_process_folio - update folio buffers corresponding to changed extent
2059 * and may submit fully mapped page for IO
2060 * @mpd: description of extent to map, on return next extent to map
2061 * @folio: Contains these buffers.
2062 * @m_lblk: logical block mapping.
2063 * @m_pblk: corresponding physical mapping.
2064 * @map_bh: determines on return whether this page requires any further
2067 * Scan given folio buffers corresponding to changed extent and update buffer
2068 * state according to new extent state.
2069 * We map delalloc buffers to their physical location, clear unwritten bits.
2070 * If the given folio is not fully mapped, we update @mpd to the next extent in
2071 * the given folio that needs mapping & return @map_bh as true.
2073 static int mpage_process_folio(struct mpage_da_data
*mpd
, struct folio
*folio
,
2074 ext4_lblk_t
*m_lblk
, ext4_fsblk_t
*m_pblk
,
2077 struct buffer_head
*head
, *bh
;
2078 ext4_io_end_t
*io_end
= mpd
->io_submit
.io_end
;
2079 ext4_lblk_t lblk
= *m_lblk
;
2080 ext4_fsblk_t pblock
= *m_pblk
;
2082 int blkbits
= mpd
->inode
->i_blkbits
;
2083 ssize_t io_end_size
= 0;
2084 struct ext4_io_end_vec
*io_end_vec
= ext4_last_io_end_vec(io_end
);
2086 bh
= head
= folio_buffers(folio
);
2088 if (lblk
< mpd
->map
.m_lblk
)
2090 if (lblk
>= mpd
->map
.m_lblk
+ mpd
->map
.m_len
) {
2092 * Buffer after end of mapped extent.
2093 * Find next buffer in the folio to map.
2096 mpd
->map
.m_flags
= 0;
2097 io_end_vec
->size
+= io_end_size
;
2099 err
= mpage_process_page_bufs(mpd
, head
, bh
, lblk
);
2102 if (!err
&& mpd
->map
.m_len
&& mpd
->map
.m_lblk
> lblk
) {
2103 io_end_vec
= ext4_alloc_io_end_vec(io_end
);
2104 if (IS_ERR(io_end_vec
)) {
2105 err
= PTR_ERR(io_end_vec
);
2108 io_end_vec
->offset
= (loff_t
)mpd
->map
.m_lblk
<< blkbits
;
2113 if (buffer_delay(bh
)) {
2114 clear_buffer_delay(bh
);
2115 bh
->b_blocknr
= pblock
++;
2117 clear_buffer_unwritten(bh
);
2118 io_end_size
+= (1 << blkbits
);
2119 } while (lblk
++, (bh
= bh
->b_this_page
) != head
);
2121 io_end_vec
->size
+= io_end_size
;
2130 * mpage_map_buffers - update buffers corresponding to changed extent and
2131 * submit fully mapped pages for IO
2133 * @mpd - description of extent to map, on return next extent to map
2135 * Scan buffers corresponding to changed extent (we expect corresponding pages
2136 * to be already locked) and update buffer state according to new extent state.
2137 * We map delalloc buffers to their physical location, clear unwritten bits,
2138 * and mark buffers as uninit when we perform writes to unwritten extents
2139 * and do extent conversion after IO is finished. If the last page is not fully
2140 * mapped, we update @map to the next extent in the last page that needs
2141 * mapping. Otherwise we submit the page for IO.
2143 static int mpage_map_and_submit_buffers(struct mpage_da_data
*mpd
)
2145 struct folio_batch fbatch
;
2147 struct inode
*inode
= mpd
->inode
;
2148 int bpp_bits
= PAGE_SHIFT
- inode
->i_blkbits
;
2151 ext4_fsblk_t pblock
;
2153 bool map_bh
= false;
2155 start
= mpd
->map
.m_lblk
>> bpp_bits
;
2156 end
= (mpd
->map
.m_lblk
+ mpd
->map
.m_len
- 1) >> bpp_bits
;
2157 lblk
= start
<< bpp_bits
;
2158 pblock
= mpd
->map
.m_pblk
;
2160 folio_batch_init(&fbatch
);
2161 while (start
<= end
) {
2162 nr
= filemap_get_folios(inode
->i_mapping
, &start
, end
, &fbatch
);
2165 for (i
= 0; i
< nr
; i
++) {
2166 struct folio
*folio
= fbatch
.folios
[i
];
2168 err
= mpage_process_folio(mpd
, folio
, &lblk
, &pblock
,
2171 * If map_bh is true, means page may require further bh
2172 * mapping, or maybe the page was submitted for IO.
2173 * So we return to call further extent mapping.
2175 if (err
< 0 || map_bh
)
2177 /* Page fully mapped - let IO run! */
2178 err
= mpage_submit_folio(mpd
, folio
);
2181 mpage_folio_done(mpd
, folio
);
2183 folio_batch_release(&fbatch
);
2185 /* Extent fully mapped and matches with page boundary. We are done. */
2187 mpd
->map
.m_flags
= 0;
2190 folio_batch_release(&fbatch
);
2194 static int mpage_map_one_extent(handle_t
*handle
, struct mpage_da_data
*mpd
)
2196 struct inode
*inode
= mpd
->inode
;
2197 struct ext4_map_blocks
*map
= &mpd
->map
;
2198 int get_blocks_flags
;
2199 int err
, dioread_nolock
;
2201 trace_ext4_da_write_pages_extent(inode
, map
);
2203 * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
2204 * to convert an unwritten extent to be initialized (in the case
2205 * where we have written into one or more preallocated blocks). It is
2206 * possible that we're going to need more metadata blocks than
2207 * previously reserved. However we must not fail because we're in
2208 * writeback and there is nothing we can do about it so it might result
2209 * in data loss. So use reserved blocks to allocate metadata if
2212 get_blocks_flags
= EXT4_GET_BLOCKS_CREATE
|
2213 EXT4_GET_BLOCKS_METADATA_NOFAIL
|
2214 EXT4_GET_BLOCKS_IO_SUBMIT
;
2215 dioread_nolock
= ext4_should_dioread_nolock(inode
);
2217 get_blocks_flags
|= EXT4_GET_BLOCKS_IO_CREATE_EXT
;
2219 err
= ext4_map_blocks(handle
, inode
, map
, get_blocks_flags
);
2222 if (dioread_nolock
&& (map
->m_flags
& EXT4_MAP_UNWRITTEN
)) {
2223 if (!mpd
->io_submit
.io_end
->handle
&&
2224 ext4_handle_valid(handle
)) {
2225 mpd
->io_submit
.io_end
->handle
= handle
->h_rsv_handle
;
2226 handle
->h_rsv_handle
= NULL
;
2228 ext4_set_io_unwritten_flag(inode
, mpd
->io_submit
.io_end
);
2231 BUG_ON(map
->m_len
== 0);
2236 * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
2237 * mpd->len and submit pages underlying it for IO
2239 * @handle - handle for journal operations
2240 * @mpd - extent to map
2241 * @give_up_on_write - we set this to true iff there is a fatal error and there
2242 * is no hope of writing the data. The caller should discard
2243 * dirty pages to avoid infinite loops.
2245 * The function maps extent starting at mpd->lblk of length mpd->len. If it is
2246 * delayed, blocks are allocated, if it is unwritten, we may need to convert
2247 * them to initialized or split the described range from larger unwritten
2248 * extent. Note that we need not map all the described range since allocation
2249 * can return less blocks or the range is covered by more unwritten extents. We
2250 * cannot map more because we are limited by reserved transaction credits. On
2251 * the other hand we always make sure that the last touched page is fully
2252 * mapped so that it can be written out (and thus forward progress is
2253 * guaranteed). After mapping we submit all mapped pages for IO.
2255 static int mpage_map_and_submit_extent(handle_t
*handle
,
2256 struct mpage_da_data
*mpd
,
2257 bool *give_up_on_write
)
2259 struct inode
*inode
= mpd
->inode
;
2260 struct ext4_map_blocks
*map
= &mpd
->map
;
2264 ext4_io_end_t
*io_end
= mpd
->io_submit
.io_end
;
2265 struct ext4_io_end_vec
*io_end_vec
;
2267 io_end_vec
= ext4_alloc_io_end_vec(io_end
);
2268 if (IS_ERR(io_end_vec
))
2269 return PTR_ERR(io_end_vec
);
2270 io_end_vec
->offset
= ((loff_t
)map
->m_lblk
) << inode
->i_blkbits
;
2272 err
= mpage_map_one_extent(handle
, mpd
);
2274 struct super_block
*sb
= inode
->i_sb
;
2276 if (ext4_forced_shutdown(sb
))
2277 goto invalidate_dirty_pages
;
2279 * Let the uper layers retry transient errors.
2280 * In the case of ENOSPC, if ext4_count_free_blocks()
2281 * is non-zero, a commit should free up blocks.
2283 if ((err
== -ENOMEM
) ||
2284 (err
== -ENOSPC
&& ext4_count_free_clusters(sb
))) {
2286 goto update_disksize
;
2289 ext4_msg(sb
, KERN_CRIT
,
2290 "Delayed block allocation failed for "
2291 "inode %lu at logical offset %llu with"
2292 " max blocks %u with error %d",
2294 (unsigned long long)map
->m_lblk
,
2295 (unsigned)map
->m_len
, -err
);
2296 ext4_msg(sb
, KERN_CRIT
,
2297 "This should not happen!! Data will "
2300 ext4_print_free_blocks(inode
);
2301 invalidate_dirty_pages
:
2302 *give_up_on_write
= true;
2307 * Update buffer state, submit mapped pages, and get us new
2310 err
= mpage_map_and_submit_buffers(mpd
);
2312 goto update_disksize
;
2313 } while (map
->m_len
);
2317 * Update on-disk size after IO is submitted. Races with
2318 * truncate are avoided by checking i_size under i_data_sem.
2320 disksize
= ((loff_t
)mpd
->first_page
) << PAGE_SHIFT
;
2321 if (disksize
> READ_ONCE(EXT4_I(inode
)->i_disksize
)) {
2325 down_write(&EXT4_I(inode
)->i_data_sem
);
2326 i_size
= i_size_read(inode
);
2327 if (disksize
> i_size
)
2329 if (disksize
> EXT4_I(inode
)->i_disksize
)
2330 EXT4_I(inode
)->i_disksize
= disksize
;
2331 up_write(&EXT4_I(inode
)->i_data_sem
);
2332 err2
= ext4_mark_inode_dirty(handle
, inode
);
2334 ext4_error_err(inode
->i_sb
, -err2
,
2335 "Failed to mark inode %lu dirty",
2345 * Calculate the total number of credits to reserve for one writepages
2346 * iteration. This is called from ext4_writepages(). We map an extent of
2347 * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
2348 * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
2349 * bpp - 1 blocks in bpp different extents.
2351 static int ext4_da_writepages_trans_blocks(struct inode
*inode
)
2353 int bpp
= ext4_journal_blocks_per_page(inode
);
2355 return ext4_meta_trans_blocks(inode
,
2356 MAX_WRITEPAGES_EXTENT_LEN
+ bpp
- 1, bpp
);
2359 static int ext4_journal_folio_buffers(handle_t
*handle
, struct folio
*folio
,
2362 struct buffer_head
*page_bufs
= folio_buffers(folio
);
2363 struct inode
*inode
= folio
->mapping
->host
;
2366 ret
= ext4_walk_page_buffers(handle
, inode
, page_bufs
, 0, len
,
2367 NULL
, do_journal_get_write_access
);
2368 err
= ext4_walk_page_buffers(handle
, inode
, page_bufs
, 0, len
,
2369 NULL
, write_end_fn
);
2372 err
= ext4_jbd2_inode_add_write(handle
, inode
, folio_pos(folio
), len
);
2375 EXT4_I(inode
)->i_datasync_tid
= handle
->h_transaction
->t_tid
;
2380 static int mpage_journal_page_buffers(handle_t
*handle
,
2381 struct mpage_da_data
*mpd
,
2382 struct folio
*folio
)
2384 struct inode
*inode
= mpd
->inode
;
2385 loff_t size
= i_size_read(inode
);
2386 size_t len
= folio_size(folio
);
2388 folio_clear_checked(folio
);
2389 mpd
->wbc
->nr_to_write
--;
2391 if (folio_pos(folio
) + len
> size
&&
2392 !ext4_verity_in_progress(inode
))
2393 len
= size
& (len
- 1);
2395 return ext4_journal_folio_buffers(handle
, folio
, len
);
2399 * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
2400 * needing mapping, submit mapped pages
2402 * @mpd - where to look for pages
2404 * Walk dirty pages in the mapping. If they are fully mapped, submit them for
2405 * IO immediately. If we cannot map blocks, we submit just already mapped
2406 * buffers in the page for IO and keep page dirty. When we can map blocks and
2407 * we find a page which isn't mapped we start accumulating extent of buffers
2408 * underlying these pages that needs mapping (formed by either delayed or
2409 * unwritten buffers). We also lock the pages containing these buffers. The
2410 * extent found is returned in @mpd structure (starting at mpd->lblk with
2411 * length mpd->len blocks).
2413 * Note that this function can attach bios to one io_end structure which are
2414 * neither logically nor physically contiguous. Although it may seem as an
2415 * unnecessary complication, it is actually inevitable in blocksize < pagesize
2416 * case as we need to track IO to all buffers underlying a page in one io_end.
2418 static int mpage_prepare_extent_to_map(struct mpage_da_data
*mpd
)
2420 struct address_space
*mapping
= mpd
->inode
->i_mapping
;
2421 struct folio_batch fbatch
;
2422 unsigned int nr_folios
;
2423 pgoff_t index
= mpd
->first_page
;
2424 pgoff_t end
= mpd
->last_page
;
2427 int blkbits
= mpd
->inode
->i_blkbits
;
2429 struct buffer_head
*head
;
2430 handle_t
*handle
= NULL
;
2431 int bpp
= ext4_journal_blocks_per_page(mpd
->inode
);
2433 if (mpd
->wbc
->sync_mode
== WB_SYNC_ALL
|| mpd
->wbc
->tagged_writepages
)
2434 tag
= PAGECACHE_TAG_TOWRITE
;
2436 tag
= PAGECACHE_TAG_DIRTY
;
2439 mpd
->next_page
= index
;
2440 if (ext4_should_journal_data(mpd
->inode
)) {
2441 handle
= ext4_journal_start(mpd
->inode
, EXT4_HT_WRITE_PAGE
,
2444 return PTR_ERR(handle
);
2446 folio_batch_init(&fbatch
);
2447 while (index
<= end
) {
2448 nr_folios
= filemap_get_folios_tag(mapping
, &index
, end
,
2453 for (i
= 0; i
< nr_folios
; i
++) {
2454 struct folio
*folio
= fbatch
.folios
[i
];
2457 * Accumulated enough dirty pages? This doesn't apply
2458 * to WB_SYNC_ALL mode. For integrity sync we have to
2459 * keep going because someone may be concurrently
2460 * dirtying pages, and we might have synced a lot of
2461 * newly appeared dirty pages, but have not synced all
2462 * of the old dirty pages.
2464 if (mpd
->wbc
->sync_mode
== WB_SYNC_NONE
&&
2465 mpd
->wbc
->nr_to_write
<=
2466 mpd
->map
.m_len
>> (PAGE_SHIFT
- blkbits
))
2469 /* If we can't merge this page, we are done. */
2470 if (mpd
->map
.m_len
> 0 && mpd
->next_page
!= folio
->index
)
2474 err
= ext4_journal_ensure_credits(handle
, bpp
,
2482 * If the page is no longer dirty, or its mapping no
2483 * longer corresponds to inode we are writing (which
2484 * means it has been truncated or invalidated), or the
2485 * page is already under writeback and we are not doing
2486 * a data integrity writeback, skip the page
2488 if (!folio_test_dirty(folio
) ||
2489 (folio_test_writeback(folio
) &&
2490 (mpd
->wbc
->sync_mode
== WB_SYNC_NONE
)) ||
2491 unlikely(folio
->mapping
!= mapping
)) {
2492 folio_unlock(folio
);
2496 folio_wait_writeback(folio
);
2497 BUG_ON(folio_test_writeback(folio
));
2500 * Should never happen but for buggy code in
2501 * other subsystems that call
2502 * set_page_dirty() without properly warning
2503 * the file system first. See [1] for more
2506 * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz
2508 if (!folio_buffers(folio
)) {
2509 ext4_warning_inode(mpd
->inode
, "page %lu does not have buffers attached", folio
->index
);
2510 folio_clear_dirty(folio
);
2511 folio_unlock(folio
);
2515 if (mpd
->map
.m_len
== 0)
2516 mpd
->first_page
= folio
->index
;
2517 mpd
->next_page
= folio_next_index(folio
);
2519 * Writeout when we cannot modify metadata is simple.
2520 * Just submit the page. For data=journal mode we
2521 * first handle writeout of the page for checkpoint and
2522 * only after that handle delayed page dirtying. This
2523 * makes sure current data is checkpointed to the final
2524 * location before possibly journalling it again which
2525 * is desirable when the page is frequently dirtied
2528 if (!mpd
->can_map
) {
2529 err
= mpage_submit_folio(mpd
, folio
);
2532 /* Pending dirtying of journalled data? */
2533 if (folio_test_checked(folio
)) {
2534 err
= mpage_journal_page_buffers(handle
,
2538 mpd
->journalled_more_data
= 1;
2540 mpage_folio_done(mpd
, folio
);
2542 /* Add all dirty buffers to mpd */
2543 lblk
= ((ext4_lblk_t
)folio
->index
) <<
2544 (PAGE_SHIFT
- blkbits
);
2545 head
= folio_buffers(folio
);
2546 err
= mpage_process_page_bufs(mpd
, head
, head
,
2553 folio_batch_release(&fbatch
);
2556 mpd
->scanned_until_end
= 1;
2558 ext4_journal_stop(handle
);
2561 folio_batch_release(&fbatch
);
2563 ext4_journal_stop(handle
);
2567 static int ext4_do_writepages(struct mpage_da_data
*mpd
)
2569 struct writeback_control
*wbc
= mpd
->wbc
;
2570 pgoff_t writeback_index
= 0;
2571 long nr_to_write
= wbc
->nr_to_write
;
2572 int range_whole
= 0;
2574 handle_t
*handle
= NULL
;
2575 struct inode
*inode
= mpd
->inode
;
2576 struct address_space
*mapping
= inode
->i_mapping
;
2577 int needed_blocks
, rsv_blocks
= 0, ret
= 0;
2578 struct ext4_sb_info
*sbi
= EXT4_SB(mapping
->host
->i_sb
);
2579 struct blk_plug plug
;
2580 bool give_up_on_write
= false;
2582 trace_ext4_writepages(inode
, wbc
);
2585 * No pages to write? This is mainly a kludge to avoid starting
2586 * a transaction for special inodes like journal inode on last iput()
2587 * because that could violate lock ordering on umount
2589 if (!mapping
->nrpages
|| !mapping_tagged(mapping
, PAGECACHE_TAG_DIRTY
))
2590 goto out_writepages
;
2593 * If the filesystem has aborted, it is read-only, so return
2594 * right away instead of dumping stack traces later on that
2595 * will obscure the real source of the problem. We test
2596 * fs shutdown state instead of sb->s_flag's SB_RDONLY because
2597 * the latter could be true if the filesystem is mounted
2598 * read-only, and in that case, ext4_writepages should
2599 * *never* be called, so if that ever happens, we would want
2602 if (unlikely(ext4_forced_shutdown(mapping
->host
->i_sb
))) {
2604 goto out_writepages
;
2608 * If we have inline data and arrive here, it means that
2609 * we will soon create the block for the 1st page, so
2610 * we'd better clear the inline data here.
2612 if (ext4_has_inline_data(inode
)) {
2613 /* Just inode will be modified... */
2614 handle
= ext4_journal_start(inode
, EXT4_HT_INODE
, 1);
2615 if (IS_ERR(handle
)) {
2616 ret
= PTR_ERR(handle
);
2617 goto out_writepages
;
2619 BUG_ON(ext4_test_inode_state(inode
,
2620 EXT4_STATE_MAY_INLINE_DATA
));
2621 ext4_destroy_inline_data(handle
, inode
);
2622 ext4_journal_stop(handle
);
2626 * data=journal mode does not do delalloc so we just need to writeout /
2627 * journal already mapped buffers. On the other hand we need to commit
2628 * transaction to make data stable. We expect all the data to be
2629 * already in the journal (the only exception are DMA pinned pages
2630 * dirtied behind our back) so we commit transaction here and run the
2631 * writeback loop to checkpoint them. The checkpointing is not actually
2632 * necessary to make data persistent *but* quite a few places (extent
2633 * shifting operations, fsverity, ...) depend on being able to drop
2634 * pagecache pages after calling filemap_write_and_wait() and for that
2635 * checkpointing needs to happen.
2637 if (ext4_should_journal_data(inode
)) {
2639 if (wbc
->sync_mode
== WB_SYNC_ALL
)
2640 ext4_fc_commit(sbi
->s_journal
,
2641 EXT4_I(inode
)->i_datasync_tid
);
2643 mpd
->journalled_more_data
= 0;
2645 if (ext4_should_dioread_nolock(inode
)) {
2647 * We may need to convert up to one extent per block in
2648 * the page and we may dirty the inode.
2650 rsv_blocks
= 1 + ext4_chunk_trans_blocks(inode
,
2651 PAGE_SIZE
>> inode
->i_blkbits
);
2654 if (wbc
->range_start
== 0 && wbc
->range_end
== LLONG_MAX
)
2657 if (wbc
->range_cyclic
) {
2658 writeback_index
= mapping
->writeback_index
;
2659 if (writeback_index
)
2661 mpd
->first_page
= writeback_index
;
2662 mpd
->last_page
= -1;
2664 mpd
->first_page
= wbc
->range_start
>> PAGE_SHIFT
;
2665 mpd
->last_page
= wbc
->range_end
>> PAGE_SHIFT
;
2668 ext4_io_submit_init(&mpd
->io_submit
, wbc
);
2670 if (wbc
->sync_mode
== WB_SYNC_ALL
|| wbc
->tagged_writepages
)
2671 tag_pages_for_writeback(mapping
, mpd
->first_page
,
2673 blk_start_plug(&plug
);
2676 * First writeback pages that don't need mapping - we can avoid
2677 * starting a transaction unnecessarily and also avoid being blocked
2678 * in the block layer on device congestion while having transaction
2682 mpd
->scanned_until_end
= 0;
2683 mpd
->io_submit
.io_end
= ext4_init_io_end(inode
, GFP_KERNEL
);
2684 if (!mpd
->io_submit
.io_end
) {
2688 ret
= mpage_prepare_extent_to_map(mpd
);
2689 /* Unlock pages we didn't use */
2690 mpage_release_unused_pages(mpd
, false);
2691 /* Submit prepared bio */
2692 ext4_io_submit(&mpd
->io_submit
);
2693 ext4_put_io_end_defer(mpd
->io_submit
.io_end
);
2694 mpd
->io_submit
.io_end
= NULL
;
2698 while (!mpd
->scanned_until_end
&& wbc
->nr_to_write
> 0) {
2699 /* For each extent of pages we use new io_end */
2700 mpd
->io_submit
.io_end
= ext4_init_io_end(inode
, GFP_KERNEL
);
2701 if (!mpd
->io_submit
.io_end
) {
2706 WARN_ON_ONCE(!mpd
->can_map
);
2708 * We have two constraints: We find one extent to map and we
2709 * must always write out whole page (makes a difference when
2710 * blocksize < pagesize) so that we don't block on IO when we
2711 * try to write out the rest of the page. Journalled mode is
2712 * not supported by delalloc.
2714 BUG_ON(ext4_should_journal_data(inode
));
2715 needed_blocks
= ext4_da_writepages_trans_blocks(inode
);
2717 /* start a new transaction */
2718 handle
= ext4_journal_start_with_reserve(inode
,
2719 EXT4_HT_WRITE_PAGE
, needed_blocks
, rsv_blocks
);
2720 if (IS_ERR(handle
)) {
2721 ret
= PTR_ERR(handle
);
2722 ext4_msg(inode
->i_sb
, KERN_CRIT
, "%s: jbd2_start: "
2723 "%ld pages, ino %lu; err %d", __func__
,
2724 wbc
->nr_to_write
, inode
->i_ino
, ret
);
2725 /* Release allocated io_end */
2726 ext4_put_io_end(mpd
->io_submit
.io_end
);
2727 mpd
->io_submit
.io_end
= NULL
;
2732 trace_ext4_da_write_pages(inode
, mpd
->first_page
, wbc
);
2733 ret
= mpage_prepare_extent_to_map(mpd
);
2734 if (!ret
&& mpd
->map
.m_len
)
2735 ret
= mpage_map_and_submit_extent(handle
, mpd
,
2738 * Caution: If the handle is synchronous,
2739 * ext4_journal_stop() can wait for transaction commit
2740 * to finish which may depend on writeback of pages to
2741 * complete or on page lock to be released. In that
2742 * case, we have to wait until after we have
2743 * submitted all the IO, released page locks we hold,
2744 * and dropped io_end reference (for extent conversion
2745 * to be able to complete) before stopping the handle.
2747 if (!ext4_handle_valid(handle
) || handle
->h_sync
== 0) {
2748 ext4_journal_stop(handle
);
2752 /* Unlock pages we didn't use */
2753 mpage_release_unused_pages(mpd
, give_up_on_write
);
2754 /* Submit prepared bio */
2755 ext4_io_submit(&mpd
->io_submit
);
2758 * Drop our io_end reference we got from init. We have
2759 * to be careful and use deferred io_end finishing if
2760 * we are still holding the transaction as we can
2761 * release the last reference to io_end which may end
2762 * up doing unwritten extent conversion.
2765 ext4_put_io_end_defer(mpd
->io_submit
.io_end
);
2766 ext4_journal_stop(handle
);
2768 ext4_put_io_end(mpd
->io_submit
.io_end
);
2769 mpd
->io_submit
.io_end
= NULL
;
2771 if (ret
== -ENOSPC
&& sbi
->s_journal
) {
2773 * Commit the transaction which would
2774 * free blocks released in the transaction
2777 jbd2_journal_force_commit_nested(sbi
->s_journal
);
2781 /* Fatal error - ENOMEM, EIO... */
2786 blk_finish_plug(&plug
);
2787 if (!ret
&& !cycled
&& wbc
->nr_to_write
> 0) {
2789 mpd
->last_page
= writeback_index
- 1;
2790 mpd
->first_page
= 0;
2795 if (wbc
->range_cyclic
|| (range_whole
&& wbc
->nr_to_write
> 0))
2797 * Set the writeback_index so that range_cyclic
2798 * mode will write it back later
2800 mapping
->writeback_index
= mpd
->first_page
;
2803 trace_ext4_writepages_result(inode
, wbc
, ret
,
2804 nr_to_write
- wbc
->nr_to_write
);
2808 static int ext4_writepages(struct address_space
*mapping
,
2809 struct writeback_control
*wbc
)
2811 struct super_block
*sb
= mapping
->host
->i_sb
;
2812 struct mpage_da_data mpd
= {
2813 .inode
= mapping
->host
,
2820 if (unlikely(ext4_forced_shutdown(sb
)))
2823 alloc_ctx
= ext4_writepages_down_read(sb
);
2824 ret
= ext4_do_writepages(&mpd
);
2826 * For data=journal writeback we could have come across pages marked
2827 * for delayed dirtying (PageChecked) which were just added to the
2828 * running transaction. Try once more to get them to stable storage.
2830 if (!ret
&& mpd
.journalled_more_data
)
2831 ret
= ext4_do_writepages(&mpd
);
2832 ext4_writepages_up_read(sb
, alloc_ctx
);
2837 int ext4_normal_submit_inode_data_buffers(struct jbd2_inode
*jinode
)
2839 struct writeback_control wbc
= {
2840 .sync_mode
= WB_SYNC_ALL
,
2841 .nr_to_write
= LONG_MAX
,
2842 .range_start
= jinode
->i_dirty_start
,
2843 .range_end
= jinode
->i_dirty_end
,
2845 struct mpage_da_data mpd
= {
2846 .inode
= jinode
->i_vfs_inode
,
2850 return ext4_do_writepages(&mpd
);
2853 static int ext4_dax_writepages(struct address_space
*mapping
,
2854 struct writeback_control
*wbc
)
2857 long nr_to_write
= wbc
->nr_to_write
;
2858 struct inode
*inode
= mapping
->host
;
2861 if (unlikely(ext4_forced_shutdown(inode
->i_sb
)))
2864 alloc_ctx
= ext4_writepages_down_read(inode
->i_sb
);
2865 trace_ext4_writepages(inode
, wbc
);
2867 ret
= dax_writeback_mapping_range(mapping
,
2868 EXT4_SB(inode
->i_sb
)->s_daxdev
, wbc
);
2869 trace_ext4_writepages_result(inode
, wbc
, ret
,
2870 nr_to_write
- wbc
->nr_to_write
);
2871 ext4_writepages_up_read(inode
->i_sb
, alloc_ctx
);
2875 static int ext4_nonda_switch(struct super_block
*sb
)
2877 s64 free_clusters
, dirty_clusters
;
2878 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
2881 * switch to non delalloc mode if we are running low
2882 * on free block. The free block accounting via percpu
2883 * counters can get slightly wrong with percpu_counter_batch getting
2884 * accumulated on each CPU without updating global counters
2885 * Delalloc need an accurate free block accounting. So switch
2886 * to non delalloc when we are near to error range.
2889 percpu_counter_read_positive(&sbi
->s_freeclusters_counter
);
2891 percpu_counter_read_positive(&sbi
->s_dirtyclusters_counter
);
2893 * Start pushing delalloc when 1/2 of free blocks are dirty.
2895 if (dirty_clusters
&& (free_clusters
< 2 * dirty_clusters
))
2896 try_to_writeback_inodes_sb(sb
, WB_REASON_FS_FREE_SPACE
);
2898 if (2 * free_clusters
< 3 * dirty_clusters
||
2899 free_clusters
< (dirty_clusters
+ EXT4_FREECLUSTERS_WATERMARK
)) {
2901 * free block count is less than 150% of dirty blocks
2902 * or free blocks is less than watermark
2909 static int ext4_da_write_begin(struct file
*file
, struct address_space
*mapping
,
2910 loff_t pos
, unsigned len
,
2911 struct folio
**foliop
, void **fsdata
)
2913 int ret
, retries
= 0;
2914 struct folio
*folio
;
2916 struct inode
*inode
= mapping
->host
;
2918 if (unlikely(ext4_forced_shutdown(inode
->i_sb
)))
2921 index
= pos
>> PAGE_SHIFT
;
2923 if (ext4_nonda_switch(inode
->i_sb
) || ext4_verity_in_progress(inode
)) {
2924 *fsdata
= (void *)FALL_BACK_TO_NONDELALLOC
;
2925 return ext4_write_begin(file
, mapping
, pos
,
2926 len
, foliop
, fsdata
);
2928 *fsdata
= (void *)0;
2929 trace_ext4_da_write_begin(inode
, pos
, len
);
2931 if (ext4_test_inode_state(inode
, EXT4_STATE_MAY_INLINE_DATA
)) {
2932 ret
= ext4_da_write_inline_data_begin(mapping
, inode
, pos
, len
,
2941 folio
= __filemap_get_folio(mapping
, index
, FGP_WRITEBEGIN
,
2942 mapping_gfp_mask(mapping
));
2944 return PTR_ERR(folio
);
2946 ret
= ext4_block_write_begin(NULL
, folio
, pos
, len
,
2947 ext4_da_get_block_prep
);
2949 folio_unlock(folio
);
2952 * block_write_begin may have instantiated a few blocks
2953 * outside i_size. Trim these off again. Don't need
2954 * i_size_read because we hold inode lock.
2956 if (pos
+ len
> inode
->i_size
)
2957 ext4_truncate_failed_write(inode
);
2959 if (ret
== -ENOSPC
&&
2960 ext4_should_retry_alloc(inode
->i_sb
, &retries
))
2970 * Check if we should update i_disksize
2971 * when write to the end of file but not require block allocation
2973 static int ext4_da_should_update_i_disksize(struct folio
*folio
,
2974 unsigned long offset
)
2976 struct buffer_head
*bh
;
2977 struct inode
*inode
= folio
->mapping
->host
;
2981 bh
= folio_buffers(folio
);
2982 idx
= offset
>> inode
->i_blkbits
;
2984 for (i
= 0; i
< idx
; i
++)
2985 bh
= bh
->b_this_page
;
2987 if (!buffer_mapped(bh
) || (buffer_delay(bh
)) || buffer_unwritten(bh
))
2992 static int ext4_da_do_write_end(struct address_space
*mapping
,
2993 loff_t pos
, unsigned len
, unsigned copied
,
2994 struct folio
*folio
)
2996 struct inode
*inode
= mapping
->host
;
2997 loff_t old_size
= inode
->i_size
;
2998 bool disksize_changed
= false;
2999 loff_t new_i_size
, zero_len
= 0;
3002 if (unlikely(!folio_buffers(folio
))) {
3003 folio_unlock(folio
);
3008 * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES
3009 * flag, which all that's needed to trigger page writeback.
3011 copied
= block_write_end(NULL
, mapping
, pos
, len
, copied
,
3013 new_i_size
= pos
+ copied
;
3016 * It's important to update i_size while still holding folio lock,
3017 * because folio writeout could otherwise come in and zero beyond
3020 * Since we are holding inode lock, we are sure i_disksize <=
3021 * i_size. We also know that if i_disksize < i_size, there are
3022 * delalloc writes pending in the range up to i_size. If the end of
3023 * the current write is <= i_size, there's no need to touch
3024 * i_disksize since writeback will push i_disksize up to i_size
3025 * eventually. If the end of the current write is > i_size and
3026 * inside an allocated block which ext4_da_should_update_i_disksize()
3027 * checked, we need to update i_disksize here as certain
3028 * ext4_writepages() paths not allocating blocks and update i_disksize.
3030 if (new_i_size
> inode
->i_size
) {
3033 i_size_write(inode
, new_i_size
);
3034 end
= (new_i_size
- 1) & (PAGE_SIZE
- 1);
3035 if (copied
&& ext4_da_should_update_i_disksize(folio
, end
)) {
3036 ext4_update_i_disksize(inode
, new_i_size
);
3037 disksize_changed
= true;
3041 folio_unlock(folio
);
3044 if (pos
> old_size
) {
3045 pagecache_isize_extended(inode
, old_size
, pos
);
3046 zero_len
= pos
- old_size
;
3049 if (!disksize_changed
&& !zero_len
)
3052 handle
= ext4_journal_start(inode
, EXT4_HT_INODE
, 2);
3054 return PTR_ERR(handle
);
3056 ext4_zero_partial_blocks(handle
, inode
, old_size
, zero_len
);
3057 ext4_mark_inode_dirty(handle
, inode
);
3058 ext4_journal_stop(handle
);
3063 static int ext4_da_write_end(struct file
*file
,
3064 struct address_space
*mapping
,
3065 loff_t pos
, unsigned len
, unsigned copied
,
3066 struct folio
*folio
, void *fsdata
)
3068 struct inode
*inode
= mapping
->host
;
3069 int write_mode
= (int)(unsigned long)fsdata
;
3071 if (write_mode
== FALL_BACK_TO_NONDELALLOC
)
3072 return ext4_write_end(file
, mapping
, pos
,
3073 len
, copied
, folio
, fsdata
);
3075 trace_ext4_da_write_end(inode
, pos
, len
, copied
);
3077 if (write_mode
!= CONVERT_INLINE_DATA
&&
3078 ext4_test_inode_state(inode
, EXT4_STATE_MAY_INLINE_DATA
) &&
3079 ext4_has_inline_data(inode
))
3080 return ext4_write_inline_data_end(inode
, pos
, len
, copied
,
3083 if (unlikely(copied
< len
) && !folio_test_uptodate(folio
))
3086 return ext4_da_do_write_end(mapping
, pos
, len
, copied
, folio
);
3090 * Force all delayed allocation blocks to be allocated for a given inode.
3092 int ext4_alloc_da_blocks(struct inode
*inode
)
3094 trace_ext4_alloc_da_blocks(inode
);
3096 if (!EXT4_I(inode
)->i_reserved_data_blocks
)
3100 * We do something simple for now. The filemap_flush() will
3101 * also start triggering a write of the data blocks, which is
3102 * not strictly speaking necessary (and for users of
3103 * laptop_mode, not even desirable). However, to do otherwise
3104 * would require replicating code paths in:
3106 * ext4_writepages() ->
3107 * write_cache_pages() ---> (via passed in callback function)
3108 * __mpage_da_writepage() -->
3109 * mpage_add_bh_to_extent()
3110 * mpage_da_map_blocks()
3112 * The problem is that write_cache_pages(), located in
3113 * mm/page-writeback.c, marks pages clean in preparation for
3114 * doing I/O, which is not desirable if we're not planning on
3117 * We could call write_cache_pages(), and then redirty all of
3118 * the pages by calling redirty_page_for_writepage() but that
3119 * would be ugly in the extreme. So instead we would need to
3120 * replicate parts of the code in the above functions,
3121 * simplifying them because we wouldn't actually intend to
3122 * write out the pages, but rather only collect contiguous
3123 * logical block extents, call the multi-block allocator, and
3124 * then update the buffer heads with the block allocations.
3126 * For now, though, we'll cheat by calling filemap_flush(),
3127 * which will map the blocks, and start the I/O, but not
3128 * actually wait for the I/O to complete.
3130 return filemap_flush(inode
->i_mapping
);
3134 * bmap() is special. It gets used by applications such as lilo and by
3135 * the swapper to find the on-disk block of a specific piece of data.
3137 * Naturally, this is dangerous if the block concerned is still in the
3138 * journal. If somebody makes a swapfile on an ext4 data-journaling
3139 * filesystem and enables swap, then they may get a nasty shock when the
3140 * data getting swapped to that swapfile suddenly gets overwritten by
3141 * the original zero's written out previously to the journal and
3142 * awaiting writeback in the kernel's buffer cache.
3144 * So, if we see any bmap calls here on a modified, data-journaled file,
3145 * take extra steps to flush any blocks which might be in the cache.
3147 static sector_t
ext4_bmap(struct address_space
*mapping
, sector_t block
)
3149 struct inode
*inode
= mapping
->host
;
3152 inode_lock_shared(inode
);
3154 * We can get here for an inline file via the FIBMAP ioctl
3156 if (ext4_has_inline_data(inode
))
3159 if (mapping_tagged(mapping
, PAGECACHE_TAG_DIRTY
) &&
3160 (test_opt(inode
->i_sb
, DELALLOC
) ||
3161 ext4_should_journal_data(inode
))) {
3163 * With delalloc or journalled data we want to sync the file so
3164 * that we can make sure we allocate blocks for file and data
3165 * is in place for the user to see it
3167 filemap_write_and_wait(mapping
);
3170 ret
= iomap_bmap(mapping
, block
, &ext4_iomap_ops
);
3173 inode_unlock_shared(inode
);
3177 static int ext4_read_folio(struct file
*file
, struct folio
*folio
)
3180 struct inode
*inode
= folio
->mapping
->host
;
3182 trace_ext4_read_folio(inode
, folio
);
3184 if (ext4_has_inline_data(inode
))
3185 ret
= ext4_readpage_inline(inode
, folio
);
3188 return ext4_mpage_readpages(inode
, NULL
, folio
);
3193 static void ext4_readahead(struct readahead_control
*rac
)
3195 struct inode
*inode
= rac
->mapping
->host
;
3197 /* If the file has inline data, no need to do readahead. */
3198 if (ext4_has_inline_data(inode
))
3201 ext4_mpage_readpages(inode
, rac
, NULL
);
3204 static void ext4_invalidate_folio(struct folio
*folio
, size_t offset
,
3207 trace_ext4_invalidate_folio(folio
, offset
, length
);
3209 /* No journalling happens on data buffers when this function is used */
3210 WARN_ON(folio_buffers(folio
) && buffer_jbd(folio_buffers(folio
)));
3212 block_invalidate_folio(folio
, offset
, length
);
3215 static int __ext4_journalled_invalidate_folio(struct folio
*folio
,
3216 size_t offset
, size_t length
)
3218 journal_t
*journal
= EXT4_JOURNAL(folio
->mapping
->host
);
3220 trace_ext4_journalled_invalidate_folio(folio
, offset
, length
);
3223 * If it's a full truncate we just forget about the pending dirtying
3225 if (offset
== 0 && length
== folio_size(folio
))
3226 folio_clear_checked(folio
);
3228 return jbd2_journal_invalidate_folio(journal
, folio
, offset
, length
);
3231 /* Wrapper for aops... */
3232 static void ext4_journalled_invalidate_folio(struct folio
*folio
,
3236 WARN_ON(__ext4_journalled_invalidate_folio(folio
, offset
, length
) < 0);
3239 static bool ext4_release_folio(struct folio
*folio
, gfp_t wait
)
3241 struct inode
*inode
= folio
->mapping
->host
;
3242 journal_t
*journal
= EXT4_JOURNAL(inode
);
3244 trace_ext4_release_folio(inode
, folio
);
3246 /* Page has dirty journalled data -> cannot release */
3247 if (folio_test_checked(folio
))
3250 return jbd2_journal_try_to_free_buffers(journal
, folio
);
3252 return try_to_free_buffers(folio
);
3255 static bool ext4_inode_datasync_dirty(struct inode
*inode
)
3257 journal_t
*journal
= EXT4_SB(inode
->i_sb
)->s_journal
;
3260 if (jbd2_transaction_committed(journal
,
3261 EXT4_I(inode
)->i_datasync_tid
))
3263 if (test_opt2(inode
->i_sb
, JOURNAL_FAST_COMMIT
))
3264 return !list_empty(&EXT4_I(inode
)->i_fc_list
);
3268 /* Any metadata buffers to write? */
3269 if (!list_empty(&inode
->i_mapping
->i_private_list
))
3271 return inode
->i_state
& I_DIRTY_DATASYNC
;
3274 static void ext4_set_iomap(struct inode
*inode
, struct iomap
*iomap
,
3275 struct ext4_map_blocks
*map
, loff_t offset
,
3276 loff_t length
, unsigned int flags
)
3278 u8 blkbits
= inode
->i_blkbits
;
3281 * Writes that span EOF might trigger an I/O size update on completion,
3282 * so consider them to be dirty for the purpose of O_DSYNC, even if
3283 * there is no other metadata changes being made or are pending.
3286 if (ext4_inode_datasync_dirty(inode
) ||
3287 offset
+ length
> i_size_read(inode
))
3288 iomap
->flags
|= IOMAP_F_DIRTY
;
3290 if (map
->m_flags
& EXT4_MAP_NEW
)
3291 iomap
->flags
|= IOMAP_F_NEW
;
3293 if (flags
& IOMAP_DAX
)
3294 iomap
->dax_dev
= EXT4_SB(inode
->i_sb
)->s_daxdev
;
3296 iomap
->bdev
= inode
->i_sb
->s_bdev
;
3297 iomap
->offset
= (u64
) map
->m_lblk
<< blkbits
;
3298 iomap
->length
= (u64
) map
->m_len
<< blkbits
;
3300 if ((map
->m_flags
& EXT4_MAP_MAPPED
) &&
3301 !ext4_test_inode_flag(inode
, EXT4_INODE_EXTENTS
))
3302 iomap
->flags
|= IOMAP_F_MERGED
;
3305 * Flags passed to ext4_map_blocks() for direct I/O writes can result
3306 * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
3307 * set. In order for any allocated unwritten extents to be converted
3308 * into written extents correctly within the ->end_io() handler, we
3309 * need to ensure that the iomap->type is set appropriately. Hence, the
3310 * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has
3313 if (map
->m_flags
& EXT4_MAP_UNWRITTEN
) {
3314 iomap
->type
= IOMAP_UNWRITTEN
;
3315 iomap
->addr
= (u64
) map
->m_pblk
<< blkbits
;
3316 if (flags
& IOMAP_DAX
)
3317 iomap
->addr
+= EXT4_SB(inode
->i_sb
)->s_dax_part_off
;
3318 } else if (map
->m_flags
& EXT4_MAP_MAPPED
) {
3319 iomap
->type
= IOMAP_MAPPED
;
3320 iomap
->addr
= (u64
) map
->m_pblk
<< blkbits
;
3321 if (flags
& IOMAP_DAX
)
3322 iomap
->addr
+= EXT4_SB(inode
->i_sb
)->s_dax_part_off
;
3323 } else if (map
->m_flags
& EXT4_MAP_DELAYED
) {
3324 iomap
->type
= IOMAP_DELALLOC
;
3325 iomap
->addr
= IOMAP_NULL_ADDR
;
3327 iomap
->type
= IOMAP_HOLE
;
3328 iomap
->addr
= IOMAP_NULL_ADDR
;
3332 static int ext4_iomap_alloc(struct inode
*inode
, struct ext4_map_blocks
*map
,
3336 u8 blkbits
= inode
->i_blkbits
;
3337 int ret
, dio_credits
, m_flags
= 0, retries
= 0;
3340 * Trim the mapping request to the maximum value that we can map at
3341 * once for direct I/O.
3343 if (map
->m_len
> DIO_MAX_BLOCKS
)
3344 map
->m_len
= DIO_MAX_BLOCKS
;
3345 dio_credits
= ext4_chunk_trans_blocks(inode
, map
->m_len
);
3349 * Either we allocate blocks and then don't get an unwritten extent, so
3350 * in that case we have reserved enough credits. Or, the blocks are
3351 * already allocated and unwritten. In that case, the extent conversion
3352 * fits into the credits as well.
3354 handle
= ext4_journal_start(inode
, EXT4_HT_MAP_BLOCKS
, dio_credits
);
3356 return PTR_ERR(handle
);
3359 * DAX and direct I/O are the only two operations that are currently
3360 * supported with IOMAP_WRITE.
3362 WARN_ON(!(flags
& (IOMAP_DAX
| IOMAP_DIRECT
)));
3363 if (flags
& IOMAP_DAX
)
3364 m_flags
= EXT4_GET_BLOCKS_CREATE_ZERO
;
3366 * We use i_size instead of i_disksize here because delalloc writeback
3367 * can complete at any point during the I/O and subsequently push the
3368 * i_disksize out to i_size. This could be beyond where direct I/O is
3369 * happening and thus expose allocated blocks to direct I/O reads.
3371 else if (((loff_t
)map
->m_lblk
<< blkbits
) >= i_size_read(inode
))
3372 m_flags
= EXT4_GET_BLOCKS_CREATE
;
3373 else if (ext4_test_inode_flag(inode
, EXT4_INODE_EXTENTS
))
3374 m_flags
= EXT4_GET_BLOCKS_IO_CREATE_EXT
;
3376 ret
= ext4_map_blocks(handle
, inode
, map
, m_flags
);
3379 * We cannot fill holes in indirect tree based inodes as that could
3380 * expose stale data in the case of a crash. Use the magic error code
3381 * to fallback to buffered I/O.
3383 if (!m_flags
&& !ret
)
3386 ext4_journal_stop(handle
);
3387 if (ret
== -ENOSPC
&& ext4_should_retry_alloc(inode
->i_sb
, &retries
))
3394 static int ext4_iomap_begin(struct inode
*inode
, loff_t offset
, loff_t length
,
3395 unsigned flags
, struct iomap
*iomap
, struct iomap
*srcmap
)
3398 struct ext4_map_blocks map
;
3399 u8 blkbits
= inode
->i_blkbits
;
3401 if ((offset
>> blkbits
) > EXT4_MAX_LOGICAL_BLOCK
)
3404 if (WARN_ON_ONCE(ext4_has_inline_data(inode
)))
3408 * Calculate the first and last logical blocks respectively.
3410 map
.m_lblk
= offset
>> blkbits
;
3411 map
.m_len
= min_t(loff_t
, (offset
+ length
- 1) >> blkbits
,
3412 EXT4_MAX_LOGICAL_BLOCK
) - map
.m_lblk
+ 1;
3414 if (flags
& IOMAP_WRITE
) {
3416 * We check here if the blocks are already allocated, then we
3417 * don't need to start a journal txn and we can directly return
3418 * the mapping information. This could boost performance
3419 * especially in multi-threaded overwrite requests.
3421 if (offset
+ length
<= i_size_read(inode
)) {
3422 ret
= ext4_map_blocks(NULL
, inode
, &map
, 0);
3423 if (ret
> 0 && (map
.m_flags
& EXT4_MAP_MAPPED
))
3426 ret
= ext4_iomap_alloc(inode
, &map
, flags
);
3428 ret
= ext4_map_blocks(NULL
, inode
, &map
, 0);
3435 * When inline encryption is enabled, sometimes I/O to an encrypted file
3436 * has to be broken up to guarantee DUN contiguity. Handle this by
3437 * limiting the length of the mapping returned.
3439 map
.m_len
= fscrypt_limit_io_blocks(inode
, map
.m_lblk
, map
.m_len
);
3441 ext4_set_iomap(inode
, iomap
, &map
, offset
, length
, flags
);
3446 static int ext4_iomap_overwrite_begin(struct inode
*inode
, loff_t offset
,
3447 loff_t length
, unsigned flags
, struct iomap
*iomap
,
3448 struct iomap
*srcmap
)
3453 * Even for writes we don't need to allocate blocks, so just pretend
3454 * we are reading to save overhead of starting a transaction.
3456 flags
&= ~IOMAP_WRITE
;
3457 ret
= ext4_iomap_begin(inode
, offset
, length
, flags
, iomap
, srcmap
);
3458 WARN_ON_ONCE(!ret
&& iomap
->type
!= IOMAP_MAPPED
);
3462 static inline bool ext4_want_directio_fallback(unsigned flags
, ssize_t written
)
3464 /* must be a directio to fall back to buffered */
3465 if ((flags
& (IOMAP_WRITE
| IOMAP_DIRECT
)) !=
3466 (IOMAP_WRITE
| IOMAP_DIRECT
))
3469 /* atomic writes are all-or-nothing */
3470 if (flags
& IOMAP_ATOMIC
)
3473 /* can only try again if we wrote nothing */
3474 return written
== 0;
3477 static int ext4_iomap_end(struct inode
*inode
, loff_t offset
, loff_t length
,
3478 ssize_t written
, unsigned flags
, struct iomap
*iomap
)
3481 * Check to see whether an error occurred while writing out the data to
3482 * the allocated blocks. If so, return the magic error code for
3483 * non-atomic write so that we fallback to buffered I/O and attempt to
3484 * complete the remainder of the I/O.
3485 * For non-atomic writes, any blocks that may have been
3486 * allocated in preparation for the direct I/O will be reused during
3487 * buffered I/O. For atomic write, we never fallback to buffered-io.
3489 if (ext4_want_directio_fallback(flags
, written
))
3495 const struct iomap_ops ext4_iomap_ops
= {
3496 .iomap_begin
= ext4_iomap_begin
,
3497 .iomap_end
= ext4_iomap_end
,
3500 const struct iomap_ops ext4_iomap_overwrite_ops
= {
3501 .iomap_begin
= ext4_iomap_overwrite_begin
,
3502 .iomap_end
= ext4_iomap_end
,
3505 static int ext4_iomap_begin_report(struct inode
*inode
, loff_t offset
,
3506 loff_t length
, unsigned int flags
,
3507 struct iomap
*iomap
, struct iomap
*srcmap
)
3510 struct ext4_map_blocks map
;
3511 u8 blkbits
= inode
->i_blkbits
;
3513 if ((offset
>> blkbits
) > EXT4_MAX_LOGICAL_BLOCK
)
3516 if (ext4_has_inline_data(inode
)) {
3517 ret
= ext4_inline_data_iomap(inode
, iomap
);
3518 if (ret
!= -EAGAIN
) {
3519 if (ret
== 0 && offset
>= iomap
->length
)
3526 * Calculate the first and last logical block respectively.
3528 map
.m_lblk
= offset
>> blkbits
;
3529 map
.m_len
= min_t(loff_t
, (offset
+ length
- 1) >> blkbits
,
3530 EXT4_MAX_LOGICAL_BLOCK
) - map
.m_lblk
+ 1;
3533 * Fiemap callers may call for offset beyond s_bitmap_maxbytes.
3534 * So handle it here itself instead of querying ext4_map_blocks().
3535 * Since ext4_map_blocks() will warn about it and will return
3538 if (!(ext4_test_inode_flag(inode
, EXT4_INODE_EXTENTS
))) {
3539 struct ext4_sb_info
*sbi
= EXT4_SB(inode
->i_sb
);
3541 if (offset
>= sbi
->s_bitmap_maxbytes
) {
3547 ret
= ext4_map_blocks(NULL
, inode
, &map
, 0);
3551 ext4_set_iomap(inode
, iomap
, &map
, offset
, length
, flags
);
3556 const struct iomap_ops ext4_iomap_report_ops
= {
3557 .iomap_begin
= ext4_iomap_begin_report
,
3561 * For data=journal mode, folio should be marked dirty only when it was
3562 * writeably mapped. When that happens, it was already attached to the
3563 * transaction and marked as jbddirty (we take care of this in
3564 * ext4_page_mkwrite()). On transaction commit, we writeprotect page mappings
3565 * so we should have nothing to do here, except for the case when someone
3566 * had the page pinned and dirtied the page through this pin (e.g. by doing
3567 * direct IO to it). In that case we'd need to attach buffers here to the
3568 * transaction but we cannot due to lock ordering. We cannot just dirty the
3569 * folio and leave attached buffers clean, because the buffers' dirty state is
3570 * "definitive". We cannot just set the buffers dirty or jbddirty because all
3571 * the journalling code will explode. So what we do is to mark the folio
3572 * "pending dirty" and next time ext4_writepages() is called, attach buffers
3573 * to the transaction appropriately.
3575 static bool ext4_journalled_dirty_folio(struct address_space
*mapping
,
3576 struct folio
*folio
)
3578 WARN_ON_ONCE(!folio_buffers(folio
));
3579 if (folio_maybe_dma_pinned(folio
))
3580 folio_set_checked(folio
);
3581 return filemap_dirty_folio(mapping
, folio
);
3584 static bool ext4_dirty_folio(struct address_space
*mapping
, struct folio
*folio
)
3586 WARN_ON_ONCE(!folio_test_locked(folio
) && !folio_test_dirty(folio
));
3587 WARN_ON_ONCE(!folio_buffers(folio
));
3588 return block_dirty_folio(mapping
, folio
);
3591 static int ext4_iomap_swap_activate(struct swap_info_struct
*sis
,
3592 struct file
*file
, sector_t
*span
)
3594 return iomap_swapfile_activate(sis
, file
, span
,
3595 &ext4_iomap_report_ops
);
3598 static const struct address_space_operations ext4_aops
= {
3599 .read_folio
= ext4_read_folio
,
3600 .readahead
= ext4_readahead
,
3601 .writepages
= ext4_writepages
,
3602 .write_begin
= ext4_write_begin
,
3603 .write_end
= ext4_write_end
,
3604 .dirty_folio
= ext4_dirty_folio
,
3606 .invalidate_folio
= ext4_invalidate_folio
,
3607 .release_folio
= ext4_release_folio
,
3608 .migrate_folio
= buffer_migrate_folio
,
3609 .is_partially_uptodate
= block_is_partially_uptodate
,
3610 .error_remove_folio
= generic_error_remove_folio
,
3611 .swap_activate
= ext4_iomap_swap_activate
,
3614 static const struct address_space_operations ext4_journalled_aops
= {
3615 .read_folio
= ext4_read_folio
,
3616 .readahead
= ext4_readahead
,
3617 .writepages
= ext4_writepages
,
3618 .write_begin
= ext4_write_begin
,
3619 .write_end
= ext4_journalled_write_end
,
3620 .dirty_folio
= ext4_journalled_dirty_folio
,
3622 .invalidate_folio
= ext4_journalled_invalidate_folio
,
3623 .release_folio
= ext4_release_folio
,
3624 .migrate_folio
= buffer_migrate_folio_norefs
,
3625 .is_partially_uptodate
= block_is_partially_uptodate
,
3626 .error_remove_folio
= generic_error_remove_folio
,
3627 .swap_activate
= ext4_iomap_swap_activate
,
3630 static const struct address_space_operations ext4_da_aops
= {
3631 .read_folio
= ext4_read_folio
,
3632 .readahead
= ext4_readahead
,
3633 .writepages
= ext4_writepages
,
3634 .write_begin
= ext4_da_write_begin
,
3635 .write_end
= ext4_da_write_end
,
3636 .dirty_folio
= ext4_dirty_folio
,
3638 .invalidate_folio
= ext4_invalidate_folio
,
3639 .release_folio
= ext4_release_folio
,
3640 .migrate_folio
= buffer_migrate_folio
,
3641 .is_partially_uptodate
= block_is_partially_uptodate
,
3642 .error_remove_folio
= generic_error_remove_folio
,
3643 .swap_activate
= ext4_iomap_swap_activate
,
3646 static const struct address_space_operations ext4_dax_aops
= {
3647 .writepages
= ext4_dax_writepages
,
3648 .dirty_folio
= noop_dirty_folio
,
3650 .swap_activate
= ext4_iomap_swap_activate
,
3653 void ext4_set_aops(struct inode
*inode
)
3655 switch (ext4_inode_journal_mode(inode
)) {
3656 case EXT4_INODE_ORDERED_DATA_MODE
:
3657 case EXT4_INODE_WRITEBACK_DATA_MODE
:
3659 case EXT4_INODE_JOURNAL_DATA_MODE
:
3660 inode
->i_mapping
->a_ops
= &ext4_journalled_aops
;
3666 inode
->i_mapping
->a_ops
= &ext4_dax_aops
;
3667 else if (test_opt(inode
->i_sb
, DELALLOC
))
3668 inode
->i_mapping
->a_ops
= &ext4_da_aops
;
3670 inode
->i_mapping
->a_ops
= &ext4_aops
;
3674 * Here we can't skip an unwritten buffer even though it usually reads zero
3675 * because it might have data in pagecache (eg, if called from ext4_zero_range,
3676 * ext4_punch_hole, etc) which needs to be properly zeroed out. Otherwise a
3677 * racing writeback can come later and flush the stale pagecache to disk.
3679 static int __ext4_block_zero_page_range(handle_t
*handle
,
3680 struct address_space
*mapping
, loff_t from
, loff_t length
)
3682 ext4_fsblk_t index
= from
>> PAGE_SHIFT
;
3683 unsigned offset
= from
& (PAGE_SIZE
-1);
3684 unsigned blocksize
, pos
;
3686 struct inode
*inode
= mapping
->host
;
3687 struct buffer_head
*bh
;
3688 struct folio
*folio
;
3691 folio
= __filemap_get_folio(mapping
, from
>> PAGE_SHIFT
,
3692 FGP_LOCK
| FGP_ACCESSED
| FGP_CREAT
,
3693 mapping_gfp_constraint(mapping
, ~__GFP_FS
));
3695 return PTR_ERR(folio
);
3697 blocksize
= inode
->i_sb
->s_blocksize
;
3699 iblock
= index
<< (PAGE_SHIFT
- inode
->i_sb
->s_blocksize_bits
);
3701 bh
= folio_buffers(folio
);
3703 bh
= create_empty_buffers(folio
, blocksize
, 0);
3705 /* Find the buffer that contains "offset" */
3707 while (offset
>= pos
) {
3708 bh
= bh
->b_this_page
;
3712 if (buffer_freed(bh
)) {
3713 BUFFER_TRACE(bh
, "freed: skip");
3716 if (!buffer_mapped(bh
)) {
3717 BUFFER_TRACE(bh
, "unmapped");
3718 ext4_get_block(inode
, iblock
, bh
, 0);
3719 /* unmapped? It's a hole - nothing to do */
3720 if (!buffer_mapped(bh
)) {
3721 BUFFER_TRACE(bh
, "still unmapped");
3726 /* Ok, it's mapped. Make sure it's up-to-date */
3727 if (folio_test_uptodate(folio
))
3728 set_buffer_uptodate(bh
);
3730 if (!buffer_uptodate(bh
)) {
3731 err
= ext4_read_bh_lock(bh
, 0, true);
3734 if (fscrypt_inode_uses_fs_layer_crypto(inode
)) {
3735 /* We expect the key to be set. */
3736 BUG_ON(!fscrypt_has_encryption_key(inode
));
3737 err
= fscrypt_decrypt_pagecache_blocks(folio
,
3741 clear_buffer_uptodate(bh
);
3746 if (ext4_should_journal_data(inode
)) {
3747 BUFFER_TRACE(bh
, "get write access");
3748 err
= ext4_journal_get_write_access(handle
, inode
->i_sb
, bh
,
3753 folio_zero_range(folio
, offset
, length
);
3754 BUFFER_TRACE(bh
, "zeroed end of block");
3756 if (ext4_should_journal_data(inode
)) {
3757 err
= ext4_dirty_journalled_data(handle
, bh
);
3760 mark_buffer_dirty(bh
);
3761 if (ext4_should_order_data(inode
))
3762 err
= ext4_jbd2_inode_add_write(handle
, inode
, from
,
3767 folio_unlock(folio
);
3773 * ext4_block_zero_page_range() zeros out a mapping of length 'length'
3774 * starting from file offset 'from'. The range to be zero'd must
3775 * be contained with in one block. If the specified range exceeds
3776 * the end of the block it will be shortened to end of the block
3777 * that corresponds to 'from'
3779 static int ext4_block_zero_page_range(handle_t
*handle
,
3780 struct address_space
*mapping
, loff_t from
, loff_t length
)
3782 struct inode
*inode
= mapping
->host
;
3783 unsigned offset
= from
& (PAGE_SIZE
-1);
3784 unsigned blocksize
= inode
->i_sb
->s_blocksize
;
3785 unsigned max
= blocksize
- (offset
& (blocksize
- 1));
3788 * correct length if it does not fall between
3789 * 'from' and the end of the block
3791 if (length
> max
|| length
< 0)
3794 if (IS_DAX(inode
)) {
3795 return dax_zero_range(inode
, from
, length
, NULL
,
3798 return __ext4_block_zero_page_range(handle
, mapping
, from
, length
);
3802 * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
3803 * up to the end of the block which corresponds to `from'.
3804 * This required during truncate. We need to physically zero the tail end
3805 * of that block so it doesn't yield old data if the file is later grown.
3807 static int ext4_block_truncate_page(handle_t
*handle
,
3808 struct address_space
*mapping
, loff_t from
)
3810 unsigned offset
= from
& (PAGE_SIZE
-1);
3813 struct inode
*inode
= mapping
->host
;
3815 /* If we are processing an encrypted inode during orphan list handling */
3816 if (IS_ENCRYPTED(inode
) && !fscrypt_has_encryption_key(inode
))
3819 blocksize
= inode
->i_sb
->s_blocksize
;
3820 length
= blocksize
- (offset
& (blocksize
- 1));
3822 return ext4_block_zero_page_range(handle
, mapping
, from
, length
);
3825 int ext4_zero_partial_blocks(handle_t
*handle
, struct inode
*inode
,
3826 loff_t lstart
, loff_t length
)
3828 struct super_block
*sb
= inode
->i_sb
;
3829 struct address_space
*mapping
= inode
->i_mapping
;
3830 unsigned partial_start
, partial_end
;
3831 ext4_fsblk_t start
, end
;
3832 loff_t byte_end
= (lstart
+ length
- 1);
3835 partial_start
= lstart
& (sb
->s_blocksize
- 1);
3836 partial_end
= byte_end
& (sb
->s_blocksize
- 1);
3838 start
= lstart
>> sb
->s_blocksize_bits
;
3839 end
= byte_end
>> sb
->s_blocksize_bits
;
3841 /* Handle partial zero within the single block */
3843 (partial_start
|| (partial_end
!= sb
->s_blocksize
- 1))) {
3844 err
= ext4_block_zero_page_range(handle
, mapping
,
3848 /* Handle partial zero out on the start of the range */
3849 if (partial_start
) {
3850 err
= ext4_block_zero_page_range(handle
, mapping
,
3851 lstart
, sb
->s_blocksize
);
3855 /* Handle partial zero out on the end of the range */
3856 if (partial_end
!= sb
->s_blocksize
- 1)
3857 err
= ext4_block_zero_page_range(handle
, mapping
,
3858 byte_end
- partial_end
,
3863 int ext4_can_truncate(struct inode
*inode
)
3865 if (S_ISREG(inode
->i_mode
))
3867 if (S_ISDIR(inode
->i_mode
))
3869 if (S_ISLNK(inode
->i_mode
))
3870 return !ext4_inode_is_fast_symlink(inode
);
3875 * We have to make sure i_disksize gets properly updated before we truncate
3876 * page cache due to hole punching or zero range. Otherwise i_disksize update
3877 * can get lost as it may have been postponed to submission of writeback but
3878 * that will never happen after we truncate page cache.
3880 int ext4_update_disksize_before_punch(struct inode
*inode
, loff_t offset
,
3886 loff_t size
= i_size_read(inode
);
3888 WARN_ON(!inode_is_locked(inode
));
3889 if (offset
> size
|| offset
+ len
< size
)
3892 if (EXT4_I(inode
)->i_disksize
>= size
)
3895 handle
= ext4_journal_start(inode
, EXT4_HT_MISC
, 1);
3897 return PTR_ERR(handle
);
3898 ext4_update_i_disksize(inode
, size
);
3899 ret
= ext4_mark_inode_dirty(handle
, inode
);
3900 ext4_journal_stop(handle
);
3905 static void ext4_wait_dax_page(struct inode
*inode
)
3907 filemap_invalidate_unlock(inode
->i_mapping
);
3909 filemap_invalidate_lock(inode
->i_mapping
);
3912 int ext4_break_layouts(struct inode
*inode
)
3917 if (WARN_ON_ONCE(!rwsem_is_locked(&inode
->i_mapping
->invalidate_lock
)))
3921 page
= dax_layout_busy_page(inode
->i_mapping
);
3925 error
= ___wait_var_event(&page
->_refcount
,
3926 atomic_read(&page
->_refcount
) == 1,
3927 TASK_INTERRUPTIBLE
, 0, 0,
3928 ext4_wait_dax_page(inode
));
3929 } while (error
== 0);
3935 * ext4_punch_hole: punches a hole in a file by releasing the blocks
3936 * associated with the given offset and length
3938 * @inode: File inode
3939 * @offset: The offset where the hole will begin
3940 * @len: The length of the hole
3942 * Returns: 0 on success or negative on failure
3945 int ext4_punch_hole(struct file
*file
, loff_t offset
, loff_t length
)
3947 struct inode
*inode
= file_inode(file
);
3948 struct super_block
*sb
= inode
->i_sb
;
3949 ext4_lblk_t first_block
, stop_block
;
3950 struct address_space
*mapping
= inode
->i_mapping
;
3951 loff_t first_block_offset
, last_block_offset
, max_length
;
3952 struct ext4_sb_info
*sbi
= EXT4_SB(inode
->i_sb
);
3954 unsigned int credits
;
3955 int ret
= 0, ret2
= 0;
3957 trace_ext4_punch_hole(inode
, offset
, length
, 0);
3960 * Write out all dirty pages to avoid race conditions
3961 * Then release them.
3963 if (mapping_tagged(mapping
, PAGECACHE_TAG_DIRTY
)) {
3964 ret
= filemap_write_and_wait_range(mapping
, offset
,
3965 offset
+ length
- 1);
3972 /* No need to punch hole beyond i_size */
3973 if (offset
>= inode
->i_size
)
3977 * If the hole extends beyond i_size, set the hole
3978 * to end after the page that contains i_size
3980 if (offset
+ length
> inode
->i_size
) {
3981 length
= inode
->i_size
+
3982 PAGE_SIZE
- (inode
->i_size
& (PAGE_SIZE
- 1)) -
3987 * For punch hole the length + offset needs to be within one block
3988 * before last range. Adjust the length if it goes beyond that limit.
3990 max_length
= sbi
->s_bitmap_maxbytes
- inode
->i_sb
->s_blocksize
;
3991 if (offset
+ length
> max_length
)
3992 length
= max_length
- offset
;
3994 if (offset
& (sb
->s_blocksize
- 1) ||
3995 (offset
+ length
) & (sb
->s_blocksize
- 1)) {
3997 * Attach jinode to inode for jbd2 if we do any zeroing of
4000 ret
= ext4_inode_attach_jinode(inode
);
4006 /* Wait all existing dio workers, newcomers will block on i_rwsem */
4007 inode_dio_wait(inode
);
4009 ret
= file_modified(file
);
4014 * Prevent page faults from reinstantiating pages we have released from
4017 filemap_invalidate_lock(mapping
);
4019 ret
= ext4_break_layouts(inode
);
4023 first_block_offset
= round_up(offset
, sb
->s_blocksize
);
4024 last_block_offset
= round_down((offset
+ length
), sb
->s_blocksize
) - 1;
4026 /* Now release the pages and zero block aligned part of pages*/
4027 if (last_block_offset
> first_block_offset
) {
4028 ret
= ext4_update_disksize_before_punch(inode
, offset
, length
);
4031 truncate_pagecache_range(inode
, first_block_offset
,
4035 if (ext4_test_inode_flag(inode
, EXT4_INODE_EXTENTS
))
4036 credits
= ext4_writepage_trans_blocks(inode
);
4038 credits
= ext4_blocks_for_truncate(inode
);
4039 handle
= ext4_journal_start(inode
, EXT4_HT_TRUNCATE
, credits
);
4040 if (IS_ERR(handle
)) {
4041 ret
= PTR_ERR(handle
);
4042 ext4_std_error(sb
, ret
);
4046 ret
= ext4_zero_partial_blocks(handle
, inode
, offset
,
4051 first_block
= (offset
+ sb
->s_blocksize
- 1) >>
4052 EXT4_BLOCK_SIZE_BITS(sb
);
4053 stop_block
= (offset
+ length
) >> EXT4_BLOCK_SIZE_BITS(sb
);
4055 /* If there are blocks to remove, do it */
4056 if (stop_block
> first_block
) {
4057 ext4_lblk_t hole_len
= stop_block
- first_block
;
4059 down_write(&EXT4_I(inode
)->i_data_sem
);
4060 ext4_discard_preallocations(inode
);
4062 ext4_es_remove_extent(inode
, first_block
, hole_len
);
4064 if (ext4_test_inode_flag(inode
, EXT4_INODE_EXTENTS
))
4065 ret
= ext4_ext_remove_space(inode
, first_block
,
4068 ret
= ext4_ind_remove_space(handle
, inode
, first_block
,
4071 ext4_es_insert_extent(inode
, first_block
, hole_len
, ~0,
4072 EXTENT_STATUS_HOLE
, 0);
4073 up_write(&EXT4_I(inode
)->i_data_sem
);
4075 ext4_fc_track_range(handle
, inode
, first_block
, stop_block
);
4077 ext4_handle_sync(handle
);
4079 inode_set_mtime_to_ts(inode
, inode_set_ctime_current(inode
));
4080 ret2
= ext4_mark_inode_dirty(handle
, inode
);
4084 ext4_update_inode_fsync_trans(handle
, inode
, 1);
4086 ext4_journal_stop(handle
);
4088 filemap_invalidate_unlock(mapping
);
4090 inode_unlock(inode
);
4094 int ext4_inode_attach_jinode(struct inode
*inode
)
4096 struct ext4_inode_info
*ei
= EXT4_I(inode
);
4097 struct jbd2_inode
*jinode
;
4099 if (ei
->jinode
|| !EXT4_SB(inode
->i_sb
)->s_journal
)
4102 jinode
= jbd2_alloc_inode(GFP_KERNEL
);
4103 spin_lock(&inode
->i_lock
);
4106 spin_unlock(&inode
->i_lock
);
4109 ei
->jinode
= jinode
;
4110 jbd2_journal_init_jbd_inode(ei
->jinode
, inode
);
4113 spin_unlock(&inode
->i_lock
);
4114 if (unlikely(jinode
!= NULL
))
4115 jbd2_free_inode(jinode
);
4122 * We block out ext4_get_block() block instantiations across the entire
4123 * transaction, and VFS/VM ensures that ext4_truncate() cannot run
4124 * simultaneously on behalf of the same inode.
4126 * As we work through the truncate and commit bits of it to the journal there
4127 * is one core, guiding principle: the file's tree must always be consistent on
4128 * disk. We must be able to restart the truncate after a crash.
4130 * The file's tree may be transiently inconsistent in memory (although it
4131 * probably isn't), but whenever we close off and commit a journal transaction,
4132 * the contents of (the filesystem + the journal) must be consistent and
4133 * restartable. It's pretty simple, really: bottom up, right to left (although
4134 * left-to-right works OK too).
4136 * Note that at recovery time, journal replay occurs *before* the restart of
4137 * truncate against the orphan inode list.
4139 * The committed inode has the new, desired i_size (which is the same as
4140 * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see
4141 * that this inode's truncate did not complete and it will again call
4142 * ext4_truncate() to have another go. So there will be instantiated blocks
4143 * to the right of the truncation point in a crashed ext4 filesystem. But
4144 * that's fine - as long as they are linked from the inode, the post-crash
4145 * ext4_truncate() run will find them and release them.
4147 int ext4_truncate(struct inode
*inode
)
4149 struct ext4_inode_info
*ei
= EXT4_I(inode
);
4150 unsigned int credits
;
4153 struct address_space
*mapping
= inode
->i_mapping
;
4156 * There is a possibility that we're either freeing the inode
4157 * or it's a completely new inode. In those cases we might not
4158 * have i_rwsem locked because it's not necessary.
4160 if (!(inode
->i_state
& (I_NEW
|I_FREEING
)))
4161 WARN_ON(!inode_is_locked(inode
));
4162 trace_ext4_truncate_enter(inode
);
4164 if (!ext4_can_truncate(inode
))
4167 if (inode
->i_size
== 0 && !test_opt(inode
->i_sb
, NO_AUTO_DA_ALLOC
))
4168 ext4_set_inode_state(inode
, EXT4_STATE_DA_ALLOC_CLOSE
);
4170 if (ext4_has_inline_data(inode
)) {
4173 err
= ext4_inline_data_truncate(inode
, &has_inline
);
4174 if (err
|| has_inline
)
4178 /* If we zero-out tail of the page, we have to create jinode for jbd2 */
4179 if (inode
->i_size
& (inode
->i_sb
->s_blocksize
- 1)) {
4180 err
= ext4_inode_attach_jinode(inode
);
4185 if (ext4_test_inode_flag(inode
, EXT4_INODE_EXTENTS
))
4186 credits
= ext4_writepage_trans_blocks(inode
);
4188 credits
= ext4_blocks_for_truncate(inode
);
4190 handle
= ext4_journal_start(inode
, EXT4_HT_TRUNCATE
, credits
);
4191 if (IS_ERR(handle
)) {
4192 err
= PTR_ERR(handle
);
4196 if (inode
->i_size
& (inode
->i_sb
->s_blocksize
- 1))
4197 ext4_block_truncate_page(handle
, mapping
, inode
->i_size
);
4200 * We add the inode to the orphan list, so that if this
4201 * truncate spans multiple transactions, and we crash, we will
4202 * resume the truncate when the filesystem recovers. It also
4203 * marks the inode dirty, to catch the new size.
4205 * Implication: the file must always be in a sane, consistent
4206 * truncatable state while each transaction commits.
4208 err
= ext4_orphan_add(handle
, inode
);
4212 down_write(&EXT4_I(inode
)->i_data_sem
);
4214 ext4_discard_preallocations(inode
);
4216 if (ext4_test_inode_flag(inode
, EXT4_INODE_EXTENTS
))
4217 err
= ext4_ext_truncate(handle
, inode
);
4219 ext4_ind_truncate(handle
, inode
);
4221 up_write(&ei
->i_data_sem
);
4226 ext4_handle_sync(handle
);
4230 * If this was a simple ftruncate() and the file will remain alive,
4231 * then we need to clear up the orphan record which we created above.
4232 * However, if this was a real unlink then we were called by
4233 * ext4_evict_inode(), and we allow that function to clean up the
4234 * orphan info for us.
4237 ext4_orphan_del(handle
, inode
);
4239 inode_set_mtime_to_ts(inode
, inode_set_ctime_current(inode
));
4240 err2
= ext4_mark_inode_dirty(handle
, inode
);
4241 if (unlikely(err2
&& !err
))
4243 ext4_journal_stop(handle
);
4246 trace_ext4_truncate_exit(inode
);
4250 static inline u64
ext4_inode_peek_iversion(const struct inode
*inode
)
4252 if (unlikely(EXT4_I(inode
)->i_flags
& EXT4_EA_INODE_FL
))
4253 return inode_peek_iversion_raw(inode
);
4255 return inode_peek_iversion(inode
);
4258 static int ext4_inode_blocks_set(struct ext4_inode
*raw_inode
,
4259 struct ext4_inode_info
*ei
)
4261 struct inode
*inode
= &(ei
->vfs_inode
);
4262 u64 i_blocks
= READ_ONCE(inode
->i_blocks
);
4263 struct super_block
*sb
= inode
->i_sb
;
4265 if (i_blocks
<= ~0U) {
4267 * i_blocks can be represented in a 32 bit variable
4268 * as multiple of 512 bytes
4270 raw_inode
->i_blocks_lo
= cpu_to_le32(i_blocks
);
4271 raw_inode
->i_blocks_high
= 0;
4272 ext4_clear_inode_flag(inode
, EXT4_INODE_HUGE_FILE
);
4277 * This should never happen since sb->s_maxbytes should not have
4278 * allowed this, sb->s_maxbytes was set according to the huge_file
4279 * feature in ext4_fill_super().
4281 if (!ext4_has_feature_huge_file(sb
))
4282 return -EFSCORRUPTED
;
4284 if (i_blocks
<= 0xffffffffffffULL
) {
4286 * i_blocks can be represented in a 48 bit variable
4287 * as multiple of 512 bytes
4289 raw_inode
->i_blocks_lo
= cpu_to_le32(i_blocks
);
4290 raw_inode
->i_blocks_high
= cpu_to_le16(i_blocks
>> 32);
4291 ext4_clear_inode_flag(inode
, EXT4_INODE_HUGE_FILE
);
4293 ext4_set_inode_flag(inode
, EXT4_INODE_HUGE_FILE
);
4294 /* i_block is stored in file system block size */
4295 i_blocks
= i_blocks
>> (inode
->i_blkbits
- 9);
4296 raw_inode
->i_blocks_lo
= cpu_to_le32(i_blocks
);
4297 raw_inode
->i_blocks_high
= cpu_to_le16(i_blocks
>> 32);
4302 static int ext4_fill_raw_inode(struct inode
*inode
, struct ext4_inode
*raw_inode
)
4304 struct ext4_inode_info
*ei
= EXT4_I(inode
);
4311 err
= ext4_inode_blocks_set(raw_inode
, ei
);
4313 raw_inode
->i_mode
= cpu_to_le16(inode
->i_mode
);
4314 i_uid
= i_uid_read(inode
);
4315 i_gid
= i_gid_read(inode
);
4316 i_projid
= from_kprojid(&init_user_ns
, ei
->i_projid
);
4317 if (!(test_opt(inode
->i_sb
, NO_UID32
))) {
4318 raw_inode
->i_uid_low
= cpu_to_le16(low_16_bits(i_uid
));
4319 raw_inode
->i_gid_low
= cpu_to_le16(low_16_bits(i_gid
));
4321 * Fix up interoperability with old kernels. Otherwise,
4322 * old inodes get re-used with the upper 16 bits of the
4325 if (ei
->i_dtime
&& list_empty(&ei
->i_orphan
)) {
4326 raw_inode
->i_uid_high
= 0;
4327 raw_inode
->i_gid_high
= 0;
4329 raw_inode
->i_uid_high
=
4330 cpu_to_le16(high_16_bits(i_uid
));
4331 raw_inode
->i_gid_high
=
4332 cpu_to_le16(high_16_bits(i_gid
));
4335 raw_inode
->i_uid_low
= cpu_to_le16(fs_high2lowuid(i_uid
));
4336 raw_inode
->i_gid_low
= cpu_to_le16(fs_high2lowgid(i_gid
));
4337 raw_inode
->i_uid_high
= 0;
4338 raw_inode
->i_gid_high
= 0;
4340 raw_inode
->i_links_count
= cpu_to_le16(inode
->i_nlink
);
4342 EXT4_INODE_SET_CTIME(inode
, raw_inode
);
4343 EXT4_INODE_SET_MTIME(inode
, raw_inode
);
4344 EXT4_INODE_SET_ATIME(inode
, raw_inode
);
4345 EXT4_EINODE_SET_XTIME(i_crtime
, ei
, raw_inode
);
4347 raw_inode
->i_dtime
= cpu_to_le32(ei
->i_dtime
);
4348 raw_inode
->i_flags
= cpu_to_le32(ei
->i_flags
& 0xFFFFFFFF);
4349 if (likely(!test_opt2(inode
->i_sb
, HURD_COMPAT
)))
4350 raw_inode
->i_file_acl_high
=
4351 cpu_to_le16(ei
->i_file_acl
>> 32);
4352 raw_inode
->i_file_acl_lo
= cpu_to_le32(ei
->i_file_acl
);
4353 ext4_isize_set(raw_inode
, ei
->i_disksize
);
4355 raw_inode
->i_generation
= cpu_to_le32(inode
->i_generation
);
4356 if (S_ISCHR(inode
->i_mode
) || S_ISBLK(inode
->i_mode
)) {
4357 if (old_valid_dev(inode
->i_rdev
)) {
4358 raw_inode
->i_block
[0] =
4359 cpu_to_le32(old_encode_dev(inode
->i_rdev
));
4360 raw_inode
->i_block
[1] = 0;
4362 raw_inode
->i_block
[0] = 0;
4363 raw_inode
->i_block
[1] =
4364 cpu_to_le32(new_encode_dev(inode
->i_rdev
));
4365 raw_inode
->i_block
[2] = 0;
4367 } else if (!ext4_has_inline_data(inode
)) {
4368 for (block
= 0; block
< EXT4_N_BLOCKS
; block
++)
4369 raw_inode
->i_block
[block
] = ei
->i_data
[block
];
4372 if (likely(!test_opt2(inode
->i_sb
, HURD_COMPAT
))) {
4373 u64 ivers
= ext4_inode_peek_iversion(inode
);
4375 raw_inode
->i_disk_version
= cpu_to_le32(ivers
);
4376 if (ei
->i_extra_isize
) {
4377 if (EXT4_FITS_IN_INODE(raw_inode
, ei
, i_version_hi
))
4378 raw_inode
->i_version_hi
=
4379 cpu_to_le32(ivers
>> 32);
4380 raw_inode
->i_extra_isize
=
4381 cpu_to_le16(ei
->i_extra_isize
);
4385 if (i_projid
!= EXT4_DEF_PROJID
&&
4386 !ext4_has_feature_project(inode
->i_sb
))
4387 err
= err
?: -EFSCORRUPTED
;
4389 if (EXT4_INODE_SIZE(inode
->i_sb
) > EXT4_GOOD_OLD_INODE_SIZE
&&
4390 EXT4_FITS_IN_INODE(raw_inode
, ei
, i_projid
))
4391 raw_inode
->i_projid
= cpu_to_le32(i_projid
);
4393 ext4_inode_csum_set(inode
, raw_inode
, ei
);
4398 * ext4_get_inode_loc returns with an extra refcount against the inode's
4399 * underlying buffer_head on success. If we pass 'inode' and it does not
4400 * have in-inode xattr, we have all inode data in memory that is needed
4401 * to recreate the on-disk version of this inode.
4403 static int __ext4_get_inode_loc(struct super_block
*sb
, unsigned long ino
,
4404 struct inode
*inode
, struct ext4_iloc
*iloc
,
4405 ext4_fsblk_t
*ret_block
)
4407 struct ext4_group_desc
*gdp
;
4408 struct buffer_head
*bh
;
4410 struct blk_plug plug
;
4411 int inodes_per_block
, inode_offset
;
4414 if (ino
< EXT4_ROOT_INO
||
4415 ino
> le32_to_cpu(EXT4_SB(sb
)->s_es
->s_inodes_count
))
4416 return -EFSCORRUPTED
;
4418 iloc
->block_group
= (ino
- 1) / EXT4_INODES_PER_GROUP(sb
);
4419 gdp
= ext4_get_group_desc(sb
, iloc
->block_group
, NULL
);
4424 * Figure out the offset within the block group inode table
4426 inodes_per_block
= EXT4_SB(sb
)->s_inodes_per_block
;
4427 inode_offset
= ((ino
- 1) %
4428 EXT4_INODES_PER_GROUP(sb
));
4429 iloc
->offset
= (inode_offset
% inodes_per_block
) * EXT4_INODE_SIZE(sb
);
4431 block
= ext4_inode_table(sb
, gdp
);
4432 if ((block
<= le32_to_cpu(EXT4_SB(sb
)->s_es
->s_first_data_block
)) ||
4433 (block
>= ext4_blocks_count(EXT4_SB(sb
)->s_es
))) {
4434 ext4_error(sb
, "Invalid inode table block %llu in "
4435 "block_group %u", block
, iloc
->block_group
);
4436 return -EFSCORRUPTED
;
4438 block
+= (inode_offset
/ inodes_per_block
);
4440 bh
= sb_getblk(sb
, block
);
4443 if (ext4_buffer_uptodate(bh
))
4447 if (ext4_buffer_uptodate(bh
)) {
4448 /* Someone brought it uptodate while we waited */
4454 * If we have all information of the inode in memory and this
4455 * is the only valid inode in the block, we need not read the
4458 if (inode
&& !ext4_test_inode_state(inode
, EXT4_STATE_XATTR
)) {
4459 struct buffer_head
*bitmap_bh
;
4462 start
= inode_offset
& ~(inodes_per_block
- 1);
4464 /* Is the inode bitmap in cache? */
4465 bitmap_bh
= sb_getblk(sb
, ext4_inode_bitmap(sb
, gdp
));
4466 if (unlikely(!bitmap_bh
))
4470 * If the inode bitmap isn't in cache then the
4471 * optimisation may end up performing two reads instead
4472 * of one, so skip it.
4474 if (!buffer_uptodate(bitmap_bh
)) {
4478 for (i
= start
; i
< start
+ inodes_per_block
; i
++) {
4479 if (i
== inode_offset
)
4481 if (ext4_test_bit(i
, bitmap_bh
->b_data
))
4485 if (i
== start
+ inodes_per_block
) {
4486 struct ext4_inode
*raw_inode
=
4487 (struct ext4_inode
*) (bh
->b_data
+ iloc
->offset
);
4489 /* all other inodes are free, so skip I/O */
4490 memset(bh
->b_data
, 0, bh
->b_size
);
4491 if (!ext4_test_inode_state(inode
, EXT4_STATE_NEW
))
4492 ext4_fill_raw_inode(inode
, raw_inode
);
4493 set_buffer_uptodate(bh
);
4501 * If we need to do any I/O, try to pre-readahead extra
4502 * blocks from the inode table.
4504 blk_start_plug(&plug
);
4505 if (EXT4_SB(sb
)->s_inode_readahead_blks
) {
4506 ext4_fsblk_t b
, end
, table
;
4508 __u32 ra_blks
= EXT4_SB(sb
)->s_inode_readahead_blks
;
4510 table
= ext4_inode_table(sb
, gdp
);
4511 /* s_inode_readahead_blks is always a power of 2 */
4512 b
= block
& ~((ext4_fsblk_t
) ra_blks
- 1);
4516 num
= EXT4_INODES_PER_GROUP(sb
);
4517 if (ext4_has_group_desc_csum(sb
))
4518 num
-= ext4_itable_unused_count(sb
, gdp
);
4519 table
+= num
/ inodes_per_block
;
4523 ext4_sb_breadahead_unmovable(sb
, b
++);
4527 * There are other valid inodes in the buffer, this inode
4528 * has in-inode xattrs, or we don't have this inode in memory.
4529 * Read the block from disk.
4531 trace_ext4_load_inode(sb
, ino
);
4532 ext4_read_bh_nowait(bh
, REQ_META
| REQ_PRIO
, NULL
,
4533 ext4_simulate_fail(sb
, EXT4_SIM_INODE_EIO
));
4534 blk_finish_plug(&plug
);
4536 if (!buffer_uptodate(bh
)) {
4547 static int __ext4_get_inode_loc_noinmem(struct inode
*inode
,
4548 struct ext4_iloc
*iloc
)
4550 ext4_fsblk_t err_blk
= 0;
4553 ret
= __ext4_get_inode_loc(inode
->i_sb
, inode
->i_ino
, NULL
, iloc
,
4557 ext4_error_inode_block(inode
, err_blk
, EIO
,
4558 "unable to read itable block");
4563 int ext4_get_inode_loc(struct inode
*inode
, struct ext4_iloc
*iloc
)
4565 ext4_fsblk_t err_blk
= 0;
4568 ret
= __ext4_get_inode_loc(inode
->i_sb
, inode
->i_ino
, inode
, iloc
,
4572 ext4_error_inode_block(inode
, err_blk
, EIO
,
4573 "unable to read itable block");
4579 int ext4_get_fc_inode_loc(struct super_block
*sb
, unsigned long ino
,
4580 struct ext4_iloc
*iloc
)
4582 return __ext4_get_inode_loc(sb
, ino
, NULL
, iloc
, NULL
);
4585 static bool ext4_should_enable_dax(struct inode
*inode
)
4587 struct ext4_sb_info
*sbi
= EXT4_SB(inode
->i_sb
);
4589 if (test_opt2(inode
->i_sb
, DAX_NEVER
))
4591 if (!S_ISREG(inode
->i_mode
))
4593 if (ext4_should_journal_data(inode
))
4595 if (ext4_has_inline_data(inode
))
4597 if (ext4_test_inode_flag(inode
, EXT4_INODE_ENCRYPT
))
4599 if (ext4_test_inode_flag(inode
, EXT4_INODE_VERITY
))
4601 if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX
, &sbi
->s_ext4_flags
))
4603 if (test_opt(inode
->i_sb
, DAX_ALWAYS
))
4606 return ext4_test_inode_flag(inode
, EXT4_INODE_DAX
);
4609 void ext4_set_inode_flags(struct inode
*inode
, bool init
)
4611 unsigned int flags
= EXT4_I(inode
)->i_flags
;
4612 unsigned int new_fl
= 0;
4614 WARN_ON_ONCE(IS_DAX(inode
) && init
);
4616 if (flags
& EXT4_SYNC_FL
)
4618 if (flags
& EXT4_APPEND_FL
)
4620 if (flags
& EXT4_IMMUTABLE_FL
)
4621 new_fl
|= S_IMMUTABLE
;
4622 if (flags
& EXT4_NOATIME_FL
)
4623 new_fl
|= S_NOATIME
;
4624 if (flags
& EXT4_DIRSYNC_FL
)
4625 new_fl
|= S_DIRSYNC
;
4627 /* Because of the way inode_set_flags() works we must preserve S_DAX
4628 * here if already set. */
4629 new_fl
|= (inode
->i_flags
& S_DAX
);
4630 if (init
&& ext4_should_enable_dax(inode
))
4633 if (flags
& EXT4_ENCRYPT_FL
)
4634 new_fl
|= S_ENCRYPTED
;
4635 if (flags
& EXT4_CASEFOLD_FL
)
4636 new_fl
|= S_CASEFOLD
;
4637 if (flags
& EXT4_VERITY_FL
)
4639 inode_set_flags(inode
, new_fl
,
4640 S_SYNC
|S_APPEND
|S_IMMUTABLE
|S_NOATIME
|S_DIRSYNC
|S_DAX
|
4641 S_ENCRYPTED
|S_CASEFOLD
|S_VERITY
);
4644 static blkcnt_t
ext4_inode_blocks(struct ext4_inode
*raw_inode
,
4645 struct ext4_inode_info
*ei
)
4648 struct inode
*inode
= &(ei
->vfs_inode
);
4649 struct super_block
*sb
= inode
->i_sb
;
4651 if (ext4_has_feature_huge_file(sb
)) {
4652 /* we are using combined 48 bit field */
4653 i_blocks
= ((u64
)le16_to_cpu(raw_inode
->i_blocks_high
)) << 32 |
4654 le32_to_cpu(raw_inode
->i_blocks_lo
);
4655 if (ext4_test_inode_flag(inode
, EXT4_INODE_HUGE_FILE
)) {
4656 /* i_blocks represent file system block size */
4657 return i_blocks
<< (inode
->i_blkbits
- 9);
4662 return le32_to_cpu(raw_inode
->i_blocks_lo
);
4666 static inline int ext4_iget_extra_inode(struct inode
*inode
,
4667 struct ext4_inode
*raw_inode
,
4668 struct ext4_inode_info
*ei
)
4670 __le32
*magic
= (void *)raw_inode
+
4671 EXT4_GOOD_OLD_INODE_SIZE
+ ei
->i_extra_isize
;
4673 if (EXT4_INODE_HAS_XATTR_SPACE(inode
) &&
4674 *magic
== cpu_to_le32(EXT4_XATTR_MAGIC
)) {
4677 ext4_set_inode_state(inode
, EXT4_STATE_XATTR
);
4678 err
= ext4_find_inline_data_nolock(inode
);
4679 if (!err
&& ext4_has_inline_data(inode
))
4680 ext4_set_inode_state(inode
, EXT4_STATE_MAY_INLINE_DATA
);
4683 EXT4_I(inode
)->i_inline_off
= 0;
4687 int ext4_get_projid(struct inode
*inode
, kprojid_t
*projid
)
4689 if (!ext4_has_feature_project(inode
->i_sb
))
4691 *projid
= EXT4_I(inode
)->i_projid
;
4696 * ext4 has self-managed i_version for ea inodes, it stores the lower 32bit of
4697 * refcount in i_version, so use raw values if inode has EXT4_EA_INODE_FL flag
4700 static inline void ext4_inode_set_iversion_queried(struct inode
*inode
, u64 val
)
4702 if (unlikely(EXT4_I(inode
)->i_flags
& EXT4_EA_INODE_FL
))
4703 inode_set_iversion_raw(inode
, val
);
4705 inode_set_iversion_queried(inode
, val
);
4708 static const char *check_igot_inode(struct inode
*inode
, ext4_iget_flags flags
)
4711 if (flags
& EXT4_IGET_EA_INODE
) {
4712 if (!(EXT4_I(inode
)->i_flags
& EXT4_EA_INODE_FL
))
4713 return "missing EA_INODE flag";
4714 if (ext4_test_inode_state(inode
, EXT4_STATE_XATTR
) ||
4715 EXT4_I(inode
)->i_file_acl
)
4716 return "ea_inode with extended attributes";
4718 if ((EXT4_I(inode
)->i_flags
& EXT4_EA_INODE_FL
))
4719 return "unexpected EA_INODE flag";
4721 if (is_bad_inode(inode
) && !(flags
& EXT4_IGET_BAD
))
4722 return "unexpected bad inode w/o EXT4_IGET_BAD";
4726 struct inode
*__ext4_iget(struct super_block
*sb
, unsigned long ino
,
4727 ext4_iget_flags flags
, const char *function
,
4730 struct ext4_iloc iloc
;
4731 struct ext4_inode
*raw_inode
;
4732 struct ext4_inode_info
*ei
;
4733 struct ext4_super_block
*es
= EXT4_SB(sb
)->s_es
;
4734 struct inode
*inode
;
4735 const char *err_str
;
4736 journal_t
*journal
= EXT4_SB(sb
)->s_journal
;
4744 if ((!(flags
& EXT4_IGET_SPECIAL
) &&
4745 ((ino
< EXT4_FIRST_INO(sb
) && ino
!= EXT4_ROOT_INO
) ||
4746 ino
== le32_to_cpu(es
->s_usr_quota_inum
) ||
4747 ino
== le32_to_cpu(es
->s_grp_quota_inum
) ||
4748 ino
== le32_to_cpu(es
->s_prj_quota_inum
) ||
4749 ino
== le32_to_cpu(es
->s_orphan_file_inum
))) ||
4750 (ino
< EXT4_ROOT_INO
) ||
4751 (ino
> le32_to_cpu(es
->s_inodes_count
))) {
4752 if (flags
& EXT4_IGET_HANDLE
)
4753 return ERR_PTR(-ESTALE
);
4754 __ext4_error(sb
, function
, line
, false, EFSCORRUPTED
, 0,
4755 "inode #%lu: comm %s: iget: illegal inode #",
4756 ino
, current
->comm
);
4757 return ERR_PTR(-EFSCORRUPTED
);
4760 inode
= iget_locked(sb
, ino
);
4762 return ERR_PTR(-ENOMEM
);
4763 if (!(inode
->i_state
& I_NEW
)) {
4764 if ((err_str
= check_igot_inode(inode
, flags
)) != NULL
) {
4765 ext4_error_inode(inode
, function
, line
, 0, err_str
);
4767 return ERR_PTR(-EFSCORRUPTED
);
4775 ret
= __ext4_get_inode_loc_noinmem(inode
, &iloc
);
4778 raw_inode
= ext4_raw_inode(&iloc
);
4780 if ((flags
& EXT4_IGET_HANDLE
) &&
4781 (raw_inode
->i_links_count
== 0) && (raw_inode
->i_mode
== 0)) {
4786 if (EXT4_INODE_SIZE(inode
->i_sb
) > EXT4_GOOD_OLD_INODE_SIZE
) {
4787 ei
->i_extra_isize
= le16_to_cpu(raw_inode
->i_extra_isize
);
4788 if (EXT4_GOOD_OLD_INODE_SIZE
+ ei
->i_extra_isize
>
4789 EXT4_INODE_SIZE(inode
->i_sb
) ||
4790 (ei
->i_extra_isize
& 3)) {
4791 ext4_error_inode(inode
, function
, line
, 0,
4792 "iget: bad extra_isize %u "
4795 EXT4_INODE_SIZE(inode
->i_sb
));
4796 ret
= -EFSCORRUPTED
;
4800 ei
->i_extra_isize
= 0;
4802 /* Precompute checksum seed for inode metadata */
4803 if (ext4_has_metadata_csum(sb
)) {
4804 struct ext4_sb_info
*sbi
= EXT4_SB(inode
->i_sb
);
4806 __le32 inum
= cpu_to_le32(inode
->i_ino
);
4807 __le32 gen
= raw_inode
->i_generation
;
4808 csum
= ext4_chksum(sbi
, sbi
->s_csum_seed
, (__u8
*)&inum
,
4810 ei
->i_csum_seed
= ext4_chksum(sbi
, csum
, (__u8
*)&gen
,
4814 if ((!ext4_inode_csum_verify(inode
, raw_inode
, ei
) ||
4815 ext4_simulate_fail(sb
, EXT4_SIM_INODE_CRC
)) &&
4816 (!(EXT4_SB(sb
)->s_mount_state
& EXT4_FC_REPLAY
))) {
4817 ext4_error_inode_err(inode
, function
, line
, 0,
4818 EFSBADCRC
, "iget: checksum invalid");
4823 inode
->i_mode
= le16_to_cpu(raw_inode
->i_mode
);
4824 i_uid
= (uid_t
)le16_to_cpu(raw_inode
->i_uid_low
);
4825 i_gid
= (gid_t
)le16_to_cpu(raw_inode
->i_gid_low
);
4826 if (ext4_has_feature_project(sb
) &&
4827 EXT4_INODE_SIZE(sb
) > EXT4_GOOD_OLD_INODE_SIZE
&&
4828 EXT4_FITS_IN_INODE(raw_inode
, ei
, i_projid
))
4829 i_projid
= (projid_t
)le32_to_cpu(raw_inode
->i_projid
);
4831 i_projid
= EXT4_DEF_PROJID
;
4833 if (!(test_opt(inode
->i_sb
, NO_UID32
))) {
4834 i_uid
|= le16_to_cpu(raw_inode
->i_uid_high
) << 16;
4835 i_gid
|= le16_to_cpu(raw_inode
->i_gid_high
) << 16;
4837 i_uid_write(inode
, i_uid
);
4838 i_gid_write(inode
, i_gid
);
4839 ei
->i_projid
= make_kprojid(&init_user_ns
, i_projid
);
4840 set_nlink(inode
, le16_to_cpu(raw_inode
->i_links_count
));
4842 ext4_clear_state_flags(ei
); /* Only relevant on 32-bit archs */
4843 ei
->i_inline_off
= 0;
4844 ei
->i_dir_start_lookup
= 0;
4845 ei
->i_dtime
= le32_to_cpu(raw_inode
->i_dtime
);
4846 /* We now have enough fields to check if the inode was active or not.
4847 * This is needed because nfsd might try to access dead inodes
4848 * the test is that same one that e2fsck uses
4849 * NeilBrown 1999oct15
4851 if (inode
->i_nlink
== 0) {
4852 if ((inode
->i_mode
== 0 || flags
& EXT4_IGET_SPECIAL
||
4853 !(EXT4_SB(inode
->i_sb
)->s_mount_state
& EXT4_ORPHAN_FS
)) &&
4854 ino
!= EXT4_BOOT_LOADER_INO
) {
4855 /* this inode is deleted or unallocated */
4856 if (flags
& EXT4_IGET_SPECIAL
) {
4857 ext4_error_inode(inode
, function
, line
, 0,
4858 "iget: special inode unallocated");
4859 ret
= -EFSCORRUPTED
;
4864 /* The only unlinked inodes we let through here have
4865 * valid i_mode and are being read by the orphan
4866 * recovery code: that's fine, we're about to complete
4867 * the process of deleting those.
4868 * OR it is the EXT4_BOOT_LOADER_INO which is
4869 * not initialized on a new filesystem. */
4871 ei
->i_flags
= le32_to_cpu(raw_inode
->i_flags
);
4872 ext4_set_inode_flags(inode
, true);
4873 inode
->i_blocks
= ext4_inode_blocks(raw_inode
, ei
);
4874 ei
->i_file_acl
= le32_to_cpu(raw_inode
->i_file_acl_lo
);
4875 if (ext4_has_feature_64bit(sb
))
4877 ((__u64
)le16_to_cpu(raw_inode
->i_file_acl_high
)) << 32;
4878 inode
->i_size
= ext4_isize(sb
, raw_inode
);
4879 if ((size
= i_size_read(inode
)) < 0) {
4880 ext4_error_inode(inode
, function
, line
, 0,
4881 "iget: bad i_size value: %lld", size
);
4882 ret
= -EFSCORRUPTED
;
4886 * If dir_index is not enabled but there's dir with INDEX flag set,
4887 * we'd normally treat htree data as empty space. But with metadata
4888 * checksumming that corrupts checksums so forbid that.
4890 if (!ext4_has_feature_dir_index(sb
) && ext4_has_metadata_csum(sb
) &&
4891 ext4_test_inode_flag(inode
, EXT4_INODE_INDEX
)) {
4892 ext4_error_inode(inode
, function
, line
, 0,
4893 "iget: Dir with htree data on filesystem without dir_index feature.");
4894 ret
= -EFSCORRUPTED
;
4897 ei
->i_disksize
= inode
->i_size
;
4899 ei
->i_reserved_quota
= 0;
4901 inode
->i_generation
= le32_to_cpu(raw_inode
->i_generation
);
4902 ei
->i_block_group
= iloc
.block_group
;
4903 ei
->i_last_alloc_group
= ~0;
4905 * NOTE! The in-memory inode i_data array is in little-endian order
4906 * even on big-endian machines: we do NOT byteswap the block numbers!
4908 for (block
= 0; block
< EXT4_N_BLOCKS
; block
++)
4909 ei
->i_data
[block
] = raw_inode
->i_block
[block
];
4910 INIT_LIST_HEAD(&ei
->i_orphan
);
4911 ext4_fc_init_inode(&ei
->vfs_inode
);
4914 * Set transaction id's of transactions that have to be committed
4915 * to finish f[data]sync. We set them to currently running transaction
4916 * as we cannot be sure that the inode or some of its metadata isn't
4917 * part of the transaction - the inode could have been reclaimed and
4918 * now it is reread from disk.
4921 transaction_t
*transaction
;
4924 read_lock(&journal
->j_state_lock
);
4925 if (journal
->j_running_transaction
)
4926 transaction
= journal
->j_running_transaction
;
4928 transaction
= journal
->j_committing_transaction
;
4930 tid
= transaction
->t_tid
;
4932 tid
= journal
->j_commit_sequence
;
4933 read_unlock(&journal
->j_state_lock
);
4934 ei
->i_sync_tid
= tid
;
4935 ei
->i_datasync_tid
= tid
;
4938 if (EXT4_INODE_SIZE(inode
->i_sb
) > EXT4_GOOD_OLD_INODE_SIZE
) {
4939 if (ei
->i_extra_isize
== 0) {
4940 /* The extra space is currently unused. Use it. */
4941 BUILD_BUG_ON(sizeof(struct ext4_inode
) & 3);
4942 ei
->i_extra_isize
= sizeof(struct ext4_inode
) -
4943 EXT4_GOOD_OLD_INODE_SIZE
;
4945 ret
= ext4_iget_extra_inode(inode
, raw_inode
, ei
);
4951 EXT4_INODE_GET_CTIME(inode
, raw_inode
);
4952 EXT4_INODE_GET_ATIME(inode
, raw_inode
);
4953 EXT4_INODE_GET_MTIME(inode
, raw_inode
);
4954 EXT4_EINODE_GET_XTIME(i_crtime
, ei
, raw_inode
);
4956 if (likely(!test_opt2(inode
->i_sb
, HURD_COMPAT
))) {
4957 u64 ivers
= le32_to_cpu(raw_inode
->i_disk_version
);
4959 if (EXT4_INODE_SIZE(inode
->i_sb
) > EXT4_GOOD_OLD_INODE_SIZE
) {
4960 if (EXT4_FITS_IN_INODE(raw_inode
, ei
, i_version_hi
))
4962 (__u64
)(le32_to_cpu(raw_inode
->i_version_hi
)) << 32;
4964 ext4_inode_set_iversion_queried(inode
, ivers
);
4968 if (ei
->i_file_acl
&&
4969 !ext4_inode_block_valid(inode
, ei
->i_file_acl
, 1)) {
4970 ext4_error_inode(inode
, function
, line
, 0,
4971 "iget: bad extended attribute block %llu",
4973 ret
= -EFSCORRUPTED
;
4975 } else if (!ext4_has_inline_data(inode
)) {
4976 /* validate the block references in the inode */
4977 if (!(EXT4_SB(sb
)->s_mount_state
& EXT4_FC_REPLAY
) &&
4978 (S_ISREG(inode
->i_mode
) || S_ISDIR(inode
->i_mode
) ||
4979 (S_ISLNK(inode
->i_mode
) &&
4980 !ext4_inode_is_fast_symlink(inode
)))) {
4981 if (ext4_test_inode_flag(inode
, EXT4_INODE_EXTENTS
))
4982 ret
= ext4_ext_check_inode(inode
);
4984 ret
= ext4_ind_check_inode(inode
);
4990 if (S_ISREG(inode
->i_mode
)) {
4991 inode
->i_op
= &ext4_file_inode_operations
;
4992 inode
->i_fop
= &ext4_file_operations
;
4993 ext4_set_aops(inode
);
4994 } else if (S_ISDIR(inode
->i_mode
)) {
4995 inode
->i_op
= &ext4_dir_inode_operations
;
4996 inode
->i_fop
= &ext4_dir_operations
;
4997 } else if (S_ISLNK(inode
->i_mode
)) {
4998 /* VFS does not allow setting these so must be corruption */
4999 if (IS_APPEND(inode
) || IS_IMMUTABLE(inode
)) {
5000 ext4_error_inode(inode
, function
, line
, 0,
5001 "iget: immutable or append flags "
5002 "not allowed on symlinks");
5003 ret
= -EFSCORRUPTED
;
5006 if (IS_ENCRYPTED(inode
)) {
5007 inode
->i_op
= &ext4_encrypted_symlink_inode_operations
;
5008 } else if (ext4_inode_is_fast_symlink(inode
)) {
5009 inode
->i_link
= (char *)ei
->i_data
;
5010 inode
->i_op
= &ext4_fast_symlink_inode_operations
;
5011 nd_terminate_link(ei
->i_data
, inode
->i_size
,
5012 sizeof(ei
->i_data
) - 1);
5014 inode
->i_op
= &ext4_symlink_inode_operations
;
5016 } else if (S_ISCHR(inode
->i_mode
) || S_ISBLK(inode
->i_mode
) ||
5017 S_ISFIFO(inode
->i_mode
) || S_ISSOCK(inode
->i_mode
)) {
5018 inode
->i_op
= &ext4_special_inode_operations
;
5019 if (raw_inode
->i_block
[0])
5020 init_special_inode(inode
, inode
->i_mode
,
5021 old_decode_dev(le32_to_cpu(raw_inode
->i_block
[0])));
5023 init_special_inode(inode
, inode
->i_mode
,
5024 new_decode_dev(le32_to_cpu(raw_inode
->i_block
[1])));
5025 } else if (ino
== EXT4_BOOT_LOADER_INO
) {
5026 make_bad_inode(inode
);
5028 ret
= -EFSCORRUPTED
;
5029 ext4_error_inode(inode
, function
, line
, 0,
5030 "iget: bogus i_mode (%o)", inode
->i_mode
);
5033 if (IS_CASEFOLDED(inode
) && !ext4_has_feature_casefold(inode
->i_sb
)) {
5034 ext4_error_inode(inode
, function
, line
, 0,
5035 "casefold flag without casefold feature");
5036 ret
= -EFSCORRUPTED
;
5039 if ((err_str
= check_igot_inode(inode
, flags
)) != NULL
) {
5040 ext4_error_inode(inode
, function
, line
, 0, err_str
);
5041 ret
= -EFSCORRUPTED
;
5046 unlock_new_inode(inode
);
5052 return ERR_PTR(ret
);
5055 static void __ext4_update_other_inode_time(struct super_block
*sb
,
5056 unsigned long orig_ino
,
5058 struct ext4_inode
*raw_inode
)
5060 struct inode
*inode
;
5062 inode
= find_inode_by_ino_rcu(sb
, ino
);
5066 if (!inode_is_dirtytime_only(inode
))
5069 spin_lock(&inode
->i_lock
);
5070 if (inode_is_dirtytime_only(inode
)) {
5071 struct ext4_inode_info
*ei
= EXT4_I(inode
);
5073 inode
->i_state
&= ~I_DIRTY_TIME
;
5074 spin_unlock(&inode
->i_lock
);
5076 spin_lock(&ei
->i_raw_lock
);
5077 EXT4_INODE_SET_CTIME(inode
, raw_inode
);
5078 EXT4_INODE_SET_MTIME(inode
, raw_inode
);
5079 EXT4_INODE_SET_ATIME(inode
, raw_inode
);
5080 ext4_inode_csum_set(inode
, raw_inode
, ei
);
5081 spin_unlock(&ei
->i_raw_lock
);
5082 trace_ext4_other_inode_update_time(inode
, orig_ino
);
5085 spin_unlock(&inode
->i_lock
);
5089 * Opportunistically update the other time fields for other inodes in
5090 * the same inode table block.
5092 static void ext4_update_other_inodes_time(struct super_block
*sb
,
5093 unsigned long orig_ino
, char *buf
)
5096 int i
, inodes_per_block
= EXT4_SB(sb
)->s_inodes_per_block
;
5097 int inode_size
= EXT4_INODE_SIZE(sb
);
5100 * Calculate the first inode in the inode table block. Inode
5101 * numbers are one-based. That is, the first inode in a block
5102 * (assuming 4k blocks and 256 byte inodes) is (n*16 + 1).
5104 ino
= ((orig_ino
- 1) & ~(inodes_per_block
- 1)) + 1;
5106 for (i
= 0; i
< inodes_per_block
; i
++, ino
++, buf
+= inode_size
) {
5107 if (ino
== orig_ino
)
5109 __ext4_update_other_inode_time(sb
, orig_ino
, ino
,
5110 (struct ext4_inode
*)buf
);
5116 * Post the struct inode info into an on-disk inode location in the
5117 * buffer-cache. This gobbles the caller's reference to the
5118 * buffer_head in the inode location struct.
5120 * The caller must have write access to iloc->bh.
5122 static int ext4_do_update_inode(handle_t
*handle
,
5123 struct inode
*inode
,
5124 struct ext4_iloc
*iloc
)
5126 struct ext4_inode
*raw_inode
= ext4_raw_inode(iloc
);
5127 struct ext4_inode_info
*ei
= EXT4_I(inode
);
5128 struct buffer_head
*bh
= iloc
->bh
;
5129 struct super_block
*sb
= inode
->i_sb
;
5131 int need_datasync
= 0, set_large_file
= 0;
5133 spin_lock(&ei
->i_raw_lock
);
5136 * For fields not tracked in the in-memory inode, initialise them
5137 * to zero for new inodes.
5139 if (ext4_test_inode_state(inode
, EXT4_STATE_NEW
))
5140 memset(raw_inode
, 0, EXT4_SB(inode
->i_sb
)->s_inode_size
);
5142 if (READ_ONCE(ei
->i_disksize
) != ext4_isize(inode
->i_sb
, raw_inode
))
5144 if (ei
->i_disksize
> 0x7fffffffULL
) {
5145 if (!ext4_has_feature_large_file(sb
) ||
5146 EXT4_SB(sb
)->s_es
->s_rev_level
== cpu_to_le32(EXT4_GOOD_OLD_REV
))
5150 err
= ext4_fill_raw_inode(inode
, raw_inode
);
5151 spin_unlock(&ei
->i_raw_lock
);
5153 EXT4_ERROR_INODE(inode
, "corrupted inode contents");
5157 if (inode
->i_sb
->s_flags
& SB_LAZYTIME
)
5158 ext4_update_other_inodes_time(inode
->i_sb
, inode
->i_ino
,
5161 BUFFER_TRACE(bh
, "call ext4_handle_dirty_metadata");
5162 err
= ext4_handle_dirty_metadata(handle
, NULL
, bh
);
5165 ext4_clear_inode_state(inode
, EXT4_STATE_NEW
);
5166 if (set_large_file
) {
5167 BUFFER_TRACE(EXT4_SB(sb
)->s_sbh
, "get write access");
5168 err
= ext4_journal_get_write_access(handle
, sb
,
5173 lock_buffer(EXT4_SB(sb
)->s_sbh
);
5174 ext4_set_feature_large_file(sb
);
5175 ext4_superblock_csum_set(sb
);
5176 unlock_buffer(EXT4_SB(sb
)->s_sbh
);
5177 ext4_handle_sync(handle
);
5178 err
= ext4_handle_dirty_metadata(handle
, NULL
,
5179 EXT4_SB(sb
)->s_sbh
);
5181 ext4_update_inode_fsync_trans(handle
, inode
, need_datasync
);
5183 ext4_std_error(inode
->i_sb
, err
);
5190 * ext4_write_inode()
5192 * We are called from a few places:
5194 * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
5195 * Here, there will be no transaction running. We wait for any running
5196 * transaction to commit.
5198 * - Within flush work (sys_sync(), kupdate and such).
5199 * We wait on commit, if told to.
5201 * - Within iput_final() -> write_inode_now()
5202 * We wait on commit, if told to.
5204 * In all cases it is actually safe for us to return without doing anything,
5205 * because the inode has been copied into a raw inode buffer in
5206 * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL
5209 * Note that we are absolutely dependent upon all inode dirtiers doing the
5210 * right thing: they *must* call mark_inode_dirty() after dirtying info in
5211 * which we are interested.
5213 * It would be a bug for them to not do this. The code:
5215 * mark_inode_dirty(inode)
5217 * inode->i_size = expr;
5219 * is in error because write_inode() could occur while `stuff()' is running,
5220 * and the new i_size will be lost. Plus the inode will no longer be on the
5221 * superblock's dirty inode list.
5223 int ext4_write_inode(struct inode
*inode
, struct writeback_control
*wbc
)
5227 if (WARN_ON_ONCE(current
->flags
& PF_MEMALLOC
))
5230 if (unlikely(ext4_forced_shutdown(inode
->i_sb
)))
5233 if (EXT4_SB(inode
->i_sb
)->s_journal
) {
5234 if (ext4_journal_current_handle()) {
5235 ext4_debug("called recursively, non-PF_MEMALLOC!\n");
5241 * No need to force transaction in WB_SYNC_NONE mode. Also
5242 * ext4_sync_fs() will force the commit after everything is
5245 if (wbc
->sync_mode
!= WB_SYNC_ALL
|| wbc
->for_sync
)
5248 err
= ext4_fc_commit(EXT4_SB(inode
->i_sb
)->s_journal
,
5249 EXT4_I(inode
)->i_sync_tid
);
5251 struct ext4_iloc iloc
;
5253 err
= __ext4_get_inode_loc_noinmem(inode
, &iloc
);
5257 * sync(2) will flush the whole buffer cache. No need to do
5258 * it here separately for each inode.
5260 if (wbc
->sync_mode
== WB_SYNC_ALL
&& !wbc
->for_sync
)
5261 sync_dirty_buffer(iloc
.bh
);
5262 if (buffer_req(iloc
.bh
) && !buffer_uptodate(iloc
.bh
)) {
5263 ext4_error_inode_block(inode
, iloc
.bh
->b_blocknr
, EIO
,
5264 "IO error syncing inode");
5273 * In data=journal mode ext4_journalled_invalidate_folio() may fail to invalidate
5274 * buffers that are attached to a folio straddling i_size and are undergoing
5275 * commit. In that case we have to wait for commit to finish and try again.
5277 static void ext4_wait_for_tail_page_commit(struct inode
*inode
)
5280 journal_t
*journal
= EXT4_SB(inode
->i_sb
)->s_journal
;
5283 bool has_transaction
;
5285 offset
= inode
->i_size
& (PAGE_SIZE
- 1);
5287 * If the folio is fully truncated, we don't need to wait for any commit
5288 * (and we even should not as __ext4_journalled_invalidate_folio() may
5289 * strip all buffers from the folio but keep the folio dirty which can then
5290 * confuse e.g. concurrent ext4_writepages() seeing dirty folio without
5291 * buffers). Also we don't need to wait for any commit if all buffers in
5292 * the folio remain valid. This is most beneficial for the common case of
5293 * blocksize == PAGESIZE.
5295 if (!offset
|| offset
> (PAGE_SIZE
- i_blocksize(inode
)))
5298 struct folio
*folio
= filemap_lock_folio(inode
->i_mapping
,
5299 inode
->i_size
>> PAGE_SHIFT
);
5302 ret
= __ext4_journalled_invalidate_folio(folio
, offset
,
5303 folio_size(folio
) - offset
);
5304 folio_unlock(folio
);
5308 has_transaction
= false;
5309 read_lock(&journal
->j_state_lock
);
5310 if (journal
->j_committing_transaction
) {
5311 commit_tid
= journal
->j_committing_transaction
->t_tid
;
5312 has_transaction
= true;
5314 read_unlock(&journal
->j_state_lock
);
5315 if (has_transaction
)
5316 jbd2_log_wait_commit(journal
, commit_tid
);
5323 * Called from notify_change.
5325 * We want to trap VFS attempts to truncate the file as soon as
5326 * possible. In particular, we want to make sure that when the VFS
5327 * shrinks i_size, we put the inode on the orphan list and modify
5328 * i_disksize immediately, so that during the subsequent flushing of
5329 * dirty pages and freeing of disk blocks, we can guarantee that any
5330 * commit will leave the blocks being flushed in an unused state on
5331 * disk. (On recovery, the inode will get truncated and the blocks will
5332 * be freed, so we have a strong guarantee that no future commit will
5333 * leave these blocks visible to the user.)
5335 * Another thing we have to assure is that if we are in ordered mode
5336 * and inode is still attached to the committing transaction, we must
5337 * we start writeout of all the dirty pages which are being truncated.
5338 * This way we are sure that all the data written in the previous
5339 * transaction are already on disk (truncate waits for pages under
5342 * Called with inode->i_rwsem down.
5344 int ext4_setattr(struct mnt_idmap
*idmap
, struct dentry
*dentry
,
5347 struct inode
*inode
= d_inode(dentry
);
5350 const unsigned int ia_valid
= attr
->ia_valid
;
5351 bool inc_ivers
= true;
5353 if (unlikely(ext4_forced_shutdown(inode
->i_sb
)))
5356 if (unlikely(IS_IMMUTABLE(inode
)))
5359 if (unlikely(IS_APPEND(inode
) &&
5360 (ia_valid
& (ATTR_MODE
| ATTR_UID
|
5361 ATTR_GID
| ATTR_TIMES_SET
))))
5364 error
= setattr_prepare(idmap
, dentry
, attr
);
5368 error
= fscrypt_prepare_setattr(dentry
, attr
);
5372 error
= fsverity_prepare_setattr(dentry
, attr
);
5376 if (is_quota_modification(idmap
, inode
, attr
)) {
5377 error
= dquot_initialize(inode
);
5382 if (i_uid_needs_update(idmap
, attr
, inode
) ||
5383 i_gid_needs_update(idmap
, attr
, inode
)) {
5386 /* (user+group)*(old+new) structure, inode write (sb,
5387 * inode block, ? - but truncate inode update has it) */
5388 handle
= ext4_journal_start(inode
, EXT4_HT_QUOTA
,
5389 (EXT4_MAXQUOTAS_INIT_BLOCKS(inode
->i_sb
) +
5390 EXT4_MAXQUOTAS_DEL_BLOCKS(inode
->i_sb
)) + 3);
5391 if (IS_ERR(handle
)) {
5392 error
= PTR_ERR(handle
);
5396 /* dquot_transfer() calls back ext4_get_inode_usage() which
5397 * counts xattr inode references.
5399 down_read(&EXT4_I(inode
)->xattr_sem
);
5400 error
= dquot_transfer(idmap
, inode
, attr
);
5401 up_read(&EXT4_I(inode
)->xattr_sem
);
5404 ext4_journal_stop(handle
);
5407 /* Update corresponding info in inode so that everything is in
5408 * one transaction */
5409 i_uid_update(idmap
, attr
, inode
);
5410 i_gid_update(idmap
, attr
, inode
);
5411 error
= ext4_mark_inode_dirty(handle
, inode
);
5412 ext4_journal_stop(handle
);
5413 if (unlikely(error
)) {
5418 if (attr
->ia_valid
& ATTR_SIZE
) {
5420 loff_t oldsize
= inode
->i_size
;
5421 loff_t old_disksize
;
5422 int shrink
= (attr
->ia_size
< inode
->i_size
);
5424 if (!(ext4_test_inode_flag(inode
, EXT4_INODE_EXTENTS
))) {
5425 struct ext4_sb_info
*sbi
= EXT4_SB(inode
->i_sb
);
5427 if (attr
->ia_size
> sbi
->s_bitmap_maxbytes
) {
5431 if (!S_ISREG(inode
->i_mode
)) {
5435 if (attr
->ia_size
== inode
->i_size
)
5439 if (ext4_should_order_data(inode
)) {
5440 error
= ext4_begin_ordered_truncate(inode
,
5446 * Blocks are going to be removed from the inode. Wait
5447 * for dio in flight.
5449 inode_dio_wait(inode
);
5452 filemap_invalidate_lock(inode
->i_mapping
);
5454 rc
= ext4_break_layouts(inode
);
5456 filemap_invalidate_unlock(inode
->i_mapping
);
5460 if (attr
->ia_size
!= inode
->i_size
) {
5461 /* attach jbd2 jinode for EOF folio tail zeroing */
5462 if (attr
->ia_size
& (inode
->i_sb
->s_blocksize
- 1) ||
5463 oldsize
& (inode
->i_sb
->s_blocksize
- 1)) {
5464 error
= ext4_inode_attach_jinode(inode
);
5469 handle
= ext4_journal_start(inode
, EXT4_HT_INODE
, 3);
5470 if (IS_ERR(handle
)) {
5471 error
= PTR_ERR(handle
);
5474 if (ext4_handle_valid(handle
) && shrink
) {
5475 error
= ext4_orphan_add(handle
, inode
);
5479 * Update c/mtime and tail zero the EOF folio on
5480 * truncate up. ext4_truncate() handles the shrink case
5484 inode_set_mtime_to_ts(inode
,
5485 inode_set_ctime_current(inode
));
5486 if (oldsize
& (inode
->i_sb
->s_blocksize
- 1))
5487 ext4_block_truncate_page(handle
,
5488 inode
->i_mapping
, oldsize
);
5492 ext4_fc_track_range(handle
, inode
,
5493 (attr
->ia_size
> 0 ? attr
->ia_size
- 1 : 0) >>
5494 inode
->i_sb
->s_blocksize_bits
,
5495 EXT_MAX_BLOCKS
- 1);
5497 ext4_fc_track_range(
5499 (oldsize
> 0 ? oldsize
- 1 : oldsize
) >>
5500 inode
->i_sb
->s_blocksize_bits
,
5501 (attr
->ia_size
> 0 ? attr
->ia_size
- 1 : 0) >>
5502 inode
->i_sb
->s_blocksize_bits
);
5504 down_write(&EXT4_I(inode
)->i_data_sem
);
5505 old_disksize
= EXT4_I(inode
)->i_disksize
;
5506 EXT4_I(inode
)->i_disksize
= attr
->ia_size
;
5507 rc
= ext4_mark_inode_dirty(handle
, inode
);
5511 * We have to update i_size under i_data_sem together
5512 * with i_disksize to avoid races with writeback code
5513 * running ext4_wb_update_i_disksize().
5516 i_size_write(inode
, attr
->ia_size
);
5518 EXT4_I(inode
)->i_disksize
= old_disksize
;
5519 up_write(&EXT4_I(inode
)->i_data_sem
);
5520 ext4_journal_stop(handle
);
5524 pagecache_isize_extended(inode
, oldsize
,
5526 } else if (ext4_should_journal_data(inode
)) {
5527 ext4_wait_for_tail_page_commit(inode
);
5532 * Truncate pagecache after we've waited for commit
5533 * in data=journal mode to make pages freeable.
5535 truncate_pagecache(inode
, inode
->i_size
);
5537 * Call ext4_truncate() even if i_size didn't change to
5538 * truncate possible preallocated blocks.
5540 if (attr
->ia_size
<= oldsize
) {
5541 rc
= ext4_truncate(inode
);
5546 filemap_invalidate_unlock(inode
->i_mapping
);
5551 inode_inc_iversion(inode
);
5552 setattr_copy(idmap
, inode
, attr
);
5553 mark_inode_dirty(inode
);
5557 * If the call to ext4_truncate failed to get a transaction handle at
5558 * all, we need to clean up the in-core orphan list manually.
5560 if (orphan
&& inode
->i_nlink
)
5561 ext4_orphan_del(NULL
, inode
);
5563 if (!error
&& (ia_valid
& ATTR_MODE
))
5564 rc
= posix_acl_chmod(idmap
, dentry
, inode
->i_mode
);
5568 ext4_std_error(inode
->i_sb
, error
);
5574 u32
ext4_dio_alignment(struct inode
*inode
)
5576 if (fsverity_active(inode
))
5578 if (ext4_should_journal_data(inode
))
5580 if (ext4_has_inline_data(inode
))
5582 if (IS_ENCRYPTED(inode
)) {
5583 if (!fscrypt_dio_supported(inode
))
5585 return i_blocksize(inode
);
5587 return 1; /* use the iomap defaults */
5590 int ext4_getattr(struct mnt_idmap
*idmap
, const struct path
*path
,
5591 struct kstat
*stat
, u32 request_mask
, unsigned int query_flags
)
5593 struct inode
*inode
= d_inode(path
->dentry
);
5594 struct ext4_inode
*raw_inode
;
5595 struct ext4_inode_info
*ei
= EXT4_I(inode
);
5598 if ((request_mask
& STATX_BTIME
) &&
5599 EXT4_FITS_IN_INODE(raw_inode
, ei
, i_crtime
)) {
5600 stat
->result_mask
|= STATX_BTIME
;
5601 stat
->btime
.tv_sec
= ei
->i_crtime
.tv_sec
;
5602 stat
->btime
.tv_nsec
= ei
->i_crtime
.tv_nsec
;
5606 * Return the DIO alignment restrictions if requested. We only return
5607 * this information when requested, since on encrypted files it might
5608 * take a fair bit of work to get if the file wasn't opened recently.
5610 if ((request_mask
& STATX_DIOALIGN
) && S_ISREG(inode
->i_mode
)) {
5611 u32 dio_align
= ext4_dio_alignment(inode
);
5613 stat
->result_mask
|= STATX_DIOALIGN
;
5614 if (dio_align
== 1) {
5615 struct block_device
*bdev
= inode
->i_sb
->s_bdev
;
5617 /* iomap defaults */
5618 stat
->dio_mem_align
= bdev_dma_alignment(bdev
) + 1;
5619 stat
->dio_offset_align
= bdev_logical_block_size(bdev
);
5621 stat
->dio_mem_align
= dio_align
;
5622 stat
->dio_offset_align
= dio_align
;
5626 if ((request_mask
& STATX_WRITE_ATOMIC
) && S_ISREG(inode
->i_mode
)) {
5627 struct ext4_sb_info
*sbi
= EXT4_SB(inode
->i_sb
);
5628 unsigned int awu_min
= 0, awu_max
= 0;
5630 if (ext4_inode_can_atomic_write(inode
)) {
5631 awu_min
= sbi
->s_awu_min
;
5632 awu_max
= sbi
->s_awu_max
;
5635 generic_fill_statx_atomic_writes(stat
, awu_min
, awu_max
);
5638 flags
= ei
->i_flags
& EXT4_FL_USER_VISIBLE
;
5639 if (flags
& EXT4_APPEND_FL
)
5640 stat
->attributes
|= STATX_ATTR_APPEND
;
5641 if (flags
& EXT4_COMPR_FL
)
5642 stat
->attributes
|= STATX_ATTR_COMPRESSED
;
5643 if (flags
& EXT4_ENCRYPT_FL
)
5644 stat
->attributes
|= STATX_ATTR_ENCRYPTED
;
5645 if (flags
& EXT4_IMMUTABLE_FL
)
5646 stat
->attributes
|= STATX_ATTR_IMMUTABLE
;
5647 if (flags
& EXT4_NODUMP_FL
)
5648 stat
->attributes
|= STATX_ATTR_NODUMP
;
5649 if (flags
& EXT4_VERITY_FL
)
5650 stat
->attributes
|= STATX_ATTR_VERITY
;
5652 stat
->attributes_mask
|= (STATX_ATTR_APPEND
|
5653 STATX_ATTR_COMPRESSED
|
5654 STATX_ATTR_ENCRYPTED
|
5655 STATX_ATTR_IMMUTABLE
|
5659 generic_fillattr(idmap
, request_mask
, inode
, stat
);
5663 int ext4_file_getattr(struct mnt_idmap
*idmap
,
5664 const struct path
*path
, struct kstat
*stat
,
5665 u32 request_mask
, unsigned int query_flags
)
5667 struct inode
*inode
= d_inode(path
->dentry
);
5668 u64 delalloc_blocks
;
5670 ext4_getattr(idmap
, path
, stat
, request_mask
, query_flags
);
5673 * If there is inline data in the inode, the inode will normally not
5674 * have data blocks allocated (it may have an external xattr block).
5675 * Report at least one sector for such files, so tools like tar, rsync,
5676 * others don't incorrectly think the file is completely sparse.
5678 if (unlikely(ext4_has_inline_data(inode
)))
5679 stat
->blocks
+= (stat
->size
+ 511) >> 9;
5682 * We can't update i_blocks if the block allocation is delayed
5683 * otherwise in the case of system crash before the real block
5684 * allocation is done, we will have i_blocks inconsistent with
5685 * on-disk file blocks.
5686 * We always keep i_blocks updated together with real
5687 * allocation. But to not confuse with user, stat
5688 * will return the blocks that include the delayed allocation
5689 * blocks for this file.
5691 delalloc_blocks
= EXT4_C2B(EXT4_SB(inode
->i_sb
),
5692 EXT4_I(inode
)->i_reserved_data_blocks
);
5693 stat
->blocks
+= delalloc_blocks
<< (inode
->i_sb
->s_blocksize_bits
- 9);
5697 static int ext4_index_trans_blocks(struct inode
*inode
, int lblocks
,
5700 if (!(ext4_test_inode_flag(inode
, EXT4_INODE_EXTENTS
)))
5701 return ext4_ind_trans_blocks(inode
, lblocks
);
5702 return ext4_ext_index_trans_blocks(inode
, pextents
);
5706 * Account for index blocks, block groups bitmaps and block group
5707 * descriptor blocks if modify datablocks and index blocks
5708 * worse case, the indexs blocks spread over different block groups
5710 * If datablocks are discontiguous, they are possible to spread over
5711 * different block groups too. If they are contiguous, with flexbg,
5712 * they could still across block group boundary.
5714 * Also account for superblock, inode, quota and xattr blocks
5716 static int ext4_meta_trans_blocks(struct inode
*inode
, int lblocks
,
5719 ext4_group_t groups
, ngroups
= ext4_get_groups_count(inode
->i_sb
);
5725 * How many index blocks need to touch to map @lblocks logical blocks
5726 * to @pextents physical extents?
5728 idxblocks
= ext4_index_trans_blocks(inode
, lblocks
, pextents
);
5733 * Now let's see how many group bitmaps and group descriptors need
5736 groups
= idxblocks
+ pextents
;
5738 if (groups
> ngroups
)
5740 if (groups
> EXT4_SB(inode
->i_sb
)->s_gdb_count
)
5741 gdpblocks
= EXT4_SB(inode
->i_sb
)->s_gdb_count
;
5743 /* bitmaps and block group descriptor blocks */
5744 ret
+= groups
+ gdpblocks
;
5746 /* Blocks for super block, inode, quota and xattr blocks */
5747 ret
+= EXT4_META_TRANS_BLOCKS(inode
->i_sb
);
5753 * Calculate the total number of credits to reserve to fit
5754 * the modification of a single pages into a single transaction,
5755 * which may include multiple chunks of block allocations.
5757 * This could be called via ext4_write_begin()
5759 * We need to consider the worse case, when
5760 * one new block per extent.
5762 int ext4_writepage_trans_blocks(struct inode
*inode
)
5764 int bpp
= ext4_journal_blocks_per_page(inode
);
5767 ret
= ext4_meta_trans_blocks(inode
, bpp
, bpp
);
5769 /* Account for data blocks for journalled mode */
5770 if (ext4_should_journal_data(inode
))
5776 * Calculate the journal credits for a chunk of data modification.
5778 * This is called from DIO, fallocate or whoever calling
5779 * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.
5781 * journal buffers for data blocks are not included here, as DIO
5782 * and fallocate do no need to journal data buffers.
5784 int ext4_chunk_trans_blocks(struct inode
*inode
, int nrblocks
)
5786 return ext4_meta_trans_blocks(inode
, nrblocks
, 1);
5790 * The caller must have previously called ext4_reserve_inode_write().
5791 * Give this, we know that the caller already has write access to iloc->bh.
5793 int ext4_mark_iloc_dirty(handle_t
*handle
,
5794 struct inode
*inode
, struct ext4_iloc
*iloc
)
5798 if (unlikely(ext4_forced_shutdown(inode
->i_sb
))) {
5802 ext4_fc_track_inode(handle
, inode
);
5804 /* the do_update_inode consumes one bh->b_count */
5807 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
5808 err
= ext4_do_update_inode(handle
, inode
, iloc
);
5814 * On success, We end up with an outstanding reference count against
5815 * iloc->bh. This _must_ be cleaned up later.
5819 ext4_reserve_inode_write(handle_t
*handle
, struct inode
*inode
,
5820 struct ext4_iloc
*iloc
)
5824 if (unlikely(ext4_forced_shutdown(inode
->i_sb
)))
5827 err
= ext4_get_inode_loc(inode
, iloc
);
5829 BUFFER_TRACE(iloc
->bh
, "get_write_access");
5830 err
= ext4_journal_get_write_access(handle
, inode
->i_sb
,
5831 iloc
->bh
, EXT4_JTR_NONE
);
5837 ext4_std_error(inode
->i_sb
, err
);
5841 static int __ext4_expand_extra_isize(struct inode
*inode
,
5842 unsigned int new_extra_isize
,
5843 struct ext4_iloc
*iloc
,
5844 handle_t
*handle
, int *no_expand
)
5846 struct ext4_inode
*raw_inode
;
5847 struct ext4_xattr_ibody_header
*header
;
5848 unsigned int inode_size
= EXT4_INODE_SIZE(inode
->i_sb
);
5849 struct ext4_inode_info
*ei
= EXT4_I(inode
);
5852 /* this was checked at iget time, but double check for good measure */
5853 if ((EXT4_GOOD_OLD_INODE_SIZE
+ ei
->i_extra_isize
> inode_size
) ||
5854 (ei
->i_extra_isize
& 3)) {
5855 EXT4_ERROR_INODE(inode
, "bad extra_isize %u (inode size %u)",
5857 EXT4_INODE_SIZE(inode
->i_sb
));
5858 return -EFSCORRUPTED
;
5860 if ((new_extra_isize
< ei
->i_extra_isize
) ||
5861 (new_extra_isize
< 4) ||
5862 (new_extra_isize
> inode_size
- EXT4_GOOD_OLD_INODE_SIZE
))
5863 return -EINVAL
; /* Should never happen */
5865 raw_inode
= ext4_raw_inode(iloc
);
5867 header
= IHDR(inode
, raw_inode
);
5869 /* No extended attributes present */
5870 if (!ext4_test_inode_state(inode
, EXT4_STATE_XATTR
) ||
5871 header
->h_magic
!= cpu_to_le32(EXT4_XATTR_MAGIC
)) {
5872 memset((void *)raw_inode
+ EXT4_GOOD_OLD_INODE_SIZE
+
5873 EXT4_I(inode
)->i_extra_isize
, 0,
5874 new_extra_isize
- EXT4_I(inode
)->i_extra_isize
);
5875 EXT4_I(inode
)->i_extra_isize
= new_extra_isize
;
5880 * We may need to allocate external xattr block so we need quotas
5881 * initialized. Here we can be called with various locks held so we
5882 * cannot affort to initialize quotas ourselves. So just bail.
5884 if (dquot_initialize_needed(inode
))
5887 /* try to expand with EAs present */
5888 error
= ext4_expand_extra_isize_ea(inode
, new_extra_isize
,
5892 * Inode size expansion failed; don't try again
5901 * Expand an inode by new_extra_isize bytes.
5902 * Returns 0 on success or negative error number on failure.
5904 static int ext4_try_to_expand_extra_isize(struct inode
*inode
,
5905 unsigned int new_extra_isize
,
5906 struct ext4_iloc iloc
,
5912 if (ext4_test_inode_state(inode
, EXT4_STATE_NO_EXPAND
))
5916 * In nojournal mode, we can immediately attempt to expand
5917 * the inode. When journaled, we first need to obtain extra
5918 * buffer credits since we may write into the EA block
5919 * with this same handle. If journal_extend fails, then it will
5920 * only result in a minor loss of functionality for that inode.
5921 * If this is felt to be critical, then e2fsck should be run to
5922 * force a large enough s_min_extra_isize.
5924 if (ext4_journal_extend(handle
,
5925 EXT4_DATA_TRANS_BLOCKS(inode
->i_sb
), 0) != 0)
5928 if (ext4_write_trylock_xattr(inode
, &no_expand
) == 0)
5931 error
= __ext4_expand_extra_isize(inode
, new_extra_isize
, &iloc
,
5932 handle
, &no_expand
);
5933 ext4_write_unlock_xattr(inode
, &no_expand
);
5938 int ext4_expand_extra_isize(struct inode
*inode
,
5939 unsigned int new_extra_isize
,
5940 struct ext4_iloc
*iloc
)
5946 if (ext4_test_inode_state(inode
, EXT4_STATE_NO_EXPAND
)) {
5951 handle
= ext4_journal_start(inode
, EXT4_HT_INODE
,
5952 EXT4_DATA_TRANS_BLOCKS(inode
->i_sb
));
5953 if (IS_ERR(handle
)) {
5954 error
= PTR_ERR(handle
);
5959 ext4_write_lock_xattr(inode
, &no_expand
);
5961 BUFFER_TRACE(iloc
->bh
, "get_write_access");
5962 error
= ext4_journal_get_write_access(handle
, inode
->i_sb
, iloc
->bh
,
5969 error
= __ext4_expand_extra_isize(inode
, new_extra_isize
, iloc
,
5970 handle
, &no_expand
);
5972 rc
= ext4_mark_iloc_dirty(handle
, inode
, iloc
);
5977 ext4_write_unlock_xattr(inode
, &no_expand
);
5978 ext4_journal_stop(handle
);
5983 * What we do here is to mark the in-core inode as clean with respect to inode
5984 * dirtiness (it may still be data-dirty).
5985 * This means that the in-core inode may be reaped by prune_icache
5986 * without having to perform any I/O. This is a very good thing,
5987 * because *any* task may call prune_icache - even ones which
5988 * have a transaction open against a different journal.
5990 * Is this cheating? Not really. Sure, we haven't written the
5991 * inode out, but prune_icache isn't a user-visible syncing function.
5992 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
5993 * we start and wait on commits.
5995 int __ext4_mark_inode_dirty(handle_t
*handle
, struct inode
*inode
,
5996 const char *func
, unsigned int line
)
5998 struct ext4_iloc iloc
;
5999 struct ext4_sb_info
*sbi
= EXT4_SB(inode
->i_sb
);
6003 trace_ext4_mark_inode_dirty(inode
, _RET_IP_
);
6004 err
= ext4_reserve_inode_write(handle
, inode
, &iloc
);
6008 if (EXT4_I(inode
)->i_extra_isize
< sbi
->s_want_extra_isize
)
6009 ext4_try_to_expand_extra_isize(inode
, sbi
->s_want_extra_isize
,
6012 err
= ext4_mark_iloc_dirty(handle
, inode
, &iloc
);
6015 ext4_error_inode_err(inode
, func
, line
, 0, err
,
6016 "mark_inode_dirty error");
6021 * ext4_dirty_inode() is called from __mark_inode_dirty()
6023 * We're really interested in the case where a file is being extended.
6024 * i_size has been changed by generic_commit_write() and we thus need
6025 * to include the updated inode in the current transaction.
6027 * Also, dquot_alloc_block() will always dirty the inode when blocks
6028 * are allocated to the file.
6030 * If the inode is marked synchronous, we don't honour that here - doing
6031 * so would cause a commit on atime updates, which we don't bother doing.
6032 * We handle synchronous inodes at the highest possible level.
6034 void ext4_dirty_inode(struct inode
*inode
, int flags
)
6038 handle
= ext4_journal_start(inode
, EXT4_HT_INODE
, 2);
6041 ext4_mark_inode_dirty(handle
, inode
);
6042 ext4_journal_stop(handle
);
6045 int ext4_change_inode_journal_flag(struct inode
*inode
, int val
)
6053 * We have to be very careful here: changing a data block's
6054 * journaling status dynamically is dangerous. If we write a
6055 * data block to the journal, change the status and then delete
6056 * that block, we risk forgetting to revoke the old log record
6057 * from the journal and so a subsequent replay can corrupt data.
6058 * So, first we make sure that the journal is empty and that
6059 * nobody is changing anything.
6062 journal
= EXT4_JOURNAL(inode
);
6065 if (is_journal_aborted(journal
))
6068 /* Wait for all existing dio workers */
6069 inode_dio_wait(inode
);
6072 * Before flushing the journal and switching inode's aops, we have
6073 * to flush all dirty data the inode has. There can be outstanding
6074 * delayed allocations, there can be unwritten extents created by
6075 * fallocate or buffered writes in dioread_nolock mode covered by
6076 * dirty data which can be converted only after flushing the dirty
6077 * data (and journalled aops don't know how to handle these cases).
6080 filemap_invalidate_lock(inode
->i_mapping
);
6081 err
= filemap_write_and_wait(inode
->i_mapping
);
6083 filemap_invalidate_unlock(inode
->i_mapping
);
6088 alloc_ctx
= ext4_writepages_down_write(inode
->i_sb
);
6089 jbd2_journal_lock_updates(journal
);
6092 * OK, there are no updates running now, and all cached data is
6093 * synced to disk. We are now in a completely consistent state
6094 * which doesn't have anything in the journal, and we know that
6095 * no filesystem updates are running, so it is safe to modify
6096 * the inode's in-core data-journaling state flag now.
6100 ext4_set_inode_flag(inode
, EXT4_INODE_JOURNAL_DATA
);
6102 err
= jbd2_journal_flush(journal
, 0);
6104 jbd2_journal_unlock_updates(journal
);
6105 ext4_writepages_up_write(inode
->i_sb
, alloc_ctx
);
6108 ext4_clear_inode_flag(inode
, EXT4_INODE_JOURNAL_DATA
);
6110 ext4_set_aops(inode
);
6112 jbd2_journal_unlock_updates(journal
);
6113 ext4_writepages_up_write(inode
->i_sb
, alloc_ctx
);
6116 filemap_invalidate_unlock(inode
->i_mapping
);
6118 /* Finally we can mark the inode as dirty. */
6120 handle
= ext4_journal_start(inode
, EXT4_HT_INODE
, 1);
6122 return PTR_ERR(handle
);
6124 ext4_fc_mark_ineligible(inode
->i_sb
,
6125 EXT4_FC_REASON_JOURNAL_FLAG_CHANGE
, handle
);
6126 err
= ext4_mark_inode_dirty(handle
, inode
);
6127 ext4_handle_sync(handle
);
6128 ext4_journal_stop(handle
);
6129 ext4_std_error(inode
->i_sb
, err
);
6134 static int ext4_bh_unmapped(handle_t
*handle
, struct inode
*inode
,
6135 struct buffer_head
*bh
)
6137 return !buffer_mapped(bh
);
6140 vm_fault_t
ext4_page_mkwrite(struct vm_fault
*vmf
)
6142 struct vm_area_struct
*vma
= vmf
->vma
;
6143 struct folio
*folio
= page_folio(vmf
->page
);
6148 struct file
*file
= vma
->vm_file
;
6149 struct inode
*inode
= file_inode(file
);
6150 struct address_space
*mapping
= inode
->i_mapping
;
6152 get_block_t
*get_block
;
6155 if (unlikely(IS_IMMUTABLE(inode
)))
6156 return VM_FAULT_SIGBUS
;
6158 sb_start_pagefault(inode
->i_sb
);
6159 file_update_time(vma
->vm_file
);
6161 filemap_invalidate_lock_shared(mapping
);
6163 err
= ext4_convert_inline_data(inode
);
6168 * On data journalling we skip straight to the transaction handle:
6169 * there's no delalloc; page truncated will be checked later; the
6170 * early return w/ all buffers mapped (calculates size/len) can't
6171 * be used; and there's no dioread_nolock, so only ext4_get_block.
6173 if (ext4_should_journal_data(inode
))
6176 /* Delalloc case is easy... */
6177 if (test_opt(inode
->i_sb
, DELALLOC
) &&
6178 !ext4_nonda_switch(inode
->i_sb
)) {
6180 err
= block_page_mkwrite(vma
, vmf
,
6181 ext4_da_get_block_prep
);
6182 } while (err
== -ENOSPC
&&
6183 ext4_should_retry_alloc(inode
->i_sb
, &retries
));
6188 size
= i_size_read(inode
);
6189 /* Page got truncated from under us? */
6190 if (folio
->mapping
!= mapping
|| folio_pos(folio
) > size
) {
6191 folio_unlock(folio
);
6192 ret
= VM_FAULT_NOPAGE
;
6196 len
= folio_size(folio
);
6197 if (folio_pos(folio
) + len
> size
)
6198 len
= size
- folio_pos(folio
);
6200 * Return if we have all the buffers mapped. This avoids the need to do
6201 * journal_start/journal_stop which can block and take a long time
6203 * This cannot be done for data journalling, as we have to add the
6204 * inode to the transaction's list to writeprotect pages on commit.
6206 if (folio_buffers(folio
)) {
6207 if (!ext4_walk_page_buffers(NULL
, inode
, folio_buffers(folio
),
6209 ext4_bh_unmapped
)) {
6210 /* Wait so that we don't change page under IO */
6211 folio_wait_stable(folio
);
6212 ret
= VM_FAULT_LOCKED
;
6216 folio_unlock(folio
);
6217 /* OK, we need to fill the hole... */
6218 if (ext4_should_dioread_nolock(inode
))
6219 get_block
= ext4_get_block_unwritten
;
6221 get_block
= ext4_get_block
;
6223 handle
= ext4_journal_start(inode
, EXT4_HT_WRITE_PAGE
,
6224 ext4_writepage_trans_blocks(inode
));
6225 if (IS_ERR(handle
)) {
6226 ret
= VM_FAULT_SIGBUS
;
6230 * Data journalling can't use block_page_mkwrite() because it
6231 * will set_buffer_dirty() before do_journal_get_write_access()
6232 * thus might hit warning messages for dirty metadata buffers.
6234 if (!ext4_should_journal_data(inode
)) {
6235 err
= block_page_mkwrite(vma
, vmf
, get_block
);
6238 size
= i_size_read(inode
);
6239 /* Page got truncated from under us? */
6240 if (folio
->mapping
!= mapping
|| folio_pos(folio
) > size
) {
6241 ret
= VM_FAULT_NOPAGE
;
6245 len
= folio_size(folio
);
6246 if (folio_pos(folio
) + len
> size
)
6247 len
= size
- folio_pos(folio
);
6249 err
= ext4_block_write_begin(handle
, folio
, 0, len
,
6252 ret
= VM_FAULT_SIGBUS
;
6253 if (ext4_journal_folio_buffers(handle
, folio
, len
))
6256 folio_unlock(folio
);
6259 ext4_journal_stop(handle
);
6260 if (err
== -ENOSPC
&& ext4_should_retry_alloc(inode
->i_sb
, &retries
))
6263 ret
= vmf_fs_error(err
);
6265 filemap_invalidate_unlock_shared(mapping
);
6266 sb_end_pagefault(inode
->i_sb
);
6269 folio_unlock(folio
);
6270 ext4_journal_stop(handle
);