1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
4 * Written by Alex Tomas <alex@clusterfs.com>
6 * Architecture independence:
7 * Copyright (c) 2005, Bull S.A.
8 * Written by Pierre Peiffer <pierre.peiffer@bull.net>
12 * Extents support for EXT4
15 * - ext4*_error() should be used in some situations
16 * - analyze all BUG()/BUG_ON(), use -EIO where appropriate
17 * - smart tree reduction
21 #include <linux/time.h>
22 #include <linux/jbd2.h>
23 #include <linux/highuid.h>
24 #include <linux/pagemap.h>
25 #include <linux/quotaops.h>
26 #include <linux/string.h>
27 #include <linux/slab.h>
28 #include <linux/uaccess.h>
29 #include <linux/fiemap.h>
30 #include <linux/iomap.h>
31 #include <linux/sched/mm.h>
32 #include "ext4_jbd2.h"
33 #include "ext4_extents.h"
36 #include <trace/events/ext4.h>
39 * used by extent splitting.
41 #define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \
43 #define EXT4_EXT_MARK_UNWRIT1 0x2 /* mark first half unwritten */
44 #define EXT4_EXT_MARK_UNWRIT2 0x4 /* mark second half unwritten */
46 #define EXT4_EXT_DATA_VALID1 0x8 /* first half contains valid data */
47 #define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */
49 static __le32
ext4_extent_block_csum(struct inode
*inode
,
50 struct ext4_extent_header
*eh
)
52 struct ext4_inode_info
*ei
= EXT4_I(inode
);
53 struct ext4_sb_info
*sbi
= EXT4_SB(inode
->i_sb
);
56 csum
= ext4_chksum(sbi
, ei
->i_csum_seed
, (__u8
*)eh
,
57 EXT4_EXTENT_TAIL_OFFSET(eh
));
58 return cpu_to_le32(csum
);
61 static int ext4_extent_block_csum_verify(struct inode
*inode
,
62 struct ext4_extent_header
*eh
)
64 struct ext4_extent_tail
*et
;
66 if (!ext4_has_metadata_csum(inode
->i_sb
))
69 et
= find_ext4_extent_tail(eh
);
70 if (et
->et_checksum
!= ext4_extent_block_csum(inode
, eh
))
75 static void ext4_extent_block_csum_set(struct inode
*inode
,
76 struct ext4_extent_header
*eh
)
78 struct ext4_extent_tail
*et
;
80 if (!ext4_has_metadata_csum(inode
->i_sb
))
83 et
= find_ext4_extent_tail(eh
);
84 et
->et_checksum
= ext4_extent_block_csum(inode
, eh
);
87 static struct ext4_ext_path
*ext4_split_extent_at(handle_t
*handle
,
89 struct ext4_ext_path
*path
,
91 int split_flag
, int flags
);
93 static int ext4_ext_trunc_restart_fn(struct inode
*inode
, int *dropped
)
96 * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
97 * moment, get_block can be called only for blocks inside i_size since
98 * page cache has been already dropped and writes are blocked by
99 * i_rwsem. So we can safely drop the i_data_sem here.
101 BUG_ON(EXT4_JOURNAL(inode
) == NULL
);
102 ext4_discard_preallocations(inode
);
103 up_write(&EXT4_I(inode
)->i_data_sem
);
108 static inline void ext4_ext_path_brelse(struct ext4_ext_path
*path
)
114 static void ext4_ext_drop_refs(struct ext4_ext_path
*path
)
118 if (IS_ERR_OR_NULL(path
))
120 depth
= path
->p_depth
;
121 for (i
= 0; i
<= depth
; i
++, path
++)
122 ext4_ext_path_brelse(path
);
125 void ext4_free_ext_path(struct ext4_ext_path
*path
)
127 if (IS_ERR_OR_NULL(path
))
129 ext4_ext_drop_refs(path
);
134 * Make sure 'handle' has at least 'check_cred' credits. If not, restart
135 * transaction with 'restart_cred' credits. The function drops i_data_sem
136 * when restarting transaction and gets it after transaction is restarted.
138 * The function returns 0 on success, 1 if transaction had to be restarted,
139 * and < 0 in case of fatal error.
141 int ext4_datasem_ensure_credits(handle_t
*handle
, struct inode
*inode
,
142 int check_cred
, int restart_cred
,
148 ret
= ext4_journal_ensure_credits_fn(handle
, check_cred
, restart_cred
,
149 revoke_cred
, ext4_ext_trunc_restart_fn(inode
, &dropped
));
151 down_write(&EXT4_I(inode
)->i_data_sem
);
160 static int ext4_ext_get_access(handle_t
*handle
, struct inode
*inode
,
161 struct ext4_ext_path
*path
)
166 /* path points to block */
167 BUFFER_TRACE(path
->p_bh
, "get_write_access");
168 err
= ext4_journal_get_write_access(handle
, inode
->i_sb
,
169 path
->p_bh
, EXT4_JTR_NONE
);
171 * The extent buffer's verified bit will be set again in
172 * __ext4_ext_dirty(). We could leave an inconsistent
173 * buffer if the extents updating procudure break off du
174 * to some error happens, force to check it again.
177 clear_buffer_verified(path
->p_bh
);
179 /* path points to leaf/index in inode body */
180 /* we use in-core data, no need to protect them */
190 static int __ext4_ext_dirty(const char *where
, unsigned int line
,
191 handle_t
*handle
, struct inode
*inode
,
192 struct ext4_ext_path
*path
)
196 WARN_ON(!rwsem_is_locked(&EXT4_I(inode
)->i_data_sem
));
198 ext4_extent_block_csum_set(inode
, ext_block_hdr(path
->p_bh
));
199 /* path points to block */
200 err
= __ext4_handle_dirty_metadata(where
, line
, handle
,
202 /* Extents updating done, re-set verified flag */
204 set_buffer_verified(path
->p_bh
);
206 /* path points to leaf/index in inode body */
207 err
= ext4_mark_inode_dirty(handle
, inode
);
212 #define ext4_ext_dirty(handle, inode, path) \
213 __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
215 static ext4_fsblk_t
ext4_ext_find_goal(struct inode
*inode
,
216 struct ext4_ext_path
*path
,
220 int depth
= path
->p_depth
;
221 struct ext4_extent
*ex
;
224 * Try to predict block placement assuming that we are
225 * filling in a file which will eventually be
226 * non-sparse --- i.e., in the case of libbfd writing
227 * an ELF object sections out-of-order but in a way
228 * the eventually results in a contiguous object or
229 * executable file, or some database extending a table
230 * space file. However, this is actually somewhat
231 * non-ideal if we are writing a sparse file such as
232 * qemu or KVM writing a raw image file that is going
233 * to stay fairly sparse, since it will end up
234 * fragmenting the file system's free space. Maybe we
235 * should have some hueristics or some way to allow
236 * userspace to pass a hint to file system,
237 * especially if the latter case turns out to be
240 ex
= path
[depth
].p_ext
;
242 ext4_fsblk_t ext_pblk
= ext4_ext_pblock(ex
);
243 ext4_lblk_t ext_block
= le32_to_cpu(ex
->ee_block
);
245 if (block
> ext_block
)
246 return ext_pblk
+ (block
- ext_block
);
248 return ext_pblk
- (ext_block
- block
);
251 /* it looks like index is empty;
252 * try to find starting block from index itself */
253 if (path
[depth
].p_bh
)
254 return path
[depth
].p_bh
->b_blocknr
;
257 /* OK. use inode's group */
258 return ext4_inode_to_goal_block(inode
);
262 * Allocation for a meta data block
265 ext4_ext_new_meta_block(handle_t
*handle
, struct inode
*inode
,
266 struct ext4_ext_path
*path
,
267 struct ext4_extent
*ex
, int *err
, unsigned int flags
)
269 ext4_fsblk_t goal
, newblock
;
271 goal
= ext4_ext_find_goal(inode
, path
, le32_to_cpu(ex
->ee_block
));
272 newblock
= ext4_new_meta_blocks(handle
, inode
, goal
, flags
,
277 static inline int ext4_ext_space_block(struct inode
*inode
, int check
)
281 size
= (inode
->i_sb
->s_blocksize
- sizeof(struct ext4_extent_header
))
282 / sizeof(struct ext4_extent
);
283 #ifdef AGGRESSIVE_TEST
284 if (!check
&& size
> 6)
290 static inline int ext4_ext_space_block_idx(struct inode
*inode
, int check
)
294 size
= (inode
->i_sb
->s_blocksize
- sizeof(struct ext4_extent_header
))
295 / sizeof(struct ext4_extent_idx
);
296 #ifdef AGGRESSIVE_TEST
297 if (!check
&& size
> 5)
303 static inline int ext4_ext_space_root(struct inode
*inode
, int check
)
307 size
= sizeof(EXT4_I(inode
)->i_data
);
308 size
-= sizeof(struct ext4_extent_header
);
309 size
/= sizeof(struct ext4_extent
);
310 #ifdef AGGRESSIVE_TEST
311 if (!check
&& size
> 3)
317 static inline int ext4_ext_space_root_idx(struct inode
*inode
, int check
)
321 size
= sizeof(EXT4_I(inode
)->i_data
);
322 size
-= sizeof(struct ext4_extent_header
);
323 size
/= sizeof(struct ext4_extent_idx
);
324 #ifdef AGGRESSIVE_TEST
325 if (!check
&& size
> 4)
331 static inline struct ext4_ext_path
*
332 ext4_force_split_extent_at(handle_t
*handle
, struct inode
*inode
,
333 struct ext4_ext_path
*path
, ext4_lblk_t lblk
,
336 int unwritten
= ext4_ext_is_unwritten(path
[path
->p_depth
].p_ext
);
337 int flags
= EXT4_EX_NOCACHE
| EXT4_GET_BLOCKS_PRE_IO
;
340 flags
|= EXT4_GET_BLOCKS_METADATA_NOFAIL
| EXT4_EX_NOFAIL
;
342 return ext4_split_extent_at(handle
, inode
, path
, lblk
, unwritten
?
343 EXT4_EXT_MARK_UNWRIT1
|EXT4_EXT_MARK_UNWRIT2
: 0,
348 ext4_ext_max_entries(struct inode
*inode
, int depth
)
352 if (depth
== ext_depth(inode
)) {
354 max
= ext4_ext_space_root(inode
, 1);
356 max
= ext4_ext_space_root_idx(inode
, 1);
359 max
= ext4_ext_space_block(inode
, 1);
361 max
= ext4_ext_space_block_idx(inode
, 1);
367 static int ext4_valid_extent(struct inode
*inode
, struct ext4_extent
*ext
)
369 ext4_fsblk_t block
= ext4_ext_pblock(ext
);
370 int len
= ext4_ext_get_actual_len(ext
);
371 ext4_lblk_t lblock
= le32_to_cpu(ext
->ee_block
);
376 * - overflow/wrap-around
378 if (lblock
+ len
<= lblock
)
380 return ext4_inode_block_valid(inode
, block
, len
);
383 static int ext4_valid_extent_idx(struct inode
*inode
,
384 struct ext4_extent_idx
*ext_idx
)
386 ext4_fsblk_t block
= ext4_idx_pblock(ext_idx
);
388 return ext4_inode_block_valid(inode
, block
, 1);
391 static int ext4_valid_extent_entries(struct inode
*inode
,
392 struct ext4_extent_header
*eh
,
393 ext4_lblk_t lblk
, ext4_fsblk_t
*pblk
,
396 unsigned short entries
;
397 ext4_lblk_t lblock
= 0;
400 if (eh
->eh_entries
== 0)
403 entries
= le16_to_cpu(eh
->eh_entries
);
407 struct ext4_extent
*ext
= EXT_FIRST_EXTENT(eh
);
410 * The logical block in the first entry should equal to
411 * the number in the index block.
413 if (depth
!= ext_depth(inode
) &&
414 lblk
!= le32_to_cpu(ext
->ee_block
))
417 if (!ext4_valid_extent(inode
, ext
))
420 /* Check for overlapping extents */
421 lblock
= le32_to_cpu(ext
->ee_block
);
423 *pblk
= ext4_ext_pblock(ext
);
426 cur
= lblock
+ ext4_ext_get_actual_len(ext
);
431 struct ext4_extent_idx
*ext_idx
= EXT_FIRST_INDEX(eh
);
434 * The logical block in the first entry should equal to
435 * the number in the parent index block.
437 if (depth
!= ext_depth(inode
) &&
438 lblk
!= le32_to_cpu(ext_idx
->ei_block
))
441 if (!ext4_valid_extent_idx(inode
, ext_idx
))
444 /* Check for overlapping index extents */
445 lblock
= le32_to_cpu(ext_idx
->ei_block
);
447 *pblk
= ext4_idx_pblock(ext_idx
);
458 static int __ext4_ext_check(const char *function
, unsigned int line
,
459 struct inode
*inode
, struct ext4_extent_header
*eh
,
460 int depth
, ext4_fsblk_t pblk
, ext4_lblk_t lblk
)
462 const char *error_msg
;
463 int max
= 0, err
= -EFSCORRUPTED
;
465 if (unlikely(eh
->eh_magic
!= EXT4_EXT_MAGIC
)) {
466 error_msg
= "invalid magic";
469 if (unlikely(le16_to_cpu(eh
->eh_depth
) != depth
)) {
470 error_msg
= "unexpected eh_depth";
473 if (unlikely(eh
->eh_max
== 0)) {
474 error_msg
= "invalid eh_max";
477 max
= ext4_ext_max_entries(inode
, depth
);
478 if (unlikely(le16_to_cpu(eh
->eh_max
) > max
)) {
479 error_msg
= "too large eh_max";
482 if (unlikely(le16_to_cpu(eh
->eh_entries
) > le16_to_cpu(eh
->eh_max
))) {
483 error_msg
= "invalid eh_entries";
486 if (unlikely((eh
->eh_entries
== 0) && (depth
> 0))) {
487 error_msg
= "eh_entries is 0 but eh_depth is > 0";
490 if (!ext4_valid_extent_entries(inode
, eh
, lblk
, &pblk
, depth
)) {
491 error_msg
= "invalid extent entries";
494 if (unlikely(depth
> 32)) {
495 error_msg
= "too large eh_depth";
498 /* Verify checksum on non-root extent tree nodes */
499 if (ext_depth(inode
) != depth
&&
500 !ext4_extent_block_csum_verify(inode
, eh
)) {
501 error_msg
= "extent tree corrupted";
508 ext4_error_inode_err(inode
, function
, line
, 0, -err
,
509 "pblk %llu bad header/extent: %s - magic %x, "
510 "entries %u, max %u(%u), depth %u(%u)",
511 (unsigned long long) pblk
, error_msg
,
512 le16_to_cpu(eh
->eh_magic
),
513 le16_to_cpu(eh
->eh_entries
),
514 le16_to_cpu(eh
->eh_max
),
515 max
, le16_to_cpu(eh
->eh_depth
), depth
);
519 #define ext4_ext_check(inode, eh, depth, pblk) \
520 __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk), 0)
522 int ext4_ext_check_inode(struct inode
*inode
)
524 return ext4_ext_check(inode
, ext_inode_hdr(inode
), ext_depth(inode
), 0);
527 static void ext4_cache_extents(struct inode
*inode
,
528 struct ext4_extent_header
*eh
)
530 struct ext4_extent
*ex
= EXT_FIRST_EXTENT(eh
);
531 ext4_lblk_t prev
= 0;
534 for (i
= le16_to_cpu(eh
->eh_entries
); i
> 0; i
--, ex
++) {
535 unsigned int status
= EXTENT_STATUS_WRITTEN
;
536 ext4_lblk_t lblk
= le32_to_cpu(ex
->ee_block
);
537 int len
= ext4_ext_get_actual_len(ex
);
539 if (prev
&& (prev
!= lblk
))
540 ext4_es_cache_extent(inode
, prev
, lblk
- prev
, ~0,
543 if (ext4_ext_is_unwritten(ex
))
544 status
= EXTENT_STATUS_UNWRITTEN
;
545 ext4_es_cache_extent(inode
, lblk
, len
,
546 ext4_ext_pblock(ex
), status
);
551 static struct buffer_head
*
552 __read_extent_tree_block(const char *function
, unsigned int line
,
553 struct inode
*inode
, struct ext4_extent_idx
*idx
,
554 int depth
, int flags
)
556 struct buffer_head
*bh
;
558 gfp_t gfp_flags
= __GFP_MOVABLE
| GFP_NOFS
;
561 if (flags
& EXT4_EX_NOFAIL
)
562 gfp_flags
|= __GFP_NOFAIL
;
564 pblk
= ext4_idx_pblock(idx
);
565 bh
= sb_getblk_gfp(inode
->i_sb
, pblk
, gfp_flags
);
567 return ERR_PTR(-ENOMEM
);
569 if (!bh_uptodate_or_lock(bh
)) {
570 trace_ext4_ext_load_extent(inode
, pblk
, _RET_IP_
);
571 err
= ext4_read_bh(bh
, 0, NULL
, false);
575 if (buffer_verified(bh
) && !(flags
& EXT4_EX_FORCE_CACHE
))
577 err
= __ext4_ext_check(function
, line
, inode
, ext_block_hdr(bh
),
578 depth
, pblk
, le32_to_cpu(idx
->ei_block
));
581 set_buffer_verified(bh
);
583 * If this is a leaf block, cache all of its entries
585 if (!(flags
& EXT4_EX_NOCACHE
) && depth
== 0) {
586 struct ext4_extent_header
*eh
= ext_block_hdr(bh
);
587 ext4_cache_extents(inode
, eh
);
596 #define read_extent_tree_block(inode, idx, depth, flags) \
597 __read_extent_tree_block(__func__, __LINE__, (inode), (idx), \
601 * This function is called to cache a file's extent information in the
604 int ext4_ext_precache(struct inode
*inode
)
606 struct ext4_inode_info
*ei
= EXT4_I(inode
);
607 struct ext4_ext_path
*path
= NULL
;
608 struct buffer_head
*bh
;
609 int i
= 0, depth
, ret
= 0;
611 if (!ext4_test_inode_flag(inode
, EXT4_INODE_EXTENTS
))
612 return 0; /* not an extent-mapped inode */
614 down_read(&ei
->i_data_sem
);
615 depth
= ext_depth(inode
);
617 /* Don't cache anything if there are no external extent blocks */
619 up_read(&ei
->i_data_sem
);
623 path
= kcalloc(depth
+ 1, sizeof(struct ext4_ext_path
),
626 up_read(&ei
->i_data_sem
);
630 path
[0].p_hdr
= ext_inode_hdr(inode
);
631 ret
= ext4_ext_check(inode
, path
[0].p_hdr
, depth
, 0);
634 path
[0].p_idx
= EXT_FIRST_INDEX(path
[0].p_hdr
);
637 * If this is a leaf block or we've reached the end of
638 * the index block, go up
641 path
[i
].p_idx
> EXT_LAST_INDEX(path
[i
].p_hdr
)) {
642 ext4_ext_path_brelse(path
+ i
);
646 bh
= read_extent_tree_block(inode
, path
[i
].p_idx
++,
648 EXT4_EX_FORCE_CACHE
);
655 path
[i
].p_hdr
= ext_block_hdr(bh
);
656 path
[i
].p_idx
= EXT_FIRST_INDEX(path
[i
].p_hdr
);
658 ext4_set_inode_state(inode
, EXT4_STATE_EXT_PRECACHED
);
660 up_read(&ei
->i_data_sem
);
661 ext4_free_ext_path(path
);
666 static void ext4_ext_show_path(struct inode
*inode
, struct ext4_ext_path
*path
)
668 int k
, l
= path
->p_depth
;
670 ext_debug(inode
, "path:");
671 for (k
= 0; k
<= l
; k
++, path
++) {
673 ext_debug(inode
, " %d->%llu",
674 le32_to_cpu(path
->p_idx
->ei_block
),
675 ext4_idx_pblock(path
->p_idx
));
676 } else if (path
->p_ext
) {
677 ext_debug(inode
, " %d:[%d]%d:%llu ",
678 le32_to_cpu(path
->p_ext
->ee_block
),
679 ext4_ext_is_unwritten(path
->p_ext
),
680 ext4_ext_get_actual_len(path
->p_ext
),
681 ext4_ext_pblock(path
->p_ext
));
683 ext_debug(inode
, " []");
685 ext_debug(inode
, "\n");
688 static void ext4_ext_show_leaf(struct inode
*inode
, struct ext4_ext_path
*path
)
690 int depth
= ext_depth(inode
);
691 struct ext4_extent_header
*eh
;
692 struct ext4_extent
*ex
;
695 if (IS_ERR_OR_NULL(path
))
698 eh
= path
[depth
].p_hdr
;
699 ex
= EXT_FIRST_EXTENT(eh
);
701 ext_debug(inode
, "Displaying leaf extents\n");
703 for (i
= 0; i
< le16_to_cpu(eh
->eh_entries
); i
++, ex
++) {
704 ext_debug(inode
, "%d:[%d]%d:%llu ", le32_to_cpu(ex
->ee_block
),
705 ext4_ext_is_unwritten(ex
),
706 ext4_ext_get_actual_len(ex
), ext4_ext_pblock(ex
));
708 ext_debug(inode
, "\n");
711 static void ext4_ext_show_move(struct inode
*inode
, struct ext4_ext_path
*path
,
712 ext4_fsblk_t newblock
, int level
)
714 int depth
= ext_depth(inode
);
715 struct ext4_extent
*ex
;
717 if (depth
!= level
) {
718 struct ext4_extent_idx
*idx
;
719 idx
= path
[level
].p_idx
;
720 while (idx
<= EXT_MAX_INDEX(path
[level
].p_hdr
)) {
721 ext_debug(inode
, "%d: move %d:%llu in new index %llu\n",
722 level
, le32_to_cpu(idx
->ei_block
),
723 ext4_idx_pblock(idx
), newblock
);
730 ex
= path
[depth
].p_ext
;
731 while (ex
<= EXT_MAX_EXTENT(path
[depth
].p_hdr
)) {
732 ext_debug(inode
, "move %d:%llu:[%d]%d in new leaf %llu\n",
733 le32_to_cpu(ex
->ee_block
),
735 ext4_ext_is_unwritten(ex
),
736 ext4_ext_get_actual_len(ex
),
743 #define ext4_ext_show_path(inode, path)
744 #define ext4_ext_show_leaf(inode, path)
745 #define ext4_ext_show_move(inode, path, newblock, level)
749 * ext4_ext_binsearch_idx:
750 * binary search for the closest index of the given block
751 * the header must be checked before calling this
754 ext4_ext_binsearch_idx(struct inode
*inode
,
755 struct ext4_ext_path
*path
, ext4_lblk_t block
)
757 struct ext4_extent_header
*eh
= path
->p_hdr
;
758 struct ext4_extent_idx
*r
, *l
, *m
;
761 ext_debug(inode
, "binsearch for %u(idx): ", block
);
763 l
= EXT_FIRST_INDEX(eh
) + 1;
764 r
= EXT_LAST_INDEX(eh
);
767 ext_debug(inode
, "%p(%u):%p(%u):%p(%u) ", l
,
768 le32_to_cpu(l
->ei_block
), m
, le32_to_cpu(m
->ei_block
),
769 r
, le32_to_cpu(r
->ei_block
));
771 if (block
< le32_to_cpu(m
->ei_block
))
778 ext_debug(inode
, " -> %u->%lld ", le32_to_cpu(path
->p_idx
->ei_block
),
779 ext4_idx_pblock(path
->p_idx
));
781 #ifdef CHECK_BINSEARCH
783 struct ext4_extent_idx
*chix
, *ix
;
786 chix
= ix
= EXT_FIRST_INDEX(eh
);
787 for (k
= 0; k
< le16_to_cpu(eh
->eh_entries
); k
++, ix
++) {
788 if (k
!= 0 && le32_to_cpu(ix
->ei_block
) <=
789 le32_to_cpu(ix
[-1].ei_block
)) {
790 printk(KERN_DEBUG
"k=%d, ix=0x%p, "
792 ix
, EXT_FIRST_INDEX(eh
));
793 printk(KERN_DEBUG
"%u <= %u\n",
794 le32_to_cpu(ix
->ei_block
),
795 le32_to_cpu(ix
[-1].ei_block
));
797 BUG_ON(k
&& le32_to_cpu(ix
->ei_block
)
798 <= le32_to_cpu(ix
[-1].ei_block
));
799 if (block
< le32_to_cpu(ix
->ei_block
))
803 BUG_ON(chix
!= path
->p_idx
);
810 * ext4_ext_binsearch:
811 * binary search for closest extent of the given block
812 * the header must be checked before calling this
815 ext4_ext_binsearch(struct inode
*inode
,
816 struct ext4_ext_path
*path
, ext4_lblk_t block
)
818 struct ext4_extent_header
*eh
= path
->p_hdr
;
819 struct ext4_extent
*r
, *l
, *m
;
821 if (eh
->eh_entries
== 0) {
823 * this leaf is empty:
824 * we get such a leaf in split/add case
829 ext_debug(inode
, "binsearch for %u: ", block
);
831 l
= EXT_FIRST_EXTENT(eh
) + 1;
832 r
= EXT_LAST_EXTENT(eh
);
836 ext_debug(inode
, "%p(%u):%p(%u):%p(%u) ", l
,
837 le32_to_cpu(l
->ee_block
), m
, le32_to_cpu(m
->ee_block
),
838 r
, le32_to_cpu(r
->ee_block
));
840 if (block
< le32_to_cpu(m
->ee_block
))
847 ext_debug(inode
, " -> %d:%llu:[%d]%d ",
848 le32_to_cpu(path
->p_ext
->ee_block
),
849 ext4_ext_pblock(path
->p_ext
),
850 ext4_ext_is_unwritten(path
->p_ext
),
851 ext4_ext_get_actual_len(path
->p_ext
));
853 #ifdef CHECK_BINSEARCH
855 struct ext4_extent
*chex
, *ex
;
858 chex
= ex
= EXT_FIRST_EXTENT(eh
);
859 for (k
= 0; k
< le16_to_cpu(eh
->eh_entries
); k
++, ex
++) {
860 BUG_ON(k
&& le32_to_cpu(ex
->ee_block
)
861 <= le32_to_cpu(ex
[-1].ee_block
));
862 if (block
< le32_to_cpu(ex
->ee_block
))
866 BUG_ON(chex
!= path
->p_ext
);
872 void ext4_ext_tree_init(handle_t
*handle
, struct inode
*inode
)
874 struct ext4_extent_header
*eh
;
876 eh
= ext_inode_hdr(inode
);
879 eh
->eh_magic
= EXT4_EXT_MAGIC
;
880 eh
->eh_max
= cpu_to_le16(ext4_ext_space_root(inode
, 0));
881 eh
->eh_generation
= 0;
882 ext4_mark_inode_dirty(handle
, inode
);
885 struct ext4_ext_path
*
886 ext4_find_extent(struct inode
*inode
, ext4_lblk_t block
,
887 struct ext4_ext_path
*path
, int flags
)
889 struct ext4_extent_header
*eh
;
890 struct buffer_head
*bh
;
891 short int depth
, i
, ppos
= 0;
893 gfp_t gfp_flags
= GFP_NOFS
;
895 if (flags
& EXT4_EX_NOFAIL
)
896 gfp_flags
|= __GFP_NOFAIL
;
898 eh
= ext_inode_hdr(inode
);
899 depth
= ext_depth(inode
);
900 if (depth
< 0 || depth
> EXT4_MAX_EXTENT_DEPTH
) {
901 EXT4_ERROR_INODE(inode
, "inode has invalid extent depth: %d",
908 ext4_ext_drop_refs(path
);
909 if (depth
> path
[0].p_maxdepth
) {
915 /* account possible depth increase */
916 path
= kcalloc(depth
+ 2, sizeof(struct ext4_ext_path
),
919 return ERR_PTR(-ENOMEM
);
920 path
[0].p_maxdepth
= depth
+ 1;
926 if (!(flags
& EXT4_EX_NOCACHE
) && depth
== 0)
927 ext4_cache_extents(inode
, eh
);
928 /* walk through the tree */
930 ext_debug(inode
, "depth %d: num %d, max %d\n",
931 ppos
, le16_to_cpu(eh
->eh_entries
), le16_to_cpu(eh
->eh_max
));
933 ext4_ext_binsearch_idx(inode
, path
+ ppos
, block
);
934 path
[ppos
].p_block
= ext4_idx_pblock(path
[ppos
].p_idx
);
935 path
[ppos
].p_depth
= i
;
936 path
[ppos
].p_ext
= NULL
;
938 bh
= read_extent_tree_block(inode
, path
[ppos
].p_idx
, --i
, flags
);
944 eh
= ext_block_hdr(bh
);
946 path
[ppos
].p_bh
= bh
;
947 path
[ppos
].p_hdr
= eh
;
950 path
[ppos
].p_depth
= i
;
951 path
[ppos
].p_ext
= NULL
;
952 path
[ppos
].p_idx
= NULL
;
955 ext4_ext_binsearch(inode
, path
+ ppos
, block
);
956 /* if not an empty leaf */
957 if (path
[ppos
].p_ext
)
958 path
[ppos
].p_block
= ext4_ext_pblock(path
[ppos
].p_ext
);
960 ext4_ext_show_path(inode
, path
);
965 ext4_free_ext_path(path
);
970 * ext4_ext_insert_index:
971 * insert new index [@logical;@ptr] into the block at @curp;
972 * check where to insert: before @curp or after @curp
974 static int ext4_ext_insert_index(handle_t
*handle
, struct inode
*inode
,
975 struct ext4_ext_path
*curp
,
976 int logical
, ext4_fsblk_t ptr
)
978 struct ext4_extent_idx
*ix
;
981 err
= ext4_ext_get_access(handle
, inode
, curp
);
985 if (unlikely(logical
== le32_to_cpu(curp
->p_idx
->ei_block
))) {
986 EXT4_ERROR_INODE(inode
,
987 "logical %d == ei_block %d!",
988 logical
, le32_to_cpu(curp
->p_idx
->ei_block
));
989 return -EFSCORRUPTED
;
992 if (unlikely(le16_to_cpu(curp
->p_hdr
->eh_entries
)
993 >= le16_to_cpu(curp
->p_hdr
->eh_max
))) {
994 EXT4_ERROR_INODE(inode
,
995 "eh_entries %d >= eh_max %d!",
996 le16_to_cpu(curp
->p_hdr
->eh_entries
),
997 le16_to_cpu(curp
->p_hdr
->eh_max
));
998 return -EFSCORRUPTED
;
1001 if (logical
> le32_to_cpu(curp
->p_idx
->ei_block
)) {
1003 ext_debug(inode
, "insert new index %d after: %llu\n",
1005 ix
= curp
->p_idx
+ 1;
1008 ext_debug(inode
, "insert new index %d before: %llu\n",
1013 if (unlikely(ix
> EXT_MAX_INDEX(curp
->p_hdr
))) {
1014 EXT4_ERROR_INODE(inode
, "ix > EXT_MAX_INDEX!");
1015 return -EFSCORRUPTED
;
1018 len
= EXT_LAST_INDEX(curp
->p_hdr
) - ix
+ 1;
1021 ext_debug(inode
, "insert new index %d: "
1022 "move %d indices from 0x%p to 0x%p\n",
1023 logical
, len
, ix
, ix
+ 1);
1024 memmove(ix
+ 1, ix
, len
* sizeof(struct ext4_extent_idx
));
1027 ix
->ei_block
= cpu_to_le32(logical
);
1028 ext4_idx_store_pblock(ix
, ptr
);
1029 le16_add_cpu(&curp
->p_hdr
->eh_entries
, 1);
1031 if (unlikely(ix
> EXT_LAST_INDEX(curp
->p_hdr
))) {
1032 EXT4_ERROR_INODE(inode
, "ix > EXT_LAST_INDEX!");
1033 return -EFSCORRUPTED
;
1036 err
= ext4_ext_dirty(handle
, inode
, curp
);
1037 ext4_std_error(inode
->i_sb
, err
);
1044 * inserts new subtree into the path, using free index entry
1046 * - allocates all needed blocks (new leaf and all intermediate index blocks)
1047 * - makes decision where to split
1048 * - moves remaining extents and index entries (right to the split point)
1049 * into the newly allocated blocks
1050 * - initializes subtree
1052 static int ext4_ext_split(handle_t
*handle
, struct inode
*inode
,
1054 struct ext4_ext_path
*path
,
1055 struct ext4_extent
*newext
, int at
)
1057 struct buffer_head
*bh
= NULL
;
1058 int depth
= ext_depth(inode
);
1059 struct ext4_extent_header
*neh
;
1060 struct ext4_extent_idx
*fidx
;
1061 int i
= at
, k
, m
, a
;
1062 ext4_fsblk_t newblock
, oldblock
;
1064 ext4_fsblk_t
*ablocks
= NULL
; /* array of allocated blocks */
1065 gfp_t gfp_flags
= GFP_NOFS
;
1067 size_t ext_size
= 0;
1069 if (flags
& EXT4_EX_NOFAIL
)
1070 gfp_flags
|= __GFP_NOFAIL
;
1072 /* make decision: where to split? */
1073 /* FIXME: now decision is simplest: at current extent */
1075 /* if current leaf will be split, then we should use
1076 * border from split point */
1077 if (unlikely(path
[depth
].p_ext
> EXT_MAX_EXTENT(path
[depth
].p_hdr
))) {
1078 EXT4_ERROR_INODE(inode
, "p_ext > EXT_MAX_EXTENT!");
1079 return -EFSCORRUPTED
;
1081 if (path
[depth
].p_ext
!= EXT_MAX_EXTENT(path
[depth
].p_hdr
)) {
1082 border
= path
[depth
].p_ext
[1].ee_block
;
1083 ext_debug(inode
, "leaf will be split."
1084 " next leaf starts at %d\n",
1085 le32_to_cpu(border
));
1087 border
= newext
->ee_block
;
1088 ext_debug(inode
, "leaf will be added."
1089 " next leaf starts at %d\n",
1090 le32_to_cpu(border
));
1094 * If error occurs, then we break processing
1095 * and mark filesystem read-only. index won't
1096 * be inserted and tree will be in consistent
1097 * state. Next mount will repair buffers too.
1101 * Get array to track all allocated blocks.
1102 * We need this to handle errors and free blocks
1105 ablocks
= kcalloc(depth
, sizeof(ext4_fsblk_t
), gfp_flags
);
1109 /* allocate all needed blocks */
1110 ext_debug(inode
, "allocate %d blocks for indexes/leaf\n", depth
- at
);
1111 for (a
= 0; a
< depth
- at
; a
++) {
1112 newblock
= ext4_ext_new_meta_block(handle
, inode
, path
,
1113 newext
, &err
, flags
);
1116 ablocks
[a
] = newblock
;
1119 /* initialize new leaf */
1120 newblock
= ablocks
[--a
];
1121 if (unlikely(newblock
== 0)) {
1122 EXT4_ERROR_INODE(inode
, "newblock == 0!");
1123 err
= -EFSCORRUPTED
;
1126 bh
= sb_getblk_gfp(inode
->i_sb
, newblock
, __GFP_MOVABLE
| GFP_NOFS
);
1127 if (unlikely(!bh
)) {
1133 err
= ext4_journal_get_create_access(handle
, inode
->i_sb
, bh
,
1138 neh
= ext_block_hdr(bh
);
1139 neh
->eh_entries
= 0;
1140 neh
->eh_max
= cpu_to_le16(ext4_ext_space_block(inode
, 0));
1141 neh
->eh_magic
= EXT4_EXT_MAGIC
;
1143 neh
->eh_generation
= 0;
1145 /* move remainder of path[depth] to the new leaf */
1146 if (unlikely(path
[depth
].p_hdr
->eh_entries
!=
1147 path
[depth
].p_hdr
->eh_max
)) {
1148 EXT4_ERROR_INODE(inode
, "eh_entries %d != eh_max %d!",
1149 path
[depth
].p_hdr
->eh_entries
,
1150 path
[depth
].p_hdr
->eh_max
);
1151 err
= -EFSCORRUPTED
;
1154 /* start copy from next extent */
1155 m
= EXT_MAX_EXTENT(path
[depth
].p_hdr
) - path
[depth
].p_ext
++;
1156 ext4_ext_show_move(inode
, path
, newblock
, depth
);
1158 struct ext4_extent
*ex
;
1159 ex
= EXT_FIRST_EXTENT(neh
);
1160 memmove(ex
, path
[depth
].p_ext
, sizeof(struct ext4_extent
) * m
);
1161 le16_add_cpu(&neh
->eh_entries
, m
);
1164 /* zero out unused area in the extent block */
1165 ext_size
= sizeof(struct ext4_extent_header
) +
1166 sizeof(struct ext4_extent
) * le16_to_cpu(neh
->eh_entries
);
1167 memset(bh
->b_data
+ ext_size
, 0, inode
->i_sb
->s_blocksize
- ext_size
);
1168 ext4_extent_block_csum_set(inode
, neh
);
1169 set_buffer_uptodate(bh
);
1172 err
= ext4_handle_dirty_metadata(handle
, inode
, bh
);
1178 /* correct old leaf */
1180 err
= ext4_ext_get_access(handle
, inode
, path
+ depth
);
1183 le16_add_cpu(&path
[depth
].p_hdr
->eh_entries
, -m
);
1184 err
= ext4_ext_dirty(handle
, inode
, path
+ depth
);
1190 /* create intermediate indexes */
1192 if (unlikely(k
< 0)) {
1193 EXT4_ERROR_INODE(inode
, "k %d < 0!", k
);
1194 err
= -EFSCORRUPTED
;
1198 ext_debug(inode
, "create %d intermediate indices\n", k
);
1199 /* insert new index into current index block */
1200 /* current depth stored in i var */
1203 oldblock
= newblock
;
1204 newblock
= ablocks
[--a
];
1205 bh
= sb_getblk(inode
->i_sb
, newblock
);
1206 if (unlikely(!bh
)) {
1212 err
= ext4_journal_get_create_access(handle
, inode
->i_sb
, bh
,
1217 neh
= ext_block_hdr(bh
);
1218 neh
->eh_entries
= cpu_to_le16(1);
1219 neh
->eh_magic
= EXT4_EXT_MAGIC
;
1220 neh
->eh_max
= cpu_to_le16(ext4_ext_space_block_idx(inode
, 0));
1221 neh
->eh_depth
= cpu_to_le16(depth
- i
);
1222 neh
->eh_generation
= 0;
1223 fidx
= EXT_FIRST_INDEX(neh
);
1224 fidx
->ei_block
= border
;
1225 ext4_idx_store_pblock(fidx
, oldblock
);
1227 ext_debug(inode
, "int.index at %d (block %llu): %u -> %llu\n",
1228 i
, newblock
, le32_to_cpu(border
), oldblock
);
1230 /* move remainder of path[i] to the new index block */
1231 if (unlikely(EXT_MAX_INDEX(path
[i
].p_hdr
) !=
1232 EXT_LAST_INDEX(path
[i
].p_hdr
))) {
1233 EXT4_ERROR_INODE(inode
,
1234 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
1235 le32_to_cpu(path
[i
].p_ext
->ee_block
));
1236 err
= -EFSCORRUPTED
;
1239 /* start copy indexes */
1240 m
= EXT_MAX_INDEX(path
[i
].p_hdr
) - path
[i
].p_idx
++;
1241 ext_debug(inode
, "cur 0x%p, last 0x%p\n", path
[i
].p_idx
,
1242 EXT_MAX_INDEX(path
[i
].p_hdr
));
1243 ext4_ext_show_move(inode
, path
, newblock
, i
);
1245 memmove(++fidx
, path
[i
].p_idx
,
1246 sizeof(struct ext4_extent_idx
) * m
);
1247 le16_add_cpu(&neh
->eh_entries
, m
);
1249 /* zero out unused area in the extent block */
1250 ext_size
= sizeof(struct ext4_extent_header
) +
1251 (sizeof(struct ext4_extent
) * le16_to_cpu(neh
->eh_entries
));
1252 memset(bh
->b_data
+ ext_size
, 0,
1253 inode
->i_sb
->s_blocksize
- ext_size
);
1254 ext4_extent_block_csum_set(inode
, neh
);
1255 set_buffer_uptodate(bh
);
1258 err
= ext4_handle_dirty_metadata(handle
, inode
, bh
);
1264 /* correct old index */
1266 err
= ext4_ext_get_access(handle
, inode
, path
+ i
);
1269 le16_add_cpu(&path
[i
].p_hdr
->eh_entries
, -m
);
1270 err
= ext4_ext_dirty(handle
, inode
, path
+ i
);
1278 /* insert new index */
1279 err
= ext4_ext_insert_index(handle
, inode
, path
+ at
,
1280 le32_to_cpu(border
), newblock
);
1284 if (buffer_locked(bh
))
1290 /* free all allocated blocks in error case */
1291 for (i
= 0; i
< depth
; i
++) {
1294 ext4_free_blocks(handle
, inode
, NULL
, ablocks
[i
], 1,
1295 EXT4_FREE_BLOCKS_METADATA
);
1304 * ext4_ext_grow_indepth:
1305 * implements tree growing procedure:
1306 * - allocates new block
1307 * - moves top-level data (index block or leaf) into the new block
1308 * - initializes new top-level, creating index that points to the
1309 * just created block
1311 static int ext4_ext_grow_indepth(handle_t
*handle
, struct inode
*inode
,
1314 struct ext4_extent_header
*neh
;
1315 struct buffer_head
*bh
;
1316 ext4_fsblk_t newblock
, goal
= 0;
1317 struct ext4_super_block
*es
= EXT4_SB(inode
->i_sb
)->s_es
;
1319 size_t ext_size
= 0;
1321 /* Try to prepend new index to old one */
1322 if (ext_depth(inode
))
1323 goal
= ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode
)));
1324 if (goal
> le32_to_cpu(es
->s_first_data_block
)) {
1325 flags
|= EXT4_MB_HINT_TRY_GOAL
;
1328 goal
= ext4_inode_to_goal_block(inode
);
1329 newblock
= ext4_new_meta_blocks(handle
, inode
, goal
, flags
,
1334 bh
= sb_getblk_gfp(inode
->i_sb
, newblock
, __GFP_MOVABLE
| GFP_NOFS
);
1339 err
= ext4_journal_get_create_access(handle
, inode
->i_sb
, bh
,
1346 ext_size
= sizeof(EXT4_I(inode
)->i_data
);
1347 /* move top-level index/leaf into new block */
1348 memmove(bh
->b_data
, EXT4_I(inode
)->i_data
, ext_size
);
1349 /* zero out unused area in the extent block */
1350 memset(bh
->b_data
+ ext_size
, 0, inode
->i_sb
->s_blocksize
- ext_size
);
1352 /* set size of new block */
1353 neh
= ext_block_hdr(bh
);
1354 /* old root could have indexes or leaves
1355 * so calculate e_max right way */
1356 if (ext_depth(inode
))
1357 neh
->eh_max
= cpu_to_le16(ext4_ext_space_block_idx(inode
, 0));
1359 neh
->eh_max
= cpu_to_le16(ext4_ext_space_block(inode
, 0));
1360 neh
->eh_magic
= EXT4_EXT_MAGIC
;
1361 ext4_extent_block_csum_set(inode
, neh
);
1362 set_buffer_uptodate(bh
);
1363 set_buffer_verified(bh
);
1366 err
= ext4_handle_dirty_metadata(handle
, inode
, bh
);
1370 /* Update top-level index: num,max,pointer */
1371 neh
= ext_inode_hdr(inode
);
1372 neh
->eh_entries
= cpu_to_le16(1);
1373 ext4_idx_store_pblock(EXT_FIRST_INDEX(neh
), newblock
);
1374 if (neh
->eh_depth
== 0) {
1375 /* Root extent block becomes index block */
1376 neh
->eh_max
= cpu_to_le16(ext4_ext_space_root_idx(inode
, 0));
1377 EXT_FIRST_INDEX(neh
)->ei_block
=
1378 EXT_FIRST_EXTENT(neh
)->ee_block
;
1380 ext_debug(inode
, "new root: num %d(%d), lblock %d, ptr %llu\n",
1381 le16_to_cpu(neh
->eh_entries
), le16_to_cpu(neh
->eh_max
),
1382 le32_to_cpu(EXT_FIRST_INDEX(neh
)->ei_block
),
1383 ext4_idx_pblock(EXT_FIRST_INDEX(neh
)));
1385 le16_add_cpu(&neh
->eh_depth
, 1);
1386 err
= ext4_mark_inode_dirty(handle
, inode
);
1394 * ext4_ext_create_new_leaf:
1395 * finds empty index and adds new leaf.
1396 * if no free index is found, then it requests in-depth growing.
1398 static struct ext4_ext_path
*
1399 ext4_ext_create_new_leaf(handle_t
*handle
, struct inode
*inode
,
1400 unsigned int mb_flags
, unsigned int gb_flags
,
1401 struct ext4_ext_path
*path
,
1402 struct ext4_extent
*newext
)
1404 struct ext4_ext_path
*curp
;
1405 int depth
, i
, err
= 0;
1406 ext4_lblk_t ee_block
= le32_to_cpu(newext
->ee_block
);
1409 i
= depth
= ext_depth(inode
);
1411 /* walk up to the tree and look for free index entry */
1412 curp
= path
+ depth
;
1413 while (i
> 0 && !EXT_HAS_FREE_INDEX(curp
)) {
1418 /* we use already allocated block for index block,
1419 * so subsequent data blocks should be contiguous */
1420 if (EXT_HAS_FREE_INDEX(curp
)) {
1421 /* if we found index with free entry, then use that
1422 * entry: create all needed subtree and add new leaf */
1423 err
= ext4_ext_split(handle
, inode
, mb_flags
, path
, newext
, i
);
1428 path
= ext4_find_extent(inode
, ee_block
, path
, gb_flags
);
1432 /* tree is full, time to grow in depth */
1433 err
= ext4_ext_grow_indepth(handle
, inode
, mb_flags
);
1438 path
= ext4_find_extent(inode
, ee_block
, path
, gb_flags
);
1443 * only first (depth 0 -> 1) produces free space;
1444 * in all other cases we have to split the grown tree
1446 depth
= ext_depth(inode
);
1447 if (path
[depth
].p_hdr
->eh_entries
== path
[depth
].p_hdr
->eh_max
) {
1448 /* now we need to split */
1455 ext4_free_ext_path(path
);
1456 return ERR_PTR(err
);
1460 * search the closest allocated block to the left for *logical
1461 * and returns it at @logical + it's physical address at @phys
1462 * if *logical is the smallest allocated block, the function
1463 * returns 0 at @phys
1464 * return value contains 0 (success) or error code
1466 static int ext4_ext_search_left(struct inode
*inode
,
1467 struct ext4_ext_path
*path
,
1468 ext4_lblk_t
*logical
, ext4_fsblk_t
*phys
)
1470 struct ext4_extent_idx
*ix
;
1471 struct ext4_extent
*ex
;
1474 if (unlikely(path
== NULL
)) {
1475 EXT4_ERROR_INODE(inode
, "path == NULL *logical %d!", *logical
);
1476 return -EFSCORRUPTED
;
1478 depth
= path
->p_depth
;
1481 if (depth
== 0 && path
->p_ext
== NULL
)
1484 /* usually extent in the path covers blocks smaller
1485 * then *logical, but it can be that extent is the
1486 * first one in the file */
1488 ex
= path
[depth
].p_ext
;
1489 ee_len
= ext4_ext_get_actual_len(ex
);
1490 if (*logical
< le32_to_cpu(ex
->ee_block
)) {
1491 if (unlikely(EXT_FIRST_EXTENT(path
[depth
].p_hdr
) != ex
)) {
1492 EXT4_ERROR_INODE(inode
,
1493 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
1494 *logical
, le32_to_cpu(ex
->ee_block
));
1495 return -EFSCORRUPTED
;
1497 while (--depth
>= 0) {
1498 ix
= path
[depth
].p_idx
;
1499 if (unlikely(ix
!= EXT_FIRST_INDEX(path
[depth
].p_hdr
))) {
1500 EXT4_ERROR_INODE(inode
,
1501 "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
1502 ix
!= NULL
? le32_to_cpu(ix
->ei_block
) : 0,
1503 le32_to_cpu(EXT_FIRST_INDEX(path
[depth
].p_hdr
)->ei_block
),
1505 return -EFSCORRUPTED
;
1511 if (unlikely(*logical
< (le32_to_cpu(ex
->ee_block
) + ee_len
))) {
1512 EXT4_ERROR_INODE(inode
,
1513 "logical %d < ee_block %d + ee_len %d!",
1514 *logical
, le32_to_cpu(ex
->ee_block
), ee_len
);
1515 return -EFSCORRUPTED
;
1518 *logical
= le32_to_cpu(ex
->ee_block
) + ee_len
- 1;
1519 *phys
= ext4_ext_pblock(ex
) + ee_len
- 1;
1524 * Search the closest allocated block to the right for *logical
1525 * and returns it at @logical + it's physical address at @phys.
1526 * If not exists, return 0 and @phys is set to 0. We will return
1527 * 1 which means we found an allocated block and ret_ex is valid.
1528 * Or return a (< 0) error code.
1530 static int ext4_ext_search_right(struct inode
*inode
,
1531 struct ext4_ext_path
*path
,
1532 ext4_lblk_t
*logical
, ext4_fsblk_t
*phys
,
1533 struct ext4_extent
*ret_ex
)
1535 struct buffer_head
*bh
= NULL
;
1536 struct ext4_extent_header
*eh
;
1537 struct ext4_extent_idx
*ix
;
1538 struct ext4_extent
*ex
;
1539 int depth
; /* Note, NOT eh_depth; depth from top of tree */
1542 if (unlikely(path
== NULL
)) {
1543 EXT4_ERROR_INODE(inode
, "path == NULL *logical %d!", *logical
);
1544 return -EFSCORRUPTED
;
1546 depth
= path
->p_depth
;
1549 if (depth
== 0 && path
->p_ext
== NULL
)
1552 /* usually extent in the path covers blocks smaller
1553 * then *logical, but it can be that extent is the
1554 * first one in the file */
1556 ex
= path
[depth
].p_ext
;
1557 ee_len
= ext4_ext_get_actual_len(ex
);
1558 if (*logical
< le32_to_cpu(ex
->ee_block
)) {
1559 if (unlikely(EXT_FIRST_EXTENT(path
[depth
].p_hdr
) != ex
)) {
1560 EXT4_ERROR_INODE(inode
,
1561 "first_extent(path[%d].p_hdr) != ex",
1563 return -EFSCORRUPTED
;
1565 while (--depth
>= 0) {
1566 ix
= path
[depth
].p_idx
;
1567 if (unlikely(ix
!= EXT_FIRST_INDEX(path
[depth
].p_hdr
))) {
1568 EXT4_ERROR_INODE(inode
,
1569 "ix != EXT_FIRST_INDEX *logical %d!",
1571 return -EFSCORRUPTED
;
1577 if (unlikely(*logical
< (le32_to_cpu(ex
->ee_block
) + ee_len
))) {
1578 EXT4_ERROR_INODE(inode
,
1579 "logical %d < ee_block %d + ee_len %d!",
1580 *logical
, le32_to_cpu(ex
->ee_block
), ee_len
);
1581 return -EFSCORRUPTED
;
1584 if (ex
!= EXT_LAST_EXTENT(path
[depth
].p_hdr
)) {
1585 /* next allocated block in this leaf */
1590 /* go up and search for index to the right */
1591 while (--depth
>= 0) {
1592 ix
= path
[depth
].p_idx
;
1593 if (ix
!= EXT_LAST_INDEX(path
[depth
].p_hdr
))
1597 /* we've gone up to the root and found no index to the right */
1601 /* we've found index to the right, let's
1602 * follow it and find the closest allocated
1603 * block to the right */
1605 while (++depth
< path
->p_depth
) {
1606 /* subtract from p_depth to get proper eh_depth */
1607 bh
= read_extent_tree_block(inode
, ix
, path
->p_depth
- depth
, 0);
1610 eh
= ext_block_hdr(bh
);
1611 ix
= EXT_FIRST_INDEX(eh
);
1615 bh
= read_extent_tree_block(inode
, ix
, path
->p_depth
- depth
, 0);
1618 eh
= ext_block_hdr(bh
);
1619 ex
= EXT_FIRST_EXTENT(eh
);
1621 *logical
= le32_to_cpu(ex
->ee_block
);
1622 *phys
= ext4_ext_pblock(ex
);
1631 * ext4_ext_next_allocated_block:
1632 * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
1633 * NOTE: it considers block number from index entry as
1634 * allocated block. Thus, index entries have to be consistent
1638 ext4_ext_next_allocated_block(struct ext4_ext_path
*path
)
1642 BUG_ON(path
== NULL
);
1643 depth
= path
->p_depth
;
1645 if (depth
== 0 && path
->p_ext
== NULL
)
1646 return EXT_MAX_BLOCKS
;
1648 while (depth
>= 0) {
1649 struct ext4_ext_path
*p
= &path
[depth
];
1651 if (depth
== path
->p_depth
) {
1653 if (p
->p_ext
&& p
->p_ext
!= EXT_LAST_EXTENT(p
->p_hdr
))
1654 return le32_to_cpu(p
->p_ext
[1].ee_block
);
1657 if (p
->p_idx
!= EXT_LAST_INDEX(p
->p_hdr
))
1658 return le32_to_cpu(p
->p_idx
[1].ei_block
);
1663 return EXT_MAX_BLOCKS
;
1667 * ext4_ext_next_leaf_block:
1668 * returns first allocated block from next leaf or EXT_MAX_BLOCKS
1670 static ext4_lblk_t
ext4_ext_next_leaf_block(struct ext4_ext_path
*path
)
1674 BUG_ON(path
== NULL
);
1675 depth
= path
->p_depth
;
1677 /* zero-tree has no leaf blocks at all */
1679 return EXT_MAX_BLOCKS
;
1681 /* go to index block */
1684 while (depth
>= 0) {
1685 if (path
[depth
].p_idx
!=
1686 EXT_LAST_INDEX(path
[depth
].p_hdr
))
1687 return (ext4_lblk_t
)
1688 le32_to_cpu(path
[depth
].p_idx
[1].ei_block
);
1692 return EXT_MAX_BLOCKS
;
1696 * ext4_ext_correct_indexes:
1697 * if leaf gets modified and modified extent is first in the leaf,
1698 * then we have to correct all indexes above.
1699 * TODO: do we need to correct tree in all cases?
1701 static int ext4_ext_correct_indexes(handle_t
*handle
, struct inode
*inode
,
1702 struct ext4_ext_path
*path
)
1704 struct ext4_extent_header
*eh
;
1705 int depth
= ext_depth(inode
);
1706 struct ext4_extent
*ex
;
1710 eh
= path
[depth
].p_hdr
;
1711 ex
= path
[depth
].p_ext
;
1713 if (unlikely(ex
== NULL
|| eh
== NULL
)) {
1714 EXT4_ERROR_INODE(inode
,
1715 "ex %p == NULL or eh %p == NULL", ex
, eh
);
1716 return -EFSCORRUPTED
;
1720 /* there is no tree at all */
1724 if (ex
!= EXT_FIRST_EXTENT(eh
)) {
1725 /* we correct tree if first leaf got modified only */
1730 * TODO: we need correction if border is smaller than current one
1733 border
= path
[depth
].p_ext
->ee_block
;
1734 err
= ext4_ext_get_access(handle
, inode
, path
+ k
);
1737 path
[k
].p_idx
->ei_block
= border
;
1738 err
= ext4_ext_dirty(handle
, inode
, path
+ k
);
1743 /* change all left-side indexes */
1744 if (path
[k
+1].p_idx
!= EXT_FIRST_INDEX(path
[k
+1].p_hdr
))
1746 err
= ext4_ext_get_access(handle
, inode
, path
+ k
);
1749 path
[k
].p_idx
->ei_block
= border
;
1750 err
= ext4_ext_dirty(handle
, inode
, path
+ k
);
1758 * The path[k].p_bh is either unmodified or with no verified bit
1759 * set (see ext4_ext_get_access()). So just clear the verified bit
1760 * of the successfully modified extents buffers, which will force
1761 * these extents to be checked to avoid using inconsistent data.
1764 clear_buffer_verified(path
[k
].p_bh
);
1769 static int ext4_can_extents_be_merged(struct inode
*inode
,
1770 struct ext4_extent
*ex1
,
1771 struct ext4_extent
*ex2
)
1773 unsigned short ext1_ee_len
, ext2_ee_len
;
1775 if (ext4_ext_is_unwritten(ex1
) != ext4_ext_is_unwritten(ex2
))
1778 ext1_ee_len
= ext4_ext_get_actual_len(ex1
);
1779 ext2_ee_len
= ext4_ext_get_actual_len(ex2
);
1781 if (le32_to_cpu(ex1
->ee_block
) + ext1_ee_len
!=
1782 le32_to_cpu(ex2
->ee_block
))
1785 if (ext1_ee_len
+ ext2_ee_len
> EXT_INIT_MAX_LEN
)
1788 if (ext4_ext_is_unwritten(ex1
) &&
1789 ext1_ee_len
+ ext2_ee_len
> EXT_UNWRITTEN_MAX_LEN
)
1791 #ifdef AGGRESSIVE_TEST
1792 if (ext1_ee_len
>= 4)
1796 if (ext4_ext_pblock(ex1
) + ext1_ee_len
== ext4_ext_pblock(ex2
))
1802 * This function tries to merge the "ex" extent to the next extent in the tree.
1803 * It always tries to merge towards right. If you want to merge towards
1804 * left, pass "ex - 1" as argument instead of "ex".
1805 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1806 * 1 if they got merged.
1808 static int ext4_ext_try_to_merge_right(struct inode
*inode
,
1809 struct ext4_ext_path
*path
,
1810 struct ext4_extent
*ex
)
1812 struct ext4_extent_header
*eh
;
1813 unsigned int depth
, len
;
1814 int merge_done
= 0, unwritten
;
1816 depth
= ext_depth(inode
);
1817 BUG_ON(path
[depth
].p_hdr
== NULL
);
1818 eh
= path
[depth
].p_hdr
;
1820 while (ex
< EXT_LAST_EXTENT(eh
)) {
1821 if (!ext4_can_extents_be_merged(inode
, ex
, ex
+ 1))
1823 /* merge with next extent! */
1824 unwritten
= ext4_ext_is_unwritten(ex
);
1825 ex
->ee_len
= cpu_to_le16(ext4_ext_get_actual_len(ex
)
1826 + ext4_ext_get_actual_len(ex
+ 1));
1828 ext4_ext_mark_unwritten(ex
);
1830 if (ex
+ 1 < EXT_LAST_EXTENT(eh
)) {
1831 len
= (EXT_LAST_EXTENT(eh
) - ex
- 1)
1832 * sizeof(struct ext4_extent
);
1833 memmove(ex
+ 1, ex
+ 2, len
);
1835 le16_add_cpu(&eh
->eh_entries
, -1);
1837 WARN_ON(eh
->eh_entries
== 0);
1838 if (!eh
->eh_entries
)
1839 EXT4_ERROR_INODE(inode
, "eh->eh_entries = 0!");
1846 * This function does a very simple check to see if we can collapse
1847 * an extent tree with a single extent tree leaf block into the inode.
1849 static void ext4_ext_try_to_merge_up(handle_t
*handle
,
1850 struct inode
*inode
,
1851 struct ext4_ext_path
*path
)
1854 unsigned max_root
= ext4_ext_space_root(inode
, 0);
1857 if ((path
[0].p_depth
!= 1) ||
1858 (le16_to_cpu(path
[0].p_hdr
->eh_entries
) != 1) ||
1859 (le16_to_cpu(path
[1].p_hdr
->eh_entries
) > max_root
))
1863 * We need to modify the block allocation bitmap and the block
1864 * group descriptor to release the extent tree block. If we
1865 * can't get the journal credits, give up.
1867 if (ext4_journal_extend(handle
, 2,
1868 ext4_free_metadata_revoke_credits(inode
->i_sb
, 1)))
1872 * Copy the extent data up to the inode
1874 blk
= ext4_idx_pblock(path
[0].p_idx
);
1875 s
= le16_to_cpu(path
[1].p_hdr
->eh_entries
) *
1876 sizeof(struct ext4_extent_idx
);
1877 s
+= sizeof(struct ext4_extent_header
);
1879 path
[1].p_maxdepth
= path
[0].p_maxdepth
;
1880 memcpy(path
[0].p_hdr
, path
[1].p_hdr
, s
);
1881 path
[0].p_depth
= 0;
1882 path
[0].p_ext
= EXT_FIRST_EXTENT(path
[0].p_hdr
) +
1883 (path
[1].p_ext
- EXT_FIRST_EXTENT(path
[1].p_hdr
));
1884 path
[0].p_hdr
->eh_max
= cpu_to_le16(max_root
);
1886 ext4_ext_path_brelse(path
+ 1);
1887 ext4_free_blocks(handle
, inode
, NULL
, blk
, 1,
1888 EXT4_FREE_BLOCKS_METADATA
| EXT4_FREE_BLOCKS_FORGET
);
1892 * This function tries to merge the @ex extent to neighbours in the tree, then
1893 * tries to collapse the extent tree into the inode.
1895 static void ext4_ext_try_to_merge(handle_t
*handle
,
1896 struct inode
*inode
,
1897 struct ext4_ext_path
*path
,
1898 struct ext4_extent
*ex
)
1900 struct ext4_extent_header
*eh
;
1904 depth
= ext_depth(inode
);
1905 BUG_ON(path
[depth
].p_hdr
== NULL
);
1906 eh
= path
[depth
].p_hdr
;
1908 if (ex
> EXT_FIRST_EXTENT(eh
))
1909 merge_done
= ext4_ext_try_to_merge_right(inode
, path
, ex
- 1);
1912 (void) ext4_ext_try_to_merge_right(inode
, path
, ex
);
1914 ext4_ext_try_to_merge_up(handle
, inode
, path
);
1918 * check if a portion of the "newext" extent overlaps with an
1921 * If there is an overlap discovered, it updates the length of the newext
1922 * such that there will be no overlap, and then returns 1.
1923 * If there is no overlap found, it returns 0.
1925 static unsigned int ext4_ext_check_overlap(struct ext4_sb_info
*sbi
,
1926 struct inode
*inode
,
1927 struct ext4_extent
*newext
,
1928 struct ext4_ext_path
*path
)
1931 unsigned int depth
, len1
;
1932 unsigned int ret
= 0;
1934 b1
= le32_to_cpu(newext
->ee_block
);
1935 len1
= ext4_ext_get_actual_len(newext
);
1936 depth
= ext_depth(inode
);
1937 if (!path
[depth
].p_ext
)
1939 b2
= EXT4_LBLK_CMASK(sbi
, le32_to_cpu(path
[depth
].p_ext
->ee_block
));
1942 * get the next allocated block if the extent in the path
1943 * is before the requested block(s)
1946 b2
= ext4_ext_next_allocated_block(path
);
1947 if (b2
== EXT_MAX_BLOCKS
)
1949 b2
= EXT4_LBLK_CMASK(sbi
, b2
);
1952 /* check for wrap through zero on extent logical start block*/
1953 if (b1
+ len1
< b1
) {
1954 len1
= EXT_MAX_BLOCKS
- b1
;
1955 newext
->ee_len
= cpu_to_le16(len1
);
1959 /* check for overlap */
1960 if (b1
+ len1
> b2
) {
1961 newext
->ee_len
= cpu_to_le16(b2
- b1
);
1969 * ext4_ext_insert_extent:
1970 * tries to merge requested extent into the existing extent or
1971 * inserts requested extent as new one into the tree,
1972 * creating new leaf in the no-space case.
1974 struct ext4_ext_path
*
1975 ext4_ext_insert_extent(handle_t
*handle
, struct inode
*inode
,
1976 struct ext4_ext_path
*path
,
1977 struct ext4_extent
*newext
, int gb_flags
)
1979 struct ext4_extent_header
*eh
;
1980 struct ext4_extent
*ex
, *fex
;
1981 struct ext4_extent
*nearex
; /* nearest extent */
1982 int depth
, len
, err
= 0;
1984 int mb_flags
= 0, unwritten
;
1986 if (gb_flags
& EXT4_GET_BLOCKS_DELALLOC_RESERVE
)
1987 mb_flags
|= EXT4_MB_DELALLOC_RESERVED
;
1988 if (unlikely(ext4_ext_get_actual_len(newext
) == 0)) {
1989 EXT4_ERROR_INODE(inode
, "ext4_ext_get_actual_len(newext) == 0");
1990 err
= -EFSCORRUPTED
;
1993 depth
= ext_depth(inode
);
1994 ex
= path
[depth
].p_ext
;
1995 eh
= path
[depth
].p_hdr
;
1996 if (unlikely(path
[depth
].p_hdr
== NULL
)) {
1997 EXT4_ERROR_INODE(inode
, "path[%d].p_hdr == NULL", depth
);
1998 err
= -EFSCORRUPTED
;
2002 /* try to insert block into found extent and return */
2003 if (ex
&& !(gb_flags
& EXT4_GET_BLOCKS_PRE_IO
)) {
2006 * Try to see whether we should rather test the extent on
2007 * right from ex, or from the left of ex. This is because
2008 * ext4_find_extent() can return either extent on the
2009 * left, or on the right from the searched position. This
2010 * will make merging more effective.
2012 if (ex
< EXT_LAST_EXTENT(eh
) &&
2013 (le32_to_cpu(ex
->ee_block
) +
2014 ext4_ext_get_actual_len(ex
) <
2015 le32_to_cpu(newext
->ee_block
))) {
2018 } else if ((ex
> EXT_FIRST_EXTENT(eh
)) &&
2019 (le32_to_cpu(newext
->ee_block
) +
2020 ext4_ext_get_actual_len(newext
) <
2021 le32_to_cpu(ex
->ee_block
)))
2024 /* Try to append newex to the ex */
2025 if (ext4_can_extents_be_merged(inode
, ex
, newext
)) {
2026 ext_debug(inode
, "append [%d]%d block to %u:[%d]%d"
2028 ext4_ext_is_unwritten(newext
),
2029 ext4_ext_get_actual_len(newext
),
2030 le32_to_cpu(ex
->ee_block
),
2031 ext4_ext_is_unwritten(ex
),
2032 ext4_ext_get_actual_len(ex
),
2033 ext4_ext_pblock(ex
));
2034 err
= ext4_ext_get_access(handle
, inode
,
2038 unwritten
= ext4_ext_is_unwritten(ex
);
2039 ex
->ee_len
= cpu_to_le16(ext4_ext_get_actual_len(ex
)
2040 + ext4_ext_get_actual_len(newext
));
2042 ext4_ext_mark_unwritten(ex
);
2048 /* Try to prepend newex to the ex */
2049 if (ext4_can_extents_be_merged(inode
, newext
, ex
)) {
2050 ext_debug(inode
, "prepend %u[%d]%d block to %u:[%d]%d"
2052 le32_to_cpu(newext
->ee_block
),
2053 ext4_ext_is_unwritten(newext
),
2054 ext4_ext_get_actual_len(newext
),
2055 le32_to_cpu(ex
->ee_block
),
2056 ext4_ext_is_unwritten(ex
),
2057 ext4_ext_get_actual_len(ex
),
2058 ext4_ext_pblock(ex
));
2059 err
= ext4_ext_get_access(handle
, inode
,
2064 unwritten
= ext4_ext_is_unwritten(ex
);
2065 ex
->ee_block
= newext
->ee_block
;
2066 ext4_ext_store_pblock(ex
, ext4_ext_pblock(newext
));
2067 ex
->ee_len
= cpu_to_le16(ext4_ext_get_actual_len(ex
)
2068 + ext4_ext_get_actual_len(newext
));
2070 ext4_ext_mark_unwritten(ex
);
2076 depth
= ext_depth(inode
);
2077 eh
= path
[depth
].p_hdr
;
2078 if (le16_to_cpu(eh
->eh_entries
) < le16_to_cpu(eh
->eh_max
))
2081 /* probably next leaf has space for us? */
2082 fex
= EXT_LAST_EXTENT(eh
);
2083 next
= EXT_MAX_BLOCKS
;
2084 if (le32_to_cpu(newext
->ee_block
) > le32_to_cpu(fex
->ee_block
))
2085 next
= ext4_ext_next_leaf_block(path
);
2086 if (next
!= EXT_MAX_BLOCKS
) {
2087 struct ext4_ext_path
*npath
;
2089 ext_debug(inode
, "next leaf block - %u\n", next
);
2090 npath
= ext4_find_extent(inode
, next
, NULL
, gb_flags
);
2091 if (IS_ERR(npath
)) {
2092 err
= PTR_ERR(npath
);
2095 BUG_ON(npath
->p_depth
!= path
->p_depth
);
2096 eh
= npath
[depth
].p_hdr
;
2097 if (le16_to_cpu(eh
->eh_entries
) < le16_to_cpu(eh
->eh_max
)) {
2098 ext_debug(inode
, "next leaf isn't full(%d)\n",
2099 le16_to_cpu(eh
->eh_entries
));
2100 ext4_free_ext_path(path
);
2104 ext_debug(inode
, "next leaf has no free space(%d,%d)\n",
2105 le16_to_cpu(eh
->eh_entries
), le16_to_cpu(eh
->eh_max
));
2106 ext4_free_ext_path(npath
);
2110 * There is no free space in the found leaf.
2111 * We're gonna add a new leaf in the tree.
2113 if (gb_flags
& EXT4_GET_BLOCKS_METADATA_NOFAIL
)
2114 mb_flags
|= EXT4_MB_USE_RESERVED
;
2115 path
= ext4_ext_create_new_leaf(handle
, inode
, mb_flags
, gb_flags
,
2119 depth
= ext_depth(inode
);
2120 eh
= path
[depth
].p_hdr
;
2123 nearex
= path
[depth
].p_ext
;
2125 err
= ext4_ext_get_access(handle
, inode
, path
+ depth
);
2130 /* there is no extent in this leaf, create first one */
2131 ext_debug(inode
, "first extent in the leaf: %u:%llu:[%d]%d\n",
2132 le32_to_cpu(newext
->ee_block
),
2133 ext4_ext_pblock(newext
),
2134 ext4_ext_is_unwritten(newext
),
2135 ext4_ext_get_actual_len(newext
));
2136 nearex
= EXT_FIRST_EXTENT(eh
);
2138 if (le32_to_cpu(newext
->ee_block
)
2139 > le32_to_cpu(nearex
->ee_block
)) {
2141 ext_debug(inode
, "insert %u:%llu:[%d]%d before: "
2143 le32_to_cpu(newext
->ee_block
),
2144 ext4_ext_pblock(newext
),
2145 ext4_ext_is_unwritten(newext
),
2146 ext4_ext_get_actual_len(newext
),
2151 BUG_ON(newext
->ee_block
== nearex
->ee_block
);
2152 ext_debug(inode
, "insert %u:%llu:[%d]%d after: "
2154 le32_to_cpu(newext
->ee_block
),
2155 ext4_ext_pblock(newext
),
2156 ext4_ext_is_unwritten(newext
),
2157 ext4_ext_get_actual_len(newext
),
2160 len
= EXT_LAST_EXTENT(eh
) - nearex
+ 1;
2162 ext_debug(inode
, "insert %u:%llu:[%d]%d: "
2163 "move %d extents from 0x%p to 0x%p\n",
2164 le32_to_cpu(newext
->ee_block
),
2165 ext4_ext_pblock(newext
),
2166 ext4_ext_is_unwritten(newext
),
2167 ext4_ext_get_actual_len(newext
),
2168 len
, nearex
, nearex
+ 1);
2169 memmove(nearex
+ 1, nearex
,
2170 len
* sizeof(struct ext4_extent
));
2174 le16_add_cpu(&eh
->eh_entries
, 1);
2175 path
[depth
].p_ext
= nearex
;
2176 nearex
->ee_block
= newext
->ee_block
;
2177 ext4_ext_store_pblock(nearex
, ext4_ext_pblock(newext
));
2178 nearex
->ee_len
= newext
->ee_len
;
2181 /* try to merge extents */
2182 if (!(gb_flags
& EXT4_GET_BLOCKS_PRE_IO
))
2183 ext4_ext_try_to_merge(handle
, inode
, path
, nearex
);
2185 /* time to correct all indexes above */
2186 err
= ext4_ext_correct_indexes(handle
, inode
, path
);
2190 err
= ext4_ext_dirty(handle
, inode
, path
+ path
->p_depth
);
2197 ext4_free_ext_path(path
);
2198 return ERR_PTR(err
);
2201 static int ext4_fill_es_cache_info(struct inode
*inode
,
2202 ext4_lblk_t block
, ext4_lblk_t num
,
2203 struct fiemap_extent_info
*fieinfo
)
2205 ext4_lblk_t next
, end
= block
+ num
- 1;
2206 struct extent_status es
;
2207 unsigned char blksize_bits
= inode
->i_sb
->s_blocksize_bits
;
2211 while (block
<= end
) {
2214 if (!ext4_es_lookup_extent(inode
, block
, &next
, &es
))
2216 if (ext4_es_is_unwritten(&es
))
2217 flags
|= FIEMAP_EXTENT_UNWRITTEN
;
2218 if (ext4_es_is_delayed(&es
))
2219 flags
|= (FIEMAP_EXTENT_DELALLOC
|
2220 FIEMAP_EXTENT_UNKNOWN
);
2221 if (ext4_es_is_hole(&es
))
2222 flags
|= EXT4_FIEMAP_EXTENT_HOLE
;
2224 flags
|= FIEMAP_EXTENT_LAST
;
2225 if (flags
& (FIEMAP_EXTENT_DELALLOC
|
2226 EXT4_FIEMAP_EXTENT_HOLE
))
2229 es
.es_pblk
= ext4_es_pblock(&es
);
2230 err
= fiemap_fill_next_extent(fieinfo
,
2231 (__u64
)es
.es_lblk
<< blksize_bits
,
2232 (__u64
)es
.es_pblk
<< blksize_bits
,
2233 (__u64
)es
.es_len
<< blksize_bits
,
2248 * ext4_ext_find_hole - find hole around given block according to the given path
2249 * @inode: inode we lookup in
2250 * @path: path in extent tree to @lblk
2251 * @lblk: pointer to logical block around which we want to determine hole
2253 * Determine hole length (and start if easily possible) around given logical
2254 * block. We don't try too hard to find the beginning of the hole but @path
2255 * actually points to extent before @lblk, we provide it.
2257 * The function returns the length of a hole starting at @lblk. We update @lblk
2258 * to the beginning of the hole if we managed to find it.
2260 static ext4_lblk_t
ext4_ext_find_hole(struct inode
*inode
,
2261 struct ext4_ext_path
*path
,
2264 int depth
= ext_depth(inode
);
2265 struct ext4_extent
*ex
;
2268 ex
= path
[depth
].p_ext
;
2270 /* there is no extent yet, so gap is [0;-] */
2272 len
= EXT_MAX_BLOCKS
;
2273 } else if (*lblk
< le32_to_cpu(ex
->ee_block
)) {
2274 len
= le32_to_cpu(ex
->ee_block
) - *lblk
;
2275 } else if (*lblk
>= le32_to_cpu(ex
->ee_block
)
2276 + ext4_ext_get_actual_len(ex
)) {
2279 *lblk
= le32_to_cpu(ex
->ee_block
) + ext4_ext_get_actual_len(ex
);
2280 next
= ext4_ext_next_allocated_block(path
);
2281 BUG_ON(next
== *lblk
);
2291 * removes index from the index block.
2293 static int ext4_ext_rm_idx(handle_t
*handle
, struct inode
*inode
,
2294 struct ext4_ext_path
*path
, int depth
)
2300 /* free index block */
2301 leaf
= ext4_idx_pblock(path
[k
].p_idx
);
2302 if (unlikely(path
[k
].p_hdr
->eh_entries
== 0)) {
2303 EXT4_ERROR_INODE(inode
, "path[%d].p_hdr->eh_entries == 0", k
);
2304 return -EFSCORRUPTED
;
2306 err
= ext4_ext_get_access(handle
, inode
, path
+ k
);
2310 if (path
[k
].p_idx
!= EXT_LAST_INDEX(path
[k
].p_hdr
)) {
2311 int len
= EXT_LAST_INDEX(path
[k
].p_hdr
) - path
[k
].p_idx
;
2312 len
*= sizeof(struct ext4_extent_idx
);
2313 memmove(path
[k
].p_idx
, path
[k
].p_idx
+ 1, len
);
2316 le16_add_cpu(&path
[k
].p_hdr
->eh_entries
, -1);
2317 err
= ext4_ext_dirty(handle
, inode
, path
+ k
);
2320 ext_debug(inode
, "index is empty, remove it, free block %llu\n", leaf
);
2321 trace_ext4_ext_rm_idx(inode
, leaf
);
2323 ext4_free_blocks(handle
, inode
, NULL
, leaf
, 1,
2324 EXT4_FREE_BLOCKS_METADATA
| EXT4_FREE_BLOCKS_FORGET
);
2327 if (path
[k
+ 1].p_idx
!= EXT_FIRST_INDEX(path
[k
+ 1].p_hdr
))
2329 err
= ext4_ext_get_access(handle
, inode
, path
+ k
);
2332 path
[k
].p_idx
->ei_block
= path
[k
+ 1].p_idx
->ei_block
;
2333 err
= ext4_ext_dirty(handle
, inode
, path
+ k
);
2341 * The path[k].p_bh is either unmodified or with no verified bit
2342 * set (see ext4_ext_get_access()). So just clear the verified bit
2343 * of the successfully modified extents buffers, which will force
2344 * these extents to be checked to avoid using inconsistent data.
2347 clear_buffer_verified(path
[k
].p_bh
);
2353 * ext4_ext_calc_credits_for_single_extent:
2354 * This routine returns max. credits that needed to insert an extent
2355 * to the extent tree.
2356 * When pass the actual path, the caller should calculate credits
2359 int ext4_ext_calc_credits_for_single_extent(struct inode
*inode
, int nrblocks
,
2360 struct ext4_ext_path
*path
)
2363 int depth
= ext_depth(inode
);
2366 /* probably there is space in leaf? */
2367 if (le16_to_cpu(path
[depth
].p_hdr
->eh_entries
)
2368 < le16_to_cpu(path
[depth
].p_hdr
->eh_max
)) {
2371 * There are some space in the leaf tree, no
2372 * need to account for leaf block credit
2374 * bitmaps and block group descriptor blocks
2375 * and other metadata blocks still need to be
2378 /* 1 bitmap, 1 block group descriptor */
2379 ret
= 2 + EXT4_META_TRANS_BLOCKS(inode
->i_sb
);
2384 return ext4_chunk_trans_blocks(inode
, nrblocks
);
2388 * How many index/leaf blocks need to change/allocate to add @extents extents?
2390 * If we add a single extent, then in the worse case, each tree level
2391 * index/leaf need to be changed in case of the tree split.
2393 * If more extents are inserted, they could cause the whole tree split more
2394 * than once, but this is really rare.
2396 int ext4_ext_index_trans_blocks(struct inode
*inode
, int extents
)
2401 /* If we are converting the inline data, only one is needed here. */
2402 if (ext4_has_inline_data(inode
))
2405 depth
= ext_depth(inode
);
2415 static inline int get_default_free_blocks_flags(struct inode
*inode
)
2417 if (S_ISDIR(inode
->i_mode
) || S_ISLNK(inode
->i_mode
) ||
2418 ext4_test_inode_flag(inode
, EXT4_INODE_EA_INODE
))
2419 return EXT4_FREE_BLOCKS_METADATA
| EXT4_FREE_BLOCKS_FORGET
;
2420 else if (ext4_should_journal_data(inode
))
2421 return EXT4_FREE_BLOCKS_FORGET
;
2426 * ext4_rereserve_cluster - increment the reserved cluster count when
2427 * freeing a cluster with a pending reservation
2429 * @inode - file containing the cluster
2430 * @lblk - logical block in cluster to be reserved
2432 * Increments the reserved cluster count and adjusts quota in a bigalloc
2433 * file system when freeing a partial cluster containing at least one
2434 * delayed and unwritten block. A partial cluster meeting that
2435 * requirement will have a pending reservation. If so, the
2436 * RERESERVE_CLUSTER flag is used when calling ext4_free_blocks() to
2437 * defer reserved and allocated space accounting to a subsequent call
2440 static void ext4_rereserve_cluster(struct inode
*inode
, ext4_lblk_t lblk
)
2442 struct ext4_sb_info
*sbi
= EXT4_SB(inode
->i_sb
);
2443 struct ext4_inode_info
*ei
= EXT4_I(inode
);
2445 dquot_reclaim_block(inode
, EXT4_C2B(sbi
, 1));
2447 spin_lock(&ei
->i_block_reservation_lock
);
2448 ei
->i_reserved_data_blocks
++;
2449 percpu_counter_add(&sbi
->s_dirtyclusters_counter
, 1);
2450 spin_unlock(&ei
->i_block_reservation_lock
);
2452 percpu_counter_add(&sbi
->s_freeclusters_counter
, 1);
2453 ext4_remove_pending(inode
, lblk
);
2456 static int ext4_remove_blocks(handle_t
*handle
, struct inode
*inode
,
2457 struct ext4_extent
*ex
,
2458 struct partial_cluster
*partial
,
2459 ext4_lblk_t from
, ext4_lblk_t to
)
2461 struct ext4_sb_info
*sbi
= EXT4_SB(inode
->i_sb
);
2462 unsigned short ee_len
= ext4_ext_get_actual_len(ex
);
2463 ext4_fsblk_t last_pblk
, pblk
;
2467 /* only extent tail removal is allowed */
2468 if (from
< le32_to_cpu(ex
->ee_block
) ||
2469 to
!= le32_to_cpu(ex
->ee_block
) + ee_len
- 1) {
2470 ext4_error(sbi
->s_sb
,
2471 "strange request: removal(2) %u-%u from %u:%u",
2472 from
, to
, le32_to_cpu(ex
->ee_block
), ee_len
);
2476 #ifdef EXTENTS_STATS
2477 spin_lock(&sbi
->s_ext_stats_lock
);
2478 sbi
->s_ext_blocks
+= ee_len
;
2479 sbi
->s_ext_extents
++;
2480 if (ee_len
< sbi
->s_ext_min
)
2481 sbi
->s_ext_min
= ee_len
;
2482 if (ee_len
> sbi
->s_ext_max
)
2483 sbi
->s_ext_max
= ee_len
;
2484 if (ext_depth(inode
) > sbi
->s_depth_max
)
2485 sbi
->s_depth_max
= ext_depth(inode
);
2486 spin_unlock(&sbi
->s_ext_stats_lock
);
2489 trace_ext4_remove_blocks(inode
, ex
, from
, to
, partial
);
2492 * if we have a partial cluster, and it's different from the
2493 * cluster of the last block in the extent, we free it
2495 last_pblk
= ext4_ext_pblock(ex
) + ee_len
- 1;
2497 if (partial
->state
!= initial
&&
2498 partial
->pclu
!= EXT4_B2C(sbi
, last_pblk
)) {
2499 if (partial
->state
== tofree
) {
2500 flags
= get_default_free_blocks_flags(inode
);
2501 if (ext4_is_pending(inode
, partial
->lblk
))
2502 flags
|= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER
;
2503 ext4_free_blocks(handle
, inode
, NULL
,
2504 EXT4_C2B(sbi
, partial
->pclu
),
2505 sbi
->s_cluster_ratio
, flags
);
2506 if (flags
& EXT4_FREE_BLOCKS_RERESERVE_CLUSTER
)
2507 ext4_rereserve_cluster(inode
, partial
->lblk
);
2509 partial
->state
= initial
;
2512 num
= le32_to_cpu(ex
->ee_block
) + ee_len
- from
;
2513 pblk
= ext4_ext_pblock(ex
) + ee_len
- num
;
2516 * We free the partial cluster at the end of the extent (if any),
2517 * unless the cluster is used by another extent (partial_cluster
2518 * state is nofree). If a partial cluster exists here, it must be
2519 * shared with the last block in the extent.
2521 flags
= get_default_free_blocks_flags(inode
);
2523 /* partial, left end cluster aligned, right end unaligned */
2524 if ((EXT4_LBLK_COFF(sbi
, to
) != sbi
->s_cluster_ratio
- 1) &&
2525 (EXT4_LBLK_CMASK(sbi
, to
) >= from
) &&
2526 (partial
->state
!= nofree
)) {
2527 if (ext4_is_pending(inode
, to
))
2528 flags
|= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER
;
2529 ext4_free_blocks(handle
, inode
, NULL
,
2530 EXT4_PBLK_CMASK(sbi
, last_pblk
),
2531 sbi
->s_cluster_ratio
, flags
);
2532 if (flags
& EXT4_FREE_BLOCKS_RERESERVE_CLUSTER
)
2533 ext4_rereserve_cluster(inode
, to
);
2534 partial
->state
= initial
;
2535 flags
= get_default_free_blocks_flags(inode
);
2538 flags
|= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER
;
2541 * For bigalloc file systems, we never free a partial cluster
2542 * at the beginning of the extent. Instead, we check to see if we
2543 * need to free it on a subsequent call to ext4_remove_blocks,
2544 * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space.
2546 flags
|= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER
;
2547 ext4_free_blocks(handle
, inode
, NULL
, pblk
, num
, flags
);
2549 /* reset the partial cluster if we've freed past it */
2550 if (partial
->state
!= initial
&& partial
->pclu
!= EXT4_B2C(sbi
, pblk
))
2551 partial
->state
= initial
;
2554 * If we've freed the entire extent but the beginning is not left
2555 * cluster aligned and is not marked as ineligible for freeing we
2556 * record the partial cluster at the beginning of the extent. It
2557 * wasn't freed by the preceding ext4_free_blocks() call, and we
2558 * need to look farther to the left to determine if it's to be freed
2559 * (not shared with another extent). Else, reset the partial
2560 * cluster - we're either done freeing or the beginning of the
2561 * extent is left cluster aligned.
2563 if (EXT4_LBLK_COFF(sbi
, from
) && num
== ee_len
) {
2564 if (partial
->state
== initial
) {
2565 partial
->pclu
= EXT4_B2C(sbi
, pblk
);
2566 partial
->lblk
= from
;
2567 partial
->state
= tofree
;
2570 partial
->state
= initial
;
2577 * ext4_ext_rm_leaf() Removes the extents associated with the
2578 * blocks appearing between "start" and "end". Both "start"
2579 * and "end" must appear in the same extent or EIO is returned.
2581 * @handle: The journal handle
2582 * @inode: The files inode
2583 * @path: The path to the leaf
2584 * @partial_cluster: The cluster which we'll have to free if all extents
2585 * has been released from it. However, if this value is
2586 * negative, it's a cluster just to the right of the
2587 * punched region and it must not be freed.
2588 * @start: The first block to remove
2589 * @end: The last block to remove
2592 ext4_ext_rm_leaf(handle_t
*handle
, struct inode
*inode
,
2593 struct ext4_ext_path
*path
,
2594 struct partial_cluster
*partial
,
2595 ext4_lblk_t start
, ext4_lblk_t end
)
2597 struct ext4_sb_info
*sbi
= EXT4_SB(inode
->i_sb
);
2598 int err
= 0, correct_index
= 0;
2599 int depth
= ext_depth(inode
), credits
, revoke_credits
;
2600 struct ext4_extent_header
*eh
;
2603 ext4_lblk_t ex_ee_block
;
2604 unsigned short ex_ee_len
;
2605 unsigned unwritten
= 0;
2606 struct ext4_extent
*ex
;
2609 /* the header must be checked already in ext4_ext_remove_space() */
2610 ext_debug(inode
, "truncate since %u in leaf to %u\n", start
, end
);
2611 if (!path
[depth
].p_hdr
)
2612 path
[depth
].p_hdr
= ext_block_hdr(path
[depth
].p_bh
);
2613 eh
= path
[depth
].p_hdr
;
2614 if (unlikely(path
[depth
].p_hdr
== NULL
)) {
2615 EXT4_ERROR_INODE(inode
, "path[%d].p_hdr == NULL", depth
);
2616 return -EFSCORRUPTED
;
2618 /* find where to start removing */
2619 ex
= path
[depth
].p_ext
;
2621 ex
= EXT_LAST_EXTENT(eh
);
2623 ex_ee_block
= le32_to_cpu(ex
->ee_block
);
2624 ex_ee_len
= ext4_ext_get_actual_len(ex
);
2626 trace_ext4_ext_rm_leaf(inode
, start
, ex
, partial
);
2628 while (ex
>= EXT_FIRST_EXTENT(eh
) &&
2629 ex_ee_block
+ ex_ee_len
> start
) {
2631 if (ext4_ext_is_unwritten(ex
))
2636 ext_debug(inode
, "remove ext %u:[%d]%d\n", ex_ee_block
,
2637 unwritten
, ex_ee_len
);
2638 path
[depth
].p_ext
= ex
;
2640 a
= max(ex_ee_block
, start
);
2641 b
= min(ex_ee_block
+ ex_ee_len
- 1, end
);
2643 ext_debug(inode
, " border %u:%u\n", a
, b
);
2645 /* If this extent is beyond the end of the hole, skip it */
2646 if (end
< ex_ee_block
) {
2648 * We're going to skip this extent and move to another,
2649 * so note that its first cluster is in use to avoid
2650 * freeing it when removing blocks. Eventually, the
2651 * right edge of the truncated/punched region will
2652 * be just to the left.
2654 if (sbi
->s_cluster_ratio
> 1) {
2655 pblk
= ext4_ext_pblock(ex
);
2656 partial
->pclu
= EXT4_B2C(sbi
, pblk
);
2657 partial
->state
= nofree
;
2660 ex_ee_block
= le32_to_cpu(ex
->ee_block
);
2661 ex_ee_len
= ext4_ext_get_actual_len(ex
);
2663 } else if (b
!= ex_ee_block
+ ex_ee_len
- 1) {
2664 EXT4_ERROR_INODE(inode
,
2665 "can not handle truncate %u:%u "
2667 start
, end
, ex_ee_block
,
2668 ex_ee_block
+ ex_ee_len
- 1);
2669 err
= -EFSCORRUPTED
;
2671 } else if (a
!= ex_ee_block
) {
2672 /* remove tail of the extent */
2673 num
= a
- ex_ee_block
;
2675 /* remove whole extent: excellent! */
2679 * 3 for leaf, sb, and inode plus 2 (bmap and group
2680 * descriptor) for each block group; assume two block
2681 * groups plus ex_ee_len/blocks_per_block_group for
2684 credits
= 7 + 2*(ex_ee_len
/EXT4_BLOCKS_PER_GROUP(inode
->i_sb
));
2685 if (ex
== EXT_FIRST_EXTENT(eh
)) {
2687 credits
+= (ext_depth(inode
)) + 1;
2689 credits
+= EXT4_MAXQUOTAS_TRANS_BLOCKS(inode
->i_sb
);
2691 * We may end up freeing some index blocks and data from the
2692 * punched range. Note that partial clusters are accounted for
2693 * by ext4_free_data_revoke_credits().
2696 ext4_free_metadata_revoke_credits(inode
->i_sb
,
2698 ext4_free_data_revoke_credits(inode
, b
- a
+ 1);
2700 err
= ext4_datasem_ensure_credits(handle
, inode
, credits
,
2701 credits
, revoke_credits
);
2708 err
= ext4_ext_get_access(handle
, inode
, path
+ depth
);
2712 err
= ext4_remove_blocks(handle
, inode
, ex
, partial
, a
, b
);
2717 /* this extent is removed; mark slot entirely unused */
2718 ext4_ext_store_pblock(ex
, 0);
2720 ex
->ee_len
= cpu_to_le16(num
);
2722 * Do not mark unwritten if all the blocks in the
2723 * extent have been removed.
2725 if (unwritten
&& num
)
2726 ext4_ext_mark_unwritten(ex
);
2728 * If the extent was completely released,
2729 * we need to remove it from the leaf
2732 if (end
!= EXT_MAX_BLOCKS
- 1) {
2734 * For hole punching, we need to scoot all the
2735 * extents up when an extent is removed so that
2736 * we dont have blank extents in the middle
2738 memmove(ex
, ex
+1, (EXT_LAST_EXTENT(eh
) - ex
) *
2739 sizeof(struct ext4_extent
));
2741 /* Now get rid of the one at the end */
2742 memset(EXT_LAST_EXTENT(eh
), 0,
2743 sizeof(struct ext4_extent
));
2745 le16_add_cpu(&eh
->eh_entries
, -1);
2748 err
= ext4_ext_dirty(handle
, inode
, path
+ depth
);
2752 ext_debug(inode
, "new extent: %u:%u:%llu\n", ex_ee_block
, num
,
2753 ext4_ext_pblock(ex
));
2755 ex_ee_block
= le32_to_cpu(ex
->ee_block
);
2756 ex_ee_len
= ext4_ext_get_actual_len(ex
);
2759 if (correct_index
&& eh
->eh_entries
)
2760 err
= ext4_ext_correct_indexes(handle
, inode
, path
);
2763 * If there's a partial cluster and at least one extent remains in
2764 * the leaf, free the partial cluster if it isn't shared with the
2765 * current extent. If it is shared with the current extent
2766 * we reset the partial cluster because we've reached the start of the
2767 * truncated/punched region and we're done removing blocks.
2769 if (partial
->state
== tofree
&& ex
>= EXT_FIRST_EXTENT(eh
)) {
2770 pblk
= ext4_ext_pblock(ex
) + ex_ee_len
- 1;
2771 if (partial
->pclu
!= EXT4_B2C(sbi
, pblk
)) {
2772 int flags
= get_default_free_blocks_flags(inode
);
2774 if (ext4_is_pending(inode
, partial
->lblk
))
2775 flags
|= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER
;
2776 ext4_free_blocks(handle
, inode
, NULL
,
2777 EXT4_C2B(sbi
, partial
->pclu
),
2778 sbi
->s_cluster_ratio
, flags
);
2779 if (flags
& EXT4_FREE_BLOCKS_RERESERVE_CLUSTER
)
2780 ext4_rereserve_cluster(inode
, partial
->lblk
);
2782 partial
->state
= initial
;
2785 /* if this leaf is free, then we should
2786 * remove it from index block above */
2787 if (err
== 0 && eh
->eh_entries
== 0 && path
[depth
].p_bh
!= NULL
)
2788 err
= ext4_ext_rm_idx(handle
, inode
, path
, depth
);
2795 * ext4_ext_more_to_rm:
2796 * returns 1 if current index has to be freed (even partial)
2799 ext4_ext_more_to_rm(struct ext4_ext_path
*path
)
2801 BUG_ON(path
->p_idx
== NULL
);
2803 if (path
->p_idx
< EXT_FIRST_INDEX(path
->p_hdr
))
2807 * if truncate on deeper level happened, it wasn't partial,
2808 * so we have to consider current index for truncation
2810 if (le16_to_cpu(path
->p_hdr
->eh_entries
) == path
->p_block
)
2815 int ext4_ext_remove_space(struct inode
*inode
, ext4_lblk_t start
,
2818 struct ext4_sb_info
*sbi
= EXT4_SB(inode
->i_sb
);
2819 int depth
= ext_depth(inode
);
2820 struct ext4_ext_path
*path
= NULL
;
2821 struct partial_cluster partial
;
2827 partial
.state
= initial
;
2829 ext_debug(inode
, "truncate since %u to %u\n", start
, end
);
2831 /* probably first extent we're gonna free will be last in block */
2832 handle
= ext4_journal_start_with_revoke(inode
, EXT4_HT_TRUNCATE
,
2834 ext4_free_metadata_revoke_credits(inode
->i_sb
, depth
));
2836 return PTR_ERR(handle
);
2839 trace_ext4_ext_remove_space(inode
, start
, end
, depth
);
2842 * Check if we are removing extents inside the extent tree. If that
2843 * is the case, we are going to punch a hole inside the extent tree
2844 * so we have to check whether we need to split the extent covering
2845 * the last block to remove so we can easily remove the part of it
2846 * in ext4_ext_rm_leaf().
2848 if (end
< EXT_MAX_BLOCKS
- 1) {
2849 struct ext4_extent
*ex
;
2850 ext4_lblk_t ee_block
, ex_end
, lblk
;
2853 /* find extent for or closest extent to this block */
2854 path
= ext4_find_extent(inode
, end
, NULL
,
2855 EXT4_EX_NOCACHE
| EXT4_EX_NOFAIL
);
2857 ext4_journal_stop(handle
);
2858 return PTR_ERR(path
);
2860 depth
= ext_depth(inode
);
2861 /* Leaf not may not exist only if inode has no blocks at all */
2862 ex
= path
[depth
].p_ext
;
2865 EXT4_ERROR_INODE(inode
,
2866 "path[%d].p_hdr == NULL",
2868 err
= -EFSCORRUPTED
;
2873 ee_block
= le32_to_cpu(ex
->ee_block
);
2874 ex_end
= ee_block
+ ext4_ext_get_actual_len(ex
) - 1;
2877 * See if the last block is inside the extent, if so split
2878 * the extent at 'end' block so we can easily remove the
2879 * tail of the first part of the split extent in
2880 * ext4_ext_rm_leaf().
2882 if (end
>= ee_block
&& end
< ex_end
) {
2885 * If we're going to split the extent, note that
2886 * the cluster containing the block after 'end' is
2887 * in use to avoid freeing it when removing blocks.
2889 if (sbi
->s_cluster_ratio
> 1) {
2890 pblk
= ext4_ext_pblock(ex
) + end
- ee_block
+ 1;
2891 partial
.pclu
= EXT4_B2C(sbi
, pblk
);
2892 partial
.state
= nofree
;
2896 * Split the extent in two so that 'end' is the last
2897 * block in the first new extent. Also we should not
2898 * fail removing space due to ENOSPC so try to use
2899 * reserved block if that happens.
2901 path
= ext4_force_split_extent_at(handle
, inode
, path
,
2904 err
= PTR_ERR(path
);
2907 } else if (sbi
->s_cluster_ratio
> 1 && end
>= ex_end
&&
2908 partial
.state
== initial
) {
2910 * If we're punching, there's an extent to the right.
2911 * If the partial cluster hasn't been set, set it to
2912 * that extent's first cluster and its state to nofree
2913 * so it won't be freed should it contain blocks to be
2914 * removed. If it's already set (tofree/nofree), we're
2915 * retrying and keep the original partial cluster info
2916 * so a cluster marked tofree as a result of earlier
2917 * extent removal is not lost.
2920 err
= ext4_ext_search_right(inode
, path
, &lblk
, &pblk
,
2925 partial
.pclu
= EXT4_B2C(sbi
, pblk
);
2926 partial
.state
= nofree
;
2931 * We start scanning from right side, freeing all the blocks
2932 * after i_size and walking into the tree depth-wise.
2934 depth
= ext_depth(inode
);
2939 le16_to_cpu(path
[k
].p_hdr
->eh_entries
)+1;
2941 path
= kcalloc(depth
+ 1, sizeof(struct ext4_ext_path
),
2942 GFP_NOFS
| __GFP_NOFAIL
);
2944 ext4_journal_stop(handle
);
2947 path
[0].p_maxdepth
= path
[0].p_depth
= depth
;
2948 path
[0].p_hdr
= ext_inode_hdr(inode
);
2951 if (ext4_ext_check(inode
, path
[0].p_hdr
, depth
, 0)) {
2952 err
= -EFSCORRUPTED
;
2958 while (i
>= 0 && err
== 0) {
2960 /* this is leaf block */
2961 err
= ext4_ext_rm_leaf(handle
, inode
, path
,
2962 &partial
, start
, end
);
2963 /* root level has p_bh == NULL, brelse() eats this */
2964 ext4_ext_path_brelse(path
+ i
);
2969 /* this is index block */
2970 if (!path
[i
].p_hdr
) {
2971 ext_debug(inode
, "initialize header\n");
2972 path
[i
].p_hdr
= ext_block_hdr(path
[i
].p_bh
);
2975 if (!path
[i
].p_idx
) {
2976 /* this level hasn't been touched yet */
2977 path
[i
].p_idx
= EXT_LAST_INDEX(path
[i
].p_hdr
);
2978 path
[i
].p_block
= le16_to_cpu(path
[i
].p_hdr
->eh_entries
)+1;
2979 ext_debug(inode
, "init index ptr: hdr 0x%p, num %d\n",
2981 le16_to_cpu(path
[i
].p_hdr
->eh_entries
));
2983 /* we were already here, see at next index */
2987 ext_debug(inode
, "level %d - index, first 0x%p, cur 0x%p\n",
2988 i
, EXT_FIRST_INDEX(path
[i
].p_hdr
),
2990 if (ext4_ext_more_to_rm(path
+ i
)) {
2991 struct buffer_head
*bh
;
2992 /* go to the next level */
2993 ext_debug(inode
, "move to level %d (block %llu)\n",
2994 i
+ 1, ext4_idx_pblock(path
[i
].p_idx
));
2995 memset(path
+ i
+ 1, 0, sizeof(*path
));
2996 bh
= read_extent_tree_block(inode
, path
[i
].p_idx
,
3000 /* should we reset i_size? */
3004 /* Yield here to deal with large extent trees.
3005 * Should be a no-op if we did IO above. */
3007 if (WARN_ON(i
+ 1 > depth
)) {
3008 err
= -EFSCORRUPTED
;
3011 path
[i
+ 1].p_bh
= bh
;
3013 /* save actual number of indexes since this
3014 * number is changed at the next iteration */
3015 path
[i
].p_block
= le16_to_cpu(path
[i
].p_hdr
->eh_entries
);
3018 /* we finished processing this index, go up */
3019 if (path
[i
].p_hdr
->eh_entries
== 0 && i
> 0) {
3020 /* index is empty, remove it;
3021 * handle must be already prepared by the
3022 * truncatei_leaf() */
3023 err
= ext4_ext_rm_idx(handle
, inode
, path
, i
);
3025 /* root level has p_bh == NULL, brelse() eats this */
3026 ext4_ext_path_brelse(path
+ i
);
3028 ext_debug(inode
, "return to level %d\n", i
);
3032 trace_ext4_ext_remove_space_done(inode
, start
, end
, depth
, &partial
,
3033 path
->p_hdr
->eh_entries
);
3036 * if there's a partial cluster and we have removed the first extent
3037 * in the file, then we also free the partial cluster, if any
3039 if (partial
.state
== tofree
&& err
== 0) {
3040 int flags
= get_default_free_blocks_flags(inode
);
3042 if (ext4_is_pending(inode
, partial
.lblk
))
3043 flags
|= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER
;
3044 ext4_free_blocks(handle
, inode
, NULL
,
3045 EXT4_C2B(sbi
, partial
.pclu
),
3046 sbi
->s_cluster_ratio
, flags
);
3047 if (flags
& EXT4_FREE_BLOCKS_RERESERVE_CLUSTER
)
3048 ext4_rereserve_cluster(inode
, partial
.lblk
);
3049 partial
.state
= initial
;
3052 /* TODO: flexible tree reduction should be here */
3053 if (path
->p_hdr
->eh_entries
== 0) {
3055 * truncate to zero freed all the tree,
3056 * so we need to correct eh_depth
3058 err
= ext4_ext_get_access(handle
, inode
, path
);
3060 ext_inode_hdr(inode
)->eh_depth
= 0;
3061 ext_inode_hdr(inode
)->eh_max
=
3062 cpu_to_le16(ext4_ext_space_root(inode
, 0));
3063 err
= ext4_ext_dirty(handle
, inode
, path
);
3067 ext4_free_ext_path(path
);
3071 ext4_journal_stop(handle
);
3077 * called at mount time
3079 void ext4_ext_init(struct super_block
*sb
)
3082 * possible initialization would be here
3085 if (ext4_has_feature_extents(sb
)) {
3086 #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
3087 printk(KERN_INFO
"EXT4-fs: file extents enabled"
3088 #ifdef AGGRESSIVE_TEST
3089 ", aggressive tests"
3091 #ifdef CHECK_BINSEARCH
3094 #ifdef EXTENTS_STATS
3099 #ifdef EXTENTS_STATS
3100 spin_lock_init(&EXT4_SB(sb
)->s_ext_stats_lock
);
3101 EXT4_SB(sb
)->s_ext_min
= 1 << 30;
3102 EXT4_SB(sb
)->s_ext_max
= 0;
3108 * called at umount time
3110 void ext4_ext_release(struct super_block
*sb
)
3112 if (!ext4_has_feature_extents(sb
))
3115 #ifdef EXTENTS_STATS
3116 if (EXT4_SB(sb
)->s_ext_blocks
&& EXT4_SB(sb
)->s_ext_extents
) {
3117 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
3118 printk(KERN_ERR
"EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
3119 sbi
->s_ext_blocks
, sbi
->s_ext_extents
,
3120 sbi
->s_ext_blocks
/ sbi
->s_ext_extents
);
3121 printk(KERN_ERR
"EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
3122 sbi
->s_ext_min
, sbi
->s_ext_max
, sbi
->s_depth_max
);
3127 static void ext4_zeroout_es(struct inode
*inode
, struct ext4_extent
*ex
)
3129 ext4_lblk_t ee_block
;
3130 ext4_fsblk_t ee_pblock
;
3131 unsigned int ee_len
;
3133 ee_block
= le32_to_cpu(ex
->ee_block
);
3134 ee_len
= ext4_ext_get_actual_len(ex
);
3135 ee_pblock
= ext4_ext_pblock(ex
);
3140 ext4_es_insert_extent(inode
, ee_block
, ee_len
, ee_pblock
,
3141 EXTENT_STATUS_WRITTEN
, false);
3144 /* FIXME!! we need to try to merge to left or right after zero-out */
3145 static int ext4_ext_zeroout(struct inode
*inode
, struct ext4_extent
*ex
)
3147 ext4_fsblk_t ee_pblock
;
3148 unsigned int ee_len
;
3150 ee_len
= ext4_ext_get_actual_len(ex
);
3151 ee_pblock
= ext4_ext_pblock(ex
);
3152 return ext4_issue_zeroout(inode
, le32_to_cpu(ex
->ee_block
), ee_pblock
,
3157 * ext4_split_extent_at() splits an extent at given block.
3159 * @handle: the journal handle
3160 * @inode: the file inode
3161 * @path: the path to the extent
3162 * @split: the logical block where the extent is splitted.
3163 * @split_flags: indicates if the extent could be zeroout if split fails, and
3164 * the states(init or unwritten) of new extents.
3165 * @flags: flags used to insert new extent to extent tree.
3168 * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
3169 * of which are determined by split_flag.
3171 * There are two cases:
3172 * a> the extent are splitted into two extent.
3173 * b> split is not needed, and just mark the extent.
3175 * Return an extent path pointer on success, or an error pointer on failure.
3177 static struct ext4_ext_path
*ext4_split_extent_at(handle_t
*handle
,
3178 struct inode
*inode
,
3179 struct ext4_ext_path
*path
,
3181 int split_flag
, int flags
)
3183 ext4_fsblk_t newblock
;
3184 ext4_lblk_t ee_block
;
3185 struct ext4_extent
*ex
, newex
, orig_ex
, zero_ex
;
3186 struct ext4_extent
*ex2
= NULL
;
3187 unsigned int ee_len
, depth
;
3190 BUG_ON((split_flag
& (EXT4_EXT_DATA_VALID1
| EXT4_EXT_DATA_VALID2
)) ==
3191 (EXT4_EXT_DATA_VALID1
| EXT4_EXT_DATA_VALID2
));
3193 ext_debug(inode
, "logical block %llu\n", (unsigned long long)split
);
3195 ext4_ext_show_leaf(inode
, path
);
3197 depth
= ext_depth(inode
);
3198 ex
= path
[depth
].p_ext
;
3199 ee_block
= le32_to_cpu(ex
->ee_block
);
3200 ee_len
= ext4_ext_get_actual_len(ex
);
3201 newblock
= split
- ee_block
+ ext4_ext_pblock(ex
);
3203 BUG_ON(split
< ee_block
|| split
>= (ee_block
+ ee_len
));
3204 BUG_ON(!ext4_ext_is_unwritten(ex
) &&
3205 split_flag
& (EXT4_EXT_MAY_ZEROOUT
|
3206 EXT4_EXT_MARK_UNWRIT1
|
3207 EXT4_EXT_MARK_UNWRIT2
));
3209 err
= ext4_ext_get_access(handle
, inode
, path
+ depth
);
3213 if (split
== ee_block
) {
3215 * case b: block @split is the block that the extent begins with
3216 * then we just change the state of the extent, and splitting
3219 if (split_flag
& EXT4_EXT_MARK_UNWRIT2
)
3220 ext4_ext_mark_unwritten(ex
);
3222 ext4_ext_mark_initialized(ex
);
3224 if (!(flags
& EXT4_GET_BLOCKS_PRE_IO
))
3225 ext4_ext_try_to_merge(handle
, inode
, path
, ex
);
3227 err
= ext4_ext_dirty(handle
, inode
, path
+ path
->p_depth
);
3232 memcpy(&orig_ex
, ex
, sizeof(orig_ex
));
3233 ex
->ee_len
= cpu_to_le16(split
- ee_block
);
3234 if (split_flag
& EXT4_EXT_MARK_UNWRIT1
)
3235 ext4_ext_mark_unwritten(ex
);
3238 * path may lead to new leaf, not to original leaf any more
3239 * after ext4_ext_insert_extent() returns,
3241 err
= ext4_ext_dirty(handle
, inode
, path
+ depth
);
3243 goto fix_extent_len
;
3246 ex2
->ee_block
= cpu_to_le32(split
);
3247 ex2
->ee_len
= cpu_to_le16(ee_len
- (split
- ee_block
));
3248 ext4_ext_store_pblock(ex2
, newblock
);
3249 if (split_flag
& EXT4_EXT_MARK_UNWRIT2
)
3250 ext4_ext_mark_unwritten(ex2
);
3252 path
= ext4_ext_insert_extent(handle
, inode
, path
, &newex
, flags
);
3256 err
= PTR_ERR(path
);
3257 if (err
!= -ENOSPC
&& err
!= -EDQUOT
&& err
!= -ENOMEM
)
3261 * Get a new path to try to zeroout or fix the extent length.
3262 * Using EXT4_EX_NOFAIL guarantees that ext4_find_extent()
3263 * will not return -ENOMEM, otherwise -ENOMEM will cause a
3264 * retry in do_writepages(), and a WARN_ON may be triggered
3265 * in ext4_da_update_reserve_space() due to an incorrect
3266 * ee_len causing the i_reserved_data_blocks exception.
3268 path
= ext4_find_extent(inode
, ee_block
, NULL
, flags
| EXT4_EX_NOFAIL
);
3270 EXT4_ERROR_INODE(inode
, "Failed split extent on %u, err %ld",
3271 split
, PTR_ERR(path
));
3274 depth
= ext_depth(inode
);
3275 ex
= path
[depth
].p_ext
;
3277 if (EXT4_EXT_MAY_ZEROOUT
& split_flag
) {
3278 if (split_flag
& (EXT4_EXT_DATA_VALID1
|EXT4_EXT_DATA_VALID2
)) {
3279 if (split_flag
& EXT4_EXT_DATA_VALID1
) {
3280 err
= ext4_ext_zeroout(inode
, ex2
);
3281 zero_ex
.ee_block
= ex2
->ee_block
;
3282 zero_ex
.ee_len
= cpu_to_le16(
3283 ext4_ext_get_actual_len(ex2
));
3284 ext4_ext_store_pblock(&zero_ex
,
3285 ext4_ext_pblock(ex2
));
3287 err
= ext4_ext_zeroout(inode
, ex
);
3288 zero_ex
.ee_block
= ex
->ee_block
;
3289 zero_ex
.ee_len
= cpu_to_le16(
3290 ext4_ext_get_actual_len(ex
));
3291 ext4_ext_store_pblock(&zero_ex
,
3292 ext4_ext_pblock(ex
));
3295 err
= ext4_ext_zeroout(inode
, &orig_ex
);
3296 zero_ex
.ee_block
= orig_ex
.ee_block
;
3297 zero_ex
.ee_len
= cpu_to_le16(
3298 ext4_ext_get_actual_len(&orig_ex
));
3299 ext4_ext_store_pblock(&zero_ex
,
3300 ext4_ext_pblock(&orig_ex
));
3304 /* update the extent length and mark as initialized */
3305 ex
->ee_len
= cpu_to_le16(ee_len
);
3306 ext4_ext_try_to_merge(handle
, inode
, path
, ex
);
3307 err
= ext4_ext_dirty(handle
, inode
, path
+ path
->p_depth
);
3309 /* update extent status tree */
3310 ext4_zeroout_es(inode
, &zero_ex
);
3311 /* If we failed at this point, we don't know in which
3312 * state the extent tree exactly is so don't try to fix
3313 * length of the original extent as it may do even more
3321 ex
->ee_len
= orig_ex
.ee_len
;
3323 * Ignore ext4_ext_dirty return value since we are already in error path
3324 * and err is a non-zero error code.
3326 ext4_ext_dirty(handle
, inode
, path
+ path
->p_depth
);
3329 ext4_free_ext_path(path
);
3330 path
= ERR_PTR(err
);
3332 ext4_ext_show_leaf(inode
, path
);
3337 * ext4_split_extent() splits an extent and mark extent which is covered
3338 * by @map as split_flags indicates
3340 * It may result in splitting the extent into multiple extents (up to three)
3341 * There are three possibilities:
3342 * a> There is no split required
3343 * b> Splits in two extents: Split is happening at either end of the extent
3344 * c> Splits in three extents: Somone is splitting in middle of the extent
3347 static struct ext4_ext_path
*ext4_split_extent(handle_t
*handle
,
3348 struct inode
*inode
,
3349 struct ext4_ext_path
*path
,
3350 struct ext4_map_blocks
*map
,
3351 int split_flag
, int flags
,
3352 unsigned int *allocated
)
3354 ext4_lblk_t ee_block
;
3355 struct ext4_extent
*ex
;
3356 unsigned int ee_len
, depth
;
3358 int split_flag1
, flags1
;
3360 depth
= ext_depth(inode
);
3361 ex
= path
[depth
].p_ext
;
3362 ee_block
= le32_to_cpu(ex
->ee_block
);
3363 ee_len
= ext4_ext_get_actual_len(ex
);
3364 unwritten
= ext4_ext_is_unwritten(ex
);
3366 if (map
->m_lblk
+ map
->m_len
< ee_block
+ ee_len
) {
3367 split_flag1
= split_flag
& EXT4_EXT_MAY_ZEROOUT
;
3368 flags1
= flags
| EXT4_GET_BLOCKS_PRE_IO
;
3370 split_flag1
|= EXT4_EXT_MARK_UNWRIT1
|
3371 EXT4_EXT_MARK_UNWRIT2
;
3372 if (split_flag
& EXT4_EXT_DATA_VALID2
)
3373 split_flag1
|= EXT4_EXT_DATA_VALID1
;
3374 path
= ext4_split_extent_at(handle
, inode
, path
,
3375 map
->m_lblk
+ map
->m_len
, split_flag1
, flags1
);
3379 * Update path is required because previous ext4_split_extent_at
3380 * may result in split of original leaf or extent zeroout.
3382 path
= ext4_find_extent(inode
, map
->m_lblk
, path
, flags
);
3385 depth
= ext_depth(inode
);
3386 ex
= path
[depth
].p_ext
;
3388 EXT4_ERROR_INODE(inode
, "unexpected hole at %lu",
3389 (unsigned long) map
->m_lblk
);
3390 ext4_free_ext_path(path
);
3391 return ERR_PTR(-EFSCORRUPTED
);
3393 unwritten
= ext4_ext_is_unwritten(ex
);
3396 if (map
->m_lblk
>= ee_block
) {
3397 split_flag1
= split_flag
& EXT4_EXT_DATA_VALID2
;
3399 split_flag1
|= EXT4_EXT_MARK_UNWRIT1
;
3400 split_flag1
|= split_flag
& (EXT4_EXT_MAY_ZEROOUT
|
3401 EXT4_EXT_MARK_UNWRIT2
);
3403 path
= ext4_split_extent_at(handle
, inode
, path
,
3404 map
->m_lblk
, split_flag1
, flags
);
3410 if (map
->m_lblk
+ map
->m_len
> ee_block
+ ee_len
)
3411 *allocated
= ee_len
- (map
->m_lblk
- ee_block
);
3413 *allocated
= map
->m_len
;
3415 ext4_ext_show_leaf(inode
, path
);
3420 * This function is called by ext4_ext_map_blocks() if someone tries to write
3421 * to an unwritten extent. It may result in splitting the unwritten
3422 * extent into multiple extents (up to three - one initialized and two
3424 * There are three possibilities:
3425 * a> There is no split required: Entire extent should be initialized
3426 * b> Splits in two extents: Write is happening at either end of the extent
3427 * c> Splits in three extents: Somone is writing in middle of the extent
3430 * - The extent pointed to by 'path' is unwritten.
3431 * - The extent pointed to by 'path' contains a superset
3432 * of the logical span [map->m_lblk, map->m_lblk + map->m_len).
3434 * Post-conditions on success:
3435 * - the returned value is the number of blocks beyond map->l_lblk
3436 * that are allocated and initialized.
3437 * It is guaranteed to be >= map->m_len.
3439 static struct ext4_ext_path
*
3440 ext4_ext_convert_to_initialized(handle_t
*handle
, struct inode
*inode
,
3441 struct ext4_map_blocks
*map
, struct ext4_ext_path
*path
,
3442 int flags
, unsigned int *allocated
)
3444 struct ext4_sb_info
*sbi
;
3445 struct ext4_extent_header
*eh
;
3446 struct ext4_map_blocks split_map
;
3447 struct ext4_extent zero_ex1
, zero_ex2
;
3448 struct ext4_extent
*ex
, *abut_ex
;
3449 ext4_lblk_t ee_block
, eof_block
;
3450 unsigned int ee_len
, depth
, map_len
= map
->m_len
;
3452 int split_flag
= EXT4_EXT_DATA_VALID2
;
3453 unsigned int max_zeroout
= 0;
3455 ext_debug(inode
, "logical block %llu, max_blocks %u\n",
3456 (unsigned long long)map
->m_lblk
, map_len
);
3458 sbi
= EXT4_SB(inode
->i_sb
);
3459 eof_block
= (EXT4_I(inode
)->i_disksize
+ inode
->i_sb
->s_blocksize
- 1)
3460 >> inode
->i_sb
->s_blocksize_bits
;
3461 if (eof_block
< map
->m_lblk
+ map_len
)
3462 eof_block
= map
->m_lblk
+ map_len
;
3464 depth
= ext_depth(inode
);
3465 eh
= path
[depth
].p_hdr
;
3466 ex
= path
[depth
].p_ext
;
3467 ee_block
= le32_to_cpu(ex
->ee_block
);
3468 ee_len
= ext4_ext_get_actual_len(ex
);
3469 zero_ex1
.ee_len
= 0;
3470 zero_ex2
.ee_len
= 0;
3472 trace_ext4_ext_convert_to_initialized_enter(inode
, map
, ex
);
3474 /* Pre-conditions */
3475 BUG_ON(!ext4_ext_is_unwritten(ex
));
3476 BUG_ON(!in_range(map
->m_lblk
, ee_block
, ee_len
));
3479 * Attempt to transfer newly initialized blocks from the currently
3480 * unwritten extent to its neighbor. This is much cheaper
3481 * than an insertion followed by a merge as those involve costly
3482 * memmove() calls. Transferring to the left is the common case in
3483 * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
3484 * followed by append writes.
3486 * Limitations of the current logic:
3487 * - L1: we do not deal with writes covering the whole extent.
3488 * This would require removing the extent if the transfer
3490 * - L2: we only attempt to merge with an extent stored in the
3491 * same extent tree node.
3494 if ((map
->m_lblk
== ee_block
) &&
3495 /* See if we can merge left */
3496 (map_len
< ee_len
) && /*L1*/
3497 (ex
> EXT_FIRST_EXTENT(eh
))) { /*L2*/
3498 ext4_lblk_t prev_lblk
;
3499 ext4_fsblk_t prev_pblk
, ee_pblk
;
3500 unsigned int prev_len
;
3503 prev_lblk
= le32_to_cpu(abut_ex
->ee_block
);
3504 prev_len
= ext4_ext_get_actual_len(abut_ex
);
3505 prev_pblk
= ext4_ext_pblock(abut_ex
);
3506 ee_pblk
= ext4_ext_pblock(ex
);
3509 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3510 * upon those conditions:
3511 * - C1: abut_ex is initialized,
3512 * - C2: abut_ex is logically abutting ex,
3513 * - C3: abut_ex is physically abutting ex,
3514 * - C4: abut_ex can receive the additional blocks without
3515 * overflowing the (initialized) length limit.
3517 if ((!ext4_ext_is_unwritten(abut_ex
)) && /*C1*/
3518 ((prev_lblk
+ prev_len
) == ee_block
) && /*C2*/
3519 ((prev_pblk
+ prev_len
) == ee_pblk
) && /*C3*/
3520 (prev_len
< (EXT_INIT_MAX_LEN
- map_len
))) { /*C4*/
3521 err
= ext4_ext_get_access(handle
, inode
, path
+ depth
);
3525 trace_ext4_ext_convert_to_initialized_fastpath(inode
,
3528 /* Shift the start of ex by 'map_len' blocks */
3529 ex
->ee_block
= cpu_to_le32(ee_block
+ map_len
);
3530 ext4_ext_store_pblock(ex
, ee_pblk
+ map_len
);
3531 ex
->ee_len
= cpu_to_le16(ee_len
- map_len
);
3532 ext4_ext_mark_unwritten(ex
); /* Restore the flag */
3534 /* Extend abut_ex by 'map_len' blocks */
3535 abut_ex
->ee_len
= cpu_to_le16(prev_len
+ map_len
);
3537 /* Result: number of initialized blocks past m_lblk */
3538 *allocated
= map_len
;
3540 } else if (((map
->m_lblk
+ map_len
) == (ee_block
+ ee_len
)) &&
3541 (map_len
< ee_len
) && /*L1*/
3542 ex
< EXT_LAST_EXTENT(eh
)) { /*L2*/
3543 /* See if we can merge right */
3544 ext4_lblk_t next_lblk
;
3545 ext4_fsblk_t next_pblk
, ee_pblk
;
3546 unsigned int next_len
;
3549 next_lblk
= le32_to_cpu(abut_ex
->ee_block
);
3550 next_len
= ext4_ext_get_actual_len(abut_ex
);
3551 next_pblk
= ext4_ext_pblock(abut_ex
);
3552 ee_pblk
= ext4_ext_pblock(ex
);
3555 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3556 * upon those conditions:
3557 * - C1: abut_ex is initialized,
3558 * - C2: abut_ex is logically abutting ex,
3559 * - C3: abut_ex is physically abutting ex,
3560 * - C4: abut_ex can receive the additional blocks without
3561 * overflowing the (initialized) length limit.
3563 if ((!ext4_ext_is_unwritten(abut_ex
)) && /*C1*/
3564 ((map
->m_lblk
+ map_len
) == next_lblk
) && /*C2*/
3565 ((ee_pblk
+ ee_len
) == next_pblk
) && /*C3*/
3566 (next_len
< (EXT_INIT_MAX_LEN
- map_len
))) { /*C4*/
3567 err
= ext4_ext_get_access(handle
, inode
, path
+ depth
);
3571 trace_ext4_ext_convert_to_initialized_fastpath(inode
,
3574 /* Shift the start of abut_ex by 'map_len' blocks */
3575 abut_ex
->ee_block
= cpu_to_le32(next_lblk
- map_len
);
3576 ext4_ext_store_pblock(abut_ex
, next_pblk
- map_len
);
3577 ex
->ee_len
= cpu_to_le16(ee_len
- map_len
);
3578 ext4_ext_mark_unwritten(ex
); /* Restore the flag */
3580 /* Extend abut_ex by 'map_len' blocks */
3581 abut_ex
->ee_len
= cpu_to_le16(next_len
+ map_len
);
3583 /* Result: number of initialized blocks past m_lblk */
3584 *allocated
= map_len
;
3588 /* Mark the block containing both extents as dirty */
3589 err
= ext4_ext_dirty(handle
, inode
, path
+ depth
);
3591 /* Update path to point to the right extent */
3592 path
[depth
].p_ext
= abut_ex
;
3597 *allocated
= ee_len
- (map
->m_lblk
- ee_block
);
3599 WARN_ON(map
->m_lblk
< ee_block
);
3601 * It is safe to convert extent to initialized via explicit
3602 * zeroout only if extent is fully inside i_size or new_size.
3604 split_flag
|= ee_block
+ ee_len
<= eof_block
? EXT4_EXT_MAY_ZEROOUT
: 0;
3606 if (EXT4_EXT_MAY_ZEROOUT
& split_flag
)
3607 max_zeroout
= sbi
->s_extent_max_zeroout_kb
>>
3608 (inode
->i_sb
->s_blocksize_bits
- 10);
3612 * 1. split the extent into three extents.
3613 * 2. split the extent into two extents, zeroout the head of the first
3615 * 3. split the extent into two extents, zeroout the tail of the second
3617 * 4. split the extent into two extents with out zeroout.
3618 * 5. no splitting needed, just possibly zeroout the head and / or the
3619 * tail of the extent.
3621 split_map
.m_lblk
= map
->m_lblk
;
3622 split_map
.m_len
= map
->m_len
;
3624 if (max_zeroout
&& (*allocated
> split_map
.m_len
)) {
3625 if (*allocated
<= max_zeroout
) {
3628 cpu_to_le32(split_map
.m_lblk
+
3631 cpu_to_le16(*allocated
- split_map
.m_len
);
3632 ext4_ext_store_pblock(&zero_ex1
,
3633 ext4_ext_pblock(ex
) + split_map
.m_lblk
+
3634 split_map
.m_len
- ee_block
);
3635 err
= ext4_ext_zeroout(inode
, &zero_ex1
);
3638 split_map
.m_len
= *allocated
;
3640 if (split_map
.m_lblk
- ee_block
+ split_map
.m_len
<
3643 if (split_map
.m_lblk
!= ee_block
) {
3644 zero_ex2
.ee_block
= ex
->ee_block
;
3645 zero_ex2
.ee_len
= cpu_to_le16(split_map
.m_lblk
-
3647 ext4_ext_store_pblock(&zero_ex2
,
3648 ext4_ext_pblock(ex
));
3649 err
= ext4_ext_zeroout(inode
, &zero_ex2
);
3654 split_map
.m_len
+= split_map
.m_lblk
- ee_block
;
3655 split_map
.m_lblk
= ee_block
;
3656 *allocated
= map
->m_len
;
3661 path
= ext4_split_extent(handle
, inode
, path
, &split_map
, split_flag
,
3666 /* If we have gotten a failure, don't zero out status tree */
3667 ext4_zeroout_es(inode
, &zero_ex1
);
3668 ext4_zeroout_es(inode
, &zero_ex2
);
3672 ext4_free_ext_path(path
);
3673 return ERR_PTR(err
);
3677 * This function is called by ext4_ext_map_blocks() from
3678 * ext4_get_blocks_dio_write() when DIO to write
3679 * to an unwritten extent.
3681 * Writing to an unwritten extent may result in splitting the unwritten
3682 * extent into multiple initialized/unwritten extents (up to three)
3683 * There are three possibilities:
3684 * a> There is no split required: Entire extent should be unwritten
3685 * b> Splits in two extents: Write is happening at either end of the extent
3686 * c> Splits in three extents: Somone is writing in middle of the extent
3688 * This works the same way in the case of initialized -> unwritten conversion.
3690 * One of more index blocks maybe needed if the extent tree grow after
3691 * the unwritten extent split. To prevent ENOSPC occur at the IO
3692 * complete, we need to split the unwritten extent before DIO submit
3693 * the IO. The unwritten extent called at this time will be split
3694 * into three unwritten extent(at most). After IO complete, the part
3695 * being filled will be convert to initialized by the end_io callback function
3696 * via ext4_convert_unwritten_extents().
3698 * The size of unwritten extent to be written is passed to the caller via the
3699 * allocated pointer. Return an extent path pointer on success, or an error
3700 * pointer on failure.
3702 static struct ext4_ext_path
*ext4_split_convert_extents(handle_t
*handle
,
3703 struct inode
*inode
,
3704 struct ext4_map_blocks
*map
,
3705 struct ext4_ext_path
*path
,
3706 int flags
, unsigned int *allocated
)
3708 ext4_lblk_t eof_block
;
3709 ext4_lblk_t ee_block
;
3710 struct ext4_extent
*ex
;
3711 unsigned int ee_len
;
3712 int split_flag
= 0, depth
;
3714 ext_debug(inode
, "logical block %llu, max_blocks %u\n",
3715 (unsigned long long)map
->m_lblk
, map
->m_len
);
3717 eof_block
= (EXT4_I(inode
)->i_disksize
+ inode
->i_sb
->s_blocksize
- 1)
3718 >> inode
->i_sb
->s_blocksize_bits
;
3719 if (eof_block
< map
->m_lblk
+ map
->m_len
)
3720 eof_block
= map
->m_lblk
+ map
->m_len
;
3722 * It is safe to convert extent to initialized via explicit
3723 * zeroout only if extent is fully inside i_size or new_size.
3725 depth
= ext_depth(inode
);
3726 ex
= path
[depth
].p_ext
;
3727 ee_block
= le32_to_cpu(ex
->ee_block
);
3728 ee_len
= ext4_ext_get_actual_len(ex
);
3730 /* Convert to unwritten */
3731 if (flags
& EXT4_GET_BLOCKS_CONVERT_UNWRITTEN
) {
3732 split_flag
|= EXT4_EXT_DATA_VALID1
;
3733 /* Convert to initialized */
3734 } else if (flags
& EXT4_GET_BLOCKS_CONVERT
) {
3735 split_flag
|= ee_block
+ ee_len
<= eof_block
?
3736 EXT4_EXT_MAY_ZEROOUT
: 0;
3737 split_flag
|= (EXT4_EXT_MARK_UNWRIT2
| EXT4_EXT_DATA_VALID2
);
3739 flags
|= EXT4_GET_BLOCKS_PRE_IO
;
3740 return ext4_split_extent(handle
, inode
, path
, map
, split_flag
, flags
,
3744 static struct ext4_ext_path
*
3745 ext4_convert_unwritten_extents_endio(handle_t
*handle
, struct inode
*inode
,
3746 struct ext4_map_blocks
*map
,
3747 struct ext4_ext_path
*path
)
3749 struct ext4_extent
*ex
;
3750 ext4_lblk_t ee_block
;
3751 unsigned int ee_len
;
3755 depth
= ext_depth(inode
);
3756 ex
= path
[depth
].p_ext
;
3757 ee_block
= le32_to_cpu(ex
->ee_block
);
3758 ee_len
= ext4_ext_get_actual_len(ex
);
3760 ext_debug(inode
, "logical block %llu, max_blocks %u\n",
3761 (unsigned long long)ee_block
, ee_len
);
3763 /* If extent is larger than requested it is a clear sign that we still
3764 * have some extent state machine issues left. So extent_split is still
3766 * TODO: Once all related issues will be fixed this situation should be
3769 if (ee_block
!= map
->m_lblk
|| ee_len
> map
->m_len
) {
3770 #ifdef CONFIG_EXT4_DEBUG
3771 ext4_warning(inode
->i_sb
, "Inode (%ld) finished: extent logical block %llu,"
3772 " len %u; IO logical block %llu, len %u",
3773 inode
->i_ino
, (unsigned long long)ee_block
, ee_len
,
3774 (unsigned long long)map
->m_lblk
, map
->m_len
);
3776 path
= ext4_split_convert_extents(handle
, inode
, map
, path
,
3777 EXT4_GET_BLOCKS_CONVERT
, NULL
);
3781 path
= ext4_find_extent(inode
, map
->m_lblk
, path
, 0);
3784 depth
= ext_depth(inode
);
3785 ex
= path
[depth
].p_ext
;
3788 err
= ext4_ext_get_access(handle
, inode
, path
+ depth
);
3791 /* first mark the extent as initialized */
3792 ext4_ext_mark_initialized(ex
);
3794 /* note: ext4_ext_correct_indexes() isn't needed here because
3795 * borders are not changed
3797 ext4_ext_try_to_merge(handle
, inode
, path
, ex
);
3799 /* Mark modified extent as dirty */
3800 err
= ext4_ext_dirty(handle
, inode
, path
+ path
->p_depth
);
3804 ext4_ext_show_leaf(inode
, path
);
3808 ext4_free_ext_path(path
);
3809 return ERR_PTR(err
);
3812 static struct ext4_ext_path
*
3813 convert_initialized_extent(handle_t
*handle
, struct inode
*inode
,
3814 struct ext4_map_blocks
*map
,
3815 struct ext4_ext_path
*path
,
3816 unsigned int *allocated
)
3818 struct ext4_extent
*ex
;
3819 ext4_lblk_t ee_block
;
3820 unsigned int ee_len
;
3825 * Make sure that the extent is no bigger than we support with
3828 if (map
->m_len
> EXT_UNWRITTEN_MAX_LEN
)
3829 map
->m_len
= EXT_UNWRITTEN_MAX_LEN
/ 2;
3831 depth
= ext_depth(inode
);
3832 ex
= path
[depth
].p_ext
;
3833 ee_block
= le32_to_cpu(ex
->ee_block
);
3834 ee_len
= ext4_ext_get_actual_len(ex
);
3836 ext_debug(inode
, "logical block %llu, max_blocks %u\n",
3837 (unsigned long long)ee_block
, ee_len
);
3839 if (ee_block
!= map
->m_lblk
|| ee_len
> map
->m_len
) {
3840 path
= ext4_split_convert_extents(handle
, inode
, map
, path
,
3841 EXT4_GET_BLOCKS_CONVERT_UNWRITTEN
, NULL
);
3845 path
= ext4_find_extent(inode
, map
->m_lblk
, path
, 0);
3848 depth
= ext_depth(inode
);
3849 ex
= path
[depth
].p_ext
;
3851 EXT4_ERROR_INODE(inode
, "unexpected hole at %lu",
3852 (unsigned long) map
->m_lblk
);
3853 err
= -EFSCORRUPTED
;
3858 err
= ext4_ext_get_access(handle
, inode
, path
+ depth
);
3861 /* first mark the extent as unwritten */
3862 ext4_ext_mark_unwritten(ex
);
3864 /* note: ext4_ext_correct_indexes() isn't needed here because
3865 * borders are not changed
3867 ext4_ext_try_to_merge(handle
, inode
, path
, ex
);
3869 /* Mark modified extent as dirty */
3870 err
= ext4_ext_dirty(handle
, inode
, path
+ path
->p_depth
);
3873 ext4_ext_show_leaf(inode
, path
);
3875 ext4_update_inode_fsync_trans(handle
, inode
, 1);
3877 map
->m_flags
|= EXT4_MAP_UNWRITTEN
;
3878 if (*allocated
> map
->m_len
)
3879 *allocated
= map
->m_len
;
3880 map
->m_len
= *allocated
;
3884 ext4_free_ext_path(path
);
3885 return ERR_PTR(err
);
3888 static struct ext4_ext_path
*
3889 ext4_ext_handle_unwritten_extents(handle_t
*handle
, struct inode
*inode
,
3890 struct ext4_map_blocks
*map
,
3891 struct ext4_ext_path
*path
, int flags
,
3892 unsigned int *allocated
, ext4_fsblk_t newblock
)
3896 ext_debug(inode
, "logical block %llu, max_blocks %u, flags 0x%x, allocated %u\n",
3897 (unsigned long long)map
->m_lblk
, map
->m_len
, flags
,
3899 ext4_ext_show_leaf(inode
, path
);
3902 * When writing into unwritten space, we should not fail to
3903 * allocate metadata blocks for the new extent block if needed.
3905 flags
|= EXT4_GET_BLOCKS_METADATA_NOFAIL
;
3907 trace_ext4_ext_handle_unwritten_extents(inode
, map
, flags
,
3908 *allocated
, newblock
);
3910 /* get_block() before submitting IO, split the extent */
3911 if (flags
& EXT4_GET_BLOCKS_PRE_IO
) {
3912 path
= ext4_split_convert_extents(handle
, inode
, map
, path
,
3913 flags
| EXT4_GET_BLOCKS_CONVERT
, allocated
);
3917 * shouldn't get a 0 allocated when splitting an extent unless
3918 * m_len is 0 (bug) or extent has been corrupted
3920 if (unlikely(*allocated
== 0)) {
3921 EXT4_ERROR_INODE(inode
,
3922 "unexpected allocated == 0, m_len = %u",
3924 err
= -EFSCORRUPTED
;
3927 map
->m_flags
|= EXT4_MAP_UNWRITTEN
;
3930 /* IO end_io complete, convert the filled extent to written */
3931 if (flags
& EXT4_GET_BLOCKS_CONVERT
) {
3932 path
= ext4_convert_unwritten_extents_endio(handle
, inode
,
3936 ext4_update_inode_fsync_trans(handle
, inode
, 1);
3939 /* buffered IO cases */
3941 * repeat fallocate creation request
3942 * we already have an unwritten extent
3944 if (flags
& EXT4_GET_BLOCKS_UNWRIT_EXT
) {
3945 map
->m_flags
|= EXT4_MAP_UNWRITTEN
;
3949 /* buffered READ or buffered write_begin() lookup */
3950 if ((flags
& EXT4_GET_BLOCKS_CREATE
) == 0) {
3952 * We have blocks reserved already. We
3953 * return allocated blocks so that delalloc
3954 * won't do block reservation for us. But
3955 * the buffer head will be unmapped so that
3956 * a read from the block returns 0s.
3958 map
->m_flags
|= EXT4_MAP_UNWRITTEN
;
3963 * Default case when (flags & EXT4_GET_BLOCKS_CREATE) == 1.
3964 * For buffered writes, at writepage time, etc. Convert a
3965 * discovered unwritten extent to written.
3967 path
= ext4_ext_convert_to_initialized(handle
, inode
, map
, path
,
3971 ext4_update_inode_fsync_trans(handle
, inode
, 1);
3973 * shouldn't get a 0 allocated when converting an unwritten extent
3974 * unless m_len is 0 (bug) or extent has been corrupted
3976 if (unlikely(*allocated
== 0)) {
3977 EXT4_ERROR_INODE(inode
, "unexpected allocated == 0, m_len = %u",
3979 err
= -EFSCORRUPTED
;
3984 map
->m_flags
|= EXT4_MAP_NEW
;
3986 map
->m_flags
|= EXT4_MAP_MAPPED
;
3988 map
->m_pblk
= newblock
;
3989 if (*allocated
> map
->m_len
)
3990 *allocated
= map
->m_len
;
3991 map
->m_len
= *allocated
;
3992 ext4_ext_show_leaf(inode
, path
);
3996 ext4_free_ext_path(path
);
3997 return ERR_PTR(err
);
4001 * get_implied_cluster_alloc - check to see if the requested
4002 * allocation (in the map structure) overlaps with a cluster already
4003 * allocated in an extent.
4004 * @sb The filesystem superblock structure
4005 * @map The requested lblk->pblk mapping
4006 * @ex The extent structure which might contain an implied
4007 * cluster allocation
4009 * This function is called by ext4_ext_map_blocks() after we failed to
4010 * find blocks that were already in the inode's extent tree. Hence,
4011 * we know that the beginning of the requested region cannot overlap
4012 * the extent from the inode's extent tree. There are three cases we
4013 * want to catch. The first is this case:
4015 * |--- cluster # N--|
4016 * |--- extent ---| |---- requested region ---|
4019 * The second case that we need to test for is this one:
4021 * |--------- cluster # N ----------------|
4022 * |--- requested region --| |------- extent ----|
4023 * |=======================|
4025 * The third case is when the requested region lies between two extents
4026 * within the same cluster:
4027 * |------------- cluster # N-------------|
4028 * |----- ex -----| |---- ex_right ----|
4029 * |------ requested region ------|
4030 * |================|
4032 * In each of the above cases, we need to set the map->m_pblk and
4033 * map->m_len so it corresponds to the return the extent labelled as
4034 * "|====|" from cluster #N, since it is already in use for data in
4035 * cluster EXT4_B2C(sbi, map->m_lblk). We will then return 1 to
4036 * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
4037 * as a new "allocated" block region. Otherwise, we will return 0 and
4038 * ext4_ext_map_blocks() will then allocate one or more new clusters
4039 * by calling ext4_mb_new_blocks().
4041 static int get_implied_cluster_alloc(struct super_block
*sb
,
4042 struct ext4_map_blocks
*map
,
4043 struct ext4_extent
*ex
,
4044 struct ext4_ext_path
*path
)
4046 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
4047 ext4_lblk_t c_offset
= EXT4_LBLK_COFF(sbi
, map
->m_lblk
);
4048 ext4_lblk_t ex_cluster_start
, ex_cluster_end
;
4049 ext4_lblk_t rr_cluster_start
;
4050 ext4_lblk_t ee_block
= le32_to_cpu(ex
->ee_block
);
4051 ext4_fsblk_t ee_start
= ext4_ext_pblock(ex
);
4052 unsigned short ee_len
= ext4_ext_get_actual_len(ex
);
4054 /* The extent passed in that we are trying to match */
4055 ex_cluster_start
= EXT4_B2C(sbi
, ee_block
);
4056 ex_cluster_end
= EXT4_B2C(sbi
, ee_block
+ ee_len
- 1);
4058 /* The requested region passed into ext4_map_blocks() */
4059 rr_cluster_start
= EXT4_B2C(sbi
, map
->m_lblk
);
4061 if ((rr_cluster_start
== ex_cluster_end
) ||
4062 (rr_cluster_start
== ex_cluster_start
)) {
4063 if (rr_cluster_start
== ex_cluster_end
)
4064 ee_start
+= ee_len
- 1;
4065 map
->m_pblk
= EXT4_PBLK_CMASK(sbi
, ee_start
) + c_offset
;
4066 map
->m_len
= min(map
->m_len
,
4067 (unsigned) sbi
->s_cluster_ratio
- c_offset
);
4069 * Check for and handle this case:
4071 * |--------- cluster # N-------------|
4072 * |------- extent ----|
4073 * |--- requested region ---|
4077 if (map
->m_lblk
< ee_block
)
4078 map
->m_len
= min(map
->m_len
, ee_block
- map
->m_lblk
);
4081 * Check for the case where there is already another allocated
4082 * block to the right of 'ex' but before the end of the cluster.
4084 * |------------- cluster # N-------------|
4085 * |----- ex -----| |---- ex_right ----|
4086 * |------ requested region ------|
4087 * |================|
4089 if (map
->m_lblk
> ee_block
) {
4090 ext4_lblk_t next
= ext4_ext_next_allocated_block(path
);
4091 map
->m_len
= min(map
->m_len
, next
- map
->m_lblk
);
4094 trace_ext4_get_implied_cluster_alloc_exit(sb
, map
, 1);
4098 trace_ext4_get_implied_cluster_alloc_exit(sb
, map
, 0);
4103 * Determine hole length around the given logical block, first try to
4104 * locate and expand the hole from the given @path, and then adjust it
4105 * if it's partially or completely converted to delayed extents, insert
4106 * it into the extent cache tree if it's indeed a hole, finally return
4107 * the length of the determined extent.
4109 static ext4_lblk_t
ext4_ext_determine_insert_hole(struct inode
*inode
,
4110 struct ext4_ext_path
*path
,
4113 ext4_lblk_t hole_start
, len
;
4114 struct extent_status es
;
4117 len
= ext4_ext_find_hole(inode
, path
, &hole_start
);
4119 ext4_es_find_extent_range(inode
, &ext4_es_is_delayed
, hole_start
,
4120 hole_start
+ len
- 1, &es
);
4125 * There's a delalloc extent in the hole, handle it if the delalloc
4126 * extent is in front of, behind and straddle the queried range.
4128 if (lblk
>= es
.es_lblk
+ es
.es_len
) {
4130 * The delalloc extent is in front of the queried range,
4131 * find again from the queried start block.
4133 len
-= lblk
- hole_start
;
4136 } else if (in_range(lblk
, es
.es_lblk
, es
.es_len
)) {
4138 * The delalloc extent containing lblk, it must have been
4139 * added after ext4_map_blocks() checked the extent status
4140 * tree so we are not holding i_rwsem and delalloc info is
4141 * only stabilized by i_data_sem we are going to release
4142 * soon. Don't modify the extent status tree and report
4143 * extent as a hole, just adjust the length to the delalloc
4144 * extent's after lblk.
4146 len
= es
.es_lblk
+ es
.es_len
- lblk
;
4150 * The delalloc extent is partially or completely behind
4151 * the queried range, update hole length until the
4152 * beginning of the delalloc extent.
4154 len
= min(es
.es_lblk
- hole_start
, len
);
4158 /* Put just found gap into cache to speed up subsequent requests */
4159 ext_debug(inode
, " -> %u:%u\n", hole_start
, len
);
4160 ext4_es_insert_extent(inode
, hole_start
, len
, ~0,
4161 EXTENT_STATUS_HOLE
, false);
4163 /* Update hole_len to reflect hole size after lblk */
4164 if (hole_start
!= lblk
)
4165 len
-= lblk
- hole_start
;
4171 * Block allocation/map/preallocation routine for extents based files
4174 * Need to be called with
4175 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
4176 * (ie, flags is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
4178 * return > 0, number of blocks already mapped/allocated
4179 * if flags doesn't contain EXT4_GET_BLOCKS_CREATE and these are pre-allocated blocks
4180 * buffer head is unmapped
4181 * otherwise blocks are mapped
4183 * return = 0, if plain look up failed (blocks have not been allocated)
4184 * buffer head is unmapped
4186 * return < 0, error case.
4188 int ext4_ext_map_blocks(handle_t
*handle
, struct inode
*inode
,
4189 struct ext4_map_blocks
*map
, int flags
)
4191 struct ext4_ext_path
*path
= NULL
;
4192 struct ext4_extent newex
, *ex
, ex2
;
4193 struct ext4_sb_info
*sbi
= EXT4_SB(inode
->i_sb
);
4194 ext4_fsblk_t newblock
= 0, pblk
;
4196 unsigned int allocated
= 0, offset
= 0;
4197 unsigned int allocated_clusters
= 0;
4198 struct ext4_allocation_request ar
;
4199 ext4_lblk_t cluster_offset
;
4201 ext_debug(inode
, "blocks %u/%u requested\n", map
->m_lblk
, map
->m_len
);
4202 trace_ext4_ext_map_blocks_enter(inode
, map
->m_lblk
, map
->m_len
, flags
);
4204 /* find extent for this block */
4205 path
= ext4_find_extent(inode
, map
->m_lblk
, NULL
, 0);
4207 err
= PTR_ERR(path
);
4211 depth
= ext_depth(inode
);
4214 * consistent leaf must not be empty;
4215 * this situation is possible, though, _during_ tree modification;
4216 * this is why assert can't be put in ext4_find_extent()
4218 if (unlikely(path
[depth
].p_ext
== NULL
&& depth
!= 0)) {
4219 EXT4_ERROR_INODE(inode
, "bad extent address "
4220 "lblock: %lu, depth: %d pblock %lld",
4221 (unsigned long) map
->m_lblk
, depth
,
4222 path
[depth
].p_block
);
4223 err
= -EFSCORRUPTED
;
4227 ex
= path
[depth
].p_ext
;
4229 ext4_lblk_t ee_block
= le32_to_cpu(ex
->ee_block
);
4230 ext4_fsblk_t ee_start
= ext4_ext_pblock(ex
);
4231 unsigned short ee_len
;
4235 * unwritten extents are treated as holes, except that
4236 * we split out initialized portions during a write.
4238 ee_len
= ext4_ext_get_actual_len(ex
);
4240 trace_ext4_ext_show_extent(inode
, ee_block
, ee_start
, ee_len
);
4242 /* if found extent covers block, simply return it */
4243 if (in_range(map
->m_lblk
, ee_block
, ee_len
)) {
4244 newblock
= map
->m_lblk
- ee_block
+ ee_start
;
4245 /* number of remaining blocks in the extent */
4246 allocated
= ee_len
- (map
->m_lblk
- ee_block
);
4247 ext_debug(inode
, "%u fit into %u:%d -> %llu\n",
4248 map
->m_lblk
, ee_block
, ee_len
, newblock
);
4251 * If the extent is initialized check whether the
4252 * caller wants to convert it to unwritten.
4254 if ((!ext4_ext_is_unwritten(ex
)) &&
4255 (flags
& EXT4_GET_BLOCKS_CONVERT_UNWRITTEN
)) {
4256 path
= convert_initialized_extent(handle
,
4257 inode
, map
, path
, &allocated
);
4259 err
= PTR_ERR(path
);
4261 } else if (!ext4_ext_is_unwritten(ex
)) {
4262 map
->m_flags
|= EXT4_MAP_MAPPED
;
4263 map
->m_pblk
= newblock
;
4264 if (allocated
> map
->m_len
)
4265 allocated
= map
->m_len
;
4266 map
->m_len
= allocated
;
4267 ext4_ext_show_leaf(inode
, path
);
4271 path
= ext4_ext_handle_unwritten_extents(
4272 handle
, inode
, map
, path
, flags
,
4273 &allocated
, newblock
);
4275 err
= PTR_ERR(path
);
4281 * requested block isn't allocated yet;
4282 * we couldn't try to create block if flags doesn't contain EXT4_GET_BLOCKS_CREATE
4284 if ((flags
& EXT4_GET_BLOCKS_CREATE
) == 0) {
4287 len
= ext4_ext_determine_insert_hole(inode
, path
, map
->m_lblk
);
4290 map
->m_len
= min_t(unsigned int, map
->m_len
, len
);
4295 * Okay, we need to do block allocation.
4297 newex
.ee_block
= cpu_to_le32(map
->m_lblk
);
4298 cluster_offset
= EXT4_LBLK_COFF(sbi
, map
->m_lblk
);
4301 * If we are doing bigalloc, check to see if the extent returned
4302 * by ext4_find_extent() implies a cluster we can use.
4304 if (cluster_offset
&& ex
&&
4305 get_implied_cluster_alloc(inode
->i_sb
, map
, ex
, path
)) {
4306 ar
.len
= allocated
= map
->m_len
;
4307 newblock
= map
->m_pblk
;
4308 goto got_allocated_blocks
;
4311 /* find neighbour allocated blocks */
4312 ar
.lleft
= map
->m_lblk
;
4313 err
= ext4_ext_search_left(inode
, path
, &ar
.lleft
, &ar
.pleft
);
4316 ar
.lright
= map
->m_lblk
;
4317 err
= ext4_ext_search_right(inode
, path
, &ar
.lright
, &ar
.pright
, &ex2
);
4321 /* Check if the extent after searching to the right implies a
4322 * cluster we can use. */
4323 if ((sbi
->s_cluster_ratio
> 1) && err
&&
4324 get_implied_cluster_alloc(inode
->i_sb
, map
, &ex2
, path
)) {
4325 ar
.len
= allocated
= map
->m_len
;
4326 newblock
= map
->m_pblk
;
4328 goto got_allocated_blocks
;
4332 * See if request is beyond maximum number of blocks we can have in
4333 * a single extent. For an initialized extent this limit is
4334 * EXT_INIT_MAX_LEN and for an unwritten extent this limit is
4335 * EXT_UNWRITTEN_MAX_LEN.
4337 if (map
->m_len
> EXT_INIT_MAX_LEN
&&
4338 !(flags
& EXT4_GET_BLOCKS_UNWRIT_EXT
))
4339 map
->m_len
= EXT_INIT_MAX_LEN
;
4340 else if (map
->m_len
> EXT_UNWRITTEN_MAX_LEN
&&
4341 (flags
& EXT4_GET_BLOCKS_UNWRIT_EXT
))
4342 map
->m_len
= EXT_UNWRITTEN_MAX_LEN
;
4344 /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
4345 newex
.ee_len
= cpu_to_le16(map
->m_len
);
4346 err
= ext4_ext_check_overlap(sbi
, inode
, &newex
, path
);
4348 allocated
= ext4_ext_get_actual_len(&newex
);
4350 allocated
= map
->m_len
;
4352 /* allocate new block */
4354 ar
.goal
= ext4_ext_find_goal(inode
, path
, map
->m_lblk
);
4355 ar
.logical
= map
->m_lblk
;
4357 * We calculate the offset from the beginning of the cluster
4358 * for the logical block number, since when we allocate a
4359 * physical cluster, the physical block should start at the
4360 * same offset from the beginning of the cluster. This is
4361 * needed so that future calls to get_implied_cluster_alloc()
4364 offset
= EXT4_LBLK_COFF(sbi
, map
->m_lblk
);
4365 ar
.len
= EXT4_NUM_B2C(sbi
, offset
+allocated
);
4367 ar
.logical
-= offset
;
4368 if (S_ISREG(inode
->i_mode
))
4369 ar
.flags
= EXT4_MB_HINT_DATA
;
4371 /* disable in-core preallocation for non-regular files */
4373 if (flags
& EXT4_GET_BLOCKS_NO_NORMALIZE
)
4374 ar
.flags
|= EXT4_MB_HINT_NOPREALLOC
;
4375 if (flags
& EXT4_GET_BLOCKS_DELALLOC_RESERVE
)
4376 ar
.flags
|= EXT4_MB_DELALLOC_RESERVED
;
4377 if (flags
& EXT4_GET_BLOCKS_METADATA_NOFAIL
)
4378 ar
.flags
|= EXT4_MB_USE_RESERVED
;
4379 newblock
= ext4_mb_new_blocks(handle
, &ar
, &err
);
4382 allocated_clusters
= ar
.len
;
4383 ar
.len
= EXT4_C2B(sbi
, ar
.len
) - offset
;
4384 ext_debug(inode
, "allocate new block: goal %llu, found %llu/%u, requested %u\n",
4385 ar
.goal
, newblock
, ar
.len
, allocated
);
4386 if (ar
.len
> allocated
)
4389 got_allocated_blocks
:
4390 /* try to insert new extent into found leaf and return */
4391 pblk
= newblock
+ offset
;
4392 ext4_ext_store_pblock(&newex
, pblk
);
4393 newex
.ee_len
= cpu_to_le16(ar
.len
);
4394 /* Mark unwritten */
4395 if (flags
& EXT4_GET_BLOCKS_UNWRIT_EXT
) {
4396 ext4_ext_mark_unwritten(&newex
);
4397 map
->m_flags
|= EXT4_MAP_UNWRITTEN
;
4400 path
= ext4_ext_insert_extent(handle
, inode
, path
, &newex
, flags
);
4402 err
= PTR_ERR(path
);
4403 if (allocated_clusters
) {
4407 * free data blocks we just allocated.
4408 * not a good idea to call discard here directly,
4409 * but otherwise we'd need to call it every free().
4411 ext4_discard_preallocations(inode
);
4412 if (flags
& EXT4_GET_BLOCKS_DELALLOC_RESERVE
)
4413 fb_flags
= EXT4_FREE_BLOCKS_NO_QUOT_UPDATE
;
4414 ext4_free_blocks(handle
, inode
, NULL
, newblock
,
4415 EXT4_C2B(sbi
, allocated_clusters
),
4422 * Cache the extent and update transaction to commit on fdatasync only
4423 * when it is _not_ an unwritten extent.
4425 if ((flags
& EXT4_GET_BLOCKS_UNWRIT_EXT
) == 0)
4426 ext4_update_inode_fsync_trans(handle
, inode
, 1);
4428 ext4_update_inode_fsync_trans(handle
, inode
, 0);
4430 map
->m_flags
|= (EXT4_MAP_NEW
| EXT4_MAP_MAPPED
);
4432 map
->m_len
= ar
.len
;
4433 allocated
= map
->m_len
;
4434 ext4_ext_show_leaf(inode
, path
);
4436 ext4_free_ext_path(path
);
4438 trace_ext4_ext_map_blocks_exit(inode
, flags
, map
,
4439 err
? err
: allocated
);
4440 return err
? err
: allocated
;
4443 int ext4_ext_truncate(handle_t
*handle
, struct inode
*inode
)
4445 struct super_block
*sb
= inode
->i_sb
;
4446 ext4_lblk_t last_block
;
4450 * TODO: optimization is possible here.
4451 * Probably we need not scan at all,
4452 * because page truncation is enough.
4455 /* we have to know where to truncate from in crash case */
4456 EXT4_I(inode
)->i_disksize
= inode
->i_size
;
4457 err
= ext4_mark_inode_dirty(handle
, inode
);
4461 last_block
= (inode
->i_size
+ sb
->s_blocksize
- 1)
4462 >> EXT4_BLOCK_SIZE_BITS(sb
);
4463 ext4_es_remove_extent(inode
, last_block
, EXT_MAX_BLOCKS
- last_block
);
4466 err
= ext4_ext_remove_space(inode
, last_block
, EXT_MAX_BLOCKS
- 1);
4467 if (err
== -ENOMEM
) {
4468 memalloc_retry_wait(GFP_ATOMIC
);
4469 goto retry_remove_space
;
4474 static int ext4_alloc_file_blocks(struct file
*file
, ext4_lblk_t offset
,
4475 ext4_lblk_t len
, loff_t new_size
,
4478 struct inode
*inode
= file_inode(file
);
4480 int ret
= 0, ret2
= 0, ret3
= 0;
4483 struct ext4_map_blocks map
;
4484 unsigned int credits
;
4485 loff_t epos
, old_size
= i_size_read(inode
);
4487 BUG_ON(!ext4_test_inode_flag(inode
, EXT4_INODE_EXTENTS
));
4488 map
.m_lblk
= offset
;
4491 * Don't normalize the request if it can fit in one extent so
4492 * that it doesn't get unnecessarily split into multiple
4495 if (len
<= EXT_UNWRITTEN_MAX_LEN
)
4496 flags
|= EXT4_GET_BLOCKS_NO_NORMALIZE
;
4499 * credits to insert 1 extent into extent tree
4501 credits
= ext4_chunk_trans_blocks(inode
, len
);
4502 depth
= ext_depth(inode
);
4507 * Recalculate credits when extent tree depth changes.
4509 if (depth
!= ext_depth(inode
)) {
4510 credits
= ext4_chunk_trans_blocks(inode
, len
);
4511 depth
= ext_depth(inode
);
4514 handle
= ext4_journal_start(inode
, EXT4_HT_MAP_BLOCKS
,
4516 if (IS_ERR(handle
)) {
4517 ret
= PTR_ERR(handle
);
4520 ret
= ext4_map_blocks(handle
, inode
, &map
, flags
);
4522 ext4_debug("inode #%lu: block %u: len %u: "
4523 "ext4_ext_map_blocks returned %d",
4524 inode
->i_ino
, map
.m_lblk
,
4526 ext4_mark_inode_dirty(handle
, inode
);
4527 ext4_journal_stop(handle
);
4531 * allow a full retry cycle for any remaining allocations
4535 map
.m_len
= len
= len
- ret
;
4536 epos
= (loff_t
)map
.m_lblk
<< inode
->i_blkbits
;
4537 inode_set_ctime_current(inode
);
4539 if (epos
> new_size
)
4541 if (ext4_update_inode_size(inode
, epos
) & 0x1)
4542 inode_set_mtime_to_ts(inode
,
4543 inode_get_ctime(inode
));
4544 if (epos
> old_size
) {
4545 pagecache_isize_extended(inode
, old_size
, epos
);
4546 ext4_zero_partial_blocks(handle
, inode
,
4547 old_size
, epos
- old_size
);
4550 ret2
= ext4_mark_inode_dirty(handle
, inode
);
4551 ext4_update_inode_fsync_trans(handle
, inode
, 1);
4552 ret3
= ext4_journal_stop(handle
);
4553 ret2
= ret3
? ret3
: ret2
;
4557 if (ret
== -ENOSPC
&& ext4_should_retry_alloc(inode
->i_sb
, &retries
))
4560 return ret
> 0 ? ret2
: ret
;
4563 static int ext4_collapse_range(struct file
*file
, loff_t offset
, loff_t len
);
4565 static int ext4_insert_range(struct file
*file
, loff_t offset
, loff_t len
);
4567 static long ext4_zero_range(struct file
*file
, loff_t offset
,
4568 loff_t len
, int mode
)
4570 struct inode
*inode
= file_inode(file
);
4571 struct address_space
*mapping
= file
->f_mapping
;
4572 handle_t
*handle
= NULL
;
4573 unsigned int max_blocks
;
4574 loff_t new_size
= 0;
4578 int partial_begin
, partial_end
;
4581 unsigned int blkbits
= inode
->i_blkbits
;
4583 trace_ext4_zero_range(inode
, offset
, len
, mode
);
4586 * Round up offset. This is not fallocate, we need to zero out
4587 * blocks, so convert interior block aligned part of the range to
4588 * unwritten and possibly manually zero out unaligned parts of the
4589 * range. Here, start and partial_begin are inclusive, end and
4590 * partial_end are exclusive.
4592 start
= round_up(offset
, 1 << blkbits
);
4593 end
= round_down((offset
+ len
), 1 << blkbits
);
4595 if (start
< offset
|| end
> offset
+ len
)
4597 partial_begin
= offset
& ((1 << blkbits
) - 1);
4598 partial_end
= (offset
+ len
) & ((1 << blkbits
) - 1);
4600 lblk
= start
>> blkbits
;
4601 max_blocks
= (end
>> blkbits
);
4602 if (max_blocks
< lblk
)
4610 * Indirect files do not support unwritten extents
4612 if (!(ext4_test_inode_flag(inode
, EXT4_INODE_EXTENTS
))) {
4617 if (!(mode
& FALLOC_FL_KEEP_SIZE
) &&
4618 (offset
+ len
> inode
->i_size
||
4619 offset
+ len
> EXT4_I(inode
)->i_disksize
)) {
4620 new_size
= offset
+ len
;
4621 ret
= inode_newsize_ok(inode
, new_size
);
4626 flags
= EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT
;
4628 /* Wait all existing dio workers, newcomers will block on i_rwsem */
4629 inode_dio_wait(inode
);
4631 ret
= file_modified(file
);
4635 /* Preallocate the range including the unaligned edges */
4636 if (partial_begin
|| partial_end
) {
4637 ret
= ext4_alloc_file_blocks(file
,
4638 round_down(offset
, 1 << blkbits
) >> blkbits
,
4639 (round_up((offset
+ len
), 1 << blkbits
) -
4640 round_down(offset
, 1 << blkbits
)) >> blkbits
,
4647 /* Zero range excluding the unaligned edges */
4648 if (max_blocks
> 0) {
4649 flags
|= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN
|
4653 * Prevent page faults from reinstantiating pages we have
4654 * released from page cache.
4656 filemap_invalidate_lock(mapping
);
4658 ret
= ext4_break_layouts(inode
);
4660 filemap_invalidate_unlock(mapping
);
4664 ret
= ext4_update_disksize_before_punch(inode
, offset
, len
);
4666 filemap_invalidate_unlock(mapping
);
4671 * For journalled data we need to write (and checkpoint) pages
4672 * before discarding page cache to avoid inconsitent data on
4673 * disk in case of crash before zeroing trans is committed.
4675 if (ext4_should_journal_data(inode
)) {
4676 ret
= filemap_write_and_wait_range(mapping
, start
,
4679 filemap_invalidate_unlock(mapping
);
4684 /* Now release the pages and zero block aligned part of pages */
4685 truncate_pagecache_range(inode
, start
, end
- 1);
4686 inode_set_mtime_to_ts(inode
, inode_set_ctime_current(inode
));
4688 ret
= ext4_alloc_file_blocks(file
, lblk
, max_blocks
, new_size
,
4690 filemap_invalidate_unlock(mapping
);
4694 if (!partial_begin
&& !partial_end
)
4698 * In worst case we have to writeout two nonadjacent unwritten
4699 * blocks and update the inode
4701 credits
= (2 * ext4_ext_index_trans_blocks(inode
, 2)) + 1;
4702 if (ext4_should_journal_data(inode
))
4704 handle
= ext4_journal_start(inode
, EXT4_HT_MISC
, credits
);
4705 if (IS_ERR(handle
)) {
4706 ret
= PTR_ERR(handle
);
4707 ext4_std_error(inode
->i_sb
, ret
);
4711 inode_set_mtime_to_ts(inode
, inode_set_ctime_current(inode
));
4713 ext4_update_inode_size(inode
, new_size
);
4714 ret
= ext4_mark_inode_dirty(handle
, inode
);
4717 /* Zero out partial block at the edges of the range */
4718 ret
= ext4_zero_partial_blocks(handle
, inode
, offset
, len
);
4720 ext4_update_inode_fsync_trans(handle
, inode
, 1);
4722 if (file
->f_flags
& O_SYNC
)
4723 ext4_handle_sync(handle
);
4726 ext4_journal_stop(handle
);
4728 inode_unlock(inode
);
4733 * preallocate space for a file. This implements ext4's fallocate file
4734 * operation, which gets called from sys_fallocate system call.
4735 * For block-mapped files, posix_fallocate should fall back to the method
4736 * of writing zeroes to the required new blocks (the same behavior which is
4737 * expected for file systems which do not support fallocate() system call).
4739 long ext4_fallocate(struct file
*file
, int mode
, loff_t offset
, loff_t len
)
4741 struct inode
*inode
= file_inode(file
);
4742 loff_t new_size
= 0;
4743 unsigned int max_blocks
;
4747 unsigned int blkbits
= inode
->i_blkbits
;
4750 * Encrypted inodes can't handle collapse range or insert
4751 * range since we would need to re-encrypt blocks with a
4752 * different IV or XTS tweak (which are based on the logical
4755 if (IS_ENCRYPTED(inode
) &&
4756 (mode
& (FALLOC_FL_COLLAPSE_RANGE
| FALLOC_FL_INSERT_RANGE
)))
4759 /* Return error if mode is not supported */
4760 if (mode
& ~(FALLOC_FL_KEEP_SIZE
| FALLOC_FL_PUNCH_HOLE
|
4761 FALLOC_FL_COLLAPSE_RANGE
| FALLOC_FL_ZERO_RANGE
|
4762 FALLOC_FL_INSERT_RANGE
))
4766 ret
= ext4_convert_inline_data(inode
);
4767 inode_unlock(inode
);
4771 if (mode
& FALLOC_FL_PUNCH_HOLE
) {
4772 ret
= ext4_punch_hole(file
, offset
, len
);
4776 if (mode
& FALLOC_FL_COLLAPSE_RANGE
) {
4777 ret
= ext4_collapse_range(file
, offset
, len
);
4781 if (mode
& FALLOC_FL_INSERT_RANGE
) {
4782 ret
= ext4_insert_range(file
, offset
, len
);
4786 if (mode
& FALLOC_FL_ZERO_RANGE
) {
4787 ret
= ext4_zero_range(file
, offset
, len
, mode
);
4790 trace_ext4_fallocate_enter(inode
, offset
, len
, mode
);
4791 lblk
= offset
>> blkbits
;
4793 max_blocks
= EXT4_MAX_BLOCKS(len
, offset
, blkbits
);
4794 flags
= EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT
;
4799 * We only support preallocation for extent-based files only
4801 if (!(ext4_test_inode_flag(inode
, EXT4_INODE_EXTENTS
))) {
4806 if (!(mode
& FALLOC_FL_KEEP_SIZE
) &&
4807 (offset
+ len
> inode
->i_size
||
4808 offset
+ len
> EXT4_I(inode
)->i_disksize
)) {
4809 new_size
= offset
+ len
;
4810 ret
= inode_newsize_ok(inode
, new_size
);
4815 /* Wait all existing dio workers, newcomers will block on i_rwsem */
4816 inode_dio_wait(inode
);
4818 ret
= file_modified(file
);
4822 ret
= ext4_alloc_file_blocks(file
, lblk
, max_blocks
, new_size
, flags
);
4826 if (file
->f_flags
& O_SYNC
&& EXT4_SB(inode
->i_sb
)->s_journal
) {
4827 ret
= ext4_fc_commit(EXT4_SB(inode
->i_sb
)->s_journal
,
4828 EXT4_I(inode
)->i_sync_tid
);
4831 inode_unlock(inode
);
4832 trace_ext4_fallocate_exit(inode
, offset
, max_blocks
, ret
);
4838 * This function convert a range of blocks to written extents
4839 * The caller of this function will pass the start offset and the size.
4840 * all unwritten extents within this range will be converted to
4843 * This function is called from the direct IO end io call back
4844 * function, to convert the fallocated extents after IO is completed.
4845 * Returns 0 on success.
4847 int ext4_convert_unwritten_extents(handle_t
*handle
, struct inode
*inode
,
4848 loff_t offset
, ssize_t len
)
4850 unsigned int max_blocks
;
4851 int ret
= 0, ret2
= 0, ret3
= 0;
4852 struct ext4_map_blocks map
;
4853 unsigned int blkbits
= inode
->i_blkbits
;
4854 unsigned int credits
= 0;
4856 map
.m_lblk
= offset
>> blkbits
;
4857 max_blocks
= EXT4_MAX_BLOCKS(len
, offset
, blkbits
);
4861 * credits to insert 1 extent into extent tree
4863 credits
= ext4_chunk_trans_blocks(inode
, max_blocks
);
4865 while (ret
>= 0 && ret
< max_blocks
) {
4867 map
.m_len
= (max_blocks
-= ret
);
4869 handle
= ext4_journal_start(inode
, EXT4_HT_MAP_BLOCKS
,
4871 if (IS_ERR(handle
)) {
4872 ret
= PTR_ERR(handle
);
4876 ret
= ext4_map_blocks(handle
, inode
, &map
,
4877 EXT4_GET_BLOCKS_IO_CONVERT_EXT
);
4879 ext4_warning(inode
->i_sb
,
4880 "inode #%lu: block %u: len %u: "
4881 "ext4_ext_map_blocks returned %d",
4882 inode
->i_ino
, map
.m_lblk
,
4884 ret2
= ext4_mark_inode_dirty(handle
, inode
);
4886 ret3
= ext4_journal_stop(handle
);
4891 if (ret
<= 0 || ret2
)
4894 return ret
> 0 ? ret2
: ret
;
4897 int ext4_convert_unwritten_io_end_vec(handle_t
*handle
, ext4_io_end_t
*io_end
)
4899 int ret
= 0, err
= 0;
4900 struct ext4_io_end_vec
*io_end_vec
;
4903 * This is somewhat ugly but the idea is clear: When transaction is
4904 * reserved, everything goes into it. Otherwise we rather start several
4905 * smaller transactions for conversion of each extent separately.
4908 handle
= ext4_journal_start_reserved(handle
,
4909 EXT4_HT_EXT_CONVERT
);
4911 return PTR_ERR(handle
);
4914 list_for_each_entry(io_end_vec
, &io_end
->list_vec
, list
) {
4915 ret
= ext4_convert_unwritten_extents(handle
, io_end
->inode
,
4923 err
= ext4_journal_stop(handle
);
4925 return ret
< 0 ? ret
: err
;
4928 static int ext4_iomap_xattr_fiemap(struct inode
*inode
, struct iomap
*iomap
)
4932 int blockbits
= inode
->i_sb
->s_blocksize_bits
;
4937 if (ext4_test_inode_state(inode
, EXT4_STATE_XATTR
)) {
4938 struct ext4_iloc iloc
;
4939 int offset
; /* offset of xattr in inode */
4941 error
= ext4_get_inode_loc(inode
, &iloc
);
4944 physical
= (__u64
)iloc
.bh
->b_blocknr
<< blockbits
;
4945 offset
= EXT4_GOOD_OLD_INODE_SIZE
+
4946 EXT4_I(inode
)->i_extra_isize
;
4948 length
= EXT4_SB(inode
->i_sb
)->s_inode_size
- offset
;
4950 iomap_type
= IOMAP_INLINE
;
4951 } else if (EXT4_I(inode
)->i_file_acl
) { /* external block */
4952 physical
= (__u64
)EXT4_I(inode
)->i_file_acl
<< blockbits
;
4953 length
= inode
->i_sb
->s_blocksize
;
4954 iomap_type
= IOMAP_MAPPED
;
4956 /* no in-inode or external block for xattr, so return -ENOENT */
4961 iomap
->addr
= physical
;
4963 iomap
->length
= length
;
4964 iomap
->type
= iomap_type
;
4970 static int ext4_iomap_xattr_begin(struct inode
*inode
, loff_t offset
,
4971 loff_t length
, unsigned flags
,
4972 struct iomap
*iomap
, struct iomap
*srcmap
)
4976 error
= ext4_iomap_xattr_fiemap(inode
, iomap
);
4977 if (error
== 0 && (offset
>= iomap
->length
))
4982 static const struct iomap_ops ext4_iomap_xattr_ops
= {
4983 .iomap_begin
= ext4_iomap_xattr_begin
,
4986 static int ext4_fiemap_check_ranges(struct inode
*inode
, u64 start
, u64
*len
)
4990 if (ext4_test_inode_flag(inode
, EXT4_INODE_EXTENTS
))
4991 maxbytes
= inode
->i_sb
->s_maxbytes
;
4993 maxbytes
= EXT4_SB(inode
->i_sb
)->s_bitmap_maxbytes
;
4997 if (start
> maxbytes
)
5001 * Shrink request scope to what the fs can actually handle.
5003 if (*len
> maxbytes
|| (maxbytes
- *len
) < start
)
5004 *len
= maxbytes
- start
;
5008 int ext4_fiemap(struct inode
*inode
, struct fiemap_extent_info
*fieinfo
,
5013 if (fieinfo
->fi_flags
& FIEMAP_FLAG_CACHE
) {
5014 error
= ext4_ext_precache(inode
);
5017 fieinfo
->fi_flags
&= ~FIEMAP_FLAG_CACHE
;
5021 * For bitmap files the maximum size limit could be smaller than
5022 * s_maxbytes, so check len here manually instead of just relying on the
5025 error
= ext4_fiemap_check_ranges(inode
, start
, &len
);
5029 if (fieinfo
->fi_flags
& FIEMAP_FLAG_XATTR
) {
5030 fieinfo
->fi_flags
&= ~FIEMAP_FLAG_XATTR
;
5031 return iomap_fiemap(inode
, fieinfo
, start
, len
,
5032 &ext4_iomap_xattr_ops
);
5035 return iomap_fiemap(inode
, fieinfo
, start
, len
, &ext4_iomap_report_ops
);
5038 int ext4_get_es_cache(struct inode
*inode
, struct fiemap_extent_info
*fieinfo
,
5039 __u64 start
, __u64 len
)
5041 ext4_lblk_t start_blk
, len_blks
;
5045 if (ext4_has_inline_data(inode
)) {
5048 down_read(&EXT4_I(inode
)->xattr_sem
);
5049 has_inline
= ext4_has_inline_data(inode
);
5050 up_read(&EXT4_I(inode
)->xattr_sem
);
5055 if (fieinfo
->fi_flags
& FIEMAP_FLAG_CACHE
) {
5056 error
= ext4_ext_precache(inode
);
5059 fieinfo
->fi_flags
&= ~FIEMAP_FLAG_CACHE
;
5062 error
= fiemap_prep(inode
, fieinfo
, start
, &len
, 0);
5066 error
= ext4_fiemap_check_ranges(inode
, start
, &len
);
5070 start_blk
= start
>> inode
->i_sb
->s_blocksize_bits
;
5071 last_blk
= (start
+ len
- 1) >> inode
->i_sb
->s_blocksize_bits
;
5072 if (last_blk
>= EXT_MAX_BLOCKS
)
5073 last_blk
= EXT_MAX_BLOCKS
-1;
5074 len_blks
= ((ext4_lblk_t
) last_blk
) - start_blk
+ 1;
5077 * Walk the extent tree gathering extent information
5078 * and pushing extents back to the user.
5080 return ext4_fill_es_cache_info(inode
, start_blk
, len_blks
, fieinfo
);
5084 * ext4_ext_shift_path_extents:
5085 * Shift the extents of a path structure lying between path[depth].p_ext
5086 * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells
5087 * if it is right shift or left shift operation.
5090 ext4_ext_shift_path_extents(struct ext4_ext_path
*path
, ext4_lblk_t shift
,
5091 struct inode
*inode
, handle_t
*handle
,
5092 enum SHIFT_DIRECTION SHIFT
)
5095 struct ext4_extent
*ex_start
, *ex_last
;
5096 bool update
= false;
5097 int credits
, restart_credits
;
5098 depth
= path
->p_depth
;
5100 while (depth
>= 0) {
5101 if (depth
== path
->p_depth
) {
5102 ex_start
= path
[depth
].p_ext
;
5104 return -EFSCORRUPTED
;
5106 ex_last
= EXT_LAST_EXTENT(path
[depth
].p_hdr
);
5107 /* leaf + sb + inode */
5109 if (ex_start
== EXT_FIRST_EXTENT(path
[depth
].p_hdr
)) {
5111 /* extent tree + sb + inode */
5112 credits
= depth
+ 2;
5115 restart_credits
= ext4_writepage_trans_blocks(inode
);
5116 err
= ext4_datasem_ensure_credits(handle
, inode
, credits
,
5117 restart_credits
, 0);
5124 err
= ext4_ext_get_access(handle
, inode
, path
+ depth
);
5128 while (ex_start
<= ex_last
) {
5129 if (SHIFT
== SHIFT_LEFT
) {
5130 le32_add_cpu(&ex_start
->ee_block
,
5132 /* Try to merge to the left. */
5134 EXT_FIRST_EXTENT(path
[depth
].p_hdr
))
5136 ext4_ext_try_to_merge_right(inode
,
5137 path
, ex_start
- 1))
5142 le32_add_cpu(&ex_last
->ee_block
, shift
);
5143 ext4_ext_try_to_merge_right(inode
, path
,
5148 err
= ext4_ext_dirty(handle
, inode
, path
+ depth
);
5152 if (--depth
< 0 || !update
)
5156 /* Update index too */
5157 err
= ext4_ext_get_access(handle
, inode
, path
+ depth
);
5161 if (SHIFT
== SHIFT_LEFT
)
5162 le32_add_cpu(&path
[depth
].p_idx
->ei_block
, -shift
);
5164 le32_add_cpu(&path
[depth
].p_idx
->ei_block
, shift
);
5165 err
= ext4_ext_dirty(handle
, inode
, path
+ depth
);
5169 /* we are done if current index is not a starting index */
5170 if (path
[depth
].p_idx
!= EXT_FIRST_INDEX(path
[depth
].p_hdr
))
5181 * ext4_ext_shift_extents:
5182 * All the extents which lies in the range from @start to the last allocated
5183 * block for the @inode are shifted either towards left or right (depending
5184 * upon @SHIFT) by @shift blocks.
5185 * On success, 0 is returned, error otherwise.
5188 ext4_ext_shift_extents(struct inode
*inode
, handle_t
*handle
,
5189 ext4_lblk_t start
, ext4_lblk_t shift
,
5190 enum SHIFT_DIRECTION SHIFT
)
5192 struct ext4_ext_path
*path
;
5194 struct ext4_extent
*extent
;
5195 ext4_lblk_t stop
, *iterator
, ex_start
, ex_end
;
5196 ext4_lblk_t tmp
= EXT_MAX_BLOCKS
;
5198 /* Let path point to the last extent */
5199 path
= ext4_find_extent(inode
, EXT_MAX_BLOCKS
- 1, NULL
,
5202 return PTR_ERR(path
);
5204 depth
= path
->p_depth
;
5205 extent
= path
[depth
].p_ext
;
5209 stop
= le32_to_cpu(extent
->ee_block
);
5212 * For left shifts, make sure the hole on the left is big enough to
5213 * accommodate the shift. For right shifts, make sure the last extent
5214 * won't be shifted beyond EXT_MAX_BLOCKS.
5216 if (SHIFT
== SHIFT_LEFT
) {
5217 path
= ext4_find_extent(inode
, start
- 1, path
,
5220 return PTR_ERR(path
);
5221 depth
= path
->p_depth
;
5222 extent
= path
[depth
].p_ext
;
5224 ex_start
= le32_to_cpu(extent
->ee_block
);
5225 ex_end
= le32_to_cpu(extent
->ee_block
) +
5226 ext4_ext_get_actual_len(extent
);
5232 if ((start
== ex_start
&& shift
> ex_start
) ||
5233 (shift
> start
- ex_end
)) {
5238 if (shift
> EXT_MAX_BLOCKS
-
5239 (stop
+ ext4_ext_get_actual_len(extent
))) {
5246 * In case of left shift, iterator points to start and it is increased
5247 * till we reach stop. In case of right shift, iterator points to stop
5248 * and it is decreased till we reach start.
5252 if (SHIFT
== SHIFT_LEFT
)
5257 if (tmp
!= EXT_MAX_BLOCKS
)
5261 * Its safe to start updating extents. Start and stop are unsigned, so
5262 * in case of right shift if extent with 0 block is reached, iterator
5263 * becomes NULL to indicate the end of the loop.
5265 while (iterator
&& start
<= stop
) {
5266 path
= ext4_find_extent(inode
, *iterator
, path
,
5269 return PTR_ERR(path
);
5270 depth
= path
->p_depth
;
5271 extent
= path
[depth
].p_ext
;
5273 EXT4_ERROR_INODE(inode
, "unexpected hole at %lu",
5274 (unsigned long) *iterator
);
5275 return -EFSCORRUPTED
;
5277 if (SHIFT
== SHIFT_LEFT
&& *iterator
>
5278 le32_to_cpu(extent
->ee_block
)) {
5279 /* Hole, move to the next extent */
5280 if (extent
< EXT_LAST_EXTENT(path
[depth
].p_hdr
)) {
5281 path
[depth
].p_ext
++;
5283 *iterator
= ext4_ext_next_allocated_block(path
);
5289 if (SHIFT
== SHIFT_LEFT
) {
5290 extent
= EXT_LAST_EXTENT(path
[depth
].p_hdr
);
5291 *iterator
= le32_to_cpu(extent
->ee_block
) +
5292 ext4_ext_get_actual_len(extent
);
5294 extent
= EXT_FIRST_EXTENT(path
[depth
].p_hdr
);
5295 if (le32_to_cpu(extent
->ee_block
) > start
)
5296 *iterator
= le32_to_cpu(extent
->ee_block
) - 1;
5297 else if (le32_to_cpu(extent
->ee_block
) == start
)
5300 extent
= EXT_LAST_EXTENT(path
[depth
].p_hdr
);
5301 while (le32_to_cpu(extent
->ee_block
) >= start
)
5304 if (extent
== EXT_LAST_EXTENT(path
[depth
].p_hdr
))
5310 path
[depth
].p_ext
= extent
;
5312 ret
= ext4_ext_shift_path_extents(path
, shift
, inode
,
5314 /* iterator can be NULL which means we should break */
5321 ext4_free_ext_path(path
);
5326 * ext4_collapse_range:
5327 * This implements the fallocate's collapse range functionality for ext4
5328 * Returns: 0 and non-zero on error.
5330 static int ext4_collapse_range(struct file
*file
, loff_t offset
, loff_t len
)
5332 struct inode
*inode
= file_inode(file
);
5333 struct super_block
*sb
= inode
->i_sb
;
5334 struct address_space
*mapping
= inode
->i_mapping
;
5335 ext4_lblk_t punch_start
, punch_stop
;
5337 unsigned int credits
;
5338 loff_t new_size
, ioffset
;
5342 * We need to test this early because xfstests assumes that a
5343 * collapse range of (0, 1) will return EOPNOTSUPP if the file
5344 * system does not support collapse range.
5346 if (!ext4_test_inode_flag(inode
, EXT4_INODE_EXTENTS
))
5349 /* Collapse range works only on fs cluster size aligned regions. */
5350 if (!IS_ALIGNED(offset
| len
, EXT4_CLUSTER_SIZE(sb
)))
5353 trace_ext4_collapse_range(inode
, offset
, len
);
5355 punch_start
= offset
>> EXT4_BLOCK_SIZE_BITS(sb
);
5356 punch_stop
= (offset
+ len
) >> EXT4_BLOCK_SIZE_BITS(sb
);
5360 * There is no need to overlap collapse range with EOF, in which case
5361 * it is effectively a truncate operation
5363 if (offset
+ len
>= inode
->i_size
) {
5368 /* Currently just for extent based files */
5369 if (!ext4_test_inode_flag(inode
, EXT4_INODE_EXTENTS
)) {
5374 /* Wait for existing dio to complete */
5375 inode_dio_wait(inode
);
5377 ret
= file_modified(file
);
5382 * Prevent page faults from reinstantiating pages we have released from
5385 filemap_invalidate_lock(mapping
);
5387 ret
= ext4_break_layouts(inode
);
5392 * Need to round down offset to be aligned with page size boundary
5393 * for page size > block size.
5395 ioffset
= round_down(offset
, PAGE_SIZE
);
5397 * Write tail of the last page before removed range since it will get
5398 * removed from the page cache below.
5400 ret
= filemap_write_and_wait_range(mapping
, ioffset
, offset
);
5404 * Write data that will be shifted to preserve them when discarding
5405 * page cache below. We are also protected from pages becoming dirty
5406 * by i_rwsem and invalidate_lock.
5408 ret
= filemap_write_and_wait_range(mapping
, offset
+ len
,
5412 truncate_pagecache(inode
, ioffset
);
5414 credits
= ext4_writepage_trans_blocks(inode
);
5415 handle
= ext4_journal_start(inode
, EXT4_HT_TRUNCATE
, credits
);
5416 if (IS_ERR(handle
)) {
5417 ret
= PTR_ERR(handle
);
5420 ext4_fc_mark_ineligible(sb
, EXT4_FC_REASON_FALLOC_RANGE
, handle
);
5422 down_write(&EXT4_I(inode
)->i_data_sem
);
5423 ext4_discard_preallocations(inode
);
5424 ext4_es_remove_extent(inode
, punch_start
, EXT_MAX_BLOCKS
- punch_start
);
5426 ret
= ext4_ext_remove_space(inode
, punch_start
, punch_stop
- 1);
5428 up_write(&EXT4_I(inode
)->i_data_sem
);
5431 ext4_discard_preallocations(inode
);
5433 ret
= ext4_ext_shift_extents(inode
, handle
, punch_stop
,
5434 punch_stop
- punch_start
, SHIFT_LEFT
);
5436 up_write(&EXT4_I(inode
)->i_data_sem
);
5440 new_size
= inode
->i_size
- len
;
5441 i_size_write(inode
, new_size
);
5442 EXT4_I(inode
)->i_disksize
= new_size
;
5444 up_write(&EXT4_I(inode
)->i_data_sem
);
5446 ext4_handle_sync(handle
);
5447 inode_set_mtime_to_ts(inode
, inode_set_ctime_current(inode
));
5448 ret
= ext4_mark_inode_dirty(handle
, inode
);
5449 ext4_update_inode_fsync_trans(handle
, inode
, 1);
5452 ext4_journal_stop(handle
);
5454 filemap_invalidate_unlock(mapping
);
5456 inode_unlock(inode
);
5461 * ext4_insert_range:
5462 * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate.
5463 * The data blocks starting from @offset to the EOF are shifted by @len
5464 * towards right to create a hole in the @inode. Inode size is increased
5466 * Returns 0 on success, error otherwise.
5468 static int ext4_insert_range(struct file
*file
, loff_t offset
, loff_t len
)
5470 struct inode
*inode
= file_inode(file
);
5471 struct super_block
*sb
= inode
->i_sb
;
5472 struct address_space
*mapping
= inode
->i_mapping
;
5474 struct ext4_ext_path
*path
;
5475 struct ext4_extent
*extent
;
5476 ext4_lblk_t offset_lblk
, len_lblk
, ee_start_lblk
= 0;
5477 unsigned int credits
, ee_len
;
5478 int ret
= 0, depth
, split_flag
= 0;
5482 * We need to test this early because xfstests assumes that an
5483 * insert range of (0, 1) will return EOPNOTSUPP if the file
5484 * system does not support insert range.
5486 if (!ext4_test_inode_flag(inode
, EXT4_INODE_EXTENTS
))
5489 /* Insert range works only on fs cluster size aligned regions. */
5490 if (!IS_ALIGNED(offset
| len
, EXT4_CLUSTER_SIZE(sb
)))
5493 trace_ext4_insert_range(inode
, offset
, len
);
5495 offset_lblk
= offset
>> EXT4_BLOCK_SIZE_BITS(sb
);
5496 len_lblk
= len
>> EXT4_BLOCK_SIZE_BITS(sb
);
5499 /* Currently just for extent based files */
5500 if (!ext4_test_inode_flag(inode
, EXT4_INODE_EXTENTS
)) {
5505 /* Check whether the maximum file size would be exceeded */
5506 if (len
> inode
->i_sb
->s_maxbytes
- inode
->i_size
) {
5511 /* Offset must be less than i_size */
5512 if (offset
>= inode
->i_size
) {
5517 /* Wait for existing dio to complete */
5518 inode_dio_wait(inode
);
5520 ret
= file_modified(file
);
5525 * Prevent page faults from reinstantiating pages we have released from
5528 filemap_invalidate_lock(mapping
);
5530 ret
= ext4_break_layouts(inode
);
5535 * Need to round down to align start offset to page size boundary
5536 * for page size > block size.
5538 ioffset
= round_down(offset
, PAGE_SIZE
);
5539 /* Write out all dirty pages */
5540 ret
= filemap_write_and_wait_range(inode
->i_mapping
, ioffset
,
5544 truncate_pagecache(inode
, ioffset
);
5546 credits
= ext4_writepage_trans_blocks(inode
);
5547 handle
= ext4_journal_start(inode
, EXT4_HT_TRUNCATE
, credits
);
5548 if (IS_ERR(handle
)) {
5549 ret
= PTR_ERR(handle
);
5552 ext4_fc_mark_ineligible(sb
, EXT4_FC_REASON_FALLOC_RANGE
, handle
);
5554 /* Expand file to avoid data loss if there is error while shifting */
5555 inode
->i_size
+= len
;
5556 EXT4_I(inode
)->i_disksize
+= len
;
5557 inode_set_mtime_to_ts(inode
, inode_set_ctime_current(inode
));
5558 ret
= ext4_mark_inode_dirty(handle
, inode
);
5562 down_write(&EXT4_I(inode
)->i_data_sem
);
5563 ext4_discard_preallocations(inode
);
5565 path
= ext4_find_extent(inode
, offset_lblk
, NULL
, 0);
5567 up_write(&EXT4_I(inode
)->i_data_sem
);
5568 ret
= PTR_ERR(path
);
5572 depth
= ext_depth(inode
);
5573 extent
= path
[depth
].p_ext
;
5575 ee_start_lblk
= le32_to_cpu(extent
->ee_block
);
5576 ee_len
= ext4_ext_get_actual_len(extent
);
5579 * If offset_lblk is not the starting block of extent, split
5580 * the extent @offset_lblk
5582 if ((offset_lblk
> ee_start_lblk
) &&
5583 (offset_lblk
< (ee_start_lblk
+ ee_len
))) {
5584 if (ext4_ext_is_unwritten(extent
))
5585 split_flag
= EXT4_EXT_MARK_UNWRIT1
|
5586 EXT4_EXT_MARK_UNWRIT2
;
5587 path
= ext4_split_extent_at(handle
, inode
, path
,
5588 offset_lblk
, split_flag
,
5590 EXT4_GET_BLOCKS_PRE_IO
|
5591 EXT4_GET_BLOCKS_METADATA_NOFAIL
);
5595 up_write(&EXT4_I(inode
)->i_data_sem
);
5596 ret
= PTR_ERR(path
);
5601 ext4_free_ext_path(path
);
5602 ext4_es_remove_extent(inode
, offset_lblk
, EXT_MAX_BLOCKS
- offset_lblk
);
5605 * if offset_lblk lies in a hole which is at start of file, use
5606 * ee_start_lblk to shift extents
5608 ret
= ext4_ext_shift_extents(inode
, handle
,
5609 max(ee_start_lblk
, offset_lblk
), len_lblk
, SHIFT_RIGHT
);
5611 up_write(&EXT4_I(inode
)->i_data_sem
);
5613 ext4_handle_sync(handle
);
5615 ext4_update_inode_fsync_trans(handle
, inode
, 1);
5618 ext4_journal_stop(handle
);
5620 filemap_invalidate_unlock(mapping
);
5622 inode_unlock(inode
);
5627 * ext4_swap_extents() - Swap extents between two inodes
5628 * @handle: handle for this transaction
5629 * @inode1: First inode
5630 * @inode2: Second inode
5631 * @lblk1: Start block for first inode
5632 * @lblk2: Start block for second inode
5633 * @count: Number of blocks to swap
5634 * @unwritten: Mark second inode's extents as unwritten after swap
5635 * @erp: Pointer to save error value
5637 * This helper routine does exactly what is promise "swap extents". All other
5638 * stuff such as page-cache locking consistency, bh mapping consistency or
5639 * extent's data copying must be performed by caller.
5641 * i_rwsem is held for both inodes
5642 * i_data_sem is locked for write for both inodes
5644 * All pages from requested range are locked for both inodes
5647 ext4_swap_extents(handle_t
*handle
, struct inode
*inode1
,
5648 struct inode
*inode2
, ext4_lblk_t lblk1
, ext4_lblk_t lblk2
,
5649 ext4_lblk_t count
, int unwritten
, int *erp
)
5651 struct ext4_ext_path
*path1
= NULL
;
5652 struct ext4_ext_path
*path2
= NULL
;
5653 int replaced_count
= 0;
5655 BUG_ON(!rwsem_is_locked(&EXT4_I(inode1
)->i_data_sem
));
5656 BUG_ON(!rwsem_is_locked(&EXT4_I(inode2
)->i_data_sem
));
5657 BUG_ON(!inode_is_locked(inode1
));
5658 BUG_ON(!inode_is_locked(inode2
));
5660 ext4_es_remove_extent(inode1
, lblk1
, count
);
5661 ext4_es_remove_extent(inode2
, lblk2
, count
);
5664 struct ext4_extent
*ex1
, *ex2
, tmp_ex
;
5665 ext4_lblk_t e1_blk
, e2_blk
;
5666 int e1_len
, e2_len
, len
;
5669 path1
= ext4_find_extent(inode1
, lblk1
, path1
, EXT4_EX_NOCACHE
);
5670 if (IS_ERR(path1
)) {
5671 *erp
= PTR_ERR(path1
);
5674 path2
= ext4_find_extent(inode2
, lblk2
, path2
, EXT4_EX_NOCACHE
);
5675 if (IS_ERR(path2
)) {
5676 *erp
= PTR_ERR(path2
);
5679 ex1
= path1
[path1
->p_depth
].p_ext
;
5680 ex2
= path2
[path2
->p_depth
].p_ext
;
5681 /* Do we have something to swap ? */
5682 if (unlikely(!ex2
|| !ex1
))
5685 e1_blk
= le32_to_cpu(ex1
->ee_block
);
5686 e2_blk
= le32_to_cpu(ex2
->ee_block
);
5687 e1_len
= ext4_ext_get_actual_len(ex1
);
5688 e2_len
= ext4_ext_get_actual_len(ex2
);
5691 if (!in_range(lblk1
, e1_blk
, e1_len
) ||
5692 !in_range(lblk2
, e2_blk
, e2_len
)) {
5693 ext4_lblk_t next1
, next2
;
5695 /* if hole after extent, then go to next extent */
5696 next1
= ext4_ext_next_allocated_block(path1
);
5697 next2
= ext4_ext_next_allocated_block(path2
);
5698 /* If hole before extent, then shift to that extent */
5703 /* Do we have something to swap */
5704 if (next1
== EXT_MAX_BLOCKS
|| next2
== EXT_MAX_BLOCKS
)
5706 /* Move to the rightest boundary */
5707 len
= next1
- lblk1
;
5708 if (len
< next2
- lblk2
)
5709 len
= next2
- lblk2
;
5718 /* Prepare left boundary */
5719 if (e1_blk
< lblk1
) {
5721 path1
= ext4_force_split_extent_at(handle
, inode1
,
5723 if (IS_ERR(path1
)) {
5724 *erp
= PTR_ERR(path1
);
5728 if (e2_blk
< lblk2
) {
5730 path2
= ext4_force_split_extent_at(handle
, inode2
,
5732 if (IS_ERR(path2
)) {
5733 *erp
= PTR_ERR(path2
);
5737 /* ext4_split_extent_at() may result in leaf extent split,
5738 * path must to be revalidated. */
5742 /* Prepare right boundary */
5744 if (len
> e1_blk
+ e1_len
- lblk1
)
5745 len
= e1_blk
+ e1_len
- lblk1
;
5746 if (len
> e2_blk
+ e2_len
- lblk2
)
5747 len
= e2_blk
+ e2_len
- lblk2
;
5749 if (len
!= e1_len
) {
5751 path1
= ext4_force_split_extent_at(handle
, inode1
,
5752 path1
, lblk1
+ len
, 0);
5753 if (IS_ERR(path1
)) {
5754 *erp
= PTR_ERR(path1
);
5758 if (len
!= e2_len
) {
5760 path2
= ext4_force_split_extent_at(handle
, inode2
,
5761 path2
, lblk2
+ len
, 0);
5762 if (IS_ERR(path2
)) {
5763 *erp
= PTR_ERR(path2
);
5767 /* ext4_split_extent_at() may result in leaf extent split,
5768 * path must to be revalidated. */
5772 BUG_ON(e2_len
!= e1_len
);
5773 *erp
= ext4_ext_get_access(handle
, inode1
, path1
+ path1
->p_depth
);
5776 *erp
= ext4_ext_get_access(handle
, inode2
, path2
+ path2
->p_depth
);
5780 /* Both extents are fully inside boundaries. Swap it now */
5782 ext4_ext_store_pblock(ex1
, ext4_ext_pblock(ex2
));
5783 ext4_ext_store_pblock(ex2
, ext4_ext_pblock(&tmp_ex
));
5784 ex1
->ee_len
= cpu_to_le16(e2_len
);
5785 ex2
->ee_len
= cpu_to_le16(e1_len
);
5787 ext4_ext_mark_unwritten(ex2
);
5788 if (ext4_ext_is_unwritten(&tmp_ex
))
5789 ext4_ext_mark_unwritten(ex1
);
5791 ext4_ext_try_to_merge(handle
, inode2
, path2
, ex2
);
5792 ext4_ext_try_to_merge(handle
, inode1
, path1
, ex1
);
5793 *erp
= ext4_ext_dirty(handle
, inode2
, path2
+
5797 *erp
= ext4_ext_dirty(handle
, inode1
, path1
+
5800 * Looks scarry ah..? second inode already points to new blocks,
5801 * and it was successfully dirtied. But luckily error may happen
5802 * only due to journal error, so full transaction will be
5810 replaced_count
+= len
;
5815 ext4_free_ext_path(path1
);
5816 ext4_free_ext_path(path2
);
5817 return replaced_count
;
5821 * ext4_clu_mapped - determine whether any block in a logical cluster has
5822 * been mapped to a physical cluster
5824 * @inode - file containing the logical cluster
5825 * @lclu - logical cluster of interest
5827 * Returns 1 if any block in the logical cluster is mapped, signifying
5828 * that a physical cluster has been allocated for it. Otherwise,
5829 * returns 0. Can also return negative error codes. Derived from
5830 * ext4_ext_map_blocks().
5832 int ext4_clu_mapped(struct inode
*inode
, ext4_lblk_t lclu
)
5834 struct ext4_sb_info
*sbi
= EXT4_SB(inode
->i_sb
);
5835 struct ext4_ext_path
*path
;
5836 int depth
, mapped
= 0, err
= 0;
5837 struct ext4_extent
*extent
;
5838 ext4_lblk_t first_lblk
, first_lclu
, last_lclu
;
5841 * if data can be stored inline, the logical cluster isn't
5842 * mapped - no physical clusters have been allocated, and the
5843 * file has no extents
5845 if (ext4_test_inode_state(inode
, EXT4_STATE_MAY_INLINE_DATA
) ||
5846 ext4_has_inline_data(inode
))
5849 /* search for the extent closest to the first block in the cluster */
5850 path
= ext4_find_extent(inode
, EXT4_C2B(sbi
, lclu
), NULL
, 0);
5852 return PTR_ERR(path
);
5854 depth
= ext_depth(inode
);
5857 * A consistent leaf must not be empty. This situation is possible,
5858 * though, _during_ tree modification, and it's why an assert can't
5859 * be put in ext4_find_extent().
5861 if (unlikely(path
[depth
].p_ext
== NULL
&& depth
!= 0)) {
5862 EXT4_ERROR_INODE(inode
,
5863 "bad extent address - lblock: %lu, depth: %d, pblock: %lld",
5864 (unsigned long) EXT4_C2B(sbi
, lclu
),
5865 depth
, path
[depth
].p_block
);
5866 err
= -EFSCORRUPTED
;
5870 extent
= path
[depth
].p_ext
;
5872 /* can't be mapped if the extent tree is empty */
5876 first_lblk
= le32_to_cpu(extent
->ee_block
);
5877 first_lclu
= EXT4_B2C(sbi
, first_lblk
);
5880 * Three possible outcomes at this point - found extent spanning
5881 * the target cluster, to the left of the target cluster, or to the
5882 * right of the target cluster. The first two cases are handled here.
5883 * The last case indicates the target cluster is not mapped.
5885 if (lclu
>= first_lclu
) {
5886 last_lclu
= EXT4_B2C(sbi
, first_lblk
+
5887 ext4_ext_get_actual_len(extent
) - 1);
5888 if (lclu
<= last_lclu
) {
5891 first_lblk
= ext4_ext_next_allocated_block(path
);
5892 first_lclu
= EXT4_B2C(sbi
, first_lblk
);
5893 if (lclu
== first_lclu
)
5899 ext4_free_ext_path(path
);
5901 return err
? err
: mapped
;
5905 * Updates physical block address and unwritten status of extent
5906 * starting at lblk start and of len. If such an extent doesn't exist,
5907 * this function splits the extent tree appropriately to create an
5908 * extent like this. This function is called in the fast commit
5909 * replay path. Returns 0 on success and error on failure.
5911 int ext4_ext_replay_update_ex(struct inode
*inode
, ext4_lblk_t start
,
5912 int len
, int unwritten
, ext4_fsblk_t pblk
)
5914 struct ext4_ext_path
*path
;
5915 struct ext4_extent
*ex
;
5918 path
= ext4_find_extent(inode
, start
, NULL
, 0);
5920 return PTR_ERR(path
);
5921 ex
= path
[path
->p_depth
].p_ext
;
5923 ret
= -EFSCORRUPTED
;
5927 if (le32_to_cpu(ex
->ee_block
) != start
||
5928 ext4_ext_get_actual_len(ex
) != len
) {
5929 /* We need to split this extent to match our extent first */
5930 down_write(&EXT4_I(inode
)->i_data_sem
);
5931 path
= ext4_force_split_extent_at(NULL
, inode
, path
, start
, 1);
5932 up_write(&EXT4_I(inode
)->i_data_sem
);
5934 ret
= PTR_ERR(path
);
5938 path
= ext4_find_extent(inode
, start
, path
, 0);
5940 return PTR_ERR(path
);
5942 ex
= path
[path
->p_depth
].p_ext
;
5943 WARN_ON(le32_to_cpu(ex
->ee_block
) != start
);
5945 if (ext4_ext_get_actual_len(ex
) != len
) {
5946 down_write(&EXT4_I(inode
)->i_data_sem
);
5947 path
= ext4_force_split_extent_at(NULL
, inode
, path
,
5949 up_write(&EXT4_I(inode
)->i_data_sem
);
5951 ret
= PTR_ERR(path
);
5955 path
= ext4_find_extent(inode
, start
, path
, 0);
5957 return PTR_ERR(path
);
5958 ex
= path
[path
->p_depth
].p_ext
;
5962 ext4_ext_mark_unwritten(ex
);
5964 ext4_ext_mark_initialized(ex
);
5965 ext4_ext_store_pblock(ex
, pblk
);
5966 down_write(&EXT4_I(inode
)->i_data_sem
);
5967 ret
= ext4_ext_dirty(NULL
, inode
, &path
[path
->p_depth
]);
5968 up_write(&EXT4_I(inode
)->i_data_sem
);
5970 ext4_free_ext_path(path
);
5971 ext4_mark_inode_dirty(NULL
, inode
);
5975 /* Try to shrink the extent tree */
5976 void ext4_ext_replay_shrink_inode(struct inode
*inode
, ext4_lblk_t end
)
5978 struct ext4_ext_path
*path
= NULL
;
5979 struct ext4_extent
*ex
;
5980 ext4_lblk_t old_cur
, cur
= 0;
5983 path
= ext4_find_extent(inode
, cur
, NULL
, 0);
5986 ex
= path
[path
->p_depth
].p_ext
;
5988 ext4_free_ext_path(path
);
5989 ext4_mark_inode_dirty(NULL
, inode
);
5993 cur
= le32_to_cpu(ex
->ee_block
) + ext4_ext_get_actual_len(ex
);
5996 ext4_ext_try_to_merge(NULL
, inode
, path
, ex
);
5997 down_write(&EXT4_I(inode
)->i_data_sem
);
5998 ext4_ext_dirty(NULL
, inode
, &path
[path
->p_depth
]);
5999 up_write(&EXT4_I(inode
)->i_data_sem
);
6000 ext4_mark_inode_dirty(NULL
, inode
);
6001 ext4_free_ext_path(path
);
6005 /* Check if *cur is a hole and if it is, skip it */
6006 static int skip_hole(struct inode
*inode
, ext4_lblk_t
*cur
)
6009 struct ext4_map_blocks map
;
6012 map
.m_len
= ((inode
->i_size
) >> inode
->i_sb
->s_blocksize_bits
) - *cur
;
6014 ret
= ext4_map_blocks(NULL
, inode
, &map
, 0);
6019 *cur
= *cur
+ map
.m_len
;
6023 /* Count number of blocks used by this inode and update i_blocks */
6024 int ext4_ext_replay_set_iblocks(struct inode
*inode
)
6026 struct ext4_ext_path
*path
= NULL
, *path2
= NULL
;
6027 struct ext4_extent
*ex
;
6028 ext4_lblk_t cur
= 0, end
;
6029 int numblks
= 0, i
, ret
= 0;
6030 ext4_fsblk_t cmp1
, cmp2
;
6031 struct ext4_map_blocks map
;
6033 /* Determin the size of the file first */
6034 path
= ext4_find_extent(inode
, EXT_MAX_BLOCKS
- 1, NULL
,
6037 return PTR_ERR(path
);
6038 ex
= path
[path
->p_depth
].p_ext
;
6041 end
= le32_to_cpu(ex
->ee_block
) + ext4_ext_get_actual_len(ex
);
6043 /* Count the number of data blocks */
6047 map
.m_len
= end
- cur
;
6048 ret
= ext4_map_blocks(NULL
, inode
, &map
, 0);
6053 cur
= cur
+ map
.m_len
;
6057 * Count the number of extent tree blocks. We do it by looking up
6058 * two successive extents and determining the difference between
6059 * their paths. When path is different for 2 successive extents
6060 * we compare the blocks in the path at each level and increment
6061 * iblocks by total number of differences found.
6064 ret
= skip_hole(inode
, &cur
);
6067 path
= ext4_find_extent(inode
, cur
, path
, 0);
6070 numblks
+= path
->p_depth
;
6072 path
= ext4_find_extent(inode
, cur
, path
, 0);
6075 ex
= path
[path
->p_depth
].p_ext
;
6079 cur
= max(cur
+ 1, le32_to_cpu(ex
->ee_block
) +
6080 ext4_ext_get_actual_len(ex
));
6081 ret
= skip_hole(inode
, &cur
);
6085 path2
= ext4_find_extent(inode
, cur
, path2
, 0);
6089 for (i
= 0; i
<= max(path
->p_depth
, path2
->p_depth
); i
++) {
6091 if (i
<= path
->p_depth
)
6092 cmp1
= path
[i
].p_bh
?
6093 path
[i
].p_bh
->b_blocknr
: 0;
6094 if (i
<= path2
->p_depth
)
6095 cmp2
= path2
[i
].p_bh
?
6096 path2
[i
].p_bh
->b_blocknr
: 0;
6097 if (cmp1
!= cmp2
&& cmp2
!= 0)
6103 inode
->i_blocks
= numblks
<< (inode
->i_sb
->s_blocksize_bits
- 9);
6104 ext4_mark_inode_dirty(NULL
, inode
);
6106 ext4_free_ext_path(path
);
6107 ext4_free_ext_path(path2
);
6111 int ext4_ext_clear_bb(struct inode
*inode
)
6113 struct ext4_ext_path
*path
= NULL
;
6114 struct ext4_extent
*ex
;
6115 ext4_lblk_t cur
= 0, end
;
6117 struct ext4_map_blocks map
;
6119 if (ext4_test_inode_flag(inode
, EXT4_INODE_INLINE_DATA
))
6122 /* Determin the size of the file first */
6123 path
= ext4_find_extent(inode
, EXT_MAX_BLOCKS
- 1, NULL
,
6126 return PTR_ERR(path
);
6127 ex
= path
[path
->p_depth
].p_ext
;
6130 end
= le32_to_cpu(ex
->ee_block
) + ext4_ext_get_actual_len(ex
);
6135 map
.m_len
= end
- cur
;
6136 ret
= ext4_map_blocks(NULL
, inode
, &map
, 0);
6140 path
= ext4_find_extent(inode
, map
.m_lblk
, path
, 0);
6141 if (!IS_ERR(path
)) {
6142 for (j
= 0; j
< path
->p_depth
; j
++) {
6143 ext4_mb_mark_bb(inode
->i_sb
,
6144 path
[j
].p_block
, 1, false);
6145 ext4_fc_record_regions(inode
->i_sb
, inode
->i_ino
,
6146 0, path
[j
].p_block
, 1, 1);
6151 ext4_mb_mark_bb(inode
->i_sb
, map
.m_pblk
, map
.m_len
, false);
6152 ext4_fc_record_regions(inode
->i_sb
, inode
->i_ino
,
6153 map
.m_lblk
, map
.m_pblk
, map
.m_len
, 1);
6155 cur
= cur
+ map
.m_len
;
6159 ext4_free_ext_path(path
);