1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
4 * Written by Alex Tomas <alex@clusterfs.com>
9 * mballoc.c contains the multiblocks allocation routines
12 #include "ext4_jbd2.h"
14 #include <linux/log2.h>
15 #include <linux/module.h>
16 #include <linux/slab.h>
17 #include <linux/nospec.h>
18 #include <linux/backing-dev.h>
19 #include <linux/freezer.h>
20 #include <trace/events/ext4.h>
21 #include <kunit/static_stub.h>
25 * - test ext4_ext_search_left() and ext4_ext_search_right()
26 * - search for metadata in few groups
29 * - normalization should take into account whether file is still open
30 * - discard preallocations if no free space left (policy?)
31 * - don't normalize tails
33 * - reservation for superuser
36 * - bitmap read-ahead (proposed by Oleg Drokin aka green)
37 * - track min/max extents in each group for better group selection
38 * - mb_mark_used() may allocate chunk right after splitting buddy
39 * - tree of groups sorted by number of free blocks
44 * The allocation request involve request for multiple number of blocks
45 * near to the goal(block) value specified.
47 * During initialization phase of the allocator we decide to use the
48 * group preallocation or inode preallocation depending on the size of
49 * the file. The size of the file could be the resulting file size we
50 * would have after allocation, or the current file size, which ever
51 * is larger. If the size is less than sbi->s_mb_stream_request we
52 * select to use the group preallocation. The default value of
53 * s_mb_stream_request is 16 blocks. This can also be tuned via
54 * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
55 * terms of number of blocks.
57 * The main motivation for having small file use group preallocation is to
58 * ensure that we have small files closer together on the disk.
60 * First stage the allocator looks at the inode prealloc list,
61 * ext4_inode_info->i_prealloc_list, which contains list of prealloc
62 * spaces for this particular inode. The inode prealloc space is
65 * pa_lstart -> the logical start block for this prealloc space
66 * pa_pstart -> the physical start block for this prealloc space
67 * pa_len -> length for this prealloc space (in clusters)
68 * pa_free -> free space available in this prealloc space (in clusters)
70 * The inode preallocation space is used looking at the _logical_ start
71 * block. If only the logical file block falls within the range of prealloc
72 * space we will consume the particular prealloc space. This makes sure that
73 * we have contiguous physical blocks representing the file blocks
75 * The important thing to be noted in case of inode prealloc space is that
76 * we don't modify the values associated to inode prealloc space except
79 * If we are not able to find blocks in the inode prealloc space and if we
80 * have the group allocation flag set then we look at the locality group
81 * prealloc space. These are per CPU prealloc list represented as
83 * ext4_sb_info.s_locality_groups[smp_processor_id()]
85 * The reason for having a per cpu locality group is to reduce the contention
86 * between CPUs. It is possible to get scheduled at this point.
88 * The locality group prealloc space is used looking at whether we have
89 * enough free space (pa_free) within the prealloc space.
91 * If we can't allocate blocks via inode prealloc or/and locality group
92 * prealloc then we look at the buddy cache. The buddy cache is represented
93 * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
94 * mapped to the buddy and bitmap information regarding different
95 * groups. The buddy information is attached to buddy cache inode so that
96 * we can access them through the page cache. The information regarding
97 * each group is loaded via ext4_mb_load_buddy. The information involve
98 * block bitmap and buddy information. The information are stored in the
102 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
105 * one block each for bitmap and buddy information. So for each group we
106 * take up 2 blocks. A page can contain blocks_per_page (PAGE_SIZE /
107 * blocksize) blocks. So it can have information regarding groups_per_page
108 * which is blocks_per_page/2
110 * The buddy cache inode is not stored on disk. The inode is thrown
111 * away when the filesystem is unmounted.
113 * We look for count number of blocks in the buddy cache. If we were able
114 * to locate that many free blocks we return with additional information
115 * regarding rest of the contiguous physical block available
117 * Before allocating blocks via buddy cache we normalize the request
118 * blocks. This ensure we ask for more blocks that we needed. The extra
119 * blocks that we get after allocation is added to the respective prealloc
120 * list. In case of inode preallocation we follow a list of heuristics
121 * based on file size. This can be found in ext4_mb_normalize_request. If
122 * we are doing a group prealloc we try to normalize the request to
123 * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is
124 * dependent on the cluster size; for non-bigalloc file systems, it is
125 * 512 blocks. This can be tuned via
126 * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
127 * terms of number of blocks. If we have mounted the file system with -O
128 * stripe=<value> option the group prealloc request is normalized to the
129 * smallest multiple of the stripe value (sbi->s_stripe) which is
130 * greater than the default mb_group_prealloc.
132 * If "mb_optimize_scan" mount option is set, we maintain in memory group info
133 * structures in two data structures:
135 * 1) Array of largest free order lists (sbi->s_mb_largest_free_orders)
137 * Locking: sbi->s_mb_largest_free_orders_locks(array of rw locks)
139 * This is an array of lists where the index in the array represents the
140 * largest free order in the buddy bitmap of the participating group infos of
141 * that list. So, there are exactly MB_NUM_ORDERS(sb) (which means total
142 * number of buddy bitmap orders possible) number of lists. Group-infos are
143 * placed in appropriate lists.
145 * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size)
147 * Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks)
149 * This is an array of lists where in the i-th list there are groups with
150 * average fragment size >= 2^i and < 2^(i+1). The average fragment size
151 * is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments.
152 * Note that we don't bother with a special list for completely empty groups
153 * so we only have MB_NUM_ORDERS(sb) lists.
155 * When "mb_optimize_scan" mount option is set, mballoc consults the above data
156 * structures to decide the order in which groups are to be traversed for
157 * fulfilling an allocation request.
159 * At CR_POWER2_ALIGNED , we look for groups which have the largest_free_order
160 * >= the order of the request. We directly look at the largest free order list
161 * in the data structure (1) above where largest_free_order = order of the
162 * request. If that list is empty, we look at remaining list in the increasing
163 * order of largest_free_order. This allows us to perform CR_POWER2_ALIGNED
164 * lookup in O(1) time.
166 * At CR_GOAL_LEN_FAST, we only consider groups where
167 * average fragment size > request size. So, we lookup a group which has average
168 * fragment size just above or equal to request size using our average fragment
169 * size group lists (data structure 2) in O(1) time.
171 * At CR_BEST_AVAIL_LEN, we aim to optimize allocations which can't be satisfied
172 * in CR_GOAL_LEN_FAST. The fact that we couldn't find a group in
173 * CR_GOAL_LEN_FAST suggests that there is no BG that has avg
174 * fragment size > goal length. So before falling to the slower
175 * CR_GOAL_LEN_SLOW, in CR_BEST_AVAIL_LEN we proactively trim goal length and
176 * then use the same fragment lists as CR_GOAL_LEN_FAST to find a BG with a big
177 * enough average fragment size. This increases the chances of finding a
178 * suitable block group in O(1) time and results in faster allocation at the
179 * cost of reduced size of allocation.
181 * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in
182 * linear order which requires O(N) search time for each CR_POWER2_ALIGNED and
183 * CR_GOAL_LEN_FAST phase.
185 * The regular allocator (using the buddy cache) supports a few tunables.
187 * /sys/fs/ext4/<partition>/mb_min_to_scan
188 * /sys/fs/ext4/<partition>/mb_max_to_scan
189 * /sys/fs/ext4/<partition>/mb_order2_req
190 * /sys/fs/ext4/<partition>/mb_linear_limit
192 * The regular allocator uses buddy scan only if the request len is power of
193 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
194 * value of s_mb_order2_reqs can be tuned via
195 * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to
196 * stripe size (sbi->s_stripe), we try to search for contiguous block in
197 * stripe size. This should result in better allocation on RAID setups. If
198 * not, we search in the specific group using bitmap for best extents. The
199 * tunable min_to_scan and max_to_scan control the behaviour here.
200 * min_to_scan indicate how long the mballoc __must__ look for a best
201 * extent and max_to_scan indicates how long the mballoc __can__ look for a
202 * best extent in the found extents. Searching for the blocks starts with
203 * the group specified as the goal value in allocation context via
204 * ac_g_ex. Each group is first checked based on the criteria whether it
205 * can be used for allocation. ext4_mb_good_group explains how the groups are
208 * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not
209 * get traversed linearly. That may result in subsequent allocations being not
210 * close to each other. And so, the underlying device may get filled up in a
211 * non-linear fashion. While that may not matter on non-rotational devices, for
212 * rotational devices that may result in higher seek times. "mb_linear_limit"
213 * tells mballoc how many groups mballoc should search linearly before
214 * performing consulting above data structures for more efficient lookups. For
215 * non rotational devices, this value defaults to 0 and for rotational devices
216 * this is set to MB_DEFAULT_LINEAR_LIMIT.
218 * Both the prealloc space are getting populated as above. So for the first
219 * request we will hit the buddy cache which will result in this prealloc
220 * space getting filled. The prealloc space is then later used for the
221 * subsequent request.
225 * mballoc operates on the following data:
227 * - in-core buddy (actually includes buddy and bitmap)
228 * - preallocation descriptors (PAs)
230 * there are two types of preallocations:
232 * assiged to specific inode and can be used for this inode only.
233 * it describes part of inode's space preallocated to specific
234 * physical blocks. any block from that preallocated can be used
235 * independent. the descriptor just tracks number of blocks left
236 * unused. so, before taking some block from descriptor, one must
237 * make sure corresponded logical block isn't allocated yet. this
238 * also means that freeing any block within descriptor's range
239 * must discard all preallocated blocks.
241 * assigned to specific locality group which does not translate to
242 * permanent set of inodes: inode can join and leave group. space
243 * from this type of preallocation can be used for any inode. thus
244 * it's consumed from the beginning to the end.
246 * relation between them can be expressed as:
247 * in-core buddy = on-disk bitmap + preallocation descriptors
249 * this mean blocks mballoc considers used are:
250 * - allocated blocks (persistent)
251 * - preallocated blocks (non-persistent)
253 * consistency in mballoc world means that at any time a block is either
254 * free or used in ALL structures. notice: "any time" should not be read
255 * literally -- time is discrete and delimited by locks.
257 * to keep it simple, we don't use block numbers, instead we count number of
258 * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
260 * all operations can be expressed as:
261 * - init buddy: buddy = on-disk + PAs
262 * - new PA: buddy += N; PA = N
263 * - use inode PA: on-disk += N; PA -= N
264 * - discard inode PA buddy -= on-disk - PA; PA = 0
265 * - use locality group PA on-disk += N; PA -= N
266 * - discard locality group PA buddy -= PA; PA = 0
267 * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
268 * is used in real operation because we can't know actual used
269 * bits from PA, only from on-disk bitmap
271 * if we follow this strict logic, then all operations above should be atomic.
272 * given some of them can block, we'd have to use something like semaphores
273 * killing performance on high-end SMP hardware. let's try to relax it using
274 * the following knowledge:
275 * 1) if buddy is referenced, it's already initialized
276 * 2) while block is used in buddy and the buddy is referenced,
277 * nobody can re-allocate that block
278 * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
279 * bit set and PA claims same block, it's OK. IOW, one can set bit in
280 * on-disk bitmap if buddy has same bit set or/and PA covers corresponded
283 * so, now we're building a concurrency table:
286 * blocks for PA are allocated in the buddy, buddy must be referenced
287 * until PA is linked to allocation group to avoid concurrent buddy init
289 * we need to make sure that either on-disk bitmap or PA has uptodate data
290 * given (3) we care that PA-=N operation doesn't interfere with init
292 * the simplest way would be to have buddy initialized by the discard
293 * - use locality group PA
294 * again PA-=N must be serialized with init
295 * - discard locality group PA
296 * the simplest way would be to have buddy initialized by the discard
299 * i_data_sem serializes them
301 * discard process must wait until PA isn't used by another process
302 * - use locality group PA
303 * some mutex should serialize them
304 * - discard locality group PA
305 * discard process must wait until PA isn't used by another process
308 * i_data_sem or another mutex should serializes them
310 * discard process must wait until PA isn't used by another process
311 * - use locality group PA
312 * nothing wrong here -- they're different PAs covering different blocks
313 * - discard locality group PA
314 * discard process must wait until PA isn't used by another process
316 * now we're ready to make few consequences:
317 * - PA is referenced and while it is no discard is possible
318 * - PA is referenced until block isn't marked in on-disk bitmap
319 * - PA changes only after on-disk bitmap
320 * - discard must not compete with init. either init is done before
321 * any discard or they're serialized somehow
322 * - buddy init as sum of on-disk bitmap and PAs is done atomically
324 * a special case when we've used PA to emptiness. no need to modify buddy
325 * in this case, but we should care about concurrent init
330 * Logic in few words:
335 * mark bits in on-disk bitmap
338 * - use preallocation:
339 * find proper PA (per-inode or group)
341 * mark bits in on-disk bitmap
347 * mark bits in on-disk bitmap
350 * - discard preallocations in group:
352 * move them onto local list
353 * load on-disk bitmap
355 * remove PA from object (inode or locality group)
356 * mark free blocks in-core
358 * - discard inode's preallocations:
365 * - bitlock on a group (group)
366 * - object (inode/locality) (object)
368 * - cr_power2_aligned lists lock (cr_power2_aligned)
369 * - cr_goal_len_fast lists lock (cr_goal_len_fast)
379 * - release consumed pa:
384 * - generate in-core bitmap:
388 * - discard all for given object (inode, locality group):
393 * - discard all for given group:
399 * - allocation path (ext4_mb_regular_allocator)
401 * cr_power2_aligned/cr_goal_len_fast
403 static struct kmem_cache
*ext4_pspace_cachep
;
404 static struct kmem_cache
*ext4_ac_cachep
;
405 static struct kmem_cache
*ext4_free_data_cachep
;
407 /* We create slab caches for groupinfo data structures based on the
408 * superblock block size. There will be one per mounted filesystem for
409 * each unique s_blocksize_bits */
410 #define NR_GRPINFO_CACHES 8
411 static struct kmem_cache
*ext4_groupinfo_caches
[NR_GRPINFO_CACHES
];
413 static const char * const ext4_groupinfo_slab_names
[NR_GRPINFO_CACHES
] = {
414 "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
415 "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
416 "ext4_groupinfo_64k", "ext4_groupinfo_128k"
419 static void ext4_mb_generate_from_pa(struct super_block
*sb
, void *bitmap
,
421 static void ext4_mb_new_preallocation(struct ext4_allocation_context
*ac
);
423 static bool ext4_mb_good_group(struct ext4_allocation_context
*ac
,
424 ext4_group_t group
, enum criteria cr
);
426 static int ext4_try_to_trim_range(struct super_block
*sb
,
427 struct ext4_buddy
*e4b
, ext4_grpblk_t start
,
428 ext4_grpblk_t max
, ext4_grpblk_t minblocks
);
431 * The algorithm using this percpu seq counter goes below:
432 * 1. We sample the percpu discard_pa_seq counter before trying for block
433 * allocation in ext4_mb_new_blocks().
434 * 2. We increment this percpu discard_pa_seq counter when we either allocate
435 * or free these blocks i.e. while marking those blocks as used/free in
436 * mb_mark_used()/mb_free_blocks().
437 * 3. We also increment this percpu seq counter when we successfully identify
438 * that the bb_prealloc_list is not empty and hence proceed for discarding
439 * of those PAs inside ext4_mb_discard_group_preallocations().
441 * Now to make sure that the regular fast path of block allocation is not
442 * affected, as a small optimization we only sample the percpu seq counter
443 * on that cpu. Only when the block allocation fails and when freed blocks
444 * found were 0, that is when we sample percpu seq counter for all cpus using
445 * below function ext4_get_discard_pa_seq_sum(). This happens after making
446 * sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty.
448 static DEFINE_PER_CPU(u64
, discard_pa_seq
);
449 static inline u64
ext4_get_discard_pa_seq_sum(void)
454 for_each_possible_cpu(__cpu
)
455 __seq
+= per_cpu(discard_pa_seq
, __cpu
);
459 static inline void *mb_correct_addr_and_bit(int *bit
, void *addr
)
461 #if BITS_PER_LONG == 64
462 *bit
+= ((unsigned long) addr
& 7UL) << 3;
463 addr
= (void *) ((unsigned long) addr
& ~7UL);
464 #elif BITS_PER_LONG == 32
465 *bit
+= ((unsigned long) addr
& 3UL) << 3;
466 addr
= (void *) ((unsigned long) addr
& ~3UL);
468 #error "how many bits you are?!"
473 static inline int mb_test_bit(int bit
, void *addr
)
476 * ext4_test_bit on architecture like powerpc
477 * needs unsigned long aligned address
479 addr
= mb_correct_addr_and_bit(&bit
, addr
);
480 return ext4_test_bit(bit
, addr
);
483 static inline void mb_set_bit(int bit
, void *addr
)
485 addr
= mb_correct_addr_and_bit(&bit
, addr
);
486 ext4_set_bit(bit
, addr
);
489 static inline void mb_clear_bit(int bit
, void *addr
)
491 addr
= mb_correct_addr_and_bit(&bit
, addr
);
492 ext4_clear_bit(bit
, addr
);
495 static inline int mb_test_and_clear_bit(int bit
, void *addr
)
497 addr
= mb_correct_addr_and_bit(&bit
, addr
);
498 return ext4_test_and_clear_bit(bit
, addr
);
501 static inline int mb_find_next_zero_bit(void *addr
, int max
, int start
)
503 int fix
= 0, ret
, tmpmax
;
504 addr
= mb_correct_addr_and_bit(&fix
, addr
);
508 ret
= ext4_find_next_zero_bit(addr
, tmpmax
, start
) - fix
;
514 static inline int mb_find_next_bit(void *addr
, int max
, int start
)
516 int fix
= 0, ret
, tmpmax
;
517 addr
= mb_correct_addr_and_bit(&fix
, addr
);
521 ret
= ext4_find_next_bit(addr
, tmpmax
, start
) - fix
;
527 static void *mb_find_buddy(struct ext4_buddy
*e4b
, int order
, int *max
)
531 BUG_ON(e4b
->bd_bitmap
== e4b
->bd_buddy
);
534 if (order
> e4b
->bd_blkbits
+ 1) {
539 /* at order 0 we see each particular block */
541 *max
= 1 << (e4b
->bd_blkbits
+ 3);
542 return e4b
->bd_bitmap
;
545 bb
= e4b
->bd_buddy
+ EXT4_SB(e4b
->bd_sb
)->s_mb_offsets
[order
];
546 *max
= EXT4_SB(e4b
->bd_sb
)->s_mb_maxs
[order
];
552 static void mb_free_blocks_double(struct inode
*inode
, struct ext4_buddy
*e4b
,
553 int first
, int count
)
556 struct super_block
*sb
= e4b
->bd_sb
;
558 if (unlikely(e4b
->bd_info
->bb_bitmap
== NULL
))
560 assert_spin_locked(ext4_group_lock_ptr(sb
, e4b
->bd_group
));
561 for (i
= 0; i
< count
; i
++) {
562 if (!mb_test_bit(first
+ i
, e4b
->bd_info
->bb_bitmap
)) {
563 ext4_fsblk_t blocknr
;
565 blocknr
= ext4_group_first_block_no(sb
, e4b
->bd_group
);
566 blocknr
+= EXT4_C2B(EXT4_SB(sb
), first
+ i
);
567 ext4_mark_group_bitmap_corrupted(sb
, e4b
->bd_group
,
568 EXT4_GROUP_INFO_BBITMAP_CORRUPT
);
569 ext4_grp_locked_error(sb
, e4b
->bd_group
,
570 inode
? inode
->i_ino
: 0,
572 "freeing block already freed "
576 mb_clear_bit(first
+ i
, e4b
->bd_info
->bb_bitmap
);
580 static void mb_mark_used_double(struct ext4_buddy
*e4b
, int first
, int count
)
584 if (unlikely(e4b
->bd_info
->bb_bitmap
== NULL
))
586 assert_spin_locked(ext4_group_lock_ptr(e4b
->bd_sb
, e4b
->bd_group
));
587 for (i
= 0; i
< count
; i
++) {
588 BUG_ON(mb_test_bit(first
+ i
, e4b
->bd_info
->bb_bitmap
));
589 mb_set_bit(first
+ i
, e4b
->bd_info
->bb_bitmap
);
593 static void mb_cmp_bitmaps(struct ext4_buddy
*e4b
, void *bitmap
)
595 if (unlikely(e4b
->bd_info
->bb_bitmap
== NULL
))
597 if (memcmp(e4b
->bd_info
->bb_bitmap
, bitmap
, e4b
->bd_sb
->s_blocksize
)) {
598 unsigned char *b1
, *b2
;
600 b1
= (unsigned char *) e4b
->bd_info
->bb_bitmap
;
601 b2
= (unsigned char *) bitmap
;
602 for (i
= 0; i
< e4b
->bd_sb
->s_blocksize
; i
++) {
603 if (b1
[i
] != b2
[i
]) {
604 ext4_msg(e4b
->bd_sb
, KERN_ERR
,
605 "corruption in group %u "
606 "at byte %u(%u): %x in copy != %x "
608 e4b
->bd_group
, i
, i
* 8, b1
[i
], b2
[i
]);
615 static void mb_group_bb_bitmap_alloc(struct super_block
*sb
,
616 struct ext4_group_info
*grp
, ext4_group_t group
)
618 struct buffer_head
*bh
;
620 grp
->bb_bitmap
= kmalloc(sb
->s_blocksize
, GFP_NOFS
);
624 bh
= ext4_read_block_bitmap(sb
, group
);
625 if (IS_ERR_OR_NULL(bh
)) {
626 kfree(grp
->bb_bitmap
);
627 grp
->bb_bitmap
= NULL
;
631 memcpy(grp
->bb_bitmap
, bh
->b_data
, sb
->s_blocksize
);
635 static void mb_group_bb_bitmap_free(struct ext4_group_info
*grp
)
637 kfree(grp
->bb_bitmap
);
641 static inline void mb_free_blocks_double(struct inode
*inode
,
642 struct ext4_buddy
*e4b
, int first
, int count
)
646 static inline void mb_mark_used_double(struct ext4_buddy
*e4b
,
647 int first
, int count
)
651 static inline void mb_cmp_bitmaps(struct ext4_buddy
*e4b
, void *bitmap
)
656 static inline void mb_group_bb_bitmap_alloc(struct super_block
*sb
,
657 struct ext4_group_info
*grp
, ext4_group_t group
)
662 static inline void mb_group_bb_bitmap_free(struct ext4_group_info
*grp
)
668 #ifdef AGGRESSIVE_CHECK
670 #define MB_CHECK_ASSERT(assert) \
674 "Assertion failure in %s() at %s:%d: \"%s\"\n", \
675 function, file, line, # assert); \
680 static void __mb_check_buddy(struct ext4_buddy
*e4b
, char *file
,
681 const char *function
, int line
)
683 struct super_block
*sb
= e4b
->bd_sb
;
684 int order
= e4b
->bd_blkbits
+ 1;
691 struct ext4_group_info
*grp
;
694 struct list_head
*cur
;
698 if (e4b
->bd_info
->bb_check_counter
++ % 10)
702 buddy
= mb_find_buddy(e4b
, order
, &max
);
703 MB_CHECK_ASSERT(buddy
);
704 buddy2
= mb_find_buddy(e4b
, order
- 1, &max2
);
705 MB_CHECK_ASSERT(buddy2
);
706 MB_CHECK_ASSERT(buddy
!= buddy2
);
707 MB_CHECK_ASSERT(max
* 2 == max2
);
710 for (i
= 0; i
< max
; i
++) {
712 if (mb_test_bit(i
, buddy
)) {
713 /* only single bit in buddy2 may be 0 */
714 if (!mb_test_bit(i
<< 1, buddy2
)) {
716 mb_test_bit((i
<<1)+1, buddy2
));
721 /* both bits in buddy2 must be 1 */
722 MB_CHECK_ASSERT(mb_test_bit(i
<< 1, buddy2
));
723 MB_CHECK_ASSERT(mb_test_bit((i
<< 1) + 1, buddy2
));
725 for (j
= 0; j
< (1 << order
); j
++) {
726 k
= (i
* (1 << order
)) + j
;
728 !mb_test_bit(k
, e4b
->bd_bitmap
));
732 MB_CHECK_ASSERT(e4b
->bd_info
->bb_counters
[order
] == count
);
737 buddy
= mb_find_buddy(e4b
, 0, &max
);
738 for (i
= 0; i
< max
; i
++) {
739 if (!mb_test_bit(i
, buddy
)) {
740 MB_CHECK_ASSERT(i
>= e4b
->bd_info
->bb_first_free
);
748 /* check used bits only */
749 for (j
= 0; j
< e4b
->bd_blkbits
+ 1; j
++) {
750 buddy2
= mb_find_buddy(e4b
, j
, &max2
);
752 MB_CHECK_ASSERT(k
< max2
);
753 MB_CHECK_ASSERT(mb_test_bit(k
, buddy2
));
756 MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b
->bd_info
));
757 MB_CHECK_ASSERT(e4b
->bd_info
->bb_fragments
== fragments
);
759 grp
= ext4_get_group_info(sb
, e4b
->bd_group
);
762 list_for_each(cur
, &grp
->bb_prealloc_list
) {
763 ext4_group_t groupnr
;
764 struct ext4_prealloc_space
*pa
;
765 pa
= list_entry(cur
, struct ext4_prealloc_space
, pa_group_list
);
766 ext4_get_group_no_and_offset(sb
, pa
->pa_pstart
, &groupnr
, &k
);
767 MB_CHECK_ASSERT(groupnr
== e4b
->bd_group
);
768 for (i
= 0; i
< pa
->pa_len
; i
++)
769 MB_CHECK_ASSERT(mb_test_bit(k
+ i
, buddy
));
772 #undef MB_CHECK_ASSERT
773 #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \
774 __FILE__, __func__, __LINE__)
776 #define mb_check_buddy(e4b)
780 * Divide blocks started from @first with length @len into
781 * smaller chunks with power of 2 blocks.
782 * Clear the bits in bitmap which the blocks of the chunk(s) covered,
783 * then increase bb_counters[] for corresponded chunk size.
785 static void ext4_mb_mark_free_simple(struct super_block
*sb
,
786 void *buddy
, ext4_grpblk_t first
, ext4_grpblk_t len
,
787 struct ext4_group_info
*grp
)
789 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
795 BUG_ON(len
> EXT4_CLUSTERS_PER_GROUP(sb
));
797 border
= 2 << sb
->s_blocksize_bits
;
800 /* find how many blocks can be covered since this position */
801 max
= ffs(first
| border
) - 1;
803 /* find how many blocks of power 2 we need to mark */
810 /* mark multiblock chunks only */
811 grp
->bb_counters
[min
]++;
813 mb_clear_bit(first
>> min
,
814 buddy
+ sbi
->s_mb_offsets
[min
]);
821 static int mb_avg_fragment_size_order(struct super_block
*sb
, ext4_grpblk_t len
)
826 * We don't bother with a special lists groups with only 1 block free
827 * extents and for completely empty groups.
829 order
= fls(len
) - 2;
832 if (order
== MB_NUM_ORDERS(sb
))
834 if (WARN_ON_ONCE(order
> MB_NUM_ORDERS(sb
)))
835 order
= MB_NUM_ORDERS(sb
) - 1;
839 /* Move group to appropriate avg_fragment_size list */
841 mb_update_avg_fragment_size(struct super_block
*sb
, struct ext4_group_info
*grp
)
843 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
846 if (!test_opt2(sb
, MB_OPTIMIZE_SCAN
) || grp
->bb_fragments
== 0)
849 new_order
= mb_avg_fragment_size_order(sb
,
850 grp
->bb_free
/ grp
->bb_fragments
);
851 if (new_order
== grp
->bb_avg_fragment_size_order
)
854 if (grp
->bb_avg_fragment_size_order
!= -1) {
855 write_lock(&sbi
->s_mb_avg_fragment_size_locks
[
856 grp
->bb_avg_fragment_size_order
]);
857 list_del(&grp
->bb_avg_fragment_size_node
);
858 write_unlock(&sbi
->s_mb_avg_fragment_size_locks
[
859 grp
->bb_avg_fragment_size_order
]);
861 grp
->bb_avg_fragment_size_order
= new_order
;
862 write_lock(&sbi
->s_mb_avg_fragment_size_locks
[
863 grp
->bb_avg_fragment_size_order
]);
864 list_add_tail(&grp
->bb_avg_fragment_size_node
,
865 &sbi
->s_mb_avg_fragment_size
[grp
->bb_avg_fragment_size_order
]);
866 write_unlock(&sbi
->s_mb_avg_fragment_size_locks
[
867 grp
->bb_avg_fragment_size_order
]);
871 * Choose next group by traversing largest_free_order lists. Updates *new_cr if
872 * cr level needs an update.
874 static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context
*ac
,
875 enum criteria
*new_cr
, ext4_group_t
*group
)
877 struct ext4_sb_info
*sbi
= EXT4_SB(ac
->ac_sb
);
878 struct ext4_group_info
*iter
;
881 if (ac
->ac_status
== AC_STATUS_FOUND
)
884 if (unlikely(sbi
->s_mb_stats
&& ac
->ac_flags
& EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED
))
885 atomic_inc(&sbi
->s_bal_p2_aligned_bad_suggestions
);
887 for (i
= ac
->ac_2order
; i
< MB_NUM_ORDERS(ac
->ac_sb
); i
++) {
888 if (list_empty(&sbi
->s_mb_largest_free_orders
[i
]))
890 read_lock(&sbi
->s_mb_largest_free_orders_locks
[i
]);
891 if (list_empty(&sbi
->s_mb_largest_free_orders
[i
])) {
892 read_unlock(&sbi
->s_mb_largest_free_orders_locks
[i
]);
895 list_for_each_entry(iter
, &sbi
->s_mb_largest_free_orders
[i
],
896 bb_largest_free_order_node
) {
898 atomic64_inc(&sbi
->s_bal_cX_groups_considered
[CR_POWER2_ALIGNED
]);
899 if (likely(ext4_mb_good_group(ac
, iter
->bb_group
, CR_POWER2_ALIGNED
))) {
900 *group
= iter
->bb_group
;
901 ac
->ac_flags
|= EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED
;
902 read_unlock(&sbi
->s_mb_largest_free_orders_locks
[i
]);
906 read_unlock(&sbi
->s_mb_largest_free_orders_locks
[i
]);
909 /* Increment cr and search again if no group is found */
910 *new_cr
= CR_GOAL_LEN_FAST
;
914 * Find a suitable group of given order from the average fragments list.
916 static struct ext4_group_info
*
917 ext4_mb_find_good_group_avg_frag_lists(struct ext4_allocation_context
*ac
, int order
)
919 struct ext4_sb_info
*sbi
= EXT4_SB(ac
->ac_sb
);
920 struct list_head
*frag_list
= &sbi
->s_mb_avg_fragment_size
[order
];
921 rwlock_t
*frag_list_lock
= &sbi
->s_mb_avg_fragment_size_locks
[order
];
922 struct ext4_group_info
*grp
= NULL
, *iter
;
923 enum criteria cr
= ac
->ac_criteria
;
925 if (list_empty(frag_list
))
927 read_lock(frag_list_lock
);
928 if (list_empty(frag_list
)) {
929 read_unlock(frag_list_lock
);
932 list_for_each_entry(iter
, frag_list
, bb_avg_fragment_size_node
) {
934 atomic64_inc(&sbi
->s_bal_cX_groups_considered
[cr
]);
935 if (likely(ext4_mb_good_group(ac
, iter
->bb_group
, cr
))) {
940 read_unlock(frag_list_lock
);
945 * Choose next group by traversing average fragment size list of suitable
946 * order. Updates *new_cr if cr level needs an update.
948 static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context
*ac
,
949 enum criteria
*new_cr
, ext4_group_t
*group
)
951 struct ext4_sb_info
*sbi
= EXT4_SB(ac
->ac_sb
);
952 struct ext4_group_info
*grp
= NULL
;
955 if (unlikely(ac
->ac_flags
& EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED
)) {
957 atomic_inc(&sbi
->s_bal_goal_fast_bad_suggestions
);
960 for (i
= mb_avg_fragment_size_order(ac
->ac_sb
, ac
->ac_g_ex
.fe_len
);
961 i
< MB_NUM_ORDERS(ac
->ac_sb
); i
++) {
962 grp
= ext4_mb_find_good_group_avg_frag_lists(ac
, i
);
964 *group
= grp
->bb_group
;
965 ac
->ac_flags
|= EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED
;
971 * CR_BEST_AVAIL_LEN works based on the concept that we have
972 * a larger normalized goal len request which can be trimmed to
973 * a smaller goal len such that it can still satisfy original
974 * request len. However, allocation request for non-regular
975 * files never gets normalized.
976 * See function ext4_mb_normalize_request() (EXT4_MB_HINT_DATA).
978 if (ac
->ac_flags
& EXT4_MB_HINT_DATA
)
979 *new_cr
= CR_BEST_AVAIL_LEN
;
981 *new_cr
= CR_GOAL_LEN_SLOW
;
985 * We couldn't find a group in CR_GOAL_LEN_FAST so try to find the highest free fragment
986 * order we have and proactively trim the goal request length to that order to
987 * find a suitable group faster.
989 * This optimizes allocation speed at the cost of slightly reduced
990 * preallocations. However, we make sure that we don't trim the request too
991 * much and fall to CR_GOAL_LEN_SLOW in that case.
993 static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
*ac
,
994 enum criteria
*new_cr
, ext4_group_t
*group
)
996 struct ext4_sb_info
*sbi
= EXT4_SB(ac
->ac_sb
);
997 struct ext4_group_info
*grp
= NULL
;
998 int i
, order
, min_order
;
999 unsigned long num_stripe_clusters
= 0;
1001 if (unlikely(ac
->ac_flags
& EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED
)) {
1002 if (sbi
->s_mb_stats
)
1003 atomic_inc(&sbi
->s_bal_best_avail_bad_suggestions
);
1007 * mb_avg_fragment_size_order() returns order in a way that makes
1008 * retrieving back the length using (1 << order) inaccurate. Hence, use
1009 * fls() instead since we need to know the actual length while modifying
1012 order
= fls(ac
->ac_g_ex
.fe_len
) - 1;
1013 if (WARN_ON_ONCE(order
- 1 > MB_NUM_ORDERS(ac
->ac_sb
)))
1014 order
= MB_NUM_ORDERS(ac
->ac_sb
);
1015 min_order
= order
- sbi
->s_mb_best_avail_max_trim_order
;
1019 if (sbi
->s_stripe
> 0) {
1021 * We are assuming that stripe size is always a multiple of
1022 * cluster ratio otherwise __ext4_fill_super exists early.
1024 num_stripe_clusters
= EXT4_NUM_B2C(sbi
, sbi
->s_stripe
);
1025 if (1 << min_order
< num_stripe_clusters
)
1027 * We consider 1 order less because later we round
1028 * up the goal len to num_stripe_clusters
1030 min_order
= fls(num_stripe_clusters
) - 1;
1033 if (1 << min_order
< ac
->ac_o_ex
.fe_len
)
1034 min_order
= fls(ac
->ac_o_ex
.fe_len
);
1036 for (i
= order
; i
>= min_order
; i
--) {
1039 * Scale down goal len to make sure we find something
1040 * in the free fragments list. Basically, reduce
1043 ac
->ac_g_ex
.fe_len
= 1 << i
;
1045 if (num_stripe_clusters
> 0) {
1047 * Try to round up the adjusted goal length to
1048 * stripe size (in cluster units) multiple for
1051 ac
->ac_g_ex
.fe_len
= roundup(ac
->ac_g_ex
.fe_len
,
1052 num_stripe_clusters
);
1055 frag_order
= mb_avg_fragment_size_order(ac
->ac_sb
,
1056 ac
->ac_g_ex
.fe_len
);
1058 grp
= ext4_mb_find_good_group_avg_frag_lists(ac
, frag_order
);
1060 *group
= grp
->bb_group
;
1061 ac
->ac_flags
|= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED
;
1066 /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */
1067 ac
->ac_g_ex
.fe_len
= ac
->ac_orig_goal_len
;
1068 *new_cr
= CR_GOAL_LEN_SLOW
;
1071 static inline int should_optimize_scan(struct ext4_allocation_context
*ac
)
1073 if (unlikely(!test_opt2(ac
->ac_sb
, MB_OPTIMIZE_SCAN
)))
1075 if (ac
->ac_criteria
>= CR_GOAL_LEN_SLOW
)
1077 if (!ext4_test_inode_flag(ac
->ac_inode
, EXT4_INODE_EXTENTS
))
1083 * Return next linear group for allocation.
1086 next_linear_group(ext4_group_t group
, ext4_group_t ngroups
)
1089 * Artificially restricted ngroups for non-extent
1090 * files makes group > ngroups possible on first loop.
1092 return group
+ 1 >= ngroups
? 0 : group
+ 1;
1096 * ext4_mb_choose_next_group: choose next group for allocation.
1098 * @ac Allocation Context
1099 * @new_cr This is an output parameter. If the there is no good group
1100 * available at current CR level, this field is updated to indicate
1101 * the new cr level that should be used.
1102 * @group This is an input / output parameter. As an input it indicates the
1103 * next group that the allocator intends to use for allocation. As
1104 * output, this field indicates the next group that should be used as
1105 * determined by the optimization functions.
1106 * @ngroups Total number of groups
1108 static void ext4_mb_choose_next_group(struct ext4_allocation_context
*ac
,
1109 enum criteria
*new_cr
, ext4_group_t
*group
, ext4_group_t ngroups
)
1111 *new_cr
= ac
->ac_criteria
;
1113 if (!should_optimize_scan(ac
)) {
1114 *group
= next_linear_group(*group
, ngroups
);
1119 * Optimized scanning can return non adjacent groups which can cause
1120 * seek overhead for rotational disks. So try few linear groups before
1121 * trying optimized scan.
1123 if (ac
->ac_groups_linear_remaining
) {
1124 *group
= next_linear_group(*group
, ngroups
);
1125 ac
->ac_groups_linear_remaining
--;
1129 if (*new_cr
== CR_POWER2_ALIGNED
) {
1130 ext4_mb_choose_next_group_p2_aligned(ac
, new_cr
, group
);
1131 } else if (*new_cr
== CR_GOAL_LEN_FAST
) {
1132 ext4_mb_choose_next_group_goal_fast(ac
, new_cr
, group
);
1133 } else if (*new_cr
== CR_BEST_AVAIL_LEN
) {
1134 ext4_mb_choose_next_group_best_avail(ac
, new_cr
, group
);
1137 * TODO: For CR_GOAL_LEN_SLOW, we can arrange groups in an
1138 * rb tree sorted by bb_free. But until that happens, we should
1146 * Cache the order of the largest free extent we have available in this block
1150 mb_set_largest_free_order(struct super_block
*sb
, struct ext4_group_info
*grp
)
1152 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
1155 for (i
= MB_NUM_ORDERS(sb
) - 1; i
>= 0; i
--)
1156 if (grp
->bb_counters
[i
] > 0)
1158 /* No need to move between order lists? */
1159 if (!test_opt2(sb
, MB_OPTIMIZE_SCAN
) ||
1160 i
== grp
->bb_largest_free_order
) {
1161 grp
->bb_largest_free_order
= i
;
1165 if (grp
->bb_largest_free_order
>= 0) {
1166 write_lock(&sbi
->s_mb_largest_free_orders_locks
[
1167 grp
->bb_largest_free_order
]);
1168 list_del_init(&grp
->bb_largest_free_order_node
);
1169 write_unlock(&sbi
->s_mb_largest_free_orders_locks
[
1170 grp
->bb_largest_free_order
]);
1172 grp
->bb_largest_free_order
= i
;
1173 if (grp
->bb_largest_free_order
>= 0 && grp
->bb_free
) {
1174 write_lock(&sbi
->s_mb_largest_free_orders_locks
[
1175 grp
->bb_largest_free_order
]);
1176 list_add_tail(&grp
->bb_largest_free_order_node
,
1177 &sbi
->s_mb_largest_free_orders
[grp
->bb_largest_free_order
]);
1178 write_unlock(&sbi
->s_mb_largest_free_orders_locks
[
1179 grp
->bb_largest_free_order
]);
1183 static noinline_for_stack
1184 void ext4_mb_generate_buddy(struct super_block
*sb
,
1185 void *buddy
, void *bitmap
, ext4_group_t group
,
1186 struct ext4_group_info
*grp
)
1188 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
1189 ext4_grpblk_t max
= EXT4_CLUSTERS_PER_GROUP(sb
);
1190 ext4_grpblk_t i
= 0;
1191 ext4_grpblk_t first
;
1194 unsigned fragments
= 0;
1195 unsigned long long period
= get_cycles();
1197 /* initialize buddy from bitmap which is aggregation
1198 * of on-disk bitmap and preallocations */
1199 i
= mb_find_next_zero_bit(bitmap
, max
, 0);
1200 grp
->bb_first_free
= i
;
1204 i
= mb_find_next_bit(bitmap
, max
, i
);
1208 ext4_mb_mark_free_simple(sb
, buddy
, first
, len
, grp
);
1210 grp
->bb_counters
[0]++;
1212 i
= mb_find_next_zero_bit(bitmap
, max
, i
);
1214 grp
->bb_fragments
= fragments
;
1216 if (free
!= grp
->bb_free
) {
1217 ext4_grp_locked_error(sb
, group
, 0, 0,
1218 "block bitmap and bg descriptor "
1219 "inconsistent: %u vs %u free clusters",
1220 free
, grp
->bb_free
);
1222 * If we intend to continue, we consider group descriptor
1223 * corrupt and update bb_free using bitmap value
1225 grp
->bb_free
= free
;
1226 ext4_mark_group_bitmap_corrupted(sb
, group
,
1227 EXT4_GROUP_INFO_BBITMAP_CORRUPT
);
1229 mb_set_largest_free_order(sb
, grp
);
1230 mb_update_avg_fragment_size(sb
, grp
);
1232 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT
, &(grp
->bb_state
));
1234 period
= get_cycles() - period
;
1235 atomic_inc(&sbi
->s_mb_buddies_generated
);
1236 atomic64_add(period
, &sbi
->s_mb_generation_time
);
1239 static void mb_regenerate_buddy(struct ext4_buddy
*e4b
)
1245 while ((buddy
= mb_find_buddy(e4b
, order
++, &count
)))
1246 mb_set_bits(buddy
, 0, count
);
1248 e4b
->bd_info
->bb_fragments
= 0;
1249 memset(e4b
->bd_info
->bb_counters
, 0,
1250 sizeof(*e4b
->bd_info
->bb_counters
) *
1251 (e4b
->bd_sb
->s_blocksize_bits
+ 2));
1253 ext4_mb_generate_buddy(e4b
->bd_sb
, e4b
->bd_buddy
,
1254 e4b
->bd_bitmap
, e4b
->bd_group
, e4b
->bd_info
);
1257 /* The buddy information is attached the buddy cache inode
1258 * for convenience. The information regarding each group
1259 * is loaded via ext4_mb_load_buddy. The information involve
1260 * block bitmap and buddy information. The information are
1261 * stored in the inode as
1264 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
1267 * one block each for bitmap and buddy information.
1268 * So for each group we take up 2 blocks. A page can
1269 * contain blocks_per_page (PAGE_SIZE / blocksize) blocks.
1270 * So it can have information regarding groups_per_page which
1271 * is blocks_per_page/2
1273 * Locking note: This routine takes the block group lock of all groups
1274 * for this page; do not hold this lock when calling this routine!
1277 static int ext4_mb_init_cache(struct folio
*folio
, char *incore
, gfp_t gfp
)
1279 ext4_group_t ngroups
;
1280 unsigned int blocksize
;
1281 int blocks_per_page
;
1282 int groups_per_page
;
1285 ext4_group_t first_group
, group
;
1287 struct super_block
*sb
;
1288 struct buffer_head
*bhs
;
1289 struct buffer_head
**bh
= NULL
;
1290 struct inode
*inode
;
1293 struct ext4_group_info
*grinfo
;
1295 inode
= folio
->mapping
->host
;
1297 ngroups
= ext4_get_groups_count(sb
);
1298 blocksize
= i_blocksize(inode
);
1299 blocks_per_page
= PAGE_SIZE
/ blocksize
;
1301 mb_debug(sb
, "init folio %lu\n", folio
->index
);
1303 groups_per_page
= blocks_per_page
>> 1;
1304 if (groups_per_page
== 0)
1305 groups_per_page
= 1;
1307 /* allocate buffer_heads to read bitmaps */
1308 if (groups_per_page
> 1) {
1309 i
= sizeof(struct buffer_head
*) * groups_per_page
;
1310 bh
= kzalloc(i
, gfp
);
1316 first_group
= folio
->index
* blocks_per_page
/ 2;
1318 /* read all groups the folio covers into the cache */
1319 for (i
= 0, group
= first_group
; i
< groups_per_page
; i
++, group
++) {
1320 if (group
>= ngroups
)
1323 grinfo
= ext4_get_group_info(sb
, group
);
1327 * If page is uptodate then we came here after online resize
1328 * which added some new uninitialized group info structs, so
1329 * we must skip all initialized uptodate buddies on the folio,
1330 * which may be currently in use by an allocating task.
1332 if (folio_test_uptodate(folio
) &&
1333 !EXT4_MB_GRP_NEED_INIT(grinfo
)) {
1337 bh
[i
] = ext4_read_block_bitmap_nowait(sb
, group
, false);
1338 if (IS_ERR(bh
[i
])) {
1339 err
= PTR_ERR(bh
[i
]);
1343 mb_debug(sb
, "read bitmap for group %u\n", group
);
1346 /* wait for I/O completion */
1347 for (i
= 0, group
= first_group
; i
< groups_per_page
; i
++, group
++) {
1352 err2
= ext4_wait_block_bitmap(sb
, group
, bh
[i
]);
1357 first_block
= folio
->index
* blocks_per_page
;
1358 for (i
= 0; i
< blocks_per_page
; i
++) {
1359 group
= (first_block
+ i
) >> 1;
1360 if (group
>= ngroups
)
1363 if (!bh
[group
- first_group
])
1364 /* skip initialized uptodate buddy */
1367 if (!buffer_verified(bh
[group
- first_group
]))
1368 /* Skip faulty bitmaps */
1373 * data carry information regarding this
1374 * particular group in the format specified
1378 data
= folio_address(folio
) + (i
* blocksize
);
1379 bitmap
= bh
[group
- first_group
]->b_data
;
1382 * We place the buddy block and bitmap block
1385 grinfo
= ext4_get_group_info(sb
, group
);
1387 err
= -EFSCORRUPTED
;
1390 if ((first_block
+ i
) & 1) {
1391 /* this is block of buddy */
1392 BUG_ON(incore
== NULL
);
1393 mb_debug(sb
, "put buddy for group %u in folio %lu/%x\n",
1394 group
, folio
->index
, i
* blocksize
);
1395 trace_ext4_mb_buddy_bitmap_load(sb
, group
);
1396 grinfo
->bb_fragments
= 0;
1397 memset(grinfo
->bb_counters
, 0,
1398 sizeof(*grinfo
->bb_counters
) *
1399 (MB_NUM_ORDERS(sb
)));
1401 * incore got set to the group block bitmap below
1403 ext4_lock_group(sb
, group
);
1404 /* init the buddy */
1405 memset(data
, 0xff, blocksize
);
1406 ext4_mb_generate_buddy(sb
, data
, incore
, group
, grinfo
);
1407 ext4_unlock_group(sb
, group
);
1410 /* this is block of bitmap */
1411 BUG_ON(incore
!= NULL
);
1412 mb_debug(sb
, "put bitmap for group %u in folio %lu/%x\n",
1413 group
, folio
->index
, i
* blocksize
);
1414 trace_ext4_mb_bitmap_load(sb
, group
);
1416 /* see comments in ext4_mb_put_pa() */
1417 ext4_lock_group(sb
, group
);
1418 memcpy(data
, bitmap
, blocksize
);
1420 /* mark all preallocated blks used in in-core bitmap */
1421 ext4_mb_generate_from_pa(sb
, data
, group
);
1422 WARN_ON_ONCE(!RB_EMPTY_ROOT(&grinfo
->bb_free_root
));
1423 ext4_unlock_group(sb
, group
);
1425 /* set incore so that the buddy information can be
1426 * generated using this
1431 folio_mark_uptodate(folio
);
1435 for (i
= 0; i
< groups_per_page
; i
++)
1444 * Lock the buddy and bitmap pages. This make sure other parallel init_group
1445 * on the same buddy page doesn't happen whild holding the buddy page lock.
1446 * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
1447 * are on the same page e4b->bd_buddy_folio is NULL and return value is 0.
1449 static int ext4_mb_get_buddy_page_lock(struct super_block
*sb
,
1450 ext4_group_t group
, struct ext4_buddy
*e4b
, gfp_t gfp
)
1452 struct inode
*inode
= EXT4_SB(sb
)->s_buddy_cache
;
1453 int block
, pnum
, poff
;
1454 int blocks_per_page
;
1455 struct folio
*folio
;
1457 e4b
->bd_buddy_folio
= NULL
;
1458 e4b
->bd_bitmap_folio
= NULL
;
1460 blocks_per_page
= PAGE_SIZE
/ sb
->s_blocksize
;
1462 * the buddy cache inode stores the block bitmap
1463 * and buddy information in consecutive blocks.
1464 * So for each group we need two blocks.
1467 pnum
= block
/ blocks_per_page
;
1468 poff
= block
% blocks_per_page
;
1469 folio
= __filemap_get_folio(inode
->i_mapping
, pnum
,
1470 FGP_LOCK
| FGP_ACCESSED
| FGP_CREAT
, gfp
);
1472 return PTR_ERR(folio
);
1473 BUG_ON(folio
->mapping
!= inode
->i_mapping
);
1474 e4b
->bd_bitmap_folio
= folio
;
1475 e4b
->bd_bitmap
= folio_address(folio
) + (poff
* sb
->s_blocksize
);
1477 if (blocks_per_page
>= 2) {
1478 /* buddy and bitmap are on the same page */
1482 /* blocks_per_page == 1, hence we need another page for the buddy */
1483 folio
= __filemap_get_folio(inode
->i_mapping
, block
+ 1,
1484 FGP_LOCK
| FGP_ACCESSED
| FGP_CREAT
, gfp
);
1486 return PTR_ERR(folio
);
1487 BUG_ON(folio
->mapping
!= inode
->i_mapping
);
1488 e4b
->bd_buddy_folio
= folio
;
1492 static void ext4_mb_put_buddy_page_lock(struct ext4_buddy
*e4b
)
1494 if (e4b
->bd_bitmap_folio
) {
1495 folio_unlock(e4b
->bd_bitmap_folio
);
1496 folio_put(e4b
->bd_bitmap_folio
);
1498 if (e4b
->bd_buddy_folio
) {
1499 folio_unlock(e4b
->bd_buddy_folio
);
1500 folio_put(e4b
->bd_buddy_folio
);
1505 * Locking note: This routine calls ext4_mb_init_cache(), which takes the
1506 * block group lock of all groups for this page; do not hold the BG lock when
1507 * calling this routine!
1509 static noinline_for_stack
1510 int ext4_mb_init_group(struct super_block
*sb
, ext4_group_t group
, gfp_t gfp
)
1513 struct ext4_group_info
*this_grp
;
1514 struct ext4_buddy e4b
;
1515 struct folio
*folio
;
1519 mb_debug(sb
, "init group %u\n", group
);
1520 this_grp
= ext4_get_group_info(sb
, group
);
1522 return -EFSCORRUPTED
;
1525 * This ensures that we don't reinit the buddy cache
1526 * page which map to the group from which we are already
1527 * allocating. If we are looking at the buddy cache we would
1528 * have taken a reference using ext4_mb_load_buddy and that
1529 * would have pinned buddy page to page cache.
1530 * The call to ext4_mb_get_buddy_page_lock will mark the
1533 ret
= ext4_mb_get_buddy_page_lock(sb
, group
, &e4b
, gfp
);
1534 if (ret
|| !EXT4_MB_GRP_NEED_INIT(this_grp
)) {
1536 * somebody initialized the group
1537 * return without doing anything
1542 folio
= e4b
.bd_bitmap_folio
;
1543 ret
= ext4_mb_init_cache(folio
, NULL
, gfp
);
1546 if (!folio_test_uptodate(folio
)) {
1551 if (e4b
.bd_buddy_folio
== NULL
) {
1553 * If both the bitmap and buddy are in
1554 * the same page we don't need to force
1560 /* init buddy cache */
1561 folio
= e4b
.bd_buddy_folio
;
1562 ret
= ext4_mb_init_cache(folio
, e4b
.bd_bitmap
, gfp
);
1565 if (!folio_test_uptodate(folio
)) {
1570 ext4_mb_put_buddy_page_lock(&e4b
);
1575 * Locking note: This routine calls ext4_mb_init_cache(), which takes the
1576 * block group lock of all groups for this page; do not hold the BG lock when
1577 * calling this routine!
1579 static noinline_for_stack
int
1580 ext4_mb_load_buddy_gfp(struct super_block
*sb
, ext4_group_t group
,
1581 struct ext4_buddy
*e4b
, gfp_t gfp
)
1583 int blocks_per_page
;
1587 struct folio
*folio
;
1589 struct ext4_group_info
*grp
;
1590 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
1591 struct inode
*inode
= sbi
->s_buddy_cache
;
1594 mb_debug(sb
, "load group %u\n", group
);
1596 blocks_per_page
= PAGE_SIZE
/ sb
->s_blocksize
;
1597 grp
= ext4_get_group_info(sb
, group
);
1599 return -EFSCORRUPTED
;
1601 e4b
->bd_blkbits
= sb
->s_blocksize_bits
;
1604 e4b
->bd_group
= group
;
1605 e4b
->bd_buddy_folio
= NULL
;
1606 e4b
->bd_bitmap_folio
= NULL
;
1608 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp
))) {
1610 * we need full data about the group
1611 * to make a good selection
1613 ret
= ext4_mb_init_group(sb
, group
, gfp
);
1619 * the buddy cache inode stores the block bitmap
1620 * and buddy information in consecutive blocks.
1621 * So for each group we need two blocks.
1624 pnum
= block
/ blocks_per_page
;
1625 poff
= block
% blocks_per_page
;
1627 /* Avoid locking the folio in the fast path ... */
1628 folio
= __filemap_get_folio(inode
->i_mapping
, pnum
, FGP_ACCESSED
, 0);
1629 if (IS_ERR(folio
) || !folio_test_uptodate(folio
)) {
1632 * drop the folio reference and try
1633 * to get the folio with lock. If we
1634 * are not uptodate that implies
1635 * somebody just created the folio but
1636 * is yet to initialize it. So
1637 * wait for it to initialize.
1640 folio
= __filemap_get_folio(inode
->i_mapping
, pnum
,
1641 FGP_LOCK
| FGP_ACCESSED
| FGP_CREAT
, gfp
);
1642 if (!IS_ERR(folio
)) {
1643 if (WARN_RATELIMIT(folio
->mapping
!= inode
->i_mapping
,
1644 "ext4: bitmap's mapping != inode->i_mapping\n")) {
1645 /* should never happen */
1646 folio_unlock(folio
);
1650 if (!folio_test_uptodate(folio
)) {
1651 ret
= ext4_mb_init_cache(folio
, NULL
, gfp
);
1653 folio_unlock(folio
);
1656 mb_cmp_bitmaps(e4b
, folio_address(folio
) +
1657 (poff
* sb
->s_blocksize
));
1659 folio_unlock(folio
);
1662 if (IS_ERR(folio
)) {
1663 ret
= PTR_ERR(folio
);
1666 if (!folio_test_uptodate(folio
)) {
1671 /* Folios marked accessed already */
1672 e4b
->bd_bitmap_folio
= folio
;
1673 e4b
->bd_bitmap
= folio_address(folio
) + (poff
* sb
->s_blocksize
);
1676 pnum
= block
/ blocks_per_page
;
1677 poff
= block
% blocks_per_page
;
1679 folio
= __filemap_get_folio(inode
->i_mapping
, pnum
, FGP_ACCESSED
, 0);
1680 if (IS_ERR(folio
) || !folio_test_uptodate(folio
)) {
1683 folio
= __filemap_get_folio(inode
->i_mapping
, pnum
,
1684 FGP_LOCK
| FGP_ACCESSED
| FGP_CREAT
, gfp
);
1685 if (!IS_ERR(folio
)) {
1686 if (WARN_RATELIMIT(folio
->mapping
!= inode
->i_mapping
,
1687 "ext4: buddy bitmap's mapping != inode->i_mapping\n")) {
1688 /* should never happen */
1689 folio_unlock(folio
);
1693 if (!folio_test_uptodate(folio
)) {
1694 ret
= ext4_mb_init_cache(folio
, e4b
->bd_bitmap
,
1697 folio_unlock(folio
);
1701 folio_unlock(folio
);
1704 if (IS_ERR(folio
)) {
1705 ret
= PTR_ERR(folio
);
1708 if (!folio_test_uptodate(folio
)) {
1713 /* Folios marked accessed already */
1714 e4b
->bd_buddy_folio
= folio
;
1715 e4b
->bd_buddy
= folio_address(folio
) + (poff
* sb
->s_blocksize
);
1720 if (!IS_ERR_OR_NULL(folio
))
1722 if (e4b
->bd_bitmap_folio
)
1723 folio_put(e4b
->bd_bitmap_folio
);
1725 e4b
->bd_buddy
= NULL
;
1726 e4b
->bd_bitmap
= NULL
;
1730 static int ext4_mb_load_buddy(struct super_block
*sb
, ext4_group_t group
,
1731 struct ext4_buddy
*e4b
)
1733 return ext4_mb_load_buddy_gfp(sb
, group
, e4b
, GFP_NOFS
);
1736 static void ext4_mb_unload_buddy(struct ext4_buddy
*e4b
)
1738 if (e4b
->bd_bitmap_folio
)
1739 folio_put(e4b
->bd_bitmap_folio
);
1740 if (e4b
->bd_buddy_folio
)
1741 folio_put(e4b
->bd_buddy_folio
);
1745 static int mb_find_order_for_block(struct ext4_buddy
*e4b
, int block
)
1750 BUG_ON(e4b
->bd_bitmap
== e4b
->bd_buddy
);
1751 BUG_ON(block
>= (1 << (e4b
->bd_blkbits
+ 3)));
1753 while (order
<= e4b
->bd_blkbits
+ 1) {
1754 bb
= mb_find_buddy(e4b
, order
, &max
);
1755 if (!mb_test_bit(block
>> order
, bb
)) {
1756 /* this block is part of buddy of order 'order' */
1764 static void mb_clear_bits(void *bm
, int cur
, int len
)
1770 if ((cur
& 31) == 0 && (len
- cur
) >= 32) {
1771 /* fast path: clear whole word at once */
1772 addr
= bm
+ (cur
>> 3);
1777 mb_clear_bit(cur
, bm
);
1782 /* clear bits in given range
1783 * will return first found zero bit if any, -1 otherwise
1785 static int mb_test_and_clear_bits(void *bm
, int cur
, int len
)
1792 if ((cur
& 31) == 0 && (len
- cur
) >= 32) {
1793 /* fast path: clear whole word at once */
1794 addr
= bm
+ (cur
>> 3);
1795 if (*addr
!= (__u32
)(-1) && zero_bit
== -1)
1796 zero_bit
= cur
+ mb_find_next_zero_bit(addr
, 32, 0);
1801 if (!mb_test_and_clear_bit(cur
, bm
) && zero_bit
== -1)
1809 void mb_set_bits(void *bm
, int cur
, int len
)
1815 if ((cur
& 31) == 0 && (len
- cur
) >= 32) {
1816 /* fast path: set whole word at once */
1817 addr
= bm
+ (cur
>> 3);
1822 mb_set_bit(cur
, bm
);
1827 static inline int mb_buddy_adjust_border(int* bit
, void* bitmap
, int side
)
1829 if (mb_test_bit(*bit
+ side
, bitmap
)) {
1830 mb_clear_bit(*bit
, bitmap
);
1836 mb_set_bit(*bit
, bitmap
);
1841 static void mb_buddy_mark_free(struct ext4_buddy
*e4b
, int first
, int last
)
1845 void *buddy
= mb_find_buddy(e4b
, order
, &max
);
1850 /* Bits in range [first; last] are known to be set since
1851 * corresponding blocks were allocated. Bits in range
1852 * (first; last) will stay set because they form buddies on
1853 * upper layer. We just deal with borders if they don't
1854 * align with upper layer and then go up.
1855 * Releasing entire group is all about clearing
1856 * single bit of highest order buddy.
1860 * ---------------------------------
1862 * ---------------------------------
1863 * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
1864 * ---------------------------------
1866 * \_____________________/
1868 * Neither [1] nor [6] is aligned to above layer.
1869 * Left neighbour [0] is free, so mark it busy,
1870 * decrease bb_counters and extend range to
1872 * Right neighbour [7] is busy. It can't be coaleasced with [6], so
1873 * mark [6] free, increase bb_counters and shrink range to
1875 * Then shift range to [0; 2], go up and do the same.
1880 e4b
->bd_info
->bb_counters
[order
] += mb_buddy_adjust_border(&first
, buddy
, -1);
1882 e4b
->bd_info
->bb_counters
[order
] += mb_buddy_adjust_border(&last
, buddy
, 1);
1887 buddy2
= mb_find_buddy(e4b
, order
, &max
);
1889 mb_clear_bits(buddy
, first
, last
- first
+ 1);
1890 e4b
->bd_info
->bb_counters
[order
- 1] += last
- first
+ 1;
1899 static void mb_free_blocks(struct inode
*inode
, struct ext4_buddy
*e4b
,
1900 int first
, int count
)
1902 int left_is_free
= 0;
1903 int right_is_free
= 0;
1905 int last
= first
+ count
- 1;
1906 struct super_block
*sb
= e4b
->bd_sb
;
1908 if (WARN_ON(count
== 0))
1910 BUG_ON(last
>= (sb
->s_blocksize
<< 3));
1911 assert_spin_locked(ext4_group_lock_ptr(sb
, e4b
->bd_group
));
1912 /* Don't bother if the block group is corrupt. */
1913 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b
->bd_info
)))
1916 mb_check_buddy(e4b
);
1917 mb_free_blocks_double(inode
, e4b
, first
, count
);
1919 /* access memory sequentially: check left neighbour,
1920 * clear range and then check right neighbour
1923 left_is_free
= !mb_test_bit(first
- 1, e4b
->bd_bitmap
);
1924 block
= mb_test_and_clear_bits(e4b
->bd_bitmap
, first
, count
);
1925 if (last
+ 1 < EXT4_SB(sb
)->s_mb_maxs
[0])
1926 right_is_free
= !mb_test_bit(last
+ 1, e4b
->bd_bitmap
);
1928 if (unlikely(block
!= -1)) {
1929 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
1930 ext4_fsblk_t blocknr
;
1933 * Fastcommit replay can free already freed blocks which
1934 * corrupts allocation info. Regenerate it.
1936 if (sbi
->s_mount_state
& EXT4_FC_REPLAY
) {
1937 mb_regenerate_buddy(e4b
);
1941 blocknr
= ext4_group_first_block_no(sb
, e4b
->bd_group
);
1942 blocknr
+= EXT4_C2B(sbi
, block
);
1943 ext4_mark_group_bitmap_corrupted(sb
, e4b
->bd_group
,
1944 EXT4_GROUP_INFO_BBITMAP_CORRUPT
);
1945 ext4_grp_locked_error(sb
, e4b
->bd_group
,
1946 inode
? inode
->i_ino
: 0, blocknr
,
1947 "freeing already freed block (bit %u); block bitmap corrupt.",
1952 this_cpu_inc(discard_pa_seq
);
1953 e4b
->bd_info
->bb_free
+= count
;
1954 if (first
< e4b
->bd_info
->bb_first_free
)
1955 e4b
->bd_info
->bb_first_free
= first
;
1957 /* let's maintain fragments counter */
1958 if (left_is_free
&& right_is_free
)
1959 e4b
->bd_info
->bb_fragments
--;
1960 else if (!left_is_free
&& !right_is_free
)
1961 e4b
->bd_info
->bb_fragments
++;
1963 /* buddy[0] == bd_bitmap is a special case, so handle
1964 * it right away and let mb_buddy_mark_free stay free of
1965 * zero order checks.
1966 * Check if neighbours are to be coaleasced,
1967 * adjust bitmap bb_counters and borders appropriately.
1970 first
+= !left_is_free
;
1971 e4b
->bd_info
->bb_counters
[0] += left_is_free
? -1 : 1;
1974 last
-= !right_is_free
;
1975 e4b
->bd_info
->bb_counters
[0] += right_is_free
? -1 : 1;
1979 mb_buddy_mark_free(e4b
, first
>> 1, last
>> 1);
1981 mb_set_largest_free_order(sb
, e4b
->bd_info
);
1982 mb_update_avg_fragment_size(sb
, e4b
->bd_info
);
1984 mb_check_buddy(e4b
);
1987 static int mb_find_extent(struct ext4_buddy
*e4b
, int block
,
1988 int needed
, struct ext4_free_extent
*ex
)
1990 int max
, order
, next
;
1993 assert_spin_locked(ext4_group_lock_ptr(e4b
->bd_sb
, e4b
->bd_group
));
1996 buddy
= mb_find_buddy(e4b
, 0, &max
);
1997 BUG_ON(buddy
== NULL
);
1998 BUG_ON(block
>= max
);
1999 if (mb_test_bit(block
, buddy
)) {
2006 /* find actual order */
2007 order
= mb_find_order_for_block(e4b
, block
);
2009 ex
->fe_len
= (1 << order
) - (block
& ((1 << order
) - 1));
2010 ex
->fe_start
= block
;
2011 ex
->fe_group
= e4b
->bd_group
;
2013 block
= block
>> order
;
2015 while (needed
> ex
->fe_len
&&
2016 mb_find_buddy(e4b
, order
, &max
)) {
2018 if (block
+ 1 >= max
)
2021 next
= (block
+ 1) * (1 << order
);
2022 if (mb_test_bit(next
, e4b
->bd_bitmap
))
2025 order
= mb_find_order_for_block(e4b
, next
);
2027 block
= next
>> order
;
2028 ex
->fe_len
+= 1 << order
;
2031 if (ex
->fe_start
+ ex
->fe_len
> EXT4_CLUSTERS_PER_GROUP(e4b
->bd_sb
)) {
2032 /* Should never happen! (but apparently sometimes does?!?) */
2034 ext4_grp_locked_error(e4b
->bd_sb
, e4b
->bd_group
, 0, 0,
2035 "corruption or bug in mb_find_extent "
2036 "block=%d, order=%d needed=%d ex=%u/%d/%d@%u",
2037 block
, order
, needed
, ex
->fe_group
, ex
->fe_start
,
2038 ex
->fe_len
, ex
->fe_logical
);
2046 static int mb_mark_used(struct ext4_buddy
*e4b
, struct ext4_free_extent
*ex
)
2051 int start
= ex
->fe_start
;
2052 int len
= ex
->fe_len
;
2056 int ord_start
, ord_end
;
2058 BUG_ON(start
+ len
> (e4b
->bd_sb
->s_blocksize
<< 3));
2059 BUG_ON(e4b
->bd_group
!= ex
->fe_group
);
2060 assert_spin_locked(ext4_group_lock_ptr(e4b
->bd_sb
, e4b
->bd_group
));
2061 mb_check_buddy(e4b
);
2062 mb_mark_used_double(e4b
, start
, len
);
2064 this_cpu_inc(discard_pa_seq
);
2065 e4b
->bd_info
->bb_free
-= len
;
2066 if (e4b
->bd_info
->bb_first_free
== start
)
2067 e4b
->bd_info
->bb_first_free
+= len
;
2069 /* let's maintain fragments counter */
2071 mlen
= !mb_test_bit(start
- 1, e4b
->bd_bitmap
);
2072 if (start
+ len
< EXT4_SB(e4b
->bd_sb
)->s_mb_maxs
[0])
2073 max
= !mb_test_bit(start
+ len
, e4b
->bd_bitmap
);
2075 e4b
->bd_info
->bb_fragments
++;
2076 else if (!mlen
&& !max
)
2077 e4b
->bd_info
->bb_fragments
--;
2079 /* let's maintain buddy itself */
2081 ord
= mb_find_order_for_block(e4b
, start
);
2083 if (((start
>> ord
) << ord
) == start
&& len
>= (1 << ord
)) {
2084 /* the whole chunk may be allocated at once! */
2086 buddy
= mb_find_buddy(e4b
, ord
, &max
);
2087 BUG_ON((start
>> ord
) >= max
);
2088 mb_set_bit(start
>> ord
, buddy
);
2089 e4b
->bd_info
->bb_counters
[ord
]--;
2096 /* store for history */
2098 ret
= len
| (ord
<< 16);
2101 buddy
= mb_find_buddy(e4b
, ord
, &max
);
2102 mb_set_bit(start
>> ord
, buddy
);
2103 e4b
->bd_info
->bb_counters
[ord
]--;
2105 ord_start
= (start
>> ord
) << ord
;
2106 ord_end
= ord_start
+ (1 << ord
);
2108 if (start
> ord_start
)
2109 ext4_mb_mark_free_simple(e4b
->bd_sb
, e4b
->bd_buddy
,
2110 ord_start
, start
- ord_start
,
2114 if (start
+ len
< ord_end
) {
2115 ext4_mb_mark_free_simple(e4b
->bd_sb
, e4b
->bd_buddy
,
2117 ord_end
- (start
+ len
),
2121 len
= start
+ len
- ord_end
;
2124 mb_set_largest_free_order(e4b
->bd_sb
, e4b
->bd_info
);
2126 mb_update_avg_fragment_size(e4b
->bd_sb
, e4b
->bd_info
);
2127 mb_set_bits(e4b
->bd_bitmap
, ex
->fe_start
, len0
);
2128 mb_check_buddy(e4b
);
2134 * Must be called under group lock!
2136 static void ext4_mb_use_best_found(struct ext4_allocation_context
*ac
,
2137 struct ext4_buddy
*e4b
)
2139 struct ext4_sb_info
*sbi
= EXT4_SB(ac
->ac_sb
);
2142 BUG_ON(ac
->ac_b_ex
.fe_group
!= e4b
->bd_group
);
2143 BUG_ON(ac
->ac_status
== AC_STATUS_FOUND
);
2145 ac
->ac_b_ex
.fe_len
= min(ac
->ac_b_ex
.fe_len
, ac
->ac_g_ex
.fe_len
);
2146 ac
->ac_b_ex
.fe_logical
= ac
->ac_g_ex
.fe_logical
;
2147 ret
= mb_mark_used(e4b
, &ac
->ac_b_ex
);
2149 /* preallocation can change ac_b_ex, thus we store actually
2150 * allocated blocks for history */
2151 ac
->ac_f_ex
= ac
->ac_b_ex
;
2153 ac
->ac_status
= AC_STATUS_FOUND
;
2154 ac
->ac_tail
= ret
& 0xffff;
2155 ac
->ac_buddy
= ret
>> 16;
2158 * take the page reference. We want the page to be pinned
2159 * so that we don't get a ext4_mb_init_cache_call for this
2160 * group until we update the bitmap. That would mean we
2161 * double allocate blocks. The reference is dropped
2162 * in ext4_mb_release_context
2164 ac
->ac_bitmap_folio
= e4b
->bd_bitmap_folio
;
2165 folio_get(ac
->ac_bitmap_folio
);
2166 ac
->ac_buddy_folio
= e4b
->bd_buddy_folio
;
2167 folio_get(ac
->ac_buddy_folio
);
2168 /* store last allocated for subsequent stream allocation */
2169 if (ac
->ac_flags
& EXT4_MB_STREAM_ALLOC
) {
2170 spin_lock(&sbi
->s_md_lock
);
2171 sbi
->s_mb_last_group
= ac
->ac_f_ex
.fe_group
;
2172 sbi
->s_mb_last_start
= ac
->ac_f_ex
.fe_start
;
2173 spin_unlock(&sbi
->s_md_lock
);
2176 * As we've just preallocated more space than
2177 * user requested originally, we store allocated
2178 * space in a special descriptor.
2180 if (ac
->ac_o_ex
.fe_len
< ac
->ac_b_ex
.fe_len
)
2181 ext4_mb_new_preallocation(ac
);
2185 static void ext4_mb_check_limits(struct ext4_allocation_context
*ac
,
2186 struct ext4_buddy
*e4b
,
2189 struct ext4_sb_info
*sbi
= EXT4_SB(ac
->ac_sb
);
2190 struct ext4_free_extent
*bex
= &ac
->ac_b_ex
;
2191 struct ext4_free_extent
*gex
= &ac
->ac_g_ex
;
2193 if (ac
->ac_status
== AC_STATUS_FOUND
)
2196 * We don't want to scan for a whole year
2198 if (ac
->ac_found
> sbi
->s_mb_max_to_scan
&&
2199 !(ac
->ac_flags
& EXT4_MB_HINT_FIRST
)) {
2200 ac
->ac_status
= AC_STATUS_BREAK
;
2205 * Haven't found good chunk so far, let's continue
2207 if (bex
->fe_len
< gex
->fe_len
)
2210 if (finish_group
|| ac
->ac_found
> sbi
->s_mb_min_to_scan
)
2211 ext4_mb_use_best_found(ac
, e4b
);
2215 * The routine checks whether found extent is good enough. If it is,
2216 * then the extent gets marked used and flag is set to the context
2217 * to stop scanning. Otherwise, the extent is compared with the
2218 * previous found extent and if new one is better, then it's stored
2219 * in the context. Later, the best found extent will be used, if
2220 * mballoc can't find good enough extent.
2222 * The algorithm used is roughly as follows:
2224 * * If free extent found is exactly as big as goal, then
2225 * stop the scan and use it immediately
2227 * * If free extent found is smaller than goal, then keep retrying
2228 * upto a max of sbi->s_mb_max_to_scan times (default 200). After
2229 * that stop scanning and use whatever we have.
2231 * * If free extent found is bigger than goal, then keep retrying
2232 * upto a max of sbi->s_mb_min_to_scan times (default 10) before
2233 * stopping the scan and using the extent.
2236 * FIXME: real allocation policy is to be designed yet!
2238 static void ext4_mb_measure_extent(struct ext4_allocation_context
*ac
,
2239 struct ext4_free_extent
*ex
,
2240 struct ext4_buddy
*e4b
)
2242 struct ext4_free_extent
*bex
= &ac
->ac_b_ex
;
2243 struct ext4_free_extent
*gex
= &ac
->ac_g_ex
;
2245 BUG_ON(ex
->fe_len
<= 0);
2246 BUG_ON(ex
->fe_len
> EXT4_CLUSTERS_PER_GROUP(ac
->ac_sb
));
2247 BUG_ON(ex
->fe_start
>= EXT4_CLUSTERS_PER_GROUP(ac
->ac_sb
));
2248 BUG_ON(ac
->ac_status
!= AC_STATUS_CONTINUE
);
2251 ac
->ac_cX_found
[ac
->ac_criteria
]++;
2254 * The special case - take what you catch first
2256 if (unlikely(ac
->ac_flags
& EXT4_MB_HINT_FIRST
)) {
2258 ext4_mb_use_best_found(ac
, e4b
);
2263 * Let's check whether the chuck is good enough
2265 if (ex
->fe_len
== gex
->fe_len
) {
2267 ext4_mb_use_best_found(ac
, e4b
);
2272 * If this is first found extent, just store it in the context
2274 if (bex
->fe_len
== 0) {
2280 * If new found extent is better, store it in the context
2282 if (bex
->fe_len
< gex
->fe_len
) {
2283 /* if the request isn't satisfied, any found extent
2284 * larger than previous best one is better */
2285 if (ex
->fe_len
> bex
->fe_len
)
2287 } else if (ex
->fe_len
> gex
->fe_len
) {
2288 /* if the request is satisfied, then we try to find
2289 * an extent that still satisfy the request, but is
2290 * smaller than previous one */
2291 if (ex
->fe_len
< bex
->fe_len
)
2295 ext4_mb_check_limits(ac
, e4b
, 0);
2298 static noinline_for_stack
2299 void ext4_mb_try_best_found(struct ext4_allocation_context
*ac
,
2300 struct ext4_buddy
*e4b
)
2302 struct ext4_free_extent ex
= ac
->ac_b_ex
;
2303 ext4_group_t group
= ex
.fe_group
;
2307 BUG_ON(ex
.fe_len
<= 0);
2308 err
= ext4_mb_load_buddy(ac
->ac_sb
, group
, e4b
);
2312 ext4_lock_group(ac
->ac_sb
, group
);
2313 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b
->bd_info
)))
2316 max
= mb_find_extent(e4b
, ex
.fe_start
, ex
.fe_len
, &ex
);
2320 ext4_mb_use_best_found(ac
, e4b
);
2324 ext4_unlock_group(ac
->ac_sb
, group
);
2325 ext4_mb_unload_buddy(e4b
);
2328 static noinline_for_stack
2329 int ext4_mb_find_by_goal(struct ext4_allocation_context
*ac
,
2330 struct ext4_buddy
*e4b
)
2332 ext4_group_t group
= ac
->ac_g_ex
.fe_group
;
2335 struct ext4_sb_info
*sbi
= EXT4_SB(ac
->ac_sb
);
2336 struct ext4_group_info
*grp
= ext4_get_group_info(ac
->ac_sb
, group
);
2337 struct ext4_free_extent ex
;
2340 return -EFSCORRUPTED
;
2341 if (!(ac
->ac_flags
& (EXT4_MB_HINT_TRY_GOAL
| EXT4_MB_HINT_GOAL_ONLY
)))
2343 if (grp
->bb_free
== 0)
2346 err
= ext4_mb_load_buddy(ac
->ac_sb
, group
, e4b
);
2350 ext4_lock_group(ac
->ac_sb
, group
);
2351 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b
->bd_info
)))
2354 max
= mb_find_extent(e4b
, ac
->ac_g_ex
.fe_start
,
2355 ac
->ac_g_ex
.fe_len
, &ex
);
2356 ex
.fe_logical
= 0xDEADFA11; /* debug value */
2358 if (max
>= ac
->ac_g_ex
.fe_len
&&
2359 ac
->ac_g_ex
.fe_len
== EXT4_NUM_B2C(sbi
, sbi
->s_stripe
)) {
2362 start
= ext4_grp_offs_to_block(ac
->ac_sb
, &ex
);
2363 /* use do_div to get remainder (would be 64-bit modulo) */
2364 if (do_div(start
, sbi
->s_stripe
) == 0) {
2367 ext4_mb_use_best_found(ac
, e4b
);
2369 } else if (max
>= ac
->ac_g_ex
.fe_len
) {
2370 BUG_ON(ex
.fe_len
<= 0);
2371 BUG_ON(ex
.fe_group
!= ac
->ac_g_ex
.fe_group
);
2372 BUG_ON(ex
.fe_start
!= ac
->ac_g_ex
.fe_start
);
2375 ext4_mb_use_best_found(ac
, e4b
);
2376 } else if (max
> 0 && (ac
->ac_flags
& EXT4_MB_HINT_MERGE
)) {
2377 /* Sometimes, caller may want to merge even small
2378 * number of blocks to an existing extent */
2379 BUG_ON(ex
.fe_len
<= 0);
2380 BUG_ON(ex
.fe_group
!= ac
->ac_g_ex
.fe_group
);
2381 BUG_ON(ex
.fe_start
!= ac
->ac_g_ex
.fe_start
);
2384 ext4_mb_use_best_found(ac
, e4b
);
2387 ext4_unlock_group(ac
->ac_sb
, group
);
2388 ext4_mb_unload_buddy(e4b
);
2394 * The routine scans buddy structures (not bitmap!) from given order
2395 * to max order and tries to find big enough chunk to satisfy the req
2397 static noinline_for_stack
2398 void ext4_mb_simple_scan_group(struct ext4_allocation_context
*ac
,
2399 struct ext4_buddy
*e4b
)
2401 struct super_block
*sb
= ac
->ac_sb
;
2402 struct ext4_group_info
*grp
= e4b
->bd_info
;
2408 BUG_ON(ac
->ac_2order
<= 0);
2409 for (i
= ac
->ac_2order
; i
< MB_NUM_ORDERS(sb
); i
++) {
2410 if (grp
->bb_counters
[i
] == 0)
2413 buddy
= mb_find_buddy(e4b
, i
, &max
);
2414 if (WARN_RATELIMIT(buddy
== NULL
,
2415 "ext4: mb_simple_scan_group: mb_find_buddy failed, (%d)\n", i
))
2418 k
= mb_find_next_zero_bit(buddy
, max
, 0);
2420 ext4_mark_group_bitmap_corrupted(ac
->ac_sb
,
2422 EXT4_GROUP_INFO_BBITMAP_CORRUPT
);
2423 ext4_grp_locked_error(ac
->ac_sb
, e4b
->bd_group
, 0, 0,
2424 "%d free clusters of order %d. But found 0",
2425 grp
->bb_counters
[i
], i
);
2429 ac
->ac_cX_found
[ac
->ac_criteria
]++;
2431 ac
->ac_b_ex
.fe_len
= 1 << i
;
2432 ac
->ac_b_ex
.fe_start
= k
<< i
;
2433 ac
->ac_b_ex
.fe_group
= e4b
->bd_group
;
2435 ext4_mb_use_best_found(ac
, e4b
);
2437 BUG_ON(ac
->ac_f_ex
.fe_len
!= ac
->ac_g_ex
.fe_len
);
2439 if (EXT4_SB(sb
)->s_mb_stats
)
2440 atomic_inc(&EXT4_SB(sb
)->s_bal_2orders
);
2447 * The routine scans the group and measures all found extents.
2448 * In order to optimize scanning, caller must pass number of
2449 * free blocks in the group, so the routine can know upper limit.
2451 static noinline_for_stack
2452 void ext4_mb_complex_scan_group(struct ext4_allocation_context
*ac
,
2453 struct ext4_buddy
*e4b
)
2455 struct super_block
*sb
= ac
->ac_sb
;
2456 void *bitmap
= e4b
->bd_bitmap
;
2457 struct ext4_free_extent ex
;
2461 free
= e4b
->bd_info
->bb_free
;
2462 if (WARN_ON(free
<= 0))
2465 i
= e4b
->bd_info
->bb_first_free
;
2467 while (free
&& ac
->ac_status
== AC_STATUS_CONTINUE
) {
2468 i
= mb_find_next_zero_bit(bitmap
,
2469 EXT4_CLUSTERS_PER_GROUP(sb
), i
);
2470 if (i
>= EXT4_CLUSTERS_PER_GROUP(sb
)) {
2472 * IF we have corrupt bitmap, we won't find any
2473 * free blocks even though group info says we
2476 ext4_mark_group_bitmap_corrupted(sb
, e4b
->bd_group
,
2477 EXT4_GROUP_INFO_BBITMAP_CORRUPT
);
2478 ext4_grp_locked_error(sb
, e4b
->bd_group
, 0, 0,
2479 "%d free clusters as per "
2480 "group info. But bitmap says 0",
2485 if (!ext4_mb_cr_expensive(ac
->ac_criteria
)) {
2487 * In CR_GOAL_LEN_FAST and CR_BEST_AVAIL_LEN, we are
2488 * sure that this group will have a large enough
2489 * continuous free extent, so skip over the smaller free
2492 j
= mb_find_next_bit(bitmap
,
2493 EXT4_CLUSTERS_PER_GROUP(sb
), i
);
2496 if (freelen
< ac
->ac_g_ex
.fe_len
) {
2503 mb_find_extent(e4b
, i
, ac
->ac_g_ex
.fe_len
, &ex
);
2504 if (WARN_ON(ex
.fe_len
<= 0))
2506 if (free
< ex
.fe_len
) {
2507 ext4_mark_group_bitmap_corrupted(sb
, e4b
->bd_group
,
2508 EXT4_GROUP_INFO_BBITMAP_CORRUPT
);
2509 ext4_grp_locked_error(sb
, e4b
->bd_group
, 0, 0,
2510 "%d free clusters as per "
2511 "group info. But got %d blocks",
2514 * The number of free blocks differs. This mostly
2515 * indicate that the bitmap is corrupt. So exit
2516 * without claiming the space.
2520 ex
.fe_logical
= 0xDEADC0DE; /* debug value */
2521 ext4_mb_measure_extent(ac
, &ex
, e4b
);
2527 ext4_mb_check_limits(ac
, e4b
, 1);
2531 * This is a special case for storages like raid5
2532 * we try to find stripe-aligned chunks for stripe-size-multiple requests
2534 static noinline_for_stack
2535 void ext4_mb_scan_aligned(struct ext4_allocation_context
*ac
,
2536 struct ext4_buddy
*e4b
)
2538 struct super_block
*sb
= ac
->ac_sb
;
2539 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
2540 void *bitmap
= e4b
->bd_bitmap
;
2541 struct ext4_free_extent ex
;
2542 ext4_fsblk_t first_group_block
;
2544 ext4_grpblk_t i
, stripe
;
2547 BUG_ON(sbi
->s_stripe
== 0);
2549 /* find first stripe-aligned block in group */
2550 first_group_block
= ext4_group_first_block_no(sb
, e4b
->bd_group
);
2552 a
= first_group_block
+ sbi
->s_stripe
- 1;
2553 do_div(a
, sbi
->s_stripe
);
2554 i
= (a
* sbi
->s_stripe
) - first_group_block
;
2556 stripe
= EXT4_NUM_B2C(sbi
, sbi
->s_stripe
);
2557 i
= EXT4_B2C(sbi
, i
);
2558 while (i
< EXT4_CLUSTERS_PER_GROUP(sb
)) {
2559 if (!mb_test_bit(i
, bitmap
)) {
2560 max
= mb_find_extent(e4b
, i
, stripe
, &ex
);
2561 if (max
>= stripe
) {
2563 ac
->ac_cX_found
[ac
->ac_criteria
]++;
2564 ex
.fe_logical
= 0xDEADF00D; /* debug value */
2566 ext4_mb_use_best_found(ac
, e4b
);
2575 * This is also called BEFORE we load the buddy bitmap.
2576 * Returns either 1 or 0 indicating that the group is either suitable
2577 * for the allocation or not.
2579 static bool ext4_mb_good_group(struct ext4_allocation_context
*ac
,
2580 ext4_group_t group
, enum criteria cr
)
2582 ext4_grpblk_t free
, fragments
;
2583 int flex_size
= ext4_flex_bg_size(EXT4_SB(ac
->ac_sb
));
2584 struct ext4_group_info
*grp
= ext4_get_group_info(ac
->ac_sb
, group
);
2586 BUG_ON(cr
< CR_POWER2_ALIGNED
|| cr
>= EXT4_MB_NUM_CRS
);
2588 if (unlikely(!grp
|| EXT4_MB_GRP_BBITMAP_CORRUPT(grp
)))
2591 free
= grp
->bb_free
;
2595 fragments
= grp
->bb_fragments
;
2600 case CR_POWER2_ALIGNED
:
2601 BUG_ON(ac
->ac_2order
== 0);
2603 /* Avoid using the first bg of a flexgroup for data files */
2604 if ((ac
->ac_flags
& EXT4_MB_HINT_DATA
) &&
2605 (flex_size
>= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
) &&
2606 ((group
% flex_size
) == 0))
2609 if (free
< ac
->ac_g_ex
.fe_len
)
2612 if (ac
->ac_2order
>= MB_NUM_ORDERS(ac
->ac_sb
))
2615 if (grp
->bb_largest_free_order
< ac
->ac_2order
)
2619 case CR_GOAL_LEN_FAST
:
2620 case CR_BEST_AVAIL_LEN
:
2621 if ((free
/ fragments
) >= ac
->ac_g_ex
.fe_len
)
2624 case CR_GOAL_LEN_SLOW
:
2625 if (free
>= ac
->ac_g_ex
.fe_len
)
2638 * This could return negative error code if something goes wrong
2639 * during ext4_mb_init_group(). This should not be called with
2640 * ext4_lock_group() held.
2642 * Note: because we are conditionally operating with the group lock in
2643 * the EXT4_MB_STRICT_CHECK case, we need to fake out sparse in this
2644 * function using __acquire and __release. This means we need to be
2645 * super careful before messing with the error path handling via "goto
2648 static int ext4_mb_good_group_nolock(struct ext4_allocation_context
*ac
,
2649 ext4_group_t group
, enum criteria cr
)
2651 struct ext4_group_info
*grp
= ext4_get_group_info(ac
->ac_sb
, group
);
2652 struct super_block
*sb
= ac
->ac_sb
;
2653 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
2654 bool should_lock
= ac
->ac_flags
& EXT4_MB_STRICT_CHECK
;
2659 return -EFSCORRUPTED
;
2660 if (sbi
->s_mb_stats
)
2661 atomic64_inc(&sbi
->s_bal_cX_groups_considered
[ac
->ac_criteria
]);
2663 ext4_lock_group(sb
, group
);
2664 __release(ext4_group_lock_ptr(sb
, group
));
2666 free
= grp
->bb_free
;
2670 * In all criterias except CR_ANY_FREE we try to avoid groups that
2671 * can't possibly satisfy the full goal request due to insufficient
2674 if (cr
< CR_ANY_FREE
&& free
< ac
->ac_g_ex
.fe_len
)
2676 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp
)))
2679 __acquire(ext4_group_lock_ptr(sb
, group
));
2680 ext4_unlock_group(sb
, group
);
2683 /* We only do this if the grp has never been initialized */
2684 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp
))) {
2685 struct ext4_group_desc
*gdp
=
2686 ext4_get_group_desc(sb
, group
, NULL
);
2690 * CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic
2691 * search to find large good chunks almost for free. If buddy
2692 * data is not ready, then this optimization makes no sense. But
2693 * we never skip the first block group in a flex_bg, since this
2694 * gets used for metadata block allocation, and we want to make
2695 * sure we locate metadata blocks in the first block group in
2696 * the flex_bg if possible.
2698 if (!ext4_mb_cr_expensive(cr
) &&
2699 (!sbi
->s_log_groups_per_flex
||
2700 ((group
& ((1 << sbi
->s_log_groups_per_flex
) - 1)) != 0)) &&
2701 !(ext4_has_group_desc_csum(sb
) &&
2702 (gdp
->bg_flags
& cpu_to_le16(EXT4_BG_BLOCK_UNINIT
))))
2704 ret
= ext4_mb_init_group(sb
, group
, GFP_NOFS
);
2710 ext4_lock_group(sb
, group
);
2711 __release(ext4_group_lock_ptr(sb
, group
));
2713 ret
= ext4_mb_good_group(ac
, group
, cr
);
2716 __acquire(ext4_group_lock_ptr(sb
, group
));
2717 ext4_unlock_group(sb
, group
);
2723 * Start prefetching @nr block bitmaps starting at @group.
2724 * Return the next group which needs to be prefetched.
2726 ext4_group_t
ext4_mb_prefetch(struct super_block
*sb
, ext4_group_t group
,
2727 unsigned int nr
, int *cnt
)
2729 ext4_group_t ngroups
= ext4_get_groups_count(sb
);
2730 struct buffer_head
*bh
;
2731 struct blk_plug plug
;
2733 blk_start_plug(&plug
);
2735 struct ext4_group_desc
*gdp
= ext4_get_group_desc(sb
, group
,
2737 struct ext4_group_info
*grp
= ext4_get_group_info(sb
, group
);
2740 * Prefetch block groups with free blocks; but don't
2741 * bother if it is marked uninitialized on disk, since
2742 * it won't require I/O to read. Also only try to
2743 * prefetch once, so we avoid getblk() call, which can
2746 if (gdp
&& grp
&& !EXT4_MB_GRP_TEST_AND_SET_READ(grp
) &&
2747 EXT4_MB_GRP_NEED_INIT(grp
) &&
2748 ext4_free_group_clusters(sb
, gdp
) > 0 ) {
2749 bh
= ext4_read_block_bitmap_nowait(sb
, group
, true);
2750 if (bh
&& !IS_ERR(bh
)) {
2751 if (!buffer_uptodate(bh
) && cnt
)
2756 if (++group
>= ngroups
)
2759 blk_finish_plug(&plug
);
2764 * Prefetching reads the block bitmap into the buffer cache; but we
2765 * need to make sure that the buddy bitmap in the page cache has been
2766 * initialized. Note that ext4_mb_init_group() will block if the I/O
2767 * is not yet completed, or indeed if it was not initiated by
2768 * ext4_mb_prefetch did not start the I/O.
2770 * TODO: We should actually kick off the buddy bitmap setup in a work
2771 * queue when the buffer I/O is completed, so that we don't block
2772 * waiting for the block allocation bitmap read to finish when
2773 * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator().
2775 void ext4_mb_prefetch_fini(struct super_block
*sb
, ext4_group_t group
,
2778 struct ext4_group_desc
*gdp
;
2779 struct ext4_group_info
*grp
;
2783 group
= ext4_get_groups_count(sb
);
2785 gdp
= ext4_get_group_desc(sb
, group
, NULL
);
2786 grp
= ext4_get_group_info(sb
, group
);
2788 if (grp
&& gdp
&& EXT4_MB_GRP_NEED_INIT(grp
) &&
2789 ext4_free_group_clusters(sb
, gdp
) > 0) {
2790 if (ext4_mb_init_group(sb
, group
, GFP_NOFS
))
2796 static noinline_for_stack
int
2797 ext4_mb_regular_allocator(struct ext4_allocation_context
*ac
)
2799 ext4_group_t prefetch_grp
= 0, ngroups
, group
, i
;
2800 enum criteria new_cr
, cr
= CR_GOAL_LEN_FAST
;
2801 int err
= 0, first_err
= 0;
2802 unsigned int nr
= 0, prefetch_ios
= 0;
2803 struct ext4_sb_info
*sbi
;
2804 struct super_block
*sb
;
2805 struct ext4_buddy e4b
;
2810 ngroups
= ext4_get_groups_count(sb
);
2811 /* non-extent files are limited to low blocks/groups */
2812 if (!(ext4_test_inode_flag(ac
->ac_inode
, EXT4_INODE_EXTENTS
)))
2813 ngroups
= sbi
->s_blockfile_groups
;
2815 BUG_ON(ac
->ac_status
== AC_STATUS_FOUND
);
2817 /* first, try the goal */
2818 err
= ext4_mb_find_by_goal(ac
, &e4b
);
2819 if (err
|| ac
->ac_status
== AC_STATUS_FOUND
)
2822 if (unlikely(ac
->ac_flags
& EXT4_MB_HINT_GOAL_ONLY
))
2826 * ac->ac_2order is set only if the fe_len is a power of 2
2827 * if ac->ac_2order is set we also set criteria to CR_POWER2_ALIGNED
2828 * so that we try exact allocation using buddy.
2830 i
= fls(ac
->ac_g_ex
.fe_len
);
2833 * We search using buddy data only if the order of the request
2834 * is greater than equal to the sbi_s_mb_order2_reqs
2835 * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
2836 * We also support searching for power-of-two requests only for
2837 * requests upto maximum buddy size we have constructed.
2839 if (i
>= sbi
->s_mb_order2_reqs
&& i
<= MB_NUM_ORDERS(sb
)) {
2840 if (is_power_of_2(ac
->ac_g_ex
.fe_len
))
2841 ac
->ac_2order
= array_index_nospec(i
- 1,
2845 /* if stream allocation is enabled, use global goal */
2846 if (ac
->ac_flags
& EXT4_MB_STREAM_ALLOC
) {
2847 /* TBD: may be hot point */
2848 spin_lock(&sbi
->s_md_lock
);
2849 ac
->ac_g_ex
.fe_group
= sbi
->s_mb_last_group
;
2850 ac
->ac_g_ex
.fe_start
= sbi
->s_mb_last_start
;
2851 spin_unlock(&sbi
->s_md_lock
);
2855 * Let's just scan groups to find more-less suitable blocks We
2856 * start with CR_GOAL_LEN_FAST, unless it is power of 2
2857 * aligned, in which case let's do that faster approach first.
2860 cr
= CR_POWER2_ALIGNED
;
2862 for (; cr
< EXT4_MB_NUM_CRS
&& ac
->ac_status
== AC_STATUS_CONTINUE
; cr
++) {
2863 ac
->ac_criteria
= cr
;
2865 * searching for the right group start
2866 * from the goal value specified
2868 group
= ac
->ac_g_ex
.fe_group
;
2869 ac
->ac_groups_linear_remaining
= sbi
->s_mb_max_linear_groups
;
2870 prefetch_grp
= group
;
2873 for (i
= 0, new_cr
= cr
; i
< ngroups
; i
++,
2874 ext4_mb_choose_next_group(ac
, &new_cr
, &group
, ngroups
)) {
2884 * Batch reads of the block allocation bitmaps
2885 * to get multiple READs in flight; limit
2886 * prefetching at inexpensive CR, otherwise mballoc
2887 * can spend a lot of time loading imperfect groups
2889 if ((prefetch_grp
== group
) &&
2890 (ext4_mb_cr_expensive(cr
) ||
2891 prefetch_ios
< sbi
->s_mb_prefetch_limit
)) {
2892 nr
= sbi
->s_mb_prefetch
;
2893 if (ext4_has_feature_flex_bg(sb
)) {
2894 nr
= 1 << sbi
->s_log_groups_per_flex
;
2895 nr
-= group
& (nr
- 1);
2896 nr
= min(nr
, sbi
->s_mb_prefetch
);
2898 prefetch_grp
= ext4_mb_prefetch(sb
, group
,
2902 /* This now checks without needing the buddy page */
2903 ret
= ext4_mb_good_group_nolock(ac
, group
, cr
);
2910 err
= ext4_mb_load_buddy(sb
, group
, &e4b
);
2914 ext4_lock_group(sb
, group
);
2917 * We need to check again after locking the
2920 ret
= ext4_mb_good_group(ac
, group
, cr
);
2922 ext4_unlock_group(sb
, group
);
2923 ext4_mb_unload_buddy(&e4b
);
2927 ac
->ac_groups_scanned
++;
2928 if (cr
== CR_POWER2_ALIGNED
)
2929 ext4_mb_simple_scan_group(ac
, &e4b
);
2931 bool is_stripe_aligned
=
2933 sbi
->s_cluster_ratio
) &&
2934 !(ac
->ac_g_ex
.fe_len
%
2935 EXT4_NUM_B2C(sbi
, sbi
->s_stripe
));
2937 if ((cr
== CR_GOAL_LEN_FAST
||
2938 cr
== CR_BEST_AVAIL_LEN
) &&
2940 ext4_mb_scan_aligned(ac
, &e4b
);
2942 if (ac
->ac_status
== AC_STATUS_CONTINUE
)
2943 ext4_mb_complex_scan_group(ac
, &e4b
);
2946 ext4_unlock_group(sb
, group
);
2947 ext4_mb_unload_buddy(&e4b
);
2949 if (ac
->ac_status
!= AC_STATUS_CONTINUE
)
2952 /* Processed all groups and haven't found blocks */
2953 if (sbi
->s_mb_stats
&& i
== ngroups
)
2954 atomic64_inc(&sbi
->s_bal_cX_failed
[cr
]);
2956 if (i
== ngroups
&& ac
->ac_criteria
== CR_BEST_AVAIL_LEN
)
2957 /* Reset goal length to original goal length before
2958 * falling into CR_GOAL_LEN_SLOW */
2959 ac
->ac_g_ex
.fe_len
= ac
->ac_orig_goal_len
;
2962 if (ac
->ac_b_ex
.fe_len
> 0 && ac
->ac_status
!= AC_STATUS_FOUND
&&
2963 !(ac
->ac_flags
& EXT4_MB_HINT_FIRST
)) {
2965 * We've been searching too long. Let's try to allocate
2966 * the best chunk we've found so far
2968 ext4_mb_try_best_found(ac
, &e4b
);
2969 if (ac
->ac_status
!= AC_STATUS_FOUND
) {
2971 * Someone more lucky has already allocated it.
2972 * The only thing we can do is just take first
2975 lost
= atomic_inc_return(&sbi
->s_mb_lost_chunks
);
2976 mb_debug(sb
, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n",
2977 ac
->ac_b_ex
.fe_group
, ac
->ac_b_ex
.fe_start
,
2978 ac
->ac_b_ex
.fe_len
, lost
);
2980 ac
->ac_b_ex
.fe_group
= 0;
2981 ac
->ac_b_ex
.fe_start
= 0;
2982 ac
->ac_b_ex
.fe_len
= 0;
2983 ac
->ac_status
= AC_STATUS_CONTINUE
;
2984 ac
->ac_flags
|= EXT4_MB_HINT_FIRST
;
2990 if (sbi
->s_mb_stats
&& ac
->ac_status
== AC_STATUS_FOUND
)
2991 atomic64_inc(&sbi
->s_bal_cX_hits
[ac
->ac_criteria
]);
2993 if (!err
&& ac
->ac_status
!= AC_STATUS_FOUND
&& first_err
)
2996 mb_debug(sb
, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
2997 ac
->ac_b_ex
.fe_len
, ac
->ac_o_ex
.fe_len
, ac
->ac_status
,
2998 ac
->ac_flags
, cr
, err
);
3001 ext4_mb_prefetch_fini(sb
, prefetch_grp
, nr
);
3006 static void *ext4_mb_seq_groups_start(struct seq_file
*seq
, loff_t
*pos
)
3008 struct super_block
*sb
= pde_data(file_inode(seq
->file
));
3011 if (*pos
< 0 || *pos
>= ext4_get_groups_count(sb
))
3014 return (void *) ((unsigned long) group
);
3017 static void *ext4_mb_seq_groups_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
3019 struct super_block
*sb
= pde_data(file_inode(seq
->file
));
3023 if (*pos
< 0 || *pos
>= ext4_get_groups_count(sb
))
3026 return (void *) ((unsigned long) group
);
3029 static int ext4_mb_seq_groups_show(struct seq_file
*seq
, void *v
)
3031 struct super_block
*sb
= pde_data(file_inode(seq
->file
));
3032 ext4_group_t group
= (ext4_group_t
) ((unsigned long) v
);
3035 struct ext4_buddy e4b
;
3036 struct ext4_group_info
*grinfo
;
3037 unsigned char blocksize_bits
= min_t(unsigned char,
3038 sb
->s_blocksize_bits
,
3039 EXT4_MAX_BLOCK_LOG_SIZE
);
3041 struct ext4_group_info info
;
3042 ext4_grpblk_t counters
[EXT4_MAX_BLOCK_LOG_SIZE
+ 2];
3047 seq_puts(seq
, "#group: free frags first ["
3048 " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 "
3049 " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n");
3051 i
= (blocksize_bits
+ 2) * sizeof(sg
.info
.bb_counters
[0]) +
3052 sizeof(struct ext4_group_info
);
3054 grinfo
= ext4_get_group_info(sb
, group
);
3057 /* Load the group info in memory only if not already loaded. */
3058 if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo
))) {
3059 err
= ext4_mb_load_buddy(sb
, group
, &e4b
);
3061 seq_printf(seq
, "#%-5u: %s\n", group
, ext4_decode_error(NULL
, err
, nbuf
));
3064 ext4_mb_unload_buddy(&e4b
);
3068 * We care only about free space counters in the group info and
3069 * these are safe to access even after the buddy has been unloaded
3071 memcpy(&sg
, grinfo
, i
);
3072 seq_printf(seq
, "#%-5u: %-5u %-5u %-5u [", group
, sg
.info
.bb_free
,
3073 sg
.info
.bb_fragments
, sg
.info
.bb_first_free
);
3074 for (i
= 0; i
<= 13; i
++)
3075 seq_printf(seq
, " %-5u", i
<= blocksize_bits
+ 1 ?
3076 sg
.info
.bb_counters
[i
] : 0);
3077 seq_puts(seq
, " ]");
3078 if (EXT4_MB_GRP_BBITMAP_CORRUPT(&sg
.info
))
3079 seq_puts(seq
, " Block bitmap corrupted!");
3080 seq_putc(seq
, '\n');
3084 static void ext4_mb_seq_groups_stop(struct seq_file
*seq
, void *v
)
3088 const struct seq_operations ext4_mb_seq_groups_ops
= {
3089 .start
= ext4_mb_seq_groups_start
,
3090 .next
= ext4_mb_seq_groups_next
,
3091 .stop
= ext4_mb_seq_groups_stop
,
3092 .show
= ext4_mb_seq_groups_show
,
3095 int ext4_seq_mb_stats_show(struct seq_file
*seq
, void *offset
)
3097 struct super_block
*sb
= seq
->private;
3098 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
3100 seq_puts(seq
, "mballoc:\n");
3101 if (!sbi
->s_mb_stats
) {
3102 seq_puts(seq
, "\tmb stats collection turned off.\n");
3105 "\tTo enable, please write \"1\" to sysfs file mb_stats.\n");
3108 seq_printf(seq
, "\treqs: %u\n", atomic_read(&sbi
->s_bal_reqs
));
3109 seq_printf(seq
, "\tsuccess: %u\n", atomic_read(&sbi
->s_bal_success
));
3111 seq_printf(seq
, "\tgroups_scanned: %u\n",
3112 atomic_read(&sbi
->s_bal_groups_scanned
));
3114 /* CR_POWER2_ALIGNED stats */
3115 seq_puts(seq
, "\tcr_p2_aligned_stats:\n");
3116 seq_printf(seq
, "\t\thits: %llu\n",
3117 atomic64_read(&sbi
->s_bal_cX_hits
[CR_POWER2_ALIGNED
]));
3119 seq
, "\t\tgroups_considered: %llu\n",
3121 &sbi
->s_bal_cX_groups_considered
[CR_POWER2_ALIGNED
]));
3122 seq_printf(seq
, "\t\textents_scanned: %u\n",
3123 atomic_read(&sbi
->s_bal_cX_ex_scanned
[CR_POWER2_ALIGNED
]));
3124 seq_printf(seq
, "\t\tuseless_loops: %llu\n",
3125 atomic64_read(&sbi
->s_bal_cX_failed
[CR_POWER2_ALIGNED
]));
3126 seq_printf(seq
, "\t\tbad_suggestions: %u\n",
3127 atomic_read(&sbi
->s_bal_p2_aligned_bad_suggestions
));
3129 /* CR_GOAL_LEN_FAST stats */
3130 seq_puts(seq
, "\tcr_goal_fast_stats:\n");
3131 seq_printf(seq
, "\t\thits: %llu\n",
3132 atomic64_read(&sbi
->s_bal_cX_hits
[CR_GOAL_LEN_FAST
]));
3133 seq_printf(seq
, "\t\tgroups_considered: %llu\n",
3135 &sbi
->s_bal_cX_groups_considered
[CR_GOAL_LEN_FAST
]));
3136 seq_printf(seq
, "\t\textents_scanned: %u\n",
3137 atomic_read(&sbi
->s_bal_cX_ex_scanned
[CR_GOAL_LEN_FAST
]));
3138 seq_printf(seq
, "\t\tuseless_loops: %llu\n",
3139 atomic64_read(&sbi
->s_bal_cX_failed
[CR_GOAL_LEN_FAST
]));
3140 seq_printf(seq
, "\t\tbad_suggestions: %u\n",
3141 atomic_read(&sbi
->s_bal_goal_fast_bad_suggestions
));
3143 /* CR_BEST_AVAIL_LEN stats */
3144 seq_puts(seq
, "\tcr_best_avail_stats:\n");
3145 seq_printf(seq
, "\t\thits: %llu\n",
3146 atomic64_read(&sbi
->s_bal_cX_hits
[CR_BEST_AVAIL_LEN
]));
3148 seq
, "\t\tgroups_considered: %llu\n",
3150 &sbi
->s_bal_cX_groups_considered
[CR_BEST_AVAIL_LEN
]));
3151 seq_printf(seq
, "\t\textents_scanned: %u\n",
3152 atomic_read(&sbi
->s_bal_cX_ex_scanned
[CR_BEST_AVAIL_LEN
]));
3153 seq_printf(seq
, "\t\tuseless_loops: %llu\n",
3154 atomic64_read(&sbi
->s_bal_cX_failed
[CR_BEST_AVAIL_LEN
]));
3155 seq_printf(seq
, "\t\tbad_suggestions: %u\n",
3156 atomic_read(&sbi
->s_bal_best_avail_bad_suggestions
));
3158 /* CR_GOAL_LEN_SLOW stats */
3159 seq_puts(seq
, "\tcr_goal_slow_stats:\n");
3160 seq_printf(seq
, "\t\thits: %llu\n",
3161 atomic64_read(&sbi
->s_bal_cX_hits
[CR_GOAL_LEN_SLOW
]));
3162 seq_printf(seq
, "\t\tgroups_considered: %llu\n",
3164 &sbi
->s_bal_cX_groups_considered
[CR_GOAL_LEN_SLOW
]));
3165 seq_printf(seq
, "\t\textents_scanned: %u\n",
3166 atomic_read(&sbi
->s_bal_cX_ex_scanned
[CR_GOAL_LEN_SLOW
]));
3167 seq_printf(seq
, "\t\tuseless_loops: %llu\n",
3168 atomic64_read(&sbi
->s_bal_cX_failed
[CR_GOAL_LEN_SLOW
]));
3170 /* CR_ANY_FREE stats */
3171 seq_puts(seq
, "\tcr_any_free_stats:\n");
3172 seq_printf(seq
, "\t\thits: %llu\n",
3173 atomic64_read(&sbi
->s_bal_cX_hits
[CR_ANY_FREE
]));
3175 seq
, "\t\tgroups_considered: %llu\n",
3176 atomic64_read(&sbi
->s_bal_cX_groups_considered
[CR_ANY_FREE
]));
3177 seq_printf(seq
, "\t\textents_scanned: %u\n",
3178 atomic_read(&sbi
->s_bal_cX_ex_scanned
[CR_ANY_FREE
]));
3179 seq_printf(seq
, "\t\tuseless_loops: %llu\n",
3180 atomic64_read(&sbi
->s_bal_cX_failed
[CR_ANY_FREE
]));
3183 seq_printf(seq
, "\textents_scanned: %u\n",
3184 atomic_read(&sbi
->s_bal_ex_scanned
));
3185 seq_printf(seq
, "\t\tgoal_hits: %u\n", atomic_read(&sbi
->s_bal_goals
));
3186 seq_printf(seq
, "\t\tlen_goal_hits: %u\n",
3187 atomic_read(&sbi
->s_bal_len_goals
));
3188 seq_printf(seq
, "\t\t2^n_hits: %u\n", atomic_read(&sbi
->s_bal_2orders
));
3189 seq_printf(seq
, "\t\tbreaks: %u\n", atomic_read(&sbi
->s_bal_breaks
));
3190 seq_printf(seq
, "\t\tlost: %u\n", atomic_read(&sbi
->s_mb_lost_chunks
));
3191 seq_printf(seq
, "\tbuddies_generated: %u/%u\n",
3192 atomic_read(&sbi
->s_mb_buddies_generated
),
3193 ext4_get_groups_count(sb
));
3194 seq_printf(seq
, "\tbuddies_time_used: %llu\n",
3195 atomic64_read(&sbi
->s_mb_generation_time
));
3196 seq_printf(seq
, "\tpreallocated: %u\n",
3197 atomic_read(&sbi
->s_mb_preallocated
));
3198 seq_printf(seq
, "\tdiscarded: %u\n", atomic_read(&sbi
->s_mb_discarded
));
3202 static void *ext4_mb_seq_structs_summary_start(struct seq_file
*seq
, loff_t
*pos
)
3204 struct super_block
*sb
= pde_data(file_inode(seq
->file
));
3205 unsigned long position
;
3207 if (*pos
< 0 || *pos
>= 2*MB_NUM_ORDERS(sb
))
3209 position
= *pos
+ 1;
3210 return (void *) ((unsigned long) position
);
3213 static void *ext4_mb_seq_structs_summary_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
3215 struct super_block
*sb
= pde_data(file_inode(seq
->file
));
3216 unsigned long position
;
3219 if (*pos
< 0 || *pos
>= 2*MB_NUM_ORDERS(sb
))
3221 position
= *pos
+ 1;
3222 return (void *) ((unsigned long) position
);
3225 static int ext4_mb_seq_structs_summary_show(struct seq_file
*seq
, void *v
)
3227 struct super_block
*sb
= pde_data(file_inode(seq
->file
));
3228 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
3229 unsigned long position
= ((unsigned long) v
);
3230 struct ext4_group_info
*grp
;
3234 if (position
>= MB_NUM_ORDERS(sb
)) {
3235 position
-= MB_NUM_ORDERS(sb
);
3237 seq_puts(seq
, "avg_fragment_size_lists:\n");
3240 read_lock(&sbi
->s_mb_avg_fragment_size_locks
[position
]);
3241 list_for_each_entry(grp
, &sbi
->s_mb_avg_fragment_size
[position
],
3242 bb_avg_fragment_size_node
)
3244 read_unlock(&sbi
->s_mb_avg_fragment_size_locks
[position
]);
3245 seq_printf(seq
, "\tlist_order_%u_groups: %u\n",
3246 (unsigned int)position
, count
);
3250 if (position
== 0) {
3251 seq_printf(seq
, "optimize_scan: %d\n",
3252 test_opt2(sb
, MB_OPTIMIZE_SCAN
) ? 1 : 0);
3253 seq_puts(seq
, "max_free_order_lists:\n");
3256 read_lock(&sbi
->s_mb_largest_free_orders_locks
[position
]);
3257 list_for_each_entry(grp
, &sbi
->s_mb_largest_free_orders
[position
],
3258 bb_largest_free_order_node
)
3260 read_unlock(&sbi
->s_mb_largest_free_orders_locks
[position
]);
3261 seq_printf(seq
, "\tlist_order_%u_groups: %u\n",
3262 (unsigned int)position
, count
);
3267 static void ext4_mb_seq_structs_summary_stop(struct seq_file
*seq
, void *v
)
3271 const struct seq_operations ext4_mb_seq_structs_summary_ops
= {
3272 .start
= ext4_mb_seq_structs_summary_start
,
3273 .next
= ext4_mb_seq_structs_summary_next
,
3274 .stop
= ext4_mb_seq_structs_summary_stop
,
3275 .show
= ext4_mb_seq_structs_summary_show
,
3278 static struct kmem_cache
*get_groupinfo_cache(int blocksize_bits
)
3280 int cache_index
= blocksize_bits
- EXT4_MIN_BLOCK_LOG_SIZE
;
3281 struct kmem_cache
*cachep
= ext4_groupinfo_caches
[cache_index
];
3288 * Allocate the top-level s_group_info array for the specified number
3291 int ext4_mb_alloc_groupinfo(struct super_block
*sb
, ext4_group_t ngroups
)
3293 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
3295 struct ext4_group_info
***old_groupinfo
, ***new_groupinfo
;
3297 size
= (ngroups
+ EXT4_DESC_PER_BLOCK(sb
) - 1) >>
3298 EXT4_DESC_PER_BLOCK_BITS(sb
);
3299 if (size
<= sbi
->s_group_info_size
)
3302 size
= roundup_pow_of_two(sizeof(*sbi
->s_group_info
) * size
);
3303 new_groupinfo
= kvzalloc(size
, GFP_KERNEL
);
3304 if (!new_groupinfo
) {
3305 ext4_msg(sb
, KERN_ERR
, "can't allocate buddy meta group");
3309 old_groupinfo
= rcu_dereference(sbi
->s_group_info
);
3311 memcpy(new_groupinfo
, old_groupinfo
,
3312 sbi
->s_group_info_size
* sizeof(*sbi
->s_group_info
));
3314 rcu_assign_pointer(sbi
->s_group_info
, new_groupinfo
);
3315 sbi
->s_group_info_size
= size
/ sizeof(*sbi
->s_group_info
);
3317 ext4_kvfree_array_rcu(old_groupinfo
);
3318 ext4_debug("allocated s_groupinfo array for %d meta_bg's\n",
3319 sbi
->s_group_info_size
);
3323 /* Create and initialize ext4_group_info data for the given group. */
3324 int ext4_mb_add_groupinfo(struct super_block
*sb
, ext4_group_t group
,
3325 struct ext4_group_desc
*desc
)
3329 int idx
= group
>> EXT4_DESC_PER_BLOCK_BITS(sb
);
3330 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
3331 struct ext4_group_info
**meta_group_info
;
3332 struct kmem_cache
*cachep
= get_groupinfo_cache(sb
->s_blocksize_bits
);
3335 * First check if this group is the first of a reserved block.
3336 * If it's true, we have to allocate a new table of pointers
3337 * to ext4_group_info structures
3339 if (group
% EXT4_DESC_PER_BLOCK(sb
) == 0) {
3340 metalen
= sizeof(*meta_group_info
) <<
3341 EXT4_DESC_PER_BLOCK_BITS(sb
);
3342 meta_group_info
= kmalloc(metalen
, GFP_NOFS
);
3343 if (meta_group_info
== NULL
) {
3344 ext4_msg(sb
, KERN_ERR
, "can't allocate mem "
3345 "for a buddy group");
3349 rcu_dereference(sbi
->s_group_info
)[idx
] = meta_group_info
;
3353 meta_group_info
= sbi_array_rcu_deref(sbi
, s_group_info
, idx
);
3354 i
= group
& (EXT4_DESC_PER_BLOCK(sb
) - 1);
3356 meta_group_info
[i
] = kmem_cache_zalloc(cachep
, GFP_NOFS
);
3357 if (meta_group_info
[i
] == NULL
) {
3358 ext4_msg(sb
, KERN_ERR
, "can't allocate buddy mem");
3359 goto exit_group_info
;
3361 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT
,
3362 &(meta_group_info
[i
]->bb_state
));
3365 * initialize bb_free to be able to skip
3366 * empty groups without initialization
3368 if (ext4_has_group_desc_csum(sb
) &&
3369 (desc
->bg_flags
& cpu_to_le16(EXT4_BG_BLOCK_UNINIT
))) {
3370 meta_group_info
[i
]->bb_free
=
3371 ext4_free_clusters_after_init(sb
, group
, desc
);
3373 meta_group_info
[i
]->bb_free
=
3374 ext4_free_group_clusters(sb
, desc
);
3377 INIT_LIST_HEAD(&meta_group_info
[i
]->bb_prealloc_list
);
3378 init_rwsem(&meta_group_info
[i
]->alloc_sem
);
3379 meta_group_info
[i
]->bb_free_root
= RB_ROOT
;
3380 INIT_LIST_HEAD(&meta_group_info
[i
]->bb_largest_free_order_node
);
3381 INIT_LIST_HEAD(&meta_group_info
[i
]->bb_avg_fragment_size_node
);
3382 meta_group_info
[i
]->bb_largest_free_order
= -1; /* uninit */
3383 meta_group_info
[i
]->bb_avg_fragment_size_order
= -1; /* uninit */
3384 meta_group_info
[i
]->bb_group
= group
;
3386 mb_group_bb_bitmap_alloc(sb
, meta_group_info
[i
], group
);
3390 /* If a meta_group_info table has been allocated, release it now */
3391 if (group
% EXT4_DESC_PER_BLOCK(sb
) == 0) {
3392 struct ext4_group_info
***group_info
;
3395 group_info
= rcu_dereference(sbi
->s_group_info
);
3396 kfree(group_info
[idx
]);
3397 group_info
[idx
] = NULL
;
3401 } /* ext4_mb_add_groupinfo */
3403 static int ext4_mb_init_backend(struct super_block
*sb
)
3405 ext4_group_t ngroups
= ext4_get_groups_count(sb
);
3407 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
3409 struct ext4_group_desc
*desc
;
3410 struct ext4_group_info
***group_info
;
3411 struct kmem_cache
*cachep
;
3413 err
= ext4_mb_alloc_groupinfo(sb
, ngroups
);
3417 sbi
->s_buddy_cache
= new_inode(sb
);
3418 if (sbi
->s_buddy_cache
== NULL
) {
3419 ext4_msg(sb
, KERN_ERR
, "can't get new inode");
3422 /* To avoid potentially colliding with an valid on-disk inode number,
3423 * use EXT4_BAD_INO for the buddy cache inode number. This inode is
3424 * not in the inode hash, so it should never be found by iget(), but
3425 * this will avoid confusion if it ever shows up during debugging. */
3426 sbi
->s_buddy_cache
->i_ino
= EXT4_BAD_INO
;
3427 EXT4_I(sbi
->s_buddy_cache
)->i_disksize
= 0;
3428 for (i
= 0; i
< ngroups
; i
++) {
3430 desc
= ext4_get_group_desc(sb
, i
, NULL
);
3432 ext4_msg(sb
, KERN_ERR
, "can't read descriptor %u", i
);
3435 if (ext4_mb_add_groupinfo(sb
, i
, desc
) != 0)
3439 if (ext4_has_feature_flex_bg(sb
)) {
3440 /* a single flex group is supposed to be read by a single IO.
3441 * 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is
3442 * unsigned integer, so the maximum shift is 32.
3444 if (sbi
->s_es
->s_log_groups_per_flex
>= 32) {
3445 ext4_msg(sb
, KERN_ERR
, "too many log groups per flexible block group");
3448 sbi
->s_mb_prefetch
= min_t(uint
, 1 << sbi
->s_es
->s_log_groups_per_flex
,
3449 BLK_MAX_SEGMENT_SIZE
>> (sb
->s_blocksize_bits
- 9));
3450 sbi
->s_mb_prefetch
*= 8; /* 8 prefetch IOs in flight at most */
3452 sbi
->s_mb_prefetch
= 32;
3454 if (sbi
->s_mb_prefetch
> ext4_get_groups_count(sb
))
3455 sbi
->s_mb_prefetch
= ext4_get_groups_count(sb
);
3457 * now many real IOs to prefetch within a single allocation at
3458 * CR_POWER2_ALIGNED. Given CR_POWER2_ALIGNED is an CPU-related
3459 * optimization we shouldn't try to load too many groups, at some point
3460 * we should start to use what we've got in memory.
3461 * with an average random access time 5ms, it'd take a second to get
3462 * 200 groups (* N with flex_bg), so let's make this limit 4
3464 sbi
->s_mb_prefetch_limit
= sbi
->s_mb_prefetch
* 4;
3465 if (sbi
->s_mb_prefetch_limit
> ext4_get_groups_count(sb
))
3466 sbi
->s_mb_prefetch_limit
= ext4_get_groups_count(sb
);
3471 cachep
= get_groupinfo_cache(sb
->s_blocksize_bits
);
3473 struct ext4_group_info
*grp
= ext4_get_group_info(sb
, i
);
3476 kmem_cache_free(cachep
, grp
);
3478 i
= sbi
->s_group_info_size
;
3480 group_info
= rcu_dereference(sbi
->s_group_info
);
3482 kfree(group_info
[i
]);
3484 iput(sbi
->s_buddy_cache
);
3487 kvfree(rcu_dereference(sbi
->s_group_info
));
3492 static void ext4_groupinfo_destroy_slabs(void)
3496 for (i
= 0; i
< NR_GRPINFO_CACHES
; i
++) {
3497 kmem_cache_destroy(ext4_groupinfo_caches
[i
]);
3498 ext4_groupinfo_caches
[i
] = NULL
;
3502 static int ext4_groupinfo_create_slab(size_t size
)
3504 static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex
);
3506 int blocksize_bits
= order_base_2(size
);
3507 int cache_index
= blocksize_bits
- EXT4_MIN_BLOCK_LOG_SIZE
;
3508 struct kmem_cache
*cachep
;
3510 if (cache_index
>= NR_GRPINFO_CACHES
)
3513 if (unlikely(cache_index
< 0))
3516 mutex_lock(&ext4_grpinfo_slab_create_mutex
);
3517 if (ext4_groupinfo_caches
[cache_index
]) {
3518 mutex_unlock(&ext4_grpinfo_slab_create_mutex
);
3519 return 0; /* Already created */
3522 slab_size
= offsetof(struct ext4_group_info
,
3523 bb_counters
[blocksize_bits
+ 2]);
3525 cachep
= kmem_cache_create(ext4_groupinfo_slab_names
[cache_index
],
3526 slab_size
, 0, SLAB_RECLAIM_ACCOUNT
,
3529 ext4_groupinfo_caches
[cache_index
] = cachep
;
3531 mutex_unlock(&ext4_grpinfo_slab_create_mutex
);
3534 "EXT4-fs: no memory for groupinfo slab cache\n");
3541 static void ext4_discard_work(struct work_struct
*work
)
3543 struct ext4_sb_info
*sbi
= container_of(work
,
3544 struct ext4_sb_info
, s_discard_work
);
3545 struct super_block
*sb
= sbi
->s_sb
;
3546 struct ext4_free_data
*fd
, *nfd
;
3547 struct ext4_buddy e4b
;
3548 LIST_HEAD(discard_list
);
3549 ext4_group_t grp
, load_grp
;
3552 spin_lock(&sbi
->s_md_lock
);
3553 list_splice_init(&sbi
->s_discard_list
, &discard_list
);
3554 spin_unlock(&sbi
->s_md_lock
);
3556 load_grp
= UINT_MAX
;
3557 list_for_each_entry_safe(fd
, nfd
, &discard_list
, efd_list
) {
3559 * If filesystem is umounting or no memory or suffering
3560 * from no space, give up the discard
3562 if ((sb
->s_flags
& SB_ACTIVE
) && !err
&&
3563 !atomic_read(&sbi
->s_retry_alloc_pending
)) {
3564 grp
= fd
->efd_group
;
3565 if (grp
!= load_grp
) {
3566 if (load_grp
!= UINT_MAX
)
3567 ext4_mb_unload_buddy(&e4b
);
3569 err
= ext4_mb_load_buddy(sb
, grp
, &e4b
);
3571 kmem_cache_free(ext4_free_data_cachep
, fd
);
3572 load_grp
= UINT_MAX
;
3579 ext4_lock_group(sb
, grp
);
3580 ext4_try_to_trim_range(sb
, &e4b
, fd
->efd_start_cluster
,
3581 fd
->efd_start_cluster
+ fd
->efd_count
- 1, 1);
3582 ext4_unlock_group(sb
, grp
);
3584 kmem_cache_free(ext4_free_data_cachep
, fd
);
3587 if (load_grp
!= UINT_MAX
)
3588 ext4_mb_unload_buddy(&e4b
);
3591 int ext4_mb_init(struct super_block
*sb
)
3593 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
3595 unsigned offset
, offset_incr
;
3599 i
= MB_NUM_ORDERS(sb
) * sizeof(*sbi
->s_mb_offsets
);
3601 sbi
->s_mb_offsets
= kmalloc(i
, GFP_KERNEL
);
3602 if (sbi
->s_mb_offsets
== NULL
) {
3607 i
= MB_NUM_ORDERS(sb
) * sizeof(*sbi
->s_mb_maxs
);
3608 sbi
->s_mb_maxs
= kmalloc(i
, GFP_KERNEL
);
3609 if (sbi
->s_mb_maxs
== NULL
) {
3614 ret
= ext4_groupinfo_create_slab(sb
->s_blocksize
);
3618 /* order 0 is regular bitmap */
3619 sbi
->s_mb_maxs
[0] = sb
->s_blocksize
<< 3;
3620 sbi
->s_mb_offsets
[0] = 0;
3624 offset_incr
= 1 << (sb
->s_blocksize_bits
- 1);
3625 max
= sb
->s_blocksize
<< 2;
3627 sbi
->s_mb_offsets
[i
] = offset
;
3628 sbi
->s_mb_maxs
[i
] = max
;
3629 offset
+= offset_incr
;
3630 offset_incr
= offset_incr
>> 1;
3633 } while (i
< MB_NUM_ORDERS(sb
));
3635 sbi
->s_mb_avg_fragment_size
=
3636 kmalloc_array(MB_NUM_ORDERS(sb
), sizeof(struct list_head
),
3638 if (!sbi
->s_mb_avg_fragment_size
) {
3642 sbi
->s_mb_avg_fragment_size_locks
=
3643 kmalloc_array(MB_NUM_ORDERS(sb
), sizeof(rwlock_t
),
3645 if (!sbi
->s_mb_avg_fragment_size_locks
) {
3649 for (i
= 0; i
< MB_NUM_ORDERS(sb
); i
++) {
3650 INIT_LIST_HEAD(&sbi
->s_mb_avg_fragment_size
[i
]);
3651 rwlock_init(&sbi
->s_mb_avg_fragment_size_locks
[i
]);
3653 sbi
->s_mb_largest_free_orders
=
3654 kmalloc_array(MB_NUM_ORDERS(sb
), sizeof(struct list_head
),
3656 if (!sbi
->s_mb_largest_free_orders
) {
3660 sbi
->s_mb_largest_free_orders_locks
=
3661 kmalloc_array(MB_NUM_ORDERS(sb
), sizeof(rwlock_t
),
3663 if (!sbi
->s_mb_largest_free_orders_locks
) {
3667 for (i
= 0; i
< MB_NUM_ORDERS(sb
); i
++) {
3668 INIT_LIST_HEAD(&sbi
->s_mb_largest_free_orders
[i
]);
3669 rwlock_init(&sbi
->s_mb_largest_free_orders_locks
[i
]);
3672 spin_lock_init(&sbi
->s_md_lock
);
3673 sbi
->s_mb_free_pending
= 0;
3674 INIT_LIST_HEAD(&sbi
->s_freed_data_list
[0]);
3675 INIT_LIST_HEAD(&sbi
->s_freed_data_list
[1]);
3676 INIT_LIST_HEAD(&sbi
->s_discard_list
);
3677 INIT_WORK(&sbi
->s_discard_work
, ext4_discard_work
);
3678 atomic_set(&sbi
->s_retry_alloc_pending
, 0);
3680 sbi
->s_mb_max_to_scan
= MB_DEFAULT_MAX_TO_SCAN
;
3681 sbi
->s_mb_min_to_scan
= MB_DEFAULT_MIN_TO_SCAN
;
3682 sbi
->s_mb_stats
= MB_DEFAULT_STATS
;
3683 sbi
->s_mb_stream_request
= MB_DEFAULT_STREAM_THRESHOLD
;
3684 sbi
->s_mb_order2_reqs
= MB_DEFAULT_ORDER2_REQS
;
3685 sbi
->s_mb_best_avail_max_trim_order
= MB_DEFAULT_BEST_AVAIL_TRIM_ORDER
;
3688 * The default group preallocation is 512, which for 4k block
3689 * sizes translates to 2 megabytes. However for bigalloc file
3690 * systems, this is probably too big (i.e, if the cluster size
3691 * is 1 megabyte, then group preallocation size becomes half a
3692 * gigabyte!). As a default, we will keep a two megabyte
3693 * group pralloc size for cluster sizes up to 64k, and after
3694 * that, we will force a minimum group preallocation size of
3695 * 32 clusters. This translates to 8 megs when the cluster
3696 * size is 256k, and 32 megs when the cluster size is 1 meg,
3697 * which seems reasonable as a default.
3699 sbi
->s_mb_group_prealloc
= max(MB_DEFAULT_GROUP_PREALLOC
>>
3700 sbi
->s_cluster_bits
, 32);
3702 * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
3703 * to the lowest multiple of s_stripe which is bigger than
3704 * the s_mb_group_prealloc as determined above. We want
3705 * the preallocation size to be an exact multiple of the
3706 * RAID stripe size so that preallocations don't fragment
3709 if (sbi
->s_stripe
> 1) {
3710 sbi
->s_mb_group_prealloc
= roundup(
3711 sbi
->s_mb_group_prealloc
, EXT4_NUM_B2C(sbi
, sbi
->s_stripe
));
3714 sbi
->s_locality_groups
= alloc_percpu(struct ext4_locality_group
);
3715 if (sbi
->s_locality_groups
== NULL
) {
3719 for_each_possible_cpu(i
) {
3720 struct ext4_locality_group
*lg
;
3721 lg
= per_cpu_ptr(sbi
->s_locality_groups
, i
);
3722 mutex_init(&lg
->lg_mutex
);
3723 for (j
= 0; j
< PREALLOC_TB_SIZE
; j
++)
3724 INIT_LIST_HEAD(&lg
->lg_prealloc_list
[j
]);
3725 spin_lock_init(&lg
->lg_prealloc_lock
);
3728 if (bdev_nonrot(sb
->s_bdev
))
3729 sbi
->s_mb_max_linear_groups
= 0;
3731 sbi
->s_mb_max_linear_groups
= MB_DEFAULT_LINEAR_LIMIT
;
3732 /* init file for buddy data */
3733 ret
= ext4_mb_init_backend(sb
);
3735 goto out_free_locality_groups
;
3739 out_free_locality_groups
:
3740 free_percpu(sbi
->s_locality_groups
);
3741 sbi
->s_locality_groups
= NULL
;
3743 kfree(sbi
->s_mb_avg_fragment_size
);
3744 kfree(sbi
->s_mb_avg_fragment_size_locks
);
3745 kfree(sbi
->s_mb_largest_free_orders
);
3746 kfree(sbi
->s_mb_largest_free_orders_locks
);
3747 kfree(sbi
->s_mb_offsets
);
3748 sbi
->s_mb_offsets
= NULL
;
3749 kfree(sbi
->s_mb_maxs
);
3750 sbi
->s_mb_maxs
= NULL
;
3754 /* need to called with the ext4 group lock held */
3755 static int ext4_mb_cleanup_pa(struct ext4_group_info
*grp
)
3757 struct ext4_prealloc_space
*pa
;
3758 struct list_head
*cur
, *tmp
;
3761 list_for_each_safe(cur
, tmp
, &grp
->bb_prealloc_list
) {
3762 pa
= list_entry(cur
, struct ext4_prealloc_space
, pa_group_list
);
3763 list_del(&pa
->pa_group_list
);
3765 kmem_cache_free(ext4_pspace_cachep
, pa
);
3770 void ext4_mb_release(struct super_block
*sb
)
3772 ext4_group_t ngroups
= ext4_get_groups_count(sb
);
3774 int num_meta_group_infos
;
3775 struct ext4_group_info
*grinfo
, ***group_info
;
3776 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
3777 struct kmem_cache
*cachep
= get_groupinfo_cache(sb
->s_blocksize_bits
);
3780 if (test_opt(sb
, DISCARD
)) {
3782 * wait the discard work to drain all of ext4_free_data
3784 flush_work(&sbi
->s_discard_work
);
3785 WARN_ON_ONCE(!list_empty(&sbi
->s_discard_list
));
3788 if (sbi
->s_group_info
) {
3789 for (i
= 0; i
< ngroups
; i
++) {
3791 grinfo
= ext4_get_group_info(sb
, i
);
3794 mb_group_bb_bitmap_free(grinfo
);
3795 ext4_lock_group(sb
, i
);
3796 count
= ext4_mb_cleanup_pa(grinfo
);
3798 mb_debug(sb
, "mballoc: %d PAs left\n",
3800 ext4_unlock_group(sb
, i
);
3801 kmem_cache_free(cachep
, grinfo
);
3803 num_meta_group_infos
= (ngroups
+
3804 EXT4_DESC_PER_BLOCK(sb
) - 1) >>
3805 EXT4_DESC_PER_BLOCK_BITS(sb
);
3807 group_info
= rcu_dereference(sbi
->s_group_info
);
3808 for (i
= 0; i
< num_meta_group_infos
; i
++)
3809 kfree(group_info
[i
]);
3813 kfree(sbi
->s_mb_avg_fragment_size
);
3814 kfree(sbi
->s_mb_avg_fragment_size_locks
);
3815 kfree(sbi
->s_mb_largest_free_orders
);
3816 kfree(sbi
->s_mb_largest_free_orders_locks
);
3817 kfree(sbi
->s_mb_offsets
);
3818 kfree(sbi
->s_mb_maxs
);
3819 iput(sbi
->s_buddy_cache
);
3820 if (sbi
->s_mb_stats
) {
3821 ext4_msg(sb
, KERN_INFO
,
3822 "mballoc: %u blocks %u reqs (%u success)",
3823 atomic_read(&sbi
->s_bal_allocated
),
3824 atomic_read(&sbi
->s_bal_reqs
),
3825 atomic_read(&sbi
->s_bal_success
));
3826 ext4_msg(sb
, KERN_INFO
,
3827 "mballoc: %u extents scanned, %u groups scanned, %u goal hits, "
3828 "%u 2^N hits, %u breaks, %u lost",
3829 atomic_read(&sbi
->s_bal_ex_scanned
),
3830 atomic_read(&sbi
->s_bal_groups_scanned
),
3831 atomic_read(&sbi
->s_bal_goals
),
3832 atomic_read(&sbi
->s_bal_2orders
),
3833 atomic_read(&sbi
->s_bal_breaks
),
3834 atomic_read(&sbi
->s_mb_lost_chunks
));
3835 ext4_msg(sb
, KERN_INFO
,
3836 "mballoc: %u generated and it took %llu",
3837 atomic_read(&sbi
->s_mb_buddies_generated
),
3838 atomic64_read(&sbi
->s_mb_generation_time
));
3839 ext4_msg(sb
, KERN_INFO
,
3840 "mballoc: %u preallocated, %u discarded",
3841 atomic_read(&sbi
->s_mb_preallocated
),
3842 atomic_read(&sbi
->s_mb_discarded
));
3845 free_percpu(sbi
->s_locality_groups
);
3848 static inline int ext4_issue_discard(struct super_block
*sb
,
3849 ext4_group_t block_group
, ext4_grpblk_t cluster
, int count
)
3851 ext4_fsblk_t discard_block
;
3853 discard_block
= (EXT4_C2B(EXT4_SB(sb
), cluster
) +
3854 ext4_group_first_block_no(sb
, block_group
));
3855 count
= EXT4_C2B(EXT4_SB(sb
), count
);
3856 trace_ext4_discard_blocks(sb
,
3857 (unsigned long long) discard_block
, count
);
3859 return sb_issue_discard(sb
, discard_block
, count
, GFP_NOFS
, 0);
3862 static void ext4_free_data_in_buddy(struct super_block
*sb
,
3863 struct ext4_free_data
*entry
)
3865 struct ext4_buddy e4b
;
3866 struct ext4_group_info
*db
;
3869 mb_debug(sb
, "gonna free %u blocks in group %u (0x%p):",
3870 entry
->efd_count
, entry
->efd_group
, entry
);
3872 err
= ext4_mb_load_buddy(sb
, entry
->efd_group
, &e4b
);
3873 /* we expect to find existing buddy because it's pinned */
3876 spin_lock(&EXT4_SB(sb
)->s_md_lock
);
3877 EXT4_SB(sb
)->s_mb_free_pending
-= entry
->efd_count
;
3878 spin_unlock(&EXT4_SB(sb
)->s_md_lock
);
3881 /* there are blocks to put in buddy to make them really free */
3882 count
+= entry
->efd_count
;
3883 ext4_lock_group(sb
, entry
->efd_group
);
3884 /* Take it out of per group rb tree */
3885 rb_erase(&entry
->efd_node
, &(db
->bb_free_root
));
3886 mb_free_blocks(NULL
, &e4b
, entry
->efd_start_cluster
, entry
->efd_count
);
3889 * Clear the trimmed flag for the group so that the next
3890 * ext4_trim_fs can trim it.
3892 EXT4_MB_GRP_CLEAR_TRIMMED(db
);
3894 if (!db
->bb_free_root
.rb_node
) {
3895 /* No more items in the per group rb tree
3896 * balance refcounts from ext4_mb_free_metadata()
3898 folio_put(e4b
.bd_buddy_folio
);
3899 folio_put(e4b
.bd_bitmap_folio
);
3901 ext4_unlock_group(sb
, entry
->efd_group
);
3902 ext4_mb_unload_buddy(&e4b
);
3904 mb_debug(sb
, "freed %d blocks in 1 structures\n", count
);
3908 * This function is called by the jbd2 layer once the commit has finished,
3909 * so we know we can free the blocks that were released with that commit.
3911 void ext4_process_freed_data(struct super_block
*sb
, tid_t commit_tid
)
3913 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
3914 struct ext4_free_data
*entry
, *tmp
;
3915 LIST_HEAD(freed_data_list
);
3916 struct list_head
*s_freed_head
= &sbi
->s_freed_data_list
[commit_tid
& 1];
3919 list_replace_init(s_freed_head
, &freed_data_list
);
3921 list_for_each_entry(entry
, &freed_data_list
, efd_list
)
3922 ext4_free_data_in_buddy(sb
, entry
);
3924 if (test_opt(sb
, DISCARD
)) {
3925 spin_lock(&sbi
->s_md_lock
);
3926 wake
= list_empty(&sbi
->s_discard_list
);
3927 list_splice_tail(&freed_data_list
, &sbi
->s_discard_list
);
3928 spin_unlock(&sbi
->s_md_lock
);
3930 queue_work(system_unbound_wq
, &sbi
->s_discard_work
);
3932 list_for_each_entry_safe(entry
, tmp
, &freed_data_list
, efd_list
)
3933 kmem_cache_free(ext4_free_data_cachep
, entry
);
3937 int __init
ext4_init_mballoc(void)
3939 ext4_pspace_cachep
= KMEM_CACHE(ext4_prealloc_space
,
3940 SLAB_RECLAIM_ACCOUNT
);
3941 if (ext4_pspace_cachep
== NULL
)
3944 ext4_ac_cachep
= KMEM_CACHE(ext4_allocation_context
,
3945 SLAB_RECLAIM_ACCOUNT
);
3946 if (ext4_ac_cachep
== NULL
)
3949 ext4_free_data_cachep
= KMEM_CACHE(ext4_free_data
,
3950 SLAB_RECLAIM_ACCOUNT
);
3951 if (ext4_free_data_cachep
== NULL
)
3957 kmem_cache_destroy(ext4_ac_cachep
);
3959 kmem_cache_destroy(ext4_pspace_cachep
);
3964 void ext4_exit_mballoc(void)
3967 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
3968 * before destroying the slab cache.
3971 kmem_cache_destroy(ext4_pspace_cachep
);
3972 kmem_cache_destroy(ext4_ac_cachep
);
3973 kmem_cache_destroy(ext4_free_data_cachep
);
3974 ext4_groupinfo_destroy_slabs();
3977 #define EXT4_MB_BITMAP_MARKED_CHECK 0x0001
3978 #define EXT4_MB_SYNC_UPDATE 0x0002
3980 ext4_mb_mark_context(handle_t
*handle
, struct super_block
*sb
, bool state
,
3981 ext4_group_t group
, ext4_grpblk_t blkoff
,
3982 ext4_grpblk_t len
, int flags
, ext4_grpblk_t
*ret_changed
)
3984 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
3985 struct buffer_head
*bitmap_bh
= NULL
;
3986 struct ext4_group_desc
*gdp
;
3987 struct buffer_head
*gdp_bh
;
3989 unsigned int i
, already
, changed
= len
;
3991 KUNIT_STATIC_STUB_REDIRECT(ext4_mb_mark_context
,
3992 handle
, sb
, state
, group
, blkoff
, len
,
3993 flags
, ret_changed
);
3997 bitmap_bh
= ext4_read_block_bitmap(sb
, group
);
3998 if (IS_ERR(bitmap_bh
))
3999 return PTR_ERR(bitmap_bh
);
4002 BUFFER_TRACE(bitmap_bh
, "getting write access");
4003 err
= ext4_journal_get_write_access(handle
, sb
, bitmap_bh
,
4010 gdp
= ext4_get_group_desc(sb
, group
, &gdp_bh
);
4015 BUFFER_TRACE(gdp_bh
, "get_write_access");
4016 err
= ext4_journal_get_write_access(handle
, sb
, gdp_bh
,
4022 ext4_lock_group(sb
, group
);
4023 if (ext4_has_group_desc_csum(sb
) &&
4024 (gdp
->bg_flags
& cpu_to_le16(EXT4_BG_BLOCK_UNINIT
))) {
4025 gdp
->bg_flags
&= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT
);
4026 ext4_free_group_clusters_set(sb
, gdp
,
4027 ext4_free_clusters_after_init(sb
, group
, gdp
));
4030 if (flags
& EXT4_MB_BITMAP_MARKED_CHECK
) {
4032 for (i
= 0; i
< len
; i
++)
4033 if (mb_test_bit(blkoff
+ i
, bitmap_bh
->b_data
) ==
4036 changed
= len
- already
;
4040 mb_set_bits(bitmap_bh
->b_data
, blkoff
, len
);
4041 ext4_free_group_clusters_set(sb
, gdp
,
4042 ext4_free_group_clusters(sb
, gdp
) - changed
);
4044 mb_clear_bits(bitmap_bh
->b_data
, blkoff
, len
);
4045 ext4_free_group_clusters_set(sb
, gdp
,
4046 ext4_free_group_clusters(sb
, gdp
) + changed
);
4049 ext4_block_bitmap_csum_set(sb
, gdp
, bitmap_bh
);
4050 ext4_group_desc_csum_set(sb
, group
, gdp
);
4051 ext4_unlock_group(sb
, group
);
4053 *ret_changed
= changed
;
4055 if (sbi
->s_log_groups_per_flex
) {
4056 ext4_group_t flex_group
= ext4_flex_group(sbi
, group
);
4057 struct flex_groups
*fg
= sbi_array_rcu_deref(sbi
,
4058 s_flex_groups
, flex_group
);
4061 atomic64_sub(changed
, &fg
->free_clusters
);
4063 atomic64_add(changed
, &fg
->free_clusters
);
4066 err
= ext4_handle_dirty_metadata(handle
, NULL
, bitmap_bh
);
4069 err
= ext4_handle_dirty_metadata(handle
, NULL
, gdp_bh
);
4073 if (flags
& EXT4_MB_SYNC_UPDATE
) {
4074 sync_dirty_buffer(bitmap_bh
);
4075 sync_dirty_buffer(gdp_bh
);
4084 * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
4085 * Returns 0 if success or error code
4087 static noinline_for_stack
int
4088 ext4_mb_mark_diskspace_used(struct ext4_allocation_context
*ac
,
4089 handle_t
*handle
, unsigned int reserv_clstrs
)
4091 struct ext4_group_desc
*gdp
;
4092 struct ext4_sb_info
*sbi
;
4093 struct super_block
*sb
;
4097 ext4_grpblk_t changed
;
4099 BUG_ON(ac
->ac_status
!= AC_STATUS_FOUND
);
4100 BUG_ON(ac
->ac_b_ex
.fe_len
<= 0);
4105 gdp
= ext4_get_group_desc(sb
, ac
->ac_b_ex
.fe_group
, NULL
);
4108 ext4_debug("using block group %u(%d)\n", ac
->ac_b_ex
.fe_group
,
4109 ext4_free_group_clusters(sb
, gdp
));
4111 block
= ext4_grp_offs_to_block(sb
, &ac
->ac_b_ex
);
4112 len
= EXT4_C2B(sbi
, ac
->ac_b_ex
.fe_len
);
4113 if (!ext4_inode_block_valid(ac
->ac_inode
, block
, len
)) {
4114 ext4_error(sb
, "Allocating blocks %llu-%llu which overlap "
4115 "fs metadata", block
, block
+len
);
4116 /* File system mounted not to panic on error
4117 * Fix the bitmap and return EFSCORRUPTED
4118 * We leak some of the blocks here.
4120 err
= ext4_mb_mark_context(handle
, sb
, true,
4121 ac
->ac_b_ex
.fe_group
,
4122 ac
->ac_b_ex
.fe_start
,
4126 err
= -EFSCORRUPTED
;
4130 #ifdef AGGRESSIVE_CHECK
4131 flags
|= EXT4_MB_BITMAP_MARKED_CHECK
;
4133 err
= ext4_mb_mark_context(handle
, sb
, true, ac
->ac_b_ex
.fe_group
,
4134 ac
->ac_b_ex
.fe_start
, ac
->ac_b_ex
.fe_len
,
4137 if (err
&& changed
== 0)
4140 #ifdef AGGRESSIVE_CHECK
4141 BUG_ON(changed
!= ac
->ac_b_ex
.fe_len
);
4143 percpu_counter_sub(&sbi
->s_freeclusters_counter
, ac
->ac_b_ex
.fe_len
);
4145 * Now reduce the dirty block count also. Should not go negative
4147 if (!(ac
->ac_flags
& EXT4_MB_DELALLOC_RESERVED
))
4148 /* release all the reserved blocks if non delalloc */
4149 percpu_counter_sub(&sbi
->s_dirtyclusters_counter
,
4156 * Idempotent helper for Ext4 fast commit replay path to set the state of
4157 * blocks in bitmaps and update counters.
4159 void ext4_mb_mark_bb(struct super_block
*sb
, ext4_fsblk_t block
,
4160 int len
, bool state
)
4162 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
4164 ext4_grpblk_t blkoff
;
4166 unsigned int clen
, thisgrp_len
;
4169 ext4_get_group_no_and_offset(sb
, block
, &group
, &blkoff
);
4172 * Check to see if we are freeing blocks across a group
4174 * In case of flex_bg, this can happen that (block, len) may
4175 * span across more than one group. In that case we need to
4176 * get the corresponding group metadata to work with.
4177 * For this we have goto again loop.
4179 thisgrp_len
= min_t(unsigned int, (unsigned int)len
,
4180 EXT4_BLOCKS_PER_GROUP(sb
) - EXT4_C2B(sbi
, blkoff
));
4181 clen
= EXT4_NUM_B2C(sbi
, thisgrp_len
);
4183 if (!ext4_sb_block_valid(sb
, NULL
, block
, thisgrp_len
)) {
4184 ext4_error(sb
, "Marking blocks in system zone - "
4185 "Block = %llu, len = %u",
4186 block
, thisgrp_len
);
4190 err
= ext4_mb_mark_context(NULL
, sb
, state
,
4191 group
, blkoff
, clen
,
4192 EXT4_MB_BITMAP_MARKED_CHECK
|
4193 EXT4_MB_SYNC_UPDATE
,
4198 block
+= thisgrp_len
;
4205 * here we normalize request for locality group
4206 * Group request are normalized to s_mb_group_prealloc, which goes to
4207 * s_strip if we set the same via mount option.
4208 * s_mb_group_prealloc can be configured via
4209 * /sys/fs/ext4/<partition>/mb_group_prealloc
4211 * XXX: should we try to preallocate more than the group has now?
4213 static void ext4_mb_normalize_group_request(struct ext4_allocation_context
*ac
)
4215 struct super_block
*sb
= ac
->ac_sb
;
4216 struct ext4_locality_group
*lg
= ac
->ac_lg
;
4219 ac
->ac_g_ex
.fe_len
= EXT4_SB(sb
)->s_mb_group_prealloc
;
4220 mb_debug(sb
, "goal %u blocks for locality group\n", ac
->ac_g_ex
.fe_len
);
4224 * This function returns the next element to look at during inode
4225 * PA rbtree walk. We assume that we have held the inode PA rbtree lock
4226 * (ei->i_prealloc_lock)
4228 * new_start The start of the range we want to compare
4229 * cur_start The existing start that we are comparing against
4230 * node The node of the rb_tree
4232 static inline struct rb_node
*
4233 ext4_mb_pa_rb_next_iter(ext4_lblk_t new_start
, ext4_lblk_t cur_start
, struct rb_node
*node
)
4235 if (new_start
< cur_start
)
4236 return node
->rb_left
;
4238 return node
->rb_right
;
4242 ext4_mb_pa_assert_overlap(struct ext4_allocation_context
*ac
,
4243 ext4_lblk_t start
, loff_t end
)
4245 struct ext4_sb_info
*sbi
= EXT4_SB(ac
->ac_sb
);
4246 struct ext4_inode_info
*ei
= EXT4_I(ac
->ac_inode
);
4247 struct ext4_prealloc_space
*tmp_pa
;
4248 ext4_lblk_t tmp_pa_start
;
4250 struct rb_node
*iter
;
4252 read_lock(&ei
->i_prealloc_lock
);
4253 for (iter
= ei
->i_prealloc_node
.rb_node
; iter
;
4254 iter
= ext4_mb_pa_rb_next_iter(start
, tmp_pa_start
, iter
)) {
4255 tmp_pa
= rb_entry(iter
, struct ext4_prealloc_space
,
4256 pa_node
.inode_node
);
4257 tmp_pa_start
= tmp_pa
->pa_lstart
;
4258 tmp_pa_end
= pa_logical_end(sbi
, tmp_pa
);
4260 spin_lock(&tmp_pa
->pa_lock
);
4261 if (tmp_pa
->pa_deleted
== 0)
4262 BUG_ON(!(start
>= tmp_pa_end
|| end
<= tmp_pa_start
));
4263 spin_unlock(&tmp_pa
->pa_lock
);
4265 read_unlock(&ei
->i_prealloc_lock
);
4269 * Given an allocation context "ac" and a range "start", "end", check
4270 * and adjust boundaries if the range overlaps with any of the existing
4271 * preallocatoins stored in the corresponding inode of the allocation context.
4274 * ac allocation context
4275 * start start of the new range
4276 * end end of the new range
4279 ext4_mb_pa_adjust_overlap(struct ext4_allocation_context
*ac
,
4280 ext4_lblk_t
*start
, loff_t
*end
)
4282 struct ext4_inode_info
*ei
= EXT4_I(ac
->ac_inode
);
4283 struct ext4_sb_info
*sbi
= EXT4_SB(ac
->ac_sb
);
4284 struct ext4_prealloc_space
*tmp_pa
= NULL
, *left_pa
= NULL
, *right_pa
= NULL
;
4285 struct rb_node
*iter
;
4286 ext4_lblk_t new_start
, tmp_pa_start
, right_pa_start
= -1;
4287 loff_t new_end
, tmp_pa_end
, left_pa_end
= -1;
4293 * Adjust the normalized range so that it doesn't overlap with any
4294 * existing preallocated blocks(PAs). Make sure to hold the rbtree lock
4295 * so it doesn't change underneath us.
4297 read_lock(&ei
->i_prealloc_lock
);
4299 /* Step 1: find any one immediate neighboring PA of the normalized range */
4300 for (iter
= ei
->i_prealloc_node
.rb_node
; iter
;
4301 iter
= ext4_mb_pa_rb_next_iter(ac
->ac_o_ex
.fe_logical
,
4302 tmp_pa_start
, iter
)) {
4303 tmp_pa
= rb_entry(iter
, struct ext4_prealloc_space
,
4304 pa_node
.inode_node
);
4305 tmp_pa_start
= tmp_pa
->pa_lstart
;
4306 tmp_pa_end
= pa_logical_end(sbi
, tmp_pa
);
4308 /* PA must not overlap original request */
4309 spin_lock(&tmp_pa
->pa_lock
);
4310 if (tmp_pa
->pa_deleted
== 0)
4311 BUG_ON(!(ac
->ac_o_ex
.fe_logical
>= tmp_pa_end
||
4312 ac
->ac_o_ex
.fe_logical
< tmp_pa_start
));
4313 spin_unlock(&tmp_pa
->pa_lock
);
4317 * Step 2: check if the found PA is left or right neighbor and
4318 * get the other neighbor
4321 if (tmp_pa
->pa_lstart
< ac
->ac_o_ex
.fe_logical
) {
4322 struct rb_node
*tmp
;
4325 tmp
= rb_next(&left_pa
->pa_node
.inode_node
);
4327 right_pa
= rb_entry(tmp
,
4328 struct ext4_prealloc_space
,
4329 pa_node
.inode_node
);
4332 struct rb_node
*tmp
;
4335 tmp
= rb_prev(&right_pa
->pa_node
.inode_node
);
4337 left_pa
= rb_entry(tmp
,
4338 struct ext4_prealloc_space
,
4339 pa_node
.inode_node
);
4344 /* Step 3: get the non deleted neighbors */
4346 for (iter
= &left_pa
->pa_node
.inode_node
;;
4347 iter
= rb_prev(iter
)) {
4353 tmp_pa
= rb_entry(iter
, struct ext4_prealloc_space
,
4354 pa_node
.inode_node
);
4356 spin_lock(&tmp_pa
->pa_lock
);
4357 if (tmp_pa
->pa_deleted
== 0) {
4358 spin_unlock(&tmp_pa
->pa_lock
);
4361 spin_unlock(&tmp_pa
->pa_lock
);
4366 for (iter
= &right_pa
->pa_node
.inode_node
;;
4367 iter
= rb_next(iter
)) {
4373 tmp_pa
= rb_entry(iter
, struct ext4_prealloc_space
,
4374 pa_node
.inode_node
);
4376 spin_lock(&tmp_pa
->pa_lock
);
4377 if (tmp_pa
->pa_deleted
== 0) {
4378 spin_unlock(&tmp_pa
->pa_lock
);
4381 spin_unlock(&tmp_pa
->pa_lock
);
4386 left_pa_end
= pa_logical_end(sbi
, left_pa
);
4387 BUG_ON(left_pa_end
> ac
->ac_o_ex
.fe_logical
);
4391 right_pa_start
= right_pa
->pa_lstart
;
4392 BUG_ON(right_pa_start
<= ac
->ac_o_ex
.fe_logical
);
4395 /* Step 4: trim our normalized range to not overlap with the neighbors */
4397 if (left_pa_end
> new_start
)
4398 new_start
= left_pa_end
;
4402 if (right_pa_start
< new_end
)
4403 new_end
= right_pa_start
;
4405 read_unlock(&ei
->i_prealloc_lock
);
4407 /* XXX: extra loop to check we really don't overlap preallocations */
4408 ext4_mb_pa_assert_overlap(ac
, new_start
, new_end
);
4415 * Normalization means making request better in terms of
4416 * size and alignment
4418 static noinline_for_stack
void
4419 ext4_mb_normalize_request(struct ext4_allocation_context
*ac
,
4420 struct ext4_allocation_request
*ar
)
4422 struct ext4_sb_info
*sbi
= EXT4_SB(ac
->ac_sb
);
4423 struct ext4_super_block
*es
= sbi
->s_es
;
4425 loff_t size
, start_off
, end
;
4426 loff_t orig_size __maybe_unused
;
4429 /* do normalize only data requests, metadata requests
4430 do not need preallocation */
4431 if (!(ac
->ac_flags
& EXT4_MB_HINT_DATA
))
4434 /* sometime caller may want exact blocks */
4435 if (unlikely(ac
->ac_flags
& EXT4_MB_HINT_GOAL_ONLY
))
4438 /* caller may indicate that preallocation isn't
4439 * required (it's a tail, for example) */
4440 if (ac
->ac_flags
& EXT4_MB_HINT_NOPREALLOC
)
4443 if (ac
->ac_flags
& EXT4_MB_HINT_GROUP_ALLOC
) {
4444 ext4_mb_normalize_group_request(ac
);
4448 bsbits
= ac
->ac_sb
->s_blocksize_bits
;
4450 /* first, let's learn actual file size
4451 * given current request is allocated */
4452 size
= extent_logical_end(sbi
, &ac
->ac_o_ex
);
4453 size
= size
<< bsbits
;
4454 if (size
< i_size_read(ac
->ac_inode
))
4455 size
= i_size_read(ac
->ac_inode
);
4458 /* max size of free chunks */
4461 #define NRL_CHECK_SIZE(req, size, max, chunk_size) \
4462 (req <= (size) || max <= (chunk_size))
4464 /* first, try to predict filesize */
4465 /* XXX: should this table be tunable? */
4467 if (size
<= 16 * 1024) {
4469 } else if (size
<= 32 * 1024) {
4471 } else if (size
<= 64 * 1024) {
4473 } else if (size
<= 128 * 1024) {
4475 } else if (size
<= 256 * 1024) {
4477 } else if (size
<= 512 * 1024) {
4479 } else if (size
<= 1024 * 1024) {
4481 } else if (NRL_CHECK_SIZE(size
, 4 * 1024 * 1024, max
, 2 * 1024)) {
4482 start_off
= ((loff_t
)ac
->ac_o_ex
.fe_logical
>>
4483 (21 - bsbits
)) << 21;
4484 size
= 2 * 1024 * 1024;
4485 } else if (NRL_CHECK_SIZE(size
, 8 * 1024 * 1024, max
, 4 * 1024)) {
4486 start_off
= ((loff_t
)ac
->ac_o_ex
.fe_logical
>>
4487 (22 - bsbits
)) << 22;
4488 size
= 4 * 1024 * 1024;
4489 } else if (NRL_CHECK_SIZE(EXT4_C2B(sbi
, ac
->ac_o_ex
.fe_len
),
4490 (8<<20)>>bsbits
, max
, 8 * 1024)) {
4491 start_off
= ((loff_t
)ac
->ac_o_ex
.fe_logical
>>
4492 (23 - bsbits
)) << 23;
4493 size
= 8 * 1024 * 1024;
4495 start_off
= (loff_t
) ac
->ac_o_ex
.fe_logical
<< bsbits
;
4496 size
= (loff_t
) EXT4_C2B(sbi
,
4497 ac
->ac_o_ex
.fe_len
) << bsbits
;
4499 size
= size
>> bsbits
;
4500 start
= start_off
>> bsbits
;
4503 * For tiny groups (smaller than 8MB) the chosen allocation
4504 * alignment may be larger than group size. Make sure the
4505 * alignment does not move allocation to a different group which
4506 * makes mballoc fail assertions later.
4508 start
= max(start
, rounddown(ac
->ac_o_ex
.fe_logical
,
4509 (ext4_lblk_t
)EXT4_BLOCKS_PER_GROUP(ac
->ac_sb
)));
4511 /* avoid unnecessary preallocation that may trigger assertions */
4512 if (start
+ size
> EXT_MAX_BLOCKS
)
4513 size
= EXT_MAX_BLOCKS
- start
;
4515 /* don't cover already allocated blocks in selected range */
4516 if (ar
->pleft
&& start
<= ar
->lleft
) {
4517 size
-= ar
->lleft
+ 1 - start
;
4518 start
= ar
->lleft
+ 1;
4520 if (ar
->pright
&& start
+ size
- 1 >= ar
->lright
)
4521 size
-= start
+ size
- ar
->lright
;
4524 * Trim allocation request for filesystems with artificially small
4527 if (size
> EXT4_BLOCKS_PER_GROUP(ac
->ac_sb
))
4528 size
= EXT4_BLOCKS_PER_GROUP(ac
->ac_sb
);
4532 ext4_mb_pa_adjust_overlap(ac
, &start
, &end
);
4537 * In this function "start" and "size" are normalized for better
4538 * alignment and length such that we could preallocate more blocks.
4539 * This normalization is done such that original request of
4540 * ac->ac_o_ex.fe_logical & fe_len should always lie within "start" and
4541 * "size" boundaries.
4542 * (Note fe_len can be relaxed since FS block allocation API does not
4543 * provide gurantee on number of contiguous blocks allocation since that
4544 * depends upon free space left, etc).
4545 * In case of inode pa, later we use the allocated blocks
4546 * [pa_pstart + fe_logical - pa_lstart, fe_len/size] from the preallocated
4547 * range of goal/best blocks [start, size] to put it at the
4548 * ac_o_ex.fe_logical extent of this inode.
4549 * (See ext4_mb_use_inode_pa() for more details)
4551 if (start
+ size
<= ac
->ac_o_ex
.fe_logical
||
4552 start
> ac
->ac_o_ex
.fe_logical
) {
4553 ext4_msg(ac
->ac_sb
, KERN_ERR
,
4554 "start %lu, size %lu, fe_logical %lu",
4555 (unsigned long) start
, (unsigned long) size
,
4556 (unsigned long) ac
->ac_o_ex
.fe_logical
);
4559 BUG_ON(size
<= 0 || size
> EXT4_BLOCKS_PER_GROUP(ac
->ac_sb
));
4561 /* now prepare goal request */
4563 /* XXX: is it better to align blocks WRT to logical
4564 * placement or satisfy big request as is */
4565 ac
->ac_g_ex
.fe_logical
= start
;
4566 ac
->ac_g_ex
.fe_len
= EXT4_NUM_B2C(sbi
, size
);
4567 ac
->ac_orig_goal_len
= ac
->ac_g_ex
.fe_len
;
4569 /* define goal start in order to merge */
4570 if (ar
->pright
&& (ar
->lright
== (start
+ size
)) &&
4571 ar
->pright
>= size
&&
4572 ar
->pright
- size
>= le32_to_cpu(es
->s_first_data_block
)) {
4573 /* merge to the right */
4574 ext4_get_group_no_and_offset(ac
->ac_sb
, ar
->pright
- size
,
4575 &ac
->ac_g_ex
.fe_group
,
4576 &ac
->ac_g_ex
.fe_start
);
4577 ac
->ac_flags
|= EXT4_MB_HINT_TRY_GOAL
;
4579 if (ar
->pleft
&& (ar
->lleft
+ 1 == start
) &&
4580 ar
->pleft
+ 1 < ext4_blocks_count(es
)) {
4581 /* merge to the left */
4582 ext4_get_group_no_and_offset(ac
->ac_sb
, ar
->pleft
+ 1,
4583 &ac
->ac_g_ex
.fe_group
,
4584 &ac
->ac_g_ex
.fe_start
);
4585 ac
->ac_flags
|= EXT4_MB_HINT_TRY_GOAL
;
4588 mb_debug(ac
->ac_sb
, "goal: %lld(was %lld) blocks at %u\n", size
,
4592 static void ext4_mb_collect_stats(struct ext4_allocation_context
*ac
)
4594 struct ext4_sb_info
*sbi
= EXT4_SB(ac
->ac_sb
);
4596 if (sbi
->s_mb_stats
&& ac
->ac_g_ex
.fe_len
>= 1) {
4597 atomic_inc(&sbi
->s_bal_reqs
);
4598 atomic_add(ac
->ac_b_ex
.fe_len
, &sbi
->s_bal_allocated
);
4599 if (ac
->ac_b_ex
.fe_len
>= ac
->ac_o_ex
.fe_len
)
4600 atomic_inc(&sbi
->s_bal_success
);
4602 atomic_add(ac
->ac_found
, &sbi
->s_bal_ex_scanned
);
4603 for (int i
=0; i
<EXT4_MB_NUM_CRS
; i
++) {
4604 atomic_add(ac
->ac_cX_found
[i
], &sbi
->s_bal_cX_ex_scanned
[i
]);
4607 atomic_add(ac
->ac_groups_scanned
, &sbi
->s_bal_groups_scanned
);
4608 if (ac
->ac_g_ex
.fe_start
== ac
->ac_b_ex
.fe_start
&&
4609 ac
->ac_g_ex
.fe_group
== ac
->ac_b_ex
.fe_group
)
4610 atomic_inc(&sbi
->s_bal_goals
);
4611 /* did we allocate as much as normalizer originally wanted? */
4612 if (ac
->ac_f_ex
.fe_len
== ac
->ac_orig_goal_len
)
4613 atomic_inc(&sbi
->s_bal_len_goals
);
4615 if (ac
->ac_found
> sbi
->s_mb_max_to_scan
)
4616 atomic_inc(&sbi
->s_bal_breaks
);
4619 if (ac
->ac_op
== EXT4_MB_HISTORY_ALLOC
)
4620 trace_ext4_mballoc_alloc(ac
);
4622 trace_ext4_mballoc_prealloc(ac
);
4626 * Called on failure; free up any blocks from the inode PA for this
4627 * context. We don't need this for MB_GROUP_PA because we only change
4628 * pa_free in ext4_mb_release_context(), but on failure, we've already
4629 * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
4631 static void ext4_discard_allocated_blocks(struct ext4_allocation_context
*ac
)
4633 struct ext4_prealloc_space
*pa
= ac
->ac_pa
;
4634 struct ext4_buddy e4b
;
4638 if (ac
->ac_f_ex
.fe_len
== 0)
4640 err
= ext4_mb_load_buddy(ac
->ac_sb
, ac
->ac_f_ex
.fe_group
, &e4b
);
4641 if (WARN_RATELIMIT(err
,
4642 "ext4: mb_load_buddy failed (%d)", err
))
4644 * This should never happen since we pin the
4645 * pages in the ext4_allocation_context so
4646 * ext4_mb_load_buddy() should never fail.
4649 ext4_lock_group(ac
->ac_sb
, ac
->ac_f_ex
.fe_group
);
4650 mb_free_blocks(ac
->ac_inode
, &e4b
, ac
->ac_f_ex
.fe_start
,
4651 ac
->ac_f_ex
.fe_len
);
4652 ext4_unlock_group(ac
->ac_sb
, ac
->ac_f_ex
.fe_group
);
4653 ext4_mb_unload_buddy(&e4b
);
4656 if (pa
->pa_type
== MB_INODE_PA
) {
4657 spin_lock(&pa
->pa_lock
);
4658 pa
->pa_free
+= ac
->ac_b_ex
.fe_len
;
4659 spin_unlock(&pa
->pa_lock
);
4664 * use blocks preallocated to inode
4666 static void ext4_mb_use_inode_pa(struct ext4_allocation_context
*ac
,
4667 struct ext4_prealloc_space
*pa
)
4669 struct ext4_sb_info
*sbi
= EXT4_SB(ac
->ac_sb
);
4674 /* found preallocated blocks, use them */
4675 start
= pa
->pa_pstart
+ (ac
->ac_o_ex
.fe_logical
- pa
->pa_lstart
);
4676 end
= min(pa
->pa_pstart
+ EXT4_C2B(sbi
, pa
->pa_len
),
4677 start
+ EXT4_C2B(sbi
, ac
->ac_o_ex
.fe_len
));
4678 len
= EXT4_NUM_B2C(sbi
, end
- start
);
4679 ext4_get_group_no_and_offset(ac
->ac_sb
, start
, &ac
->ac_b_ex
.fe_group
,
4680 &ac
->ac_b_ex
.fe_start
);
4681 ac
->ac_b_ex
.fe_len
= len
;
4682 ac
->ac_status
= AC_STATUS_FOUND
;
4685 BUG_ON(start
< pa
->pa_pstart
);
4686 BUG_ON(end
> pa
->pa_pstart
+ EXT4_C2B(sbi
, pa
->pa_len
));
4687 BUG_ON(pa
->pa_free
< len
);
4688 BUG_ON(ac
->ac_b_ex
.fe_len
<= 0);
4691 mb_debug(ac
->ac_sb
, "use %llu/%d from inode pa %p\n", start
, len
, pa
);
4695 * use blocks preallocated to locality group
4697 static void ext4_mb_use_group_pa(struct ext4_allocation_context
*ac
,
4698 struct ext4_prealloc_space
*pa
)
4700 unsigned int len
= ac
->ac_o_ex
.fe_len
;
4702 ext4_get_group_no_and_offset(ac
->ac_sb
, pa
->pa_pstart
,
4703 &ac
->ac_b_ex
.fe_group
,
4704 &ac
->ac_b_ex
.fe_start
);
4705 ac
->ac_b_ex
.fe_len
= len
;
4706 ac
->ac_status
= AC_STATUS_FOUND
;
4709 /* we don't correct pa_pstart or pa_len here to avoid
4710 * possible race when the group is being loaded concurrently
4711 * instead we correct pa later, after blocks are marked
4712 * in on-disk bitmap -- see ext4_mb_release_context()
4713 * Other CPUs are prevented from allocating from this pa by lg_mutex
4715 mb_debug(ac
->ac_sb
, "use %u/%u from group pa %p\n",
4716 pa
->pa_lstart
, len
, pa
);
4720 * Return the prealloc space that have minimal distance
4721 * from the goal block. @cpa is the prealloc
4722 * space that is having currently known minimal distance
4723 * from the goal block.
4725 static struct ext4_prealloc_space
*
4726 ext4_mb_check_group_pa(ext4_fsblk_t goal_block
,
4727 struct ext4_prealloc_space
*pa
,
4728 struct ext4_prealloc_space
*cpa
)
4730 ext4_fsblk_t cur_distance
, new_distance
;
4733 atomic_inc(&pa
->pa_count
);
4736 cur_distance
= abs(goal_block
- cpa
->pa_pstart
);
4737 new_distance
= abs(goal_block
- pa
->pa_pstart
);
4739 if (cur_distance
<= new_distance
)
4742 /* drop the previous reference */
4743 atomic_dec(&cpa
->pa_count
);
4744 atomic_inc(&pa
->pa_count
);
4749 * check if found pa meets EXT4_MB_HINT_GOAL_ONLY
4752 ext4_mb_pa_goal_check(struct ext4_allocation_context
*ac
,
4753 struct ext4_prealloc_space
*pa
)
4755 struct ext4_sb_info
*sbi
= EXT4_SB(ac
->ac_sb
);
4758 if (likely(!(ac
->ac_flags
& EXT4_MB_HINT_GOAL_ONLY
)))
4762 * If EXT4_MB_HINT_GOAL_ONLY is set, ac_g_ex will not be adjusted
4763 * in ext4_mb_normalize_request and will keep same with ac_o_ex
4764 * from ext4_mb_initialize_context. Choose ac_g_ex here to keep
4765 * consistent with ext4_mb_find_by_goal.
4767 start
= pa
->pa_pstart
+
4768 (ac
->ac_g_ex
.fe_logical
- pa
->pa_lstart
);
4769 if (ext4_grp_offs_to_block(ac
->ac_sb
, &ac
->ac_g_ex
) != start
)
4772 if (ac
->ac_g_ex
.fe_len
> pa
->pa_len
-
4773 EXT4_B2C(sbi
, ac
->ac_g_ex
.fe_logical
- pa
->pa_lstart
))
4780 * search goal blocks in preallocated space
4782 static noinline_for_stack
bool
4783 ext4_mb_use_preallocated(struct ext4_allocation_context
*ac
)
4785 struct ext4_sb_info
*sbi
= EXT4_SB(ac
->ac_sb
);
4787 struct ext4_inode_info
*ei
= EXT4_I(ac
->ac_inode
);
4788 struct ext4_locality_group
*lg
;
4789 struct ext4_prealloc_space
*tmp_pa
= NULL
, *cpa
= NULL
;
4790 struct rb_node
*iter
;
4791 ext4_fsblk_t goal_block
;
4793 /* only data can be preallocated */
4794 if (!(ac
->ac_flags
& EXT4_MB_HINT_DATA
))
4798 * first, try per-file preallocation by searching the inode pa rbtree.
4800 * Here, we can't do a direct traversal of the tree because
4801 * ext4_mb_discard_group_preallocation() can paralelly mark the pa
4802 * deleted and that can cause direct traversal to skip some entries.
4804 read_lock(&ei
->i_prealloc_lock
);
4806 if (RB_EMPTY_ROOT(&ei
->i_prealloc_node
)) {
4811 * Step 1: Find a pa with logical start immediately adjacent to the
4812 * original logical start. This could be on the left or right.
4814 * (tmp_pa->pa_lstart never changes so we can skip locking for it).
4816 for (iter
= ei
->i_prealloc_node
.rb_node
; iter
;
4817 iter
= ext4_mb_pa_rb_next_iter(ac
->ac_o_ex
.fe_logical
,
4818 tmp_pa
->pa_lstart
, iter
)) {
4819 tmp_pa
= rb_entry(iter
, struct ext4_prealloc_space
,
4820 pa_node
.inode_node
);
4824 * Step 2: The adjacent pa might be to the right of logical start, find
4825 * the left adjacent pa. After this step we'd have a valid tmp_pa whose
4826 * logical start is towards the left of original request's logical start
4828 if (tmp_pa
->pa_lstart
> ac
->ac_o_ex
.fe_logical
) {
4829 struct rb_node
*tmp
;
4830 tmp
= rb_prev(&tmp_pa
->pa_node
.inode_node
);
4833 tmp_pa
= rb_entry(tmp
, struct ext4_prealloc_space
,
4834 pa_node
.inode_node
);
4837 * If there is no adjacent pa to the left then finding
4838 * an overlapping pa is not possible hence stop searching
4845 BUG_ON(!(tmp_pa
&& tmp_pa
->pa_lstart
<= ac
->ac_o_ex
.fe_logical
));
4848 * Step 3: If the left adjacent pa is deleted, keep moving left to find
4849 * the first non deleted adjacent pa. After this step we should have a
4850 * valid tmp_pa which is guaranteed to be non deleted.
4852 for (iter
= &tmp_pa
->pa_node
.inode_node
;; iter
= rb_prev(iter
)) {
4855 * no non deleted left adjacent pa, so stop searching
4860 tmp_pa
= rb_entry(iter
, struct ext4_prealloc_space
,
4861 pa_node
.inode_node
);
4862 spin_lock(&tmp_pa
->pa_lock
);
4863 if (tmp_pa
->pa_deleted
== 0) {
4865 * We will keep holding the pa_lock from
4866 * this point on because we don't want group discard
4867 * to delete this pa underneath us. Since group
4868 * discard is anyways an ENOSPC operation it
4869 * should be okay for it to wait a few more cycles.
4873 spin_unlock(&tmp_pa
->pa_lock
);
4877 BUG_ON(!(tmp_pa
&& tmp_pa
->pa_lstart
<= ac
->ac_o_ex
.fe_logical
));
4878 BUG_ON(tmp_pa
->pa_deleted
== 1);
4881 * Step 4: We now have the non deleted left adjacent pa. Only this
4882 * pa can possibly satisfy the request hence check if it overlaps
4883 * original logical start and stop searching if it doesn't.
4885 if (ac
->ac_o_ex
.fe_logical
>= pa_logical_end(sbi
, tmp_pa
)) {
4886 spin_unlock(&tmp_pa
->pa_lock
);
4890 /* non-extent files can't have physical blocks past 2^32 */
4891 if (!(ext4_test_inode_flag(ac
->ac_inode
, EXT4_INODE_EXTENTS
)) &&
4892 (tmp_pa
->pa_pstart
+ EXT4_C2B(sbi
, tmp_pa
->pa_len
) >
4893 EXT4_MAX_BLOCK_FILE_PHYS
)) {
4895 * Since PAs don't overlap, we won't find any other PA to
4898 spin_unlock(&tmp_pa
->pa_lock
);
4902 if (tmp_pa
->pa_free
&& likely(ext4_mb_pa_goal_check(ac
, tmp_pa
))) {
4903 atomic_inc(&tmp_pa
->pa_count
);
4904 ext4_mb_use_inode_pa(ac
, tmp_pa
);
4905 spin_unlock(&tmp_pa
->pa_lock
);
4906 read_unlock(&ei
->i_prealloc_lock
);
4910 * We found a valid overlapping pa but couldn't use it because
4911 * it had no free blocks. This should ideally never happen
4914 * 1. When a new inode pa is added to rbtree it must have
4915 * pa_free > 0 since otherwise we won't actually need
4918 * 2. An inode pa that is in the rbtree can only have it's
4919 * pa_free become zero when another thread calls:
4920 * ext4_mb_new_blocks
4921 * ext4_mb_use_preallocated
4922 * ext4_mb_use_inode_pa
4924 * 3. Further, after the above calls make pa_free == 0, we will
4925 * immediately remove it from the rbtree in:
4926 * ext4_mb_new_blocks
4927 * ext4_mb_release_context
4930 * 4. Since the pa_free becoming 0 and pa_free getting removed
4931 * from tree both happen in ext4_mb_new_blocks, which is always
4932 * called with i_data_sem held for data allocations, we can be
4933 * sure that another process will never see a pa in rbtree with
4936 WARN_ON_ONCE(tmp_pa
->pa_free
== 0);
4938 spin_unlock(&tmp_pa
->pa_lock
);
4940 read_unlock(&ei
->i_prealloc_lock
);
4942 /* can we use group allocation? */
4943 if (!(ac
->ac_flags
& EXT4_MB_HINT_GROUP_ALLOC
))
4946 /* inode may have no locality group for some reason */
4950 order
= fls(ac
->ac_o_ex
.fe_len
) - 1;
4951 if (order
> PREALLOC_TB_SIZE
- 1)
4952 /* The max size of hash table is PREALLOC_TB_SIZE */
4953 order
= PREALLOC_TB_SIZE
- 1;
4955 goal_block
= ext4_grp_offs_to_block(ac
->ac_sb
, &ac
->ac_g_ex
);
4957 * search for the prealloc space that is having
4958 * minimal distance from the goal block.
4960 for (i
= order
; i
< PREALLOC_TB_SIZE
; i
++) {
4962 list_for_each_entry_rcu(tmp_pa
, &lg
->lg_prealloc_list
[i
],
4964 spin_lock(&tmp_pa
->pa_lock
);
4965 if (tmp_pa
->pa_deleted
== 0 &&
4966 tmp_pa
->pa_free
>= ac
->ac_o_ex
.fe_len
) {
4968 cpa
= ext4_mb_check_group_pa(goal_block
,
4971 spin_unlock(&tmp_pa
->pa_lock
);
4976 ext4_mb_use_group_pa(ac
, cpa
);
4983 * the function goes through all preallocation in this group and marks them
4984 * used in in-core bitmap. buddy must be generated from this bitmap
4985 * Need to be called with ext4 group lock held
4987 static noinline_for_stack
4988 void ext4_mb_generate_from_pa(struct super_block
*sb
, void *bitmap
,
4991 struct ext4_group_info
*grp
= ext4_get_group_info(sb
, group
);
4992 struct ext4_prealloc_space
*pa
;
4993 struct list_head
*cur
;
4994 ext4_group_t groupnr
;
4995 ext4_grpblk_t start
;
4996 int preallocated
= 0;
5002 /* all form of preallocation discards first load group,
5003 * so the only competing code is preallocation use.
5004 * we don't need any locking here
5005 * notice we do NOT ignore preallocations with pa_deleted
5006 * otherwise we could leave used blocks available for
5007 * allocation in buddy when concurrent ext4_mb_put_pa()
5008 * is dropping preallocation
5010 list_for_each(cur
, &grp
->bb_prealloc_list
) {
5011 pa
= list_entry(cur
, struct ext4_prealloc_space
, pa_group_list
);
5012 spin_lock(&pa
->pa_lock
);
5013 ext4_get_group_no_and_offset(sb
, pa
->pa_pstart
,
5016 spin_unlock(&pa
->pa_lock
);
5017 if (unlikely(len
== 0))
5019 BUG_ON(groupnr
!= group
);
5020 mb_set_bits(bitmap
, start
, len
);
5021 preallocated
+= len
;
5023 mb_debug(sb
, "preallocated %d for group %u\n", preallocated
, group
);
5026 static void ext4_mb_mark_pa_deleted(struct super_block
*sb
,
5027 struct ext4_prealloc_space
*pa
)
5029 struct ext4_inode_info
*ei
;
5031 if (pa
->pa_deleted
) {
5032 ext4_warning(sb
, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n",
5033 pa
->pa_type
, pa
->pa_pstart
, pa
->pa_lstart
,
5040 if (pa
->pa_type
== MB_INODE_PA
) {
5041 ei
= EXT4_I(pa
->pa_inode
);
5042 atomic_dec(&ei
->i_prealloc_active
);
5046 static inline void ext4_mb_pa_free(struct ext4_prealloc_space
*pa
)
5049 BUG_ON(atomic_read(&pa
->pa_count
));
5050 BUG_ON(pa
->pa_deleted
== 0);
5051 kmem_cache_free(ext4_pspace_cachep
, pa
);
5054 static void ext4_mb_pa_callback(struct rcu_head
*head
)
5056 struct ext4_prealloc_space
*pa
;
5058 pa
= container_of(head
, struct ext4_prealloc_space
, u
.pa_rcu
);
5059 ext4_mb_pa_free(pa
);
5063 * drops a reference to preallocated space descriptor
5064 * if this was the last reference and the space is consumed
5066 static void ext4_mb_put_pa(struct ext4_allocation_context
*ac
,
5067 struct super_block
*sb
, struct ext4_prealloc_space
*pa
)
5070 ext4_fsblk_t grp_blk
;
5071 struct ext4_inode_info
*ei
= EXT4_I(ac
->ac_inode
);
5073 /* in this short window concurrent discard can set pa_deleted */
5074 spin_lock(&pa
->pa_lock
);
5075 if (!atomic_dec_and_test(&pa
->pa_count
) || pa
->pa_free
!= 0) {
5076 spin_unlock(&pa
->pa_lock
);
5080 if (pa
->pa_deleted
== 1) {
5081 spin_unlock(&pa
->pa_lock
);
5085 ext4_mb_mark_pa_deleted(sb
, pa
);
5086 spin_unlock(&pa
->pa_lock
);
5088 grp_blk
= pa
->pa_pstart
;
5090 * If doing group-based preallocation, pa_pstart may be in the
5091 * next group when pa is used up
5093 if (pa
->pa_type
== MB_GROUP_PA
)
5096 grp
= ext4_get_group_number(sb
, grp_blk
);
5101 * P1 (buddy init) P2 (regular allocation)
5102 * find block B in PA
5103 * copy on-disk bitmap to buddy
5104 * mark B in on-disk bitmap
5105 * drop PA from group
5106 * mark all PAs in buddy
5108 * thus, P1 initializes buddy with B available. to prevent this
5109 * we make "copy" and "mark all PAs" atomic and serialize "drop PA"
5112 ext4_lock_group(sb
, grp
);
5113 list_del(&pa
->pa_group_list
);
5114 ext4_unlock_group(sb
, grp
);
5116 if (pa
->pa_type
== MB_INODE_PA
) {
5117 write_lock(pa
->pa_node_lock
.inode_lock
);
5118 rb_erase(&pa
->pa_node
.inode_node
, &ei
->i_prealloc_node
);
5119 write_unlock(pa
->pa_node_lock
.inode_lock
);
5120 ext4_mb_pa_free(pa
);
5122 spin_lock(pa
->pa_node_lock
.lg_lock
);
5123 list_del_rcu(&pa
->pa_node
.lg_list
);
5124 spin_unlock(pa
->pa_node_lock
.lg_lock
);
5125 call_rcu(&(pa
)->u
.pa_rcu
, ext4_mb_pa_callback
);
5129 static void ext4_mb_pa_rb_insert(struct rb_root
*root
, struct rb_node
*new)
5131 struct rb_node
**iter
= &root
->rb_node
, *parent
= NULL
;
5132 struct ext4_prealloc_space
*iter_pa
, *new_pa
;
5133 ext4_lblk_t iter_start
, new_start
;
5136 iter_pa
= rb_entry(*iter
, struct ext4_prealloc_space
,
5137 pa_node
.inode_node
);
5138 new_pa
= rb_entry(new, struct ext4_prealloc_space
,
5139 pa_node
.inode_node
);
5140 iter_start
= iter_pa
->pa_lstart
;
5141 new_start
= new_pa
->pa_lstart
;
5144 if (new_start
< iter_start
)
5145 iter
= &((*iter
)->rb_left
);
5147 iter
= &((*iter
)->rb_right
);
5150 rb_link_node(new, parent
, iter
);
5151 rb_insert_color(new, root
);
5155 * creates new preallocated space for given inode
5157 static noinline_for_stack
void
5158 ext4_mb_new_inode_pa(struct ext4_allocation_context
*ac
)
5160 struct super_block
*sb
= ac
->ac_sb
;
5161 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
5162 struct ext4_prealloc_space
*pa
;
5163 struct ext4_group_info
*grp
;
5164 struct ext4_inode_info
*ei
;
5166 /* preallocate only when found space is larger then requested */
5167 BUG_ON(ac
->ac_o_ex
.fe_len
>= ac
->ac_b_ex
.fe_len
);
5168 BUG_ON(ac
->ac_status
!= AC_STATUS_FOUND
);
5169 BUG_ON(!S_ISREG(ac
->ac_inode
->i_mode
));
5170 BUG_ON(ac
->ac_pa
== NULL
);
5174 if (ac
->ac_b_ex
.fe_len
< ac
->ac_orig_goal_len
) {
5175 struct ext4_free_extent ex
= {
5176 .fe_logical
= ac
->ac_g_ex
.fe_logical
,
5177 .fe_len
= ac
->ac_orig_goal_len
,
5179 loff_t orig_goal_end
= extent_logical_end(sbi
, &ex
);
5180 loff_t o_ex_end
= extent_logical_end(sbi
, &ac
->ac_o_ex
);
5183 * We can't allocate as much as normalizer wants, so we try
5184 * to get proper lstart to cover the original request, except
5185 * when the goal doesn't cover the original request as below:
5187 * orig_ex:2045/2055(10), isize:8417280 -> normalized:0/2048
5188 * best_ex:0/200(200) -> adjusted: 1848/2048(200)
5190 BUG_ON(ac
->ac_g_ex
.fe_logical
> ac
->ac_o_ex
.fe_logical
);
5191 BUG_ON(ac
->ac_g_ex
.fe_len
< ac
->ac_o_ex
.fe_len
);
5194 * Use the below logic for adjusting best extent as it keeps
5195 * fragmentation in check while ensuring logical range of best
5196 * extent doesn't overflow out of goal extent:
5198 * 1. Check if best ex can be kept at end of goal (before
5199 * cr_best_avail trimmed it) and still cover original start
5200 * 2. Else, check if best ex can be kept at start of goal and
5201 * still cover original end
5202 * 3. Else, keep the best ex at start of original request.
5204 ex
.fe_len
= ac
->ac_b_ex
.fe_len
;
5206 ex
.fe_logical
= orig_goal_end
- EXT4_C2B(sbi
, ex
.fe_len
);
5207 if (ac
->ac_o_ex
.fe_logical
>= ex
.fe_logical
)
5210 ex
.fe_logical
= ac
->ac_g_ex
.fe_logical
;
5211 if (o_ex_end
<= extent_logical_end(sbi
, &ex
))
5214 ex
.fe_logical
= ac
->ac_o_ex
.fe_logical
;
5216 ac
->ac_b_ex
.fe_logical
= ex
.fe_logical
;
5218 BUG_ON(ac
->ac_o_ex
.fe_logical
< ac
->ac_b_ex
.fe_logical
);
5219 BUG_ON(extent_logical_end(sbi
, &ex
) > orig_goal_end
);
5222 pa
->pa_lstart
= ac
->ac_b_ex
.fe_logical
;
5223 pa
->pa_pstart
= ext4_grp_offs_to_block(sb
, &ac
->ac_b_ex
);
5224 pa
->pa_len
= ac
->ac_b_ex
.fe_len
;
5225 pa
->pa_free
= pa
->pa_len
;
5226 spin_lock_init(&pa
->pa_lock
);
5227 INIT_LIST_HEAD(&pa
->pa_group_list
);
5229 pa
->pa_type
= MB_INODE_PA
;
5231 mb_debug(sb
, "new inode pa %p: %llu/%d for %u\n", pa
, pa
->pa_pstart
,
5232 pa
->pa_len
, pa
->pa_lstart
);
5233 trace_ext4_mb_new_inode_pa(ac
, pa
);
5235 atomic_add(pa
->pa_free
, &sbi
->s_mb_preallocated
);
5236 ext4_mb_use_inode_pa(ac
, pa
);
5238 ei
= EXT4_I(ac
->ac_inode
);
5239 grp
= ext4_get_group_info(sb
, ac
->ac_b_ex
.fe_group
);
5243 pa
->pa_node_lock
.inode_lock
= &ei
->i_prealloc_lock
;
5244 pa
->pa_inode
= ac
->ac_inode
;
5246 list_add(&pa
->pa_group_list
, &grp
->bb_prealloc_list
);
5248 write_lock(pa
->pa_node_lock
.inode_lock
);
5249 ext4_mb_pa_rb_insert(&ei
->i_prealloc_node
, &pa
->pa_node
.inode_node
);
5250 write_unlock(pa
->pa_node_lock
.inode_lock
);
5251 atomic_inc(&ei
->i_prealloc_active
);
5255 * creates new preallocated space for locality group inodes belongs to
5257 static noinline_for_stack
void
5258 ext4_mb_new_group_pa(struct ext4_allocation_context
*ac
)
5260 struct super_block
*sb
= ac
->ac_sb
;
5261 struct ext4_locality_group
*lg
;
5262 struct ext4_prealloc_space
*pa
;
5263 struct ext4_group_info
*grp
;
5265 /* preallocate only when found space is larger then requested */
5266 BUG_ON(ac
->ac_o_ex
.fe_len
>= ac
->ac_b_ex
.fe_len
);
5267 BUG_ON(ac
->ac_status
!= AC_STATUS_FOUND
);
5268 BUG_ON(!S_ISREG(ac
->ac_inode
->i_mode
));
5269 BUG_ON(ac
->ac_pa
== NULL
);
5273 pa
->pa_pstart
= ext4_grp_offs_to_block(sb
, &ac
->ac_b_ex
);
5274 pa
->pa_lstart
= pa
->pa_pstart
;
5275 pa
->pa_len
= ac
->ac_b_ex
.fe_len
;
5276 pa
->pa_free
= pa
->pa_len
;
5277 spin_lock_init(&pa
->pa_lock
);
5278 INIT_LIST_HEAD(&pa
->pa_node
.lg_list
);
5279 INIT_LIST_HEAD(&pa
->pa_group_list
);
5281 pa
->pa_type
= MB_GROUP_PA
;
5283 mb_debug(sb
, "new group pa %p: %llu/%d for %u\n", pa
, pa
->pa_pstart
,
5284 pa
->pa_len
, pa
->pa_lstart
);
5285 trace_ext4_mb_new_group_pa(ac
, pa
);
5287 ext4_mb_use_group_pa(ac
, pa
);
5288 atomic_add(pa
->pa_free
, &EXT4_SB(sb
)->s_mb_preallocated
);
5290 grp
= ext4_get_group_info(sb
, ac
->ac_b_ex
.fe_group
);
5296 pa
->pa_node_lock
.lg_lock
= &lg
->lg_prealloc_lock
;
5297 pa
->pa_inode
= NULL
;
5299 list_add(&pa
->pa_group_list
, &grp
->bb_prealloc_list
);
5302 * We will later add the new pa to the right bucket
5303 * after updating the pa_free in ext4_mb_release_context
5307 static void ext4_mb_new_preallocation(struct ext4_allocation_context
*ac
)
5309 if (ac
->ac_flags
& EXT4_MB_HINT_GROUP_ALLOC
)
5310 ext4_mb_new_group_pa(ac
);
5312 ext4_mb_new_inode_pa(ac
);
5316 * finds all unused blocks in on-disk bitmap, frees them in
5317 * in-core bitmap and buddy.
5318 * @pa must be unlinked from inode and group lists, so that
5319 * nobody else can find/use it.
5320 * the caller MUST hold group/inode locks.
5321 * TODO: optimize the case when there are no in-core structures yet
5323 static noinline_for_stack
void
5324 ext4_mb_release_inode_pa(struct ext4_buddy
*e4b
, struct buffer_head
*bitmap_bh
,
5325 struct ext4_prealloc_space
*pa
)
5327 struct super_block
*sb
= e4b
->bd_sb
;
5328 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
5333 unsigned long long grp_blk_start
;
5336 BUG_ON(pa
->pa_deleted
== 0);
5337 ext4_get_group_no_and_offset(sb
, pa
->pa_pstart
, &group
, &bit
);
5338 grp_blk_start
= pa
->pa_pstart
- EXT4_C2B(sbi
, bit
);
5339 BUG_ON(group
!= e4b
->bd_group
&& pa
->pa_len
!= 0);
5340 end
= bit
+ pa
->pa_len
;
5343 bit
= mb_find_next_zero_bit(bitmap_bh
->b_data
, end
, bit
);
5346 next
= mb_find_next_bit(bitmap_bh
->b_data
, end
, bit
);
5347 mb_debug(sb
, "free preallocated %u/%u in group %u\n",
5348 (unsigned) ext4_group_first_block_no(sb
, group
) + bit
,
5349 (unsigned) next
- bit
, (unsigned) group
);
5352 trace_ext4_mballoc_discard(sb
, NULL
, group
, bit
, next
- bit
);
5353 trace_ext4_mb_release_inode_pa(pa
, (grp_blk_start
+
5354 EXT4_C2B(sbi
, bit
)),
5356 mb_free_blocks(pa
->pa_inode
, e4b
, bit
, next
- bit
);
5359 if (free
!= pa
->pa_free
) {
5360 ext4_msg(e4b
->bd_sb
, KERN_CRIT
,
5361 "pa %p: logic %lu, phys. %lu, len %d",
5362 pa
, (unsigned long) pa
->pa_lstart
,
5363 (unsigned long) pa
->pa_pstart
,
5365 ext4_grp_locked_error(sb
, group
, 0, 0, "free %u, pa_free %u",
5368 * pa is already deleted so we use the value obtained
5369 * from the bitmap and continue.
5372 atomic_add(free
, &sbi
->s_mb_discarded
);
5375 static noinline_for_stack
void
5376 ext4_mb_release_group_pa(struct ext4_buddy
*e4b
,
5377 struct ext4_prealloc_space
*pa
)
5379 struct super_block
*sb
= e4b
->bd_sb
;
5383 trace_ext4_mb_release_group_pa(sb
, pa
);
5384 BUG_ON(pa
->pa_deleted
== 0);
5385 ext4_get_group_no_and_offset(sb
, pa
->pa_pstart
, &group
, &bit
);
5386 if (unlikely(group
!= e4b
->bd_group
&& pa
->pa_len
!= 0)) {
5387 ext4_warning(sb
, "bad group: expected %u, group %u, pa_start %llu",
5388 e4b
->bd_group
, group
, pa
->pa_pstart
);
5391 mb_free_blocks(pa
->pa_inode
, e4b
, bit
, pa
->pa_len
);
5392 atomic_add(pa
->pa_len
, &EXT4_SB(sb
)->s_mb_discarded
);
5393 trace_ext4_mballoc_discard(sb
, NULL
, group
, bit
, pa
->pa_len
);
5397 * releases all preallocations in given group
5399 * first, we need to decide discard policy:
5400 * - when do we discard
5402 * - how many do we discard
5403 * 1) how many requested
5405 static noinline_for_stack
int
5406 ext4_mb_discard_group_preallocations(struct super_block
*sb
,
5407 ext4_group_t group
, int *busy
)
5409 struct ext4_group_info
*grp
= ext4_get_group_info(sb
, group
);
5410 struct buffer_head
*bitmap_bh
= NULL
;
5411 struct ext4_prealloc_space
*pa
, *tmp
;
5413 struct ext4_buddy e4b
;
5414 struct ext4_inode_info
*ei
;
5420 mb_debug(sb
, "discard preallocation for group %u\n", group
);
5421 if (list_empty(&grp
->bb_prealloc_list
))
5424 bitmap_bh
= ext4_read_block_bitmap(sb
, group
);
5425 if (IS_ERR(bitmap_bh
)) {
5426 err
= PTR_ERR(bitmap_bh
);
5427 ext4_error_err(sb
, -err
,
5428 "Error %d reading block bitmap for %u",
5433 err
= ext4_mb_load_buddy(sb
, group
, &e4b
);
5435 ext4_warning(sb
, "Error %d loading buddy information for %u",
5441 ext4_lock_group(sb
, group
);
5442 list_for_each_entry_safe(pa
, tmp
,
5443 &grp
->bb_prealloc_list
, pa_group_list
) {
5444 spin_lock(&pa
->pa_lock
);
5445 if (atomic_read(&pa
->pa_count
)) {
5446 spin_unlock(&pa
->pa_lock
);
5450 if (pa
->pa_deleted
) {
5451 spin_unlock(&pa
->pa_lock
);
5455 /* seems this one can be freed ... */
5456 ext4_mb_mark_pa_deleted(sb
, pa
);
5459 this_cpu_inc(discard_pa_seq
);
5461 /* we can trust pa_free ... */
5462 free
+= pa
->pa_free
;
5464 spin_unlock(&pa
->pa_lock
);
5466 list_del(&pa
->pa_group_list
);
5467 list_add(&pa
->u
.pa_tmp_list
, &list
);
5470 /* now free all selected PAs */
5471 list_for_each_entry_safe(pa
, tmp
, &list
, u
.pa_tmp_list
) {
5473 /* remove from object (inode or locality group) */
5474 if (pa
->pa_type
== MB_GROUP_PA
) {
5475 spin_lock(pa
->pa_node_lock
.lg_lock
);
5476 list_del_rcu(&pa
->pa_node
.lg_list
);
5477 spin_unlock(pa
->pa_node_lock
.lg_lock
);
5479 write_lock(pa
->pa_node_lock
.inode_lock
);
5480 ei
= EXT4_I(pa
->pa_inode
);
5481 rb_erase(&pa
->pa_node
.inode_node
, &ei
->i_prealloc_node
);
5482 write_unlock(pa
->pa_node_lock
.inode_lock
);
5485 list_del(&pa
->u
.pa_tmp_list
);
5487 if (pa
->pa_type
== MB_GROUP_PA
) {
5488 ext4_mb_release_group_pa(&e4b
, pa
);
5489 call_rcu(&(pa
)->u
.pa_rcu
, ext4_mb_pa_callback
);
5491 ext4_mb_release_inode_pa(&e4b
, bitmap_bh
, pa
);
5492 ext4_mb_pa_free(pa
);
5496 ext4_unlock_group(sb
, group
);
5497 ext4_mb_unload_buddy(&e4b
);
5500 mb_debug(sb
, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n",
5501 free
, group
, grp
->bb_free
);
5506 * releases all non-used preallocated blocks for given inode
5508 * It's important to discard preallocations under i_data_sem
5509 * We don't want another block to be served from the prealloc
5510 * space when we are discarding the inode prealloc space.
5512 * FIXME!! Make sure it is valid at all the call sites
5514 void ext4_discard_preallocations(struct inode
*inode
)
5516 struct ext4_inode_info
*ei
= EXT4_I(inode
);
5517 struct super_block
*sb
= inode
->i_sb
;
5518 struct buffer_head
*bitmap_bh
= NULL
;
5519 struct ext4_prealloc_space
*pa
, *tmp
;
5520 ext4_group_t group
= 0;
5522 struct ext4_buddy e4b
;
5523 struct rb_node
*iter
;
5526 if (!S_ISREG(inode
->i_mode
))
5529 if (EXT4_SB(sb
)->s_mount_state
& EXT4_FC_REPLAY
)
5532 mb_debug(sb
, "discard preallocation for inode %lu\n",
5534 trace_ext4_discard_preallocations(inode
,
5535 atomic_read(&ei
->i_prealloc_active
));
5538 /* first, collect all pa's in the inode */
5539 write_lock(&ei
->i_prealloc_lock
);
5540 for (iter
= rb_first(&ei
->i_prealloc_node
); iter
;
5541 iter
= rb_next(iter
)) {
5542 pa
= rb_entry(iter
, struct ext4_prealloc_space
,
5543 pa_node
.inode_node
);
5544 BUG_ON(pa
->pa_node_lock
.inode_lock
!= &ei
->i_prealloc_lock
);
5546 spin_lock(&pa
->pa_lock
);
5547 if (atomic_read(&pa
->pa_count
)) {
5548 /* this shouldn't happen often - nobody should
5549 * use preallocation while we're discarding it */
5550 spin_unlock(&pa
->pa_lock
);
5551 write_unlock(&ei
->i_prealloc_lock
);
5552 ext4_msg(sb
, KERN_ERR
,
5553 "uh-oh! used pa while discarding");
5555 schedule_timeout_uninterruptible(HZ
);
5559 if (pa
->pa_deleted
== 0) {
5560 ext4_mb_mark_pa_deleted(sb
, pa
);
5561 spin_unlock(&pa
->pa_lock
);
5562 rb_erase(&pa
->pa_node
.inode_node
, &ei
->i_prealloc_node
);
5563 list_add(&pa
->u
.pa_tmp_list
, &list
);
5567 /* someone is deleting pa right now */
5568 spin_unlock(&pa
->pa_lock
);
5569 write_unlock(&ei
->i_prealloc_lock
);
5571 /* we have to wait here because pa_deleted
5572 * doesn't mean pa is already unlinked from
5573 * the list. as we might be called from
5574 * ->clear_inode() the inode will get freed
5575 * and concurrent thread which is unlinking
5576 * pa from inode's list may access already
5577 * freed memory, bad-bad-bad */
5579 /* XXX: if this happens too often, we can
5580 * add a flag to force wait only in case
5581 * of ->clear_inode(), but not in case of
5582 * regular truncate */
5583 schedule_timeout_uninterruptible(HZ
);
5586 write_unlock(&ei
->i_prealloc_lock
);
5588 list_for_each_entry_safe(pa
, tmp
, &list
, u
.pa_tmp_list
) {
5589 BUG_ON(pa
->pa_type
!= MB_INODE_PA
);
5590 group
= ext4_get_group_number(sb
, pa
->pa_pstart
);
5592 err
= ext4_mb_load_buddy_gfp(sb
, group
, &e4b
,
5593 GFP_NOFS
|__GFP_NOFAIL
);
5595 ext4_error_err(sb
, -err
, "Error %d loading buddy information for %u",
5600 bitmap_bh
= ext4_read_block_bitmap(sb
, group
);
5601 if (IS_ERR(bitmap_bh
)) {
5602 err
= PTR_ERR(bitmap_bh
);
5603 ext4_error_err(sb
, -err
, "Error %d reading block bitmap for %u",
5605 ext4_mb_unload_buddy(&e4b
);
5609 ext4_lock_group(sb
, group
);
5610 list_del(&pa
->pa_group_list
);
5611 ext4_mb_release_inode_pa(&e4b
, bitmap_bh
, pa
);
5612 ext4_unlock_group(sb
, group
);
5614 ext4_mb_unload_buddy(&e4b
);
5617 list_del(&pa
->u
.pa_tmp_list
);
5618 ext4_mb_pa_free(pa
);
5622 static int ext4_mb_pa_alloc(struct ext4_allocation_context
*ac
)
5624 struct ext4_prealloc_space
*pa
;
5626 BUG_ON(ext4_pspace_cachep
== NULL
);
5627 pa
= kmem_cache_zalloc(ext4_pspace_cachep
, GFP_NOFS
);
5630 atomic_set(&pa
->pa_count
, 1);
5635 static void ext4_mb_pa_put_free(struct ext4_allocation_context
*ac
)
5637 struct ext4_prealloc_space
*pa
= ac
->ac_pa
;
5641 WARN_ON(!atomic_dec_and_test(&pa
->pa_count
));
5643 * current function is only called due to an error or due to
5644 * len of found blocks < len of requested blocks hence the PA has not
5645 * been added to grp->bb_prealloc_list. So we don't need to lock it
5648 ext4_mb_pa_free(pa
);
5651 #ifdef CONFIG_EXT4_DEBUG
5652 static inline void ext4_mb_show_pa(struct super_block
*sb
)
5654 ext4_group_t i
, ngroups
;
5656 if (ext4_forced_shutdown(sb
))
5659 ngroups
= ext4_get_groups_count(sb
);
5660 mb_debug(sb
, "groups: ");
5661 for (i
= 0; i
< ngroups
; i
++) {
5662 struct ext4_group_info
*grp
= ext4_get_group_info(sb
, i
);
5663 struct ext4_prealloc_space
*pa
;
5664 ext4_grpblk_t start
;
5665 struct list_head
*cur
;
5669 ext4_lock_group(sb
, i
);
5670 list_for_each(cur
, &grp
->bb_prealloc_list
) {
5671 pa
= list_entry(cur
, struct ext4_prealloc_space
,
5673 spin_lock(&pa
->pa_lock
);
5674 ext4_get_group_no_and_offset(sb
, pa
->pa_pstart
,
5676 spin_unlock(&pa
->pa_lock
);
5677 mb_debug(sb
, "PA:%u:%d:%d\n", i
, start
,
5680 ext4_unlock_group(sb
, i
);
5681 mb_debug(sb
, "%u: %d/%d\n", i
, grp
->bb_free
,
5686 static void ext4_mb_show_ac(struct ext4_allocation_context
*ac
)
5688 struct super_block
*sb
= ac
->ac_sb
;
5690 if (ext4_forced_shutdown(sb
))
5693 mb_debug(sb
, "Can't allocate:"
5694 " Allocation context details:");
5695 mb_debug(sb
, "status %u flags 0x%x",
5696 ac
->ac_status
, ac
->ac_flags
);
5697 mb_debug(sb
, "orig %lu/%lu/%lu@%lu, "
5698 "goal %lu/%lu/%lu@%lu, "
5699 "best %lu/%lu/%lu@%lu cr %d",
5700 (unsigned long)ac
->ac_o_ex
.fe_group
,
5701 (unsigned long)ac
->ac_o_ex
.fe_start
,
5702 (unsigned long)ac
->ac_o_ex
.fe_len
,
5703 (unsigned long)ac
->ac_o_ex
.fe_logical
,
5704 (unsigned long)ac
->ac_g_ex
.fe_group
,
5705 (unsigned long)ac
->ac_g_ex
.fe_start
,
5706 (unsigned long)ac
->ac_g_ex
.fe_len
,
5707 (unsigned long)ac
->ac_g_ex
.fe_logical
,
5708 (unsigned long)ac
->ac_b_ex
.fe_group
,
5709 (unsigned long)ac
->ac_b_ex
.fe_start
,
5710 (unsigned long)ac
->ac_b_ex
.fe_len
,
5711 (unsigned long)ac
->ac_b_ex
.fe_logical
,
5712 (int)ac
->ac_criteria
);
5713 mb_debug(sb
, "%u found", ac
->ac_found
);
5714 mb_debug(sb
, "used pa: %s, ", str_yes_no(ac
->ac_pa
));
5716 mb_debug(sb
, "pa_type %s\n", ac
->ac_pa
->pa_type
== MB_GROUP_PA
?
5717 "group pa" : "inode pa");
5718 ext4_mb_show_pa(sb
);
5721 static inline void ext4_mb_show_pa(struct super_block
*sb
)
5724 static inline void ext4_mb_show_ac(struct ext4_allocation_context
*ac
)
5726 ext4_mb_show_pa(ac
->ac_sb
);
5731 * We use locality group preallocation for small size file. The size of the
5732 * file is determined by the current size or the resulting size after
5733 * allocation which ever is larger
5735 * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
5737 static void ext4_mb_group_or_file(struct ext4_allocation_context
*ac
)
5739 struct ext4_sb_info
*sbi
= EXT4_SB(ac
->ac_sb
);
5740 int bsbits
= ac
->ac_sb
->s_blocksize_bits
;
5742 bool inode_pa_eligible
, group_pa_eligible
;
5744 if (!(ac
->ac_flags
& EXT4_MB_HINT_DATA
))
5747 if (unlikely(ac
->ac_flags
& EXT4_MB_HINT_GOAL_ONLY
))
5750 group_pa_eligible
= sbi
->s_mb_group_prealloc
> 0;
5751 inode_pa_eligible
= true;
5752 size
= extent_logical_end(sbi
, &ac
->ac_o_ex
);
5753 isize
= (i_size_read(ac
->ac_inode
) + ac
->ac_sb
->s_blocksize
- 1)
5756 /* No point in using inode preallocation for closed files */
5757 if ((size
== isize
) && !ext4_fs_is_busy(sbi
) &&
5758 !inode_is_open_for_write(ac
->ac_inode
))
5759 inode_pa_eligible
= false;
5761 size
= max(size
, isize
);
5762 /* Don't use group allocation for large files */
5763 if (size
> sbi
->s_mb_stream_request
)
5764 group_pa_eligible
= false;
5766 if (!group_pa_eligible
) {
5767 if (inode_pa_eligible
)
5768 ac
->ac_flags
|= EXT4_MB_STREAM_ALLOC
;
5770 ac
->ac_flags
|= EXT4_MB_HINT_NOPREALLOC
;
5774 BUG_ON(ac
->ac_lg
!= NULL
);
5776 * locality group prealloc space are per cpu. The reason for having
5777 * per cpu locality group is to reduce the contention between block
5778 * request from multiple CPUs.
5780 ac
->ac_lg
= raw_cpu_ptr(sbi
->s_locality_groups
);
5782 /* we're going to use group allocation */
5783 ac
->ac_flags
|= EXT4_MB_HINT_GROUP_ALLOC
;
5785 /* serialize all allocations in the group */
5786 mutex_lock(&ac
->ac_lg
->lg_mutex
);
5789 static noinline_for_stack
void
5790 ext4_mb_initialize_context(struct ext4_allocation_context
*ac
,
5791 struct ext4_allocation_request
*ar
)
5793 struct super_block
*sb
= ar
->inode
->i_sb
;
5794 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
5795 struct ext4_super_block
*es
= sbi
->s_es
;
5799 ext4_grpblk_t block
;
5801 /* we can't allocate > group size */
5804 /* just a dirty hack to filter too big requests */
5805 if (len
>= EXT4_CLUSTERS_PER_GROUP(sb
))
5806 len
= EXT4_CLUSTERS_PER_GROUP(sb
);
5808 /* start searching from the goal */
5810 if (goal
< le32_to_cpu(es
->s_first_data_block
) ||
5811 goal
>= ext4_blocks_count(es
))
5812 goal
= le32_to_cpu(es
->s_first_data_block
);
5813 ext4_get_group_no_and_offset(sb
, goal
, &group
, &block
);
5815 /* set up allocation goals */
5816 ac
->ac_b_ex
.fe_logical
= EXT4_LBLK_CMASK(sbi
, ar
->logical
);
5817 ac
->ac_status
= AC_STATUS_CONTINUE
;
5819 ac
->ac_inode
= ar
->inode
;
5820 ac
->ac_o_ex
.fe_logical
= ac
->ac_b_ex
.fe_logical
;
5821 ac
->ac_o_ex
.fe_group
= group
;
5822 ac
->ac_o_ex
.fe_start
= block
;
5823 ac
->ac_o_ex
.fe_len
= len
;
5824 ac
->ac_g_ex
= ac
->ac_o_ex
;
5825 ac
->ac_orig_goal_len
= ac
->ac_g_ex
.fe_len
;
5826 ac
->ac_flags
= ar
->flags
;
5828 /* we have to define context: we'll work with a file or
5829 * locality group. this is a policy, actually */
5830 ext4_mb_group_or_file(ac
);
5832 mb_debug(sb
, "init ac: %u blocks @ %u, goal %u, flags 0x%x, 2^%d, "
5833 "left: %u/%u, right %u/%u to %swritable\n",
5834 (unsigned) ar
->len
, (unsigned) ar
->logical
,
5835 (unsigned) ar
->goal
, ac
->ac_flags
, ac
->ac_2order
,
5836 (unsigned) ar
->lleft
, (unsigned) ar
->pleft
,
5837 (unsigned) ar
->lright
, (unsigned) ar
->pright
,
5838 inode_is_open_for_write(ar
->inode
) ? "" : "non-");
5841 static noinline_for_stack
void
5842 ext4_mb_discard_lg_preallocations(struct super_block
*sb
,
5843 struct ext4_locality_group
*lg
,
5844 int order
, int total_entries
)
5846 ext4_group_t group
= 0;
5847 struct ext4_buddy e4b
;
5848 LIST_HEAD(discard_list
);
5849 struct ext4_prealloc_space
*pa
, *tmp
;
5851 mb_debug(sb
, "discard locality group preallocation\n");
5853 spin_lock(&lg
->lg_prealloc_lock
);
5854 list_for_each_entry_rcu(pa
, &lg
->lg_prealloc_list
[order
],
5856 lockdep_is_held(&lg
->lg_prealloc_lock
)) {
5857 spin_lock(&pa
->pa_lock
);
5858 if (atomic_read(&pa
->pa_count
)) {
5860 * This is the pa that we just used
5861 * for block allocation. So don't
5864 spin_unlock(&pa
->pa_lock
);
5867 if (pa
->pa_deleted
) {
5868 spin_unlock(&pa
->pa_lock
);
5871 /* only lg prealloc space */
5872 BUG_ON(pa
->pa_type
!= MB_GROUP_PA
);
5874 /* seems this one can be freed ... */
5875 ext4_mb_mark_pa_deleted(sb
, pa
);
5876 spin_unlock(&pa
->pa_lock
);
5878 list_del_rcu(&pa
->pa_node
.lg_list
);
5879 list_add(&pa
->u
.pa_tmp_list
, &discard_list
);
5882 if (total_entries
<= 5) {
5884 * we want to keep only 5 entries
5885 * allowing it to grow to 8. This
5886 * mak sure we don't call discard
5887 * soon for this list.
5892 spin_unlock(&lg
->lg_prealloc_lock
);
5894 list_for_each_entry_safe(pa
, tmp
, &discard_list
, u
.pa_tmp_list
) {
5897 group
= ext4_get_group_number(sb
, pa
->pa_pstart
);
5898 err
= ext4_mb_load_buddy_gfp(sb
, group
, &e4b
,
5899 GFP_NOFS
|__GFP_NOFAIL
);
5901 ext4_error_err(sb
, -err
, "Error %d loading buddy information for %u",
5905 ext4_lock_group(sb
, group
);
5906 list_del(&pa
->pa_group_list
);
5907 ext4_mb_release_group_pa(&e4b
, pa
);
5908 ext4_unlock_group(sb
, group
);
5910 ext4_mb_unload_buddy(&e4b
);
5911 list_del(&pa
->u
.pa_tmp_list
);
5912 call_rcu(&(pa
)->u
.pa_rcu
, ext4_mb_pa_callback
);
5917 * We have incremented pa_count. So it cannot be freed at this
5918 * point. Also we hold lg_mutex. So no parallel allocation is
5919 * possible from this lg. That means pa_free cannot be updated.
5921 * A parallel ext4_mb_discard_group_preallocations is possible.
5922 * which can cause the lg_prealloc_list to be updated.
5925 static void ext4_mb_add_n_trim(struct ext4_allocation_context
*ac
)
5927 int order
, added
= 0, lg_prealloc_count
= 1;
5928 struct super_block
*sb
= ac
->ac_sb
;
5929 struct ext4_locality_group
*lg
= ac
->ac_lg
;
5930 struct ext4_prealloc_space
*tmp_pa
, *pa
= ac
->ac_pa
;
5932 order
= fls(pa
->pa_free
) - 1;
5933 if (order
> PREALLOC_TB_SIZE
- 1)
5934 /* The max size of hash table is PREALLOC_TB_SIZE */
5935 order
= PREALLOC_TB_SIZE
- 1;
5936 /* Add the prealloc space to lg */
5937 spin_lock(&lg
->lg_prealloc_lock
);
5938 list_for_each_entry_rcu(tmp_pa
, &lg
->lg_prealloc_list
[order
],
5940 lockdep_is_held(&lg
->lg_prealloc_lock
)) {
5941 spin_lock(&tmp_pa
->pa_lock
);
5942 if (tmp_pa
->pa_deleted
) {
5943 spin_unlock(&tmp_pa
->pa_lock
);
5946 if (!added
&& pa
->pa_free
< tmp_pa
->pa_free
) {
5947 /* Add to the tail of the previous entry */
5948 list_add_tail_rcu(&pa
->pa_node
.lg_list
,
5949 &tmp_pa
->pa_node
.lg_list
);
5952 * we want to count the total
5953 * number of entries in the list
5956 spin_unlock(&tmp_pa
->pa_lock
);
5957 lg_prealloc_count
++;
5960 list_add_tail_rcu(&pa
->pa_node
.lg_list
,
5961 &lg
->lg_prealloc_list
[order
]);
5962 spin_unlock(&lg
->lg_prealloc_lock
);
5964 /* Now trim the list to be not more than 8 elements */
5965 if (lg_prealloc_count
> 8)
5966 ext4_mb_discard_lg_preallocations(sb
, lg
,
5967 order
, lg_prealloc_count
);
5971 * release all resource we used in allocation
5973 static void ext4_mb_release_context(struct ext4_allocation_context
*ac
)
5975 struct ext4_sb_info
*sbi
= EXT4_SB(ac
->ac_sb
);
5976 struct ext4_prealloc_space
*pa
= ac
->ac_pa
;
5978 if (pa
->pa_type
== MB_GROUP_PA
) {
5979 /* see comment in ext4_mb_use_group_pa() */
5980 spin_lock(&pa
->pa_lock
);
5981 pa
->pa_pstart
+= EXT4_C2B(sbi
, ac
->ac_b_ex
.fe_len
);
5982 pa
->pa_lstart
+= EXT4_C2B(sbi
, ac
->ac_b_ex
.fe_len
);
5983 pa
->pa_free
-= ac
->ac_b_ex
.fe_len
;
5984 pa
->pa_len
-= ac
->ac_b_ex
.fe_len
;
5985 spin_unlock(&pa
->pa_lock
);
5988 * We want to add the pa to the right bucket.
5989 * Remove it from the list and while adding
5990 * make sure the list to which we are adding
5993 if (likely(pa
->pa_free
)) {
5994 spin_lock(pa
->pa_node_lock
.lg_lock
);
5995 list_del_rcu(&pa
->pa_node
.lg_list
);
5996 spin_unlock(pa
->pa_node_lock
.lg_lock
);
5997 ext4_mb_add_n_trim(ac
);
6001 ext4_mb_put_pa(ac
, ac
->ac_sb
, pa
);
6003 if (ac
->ac_bitmap_folio
)
6004 folio_put(ac
->ac_bitmap_folio
);
6005 if (ac
->ac_buddy_folio
)
6006 folio_put(ac
->ac_buddy_folio
);
6007 if (ac
->ac_flags
& EXT4_MB_HINT_GROUP_ALLOC
)
6008 mutex_unlock(&ac
->ac_lg
->lg_mutex
);
6009 ext4_mb_collect_stats(ac
);
6012 static int ext4_mb_discard_preallocations(struct super_block
*sb
, int needed
)
6014 ext4_group_t i
, ngroups
= ext4_get_groups_count(sb
);
6016 int freed
= 0, busy
= 0;
6019 trace_ext4_mb_discard_preallocations(sb
, needed
);
6022 needed
= EXT4_CLUSTERS_PER_GROUP(sb
) + 1;
6024 for (i
= 0; i
< ngroups
&& needed
> 0; i
++) {
6025 ret
= ext4_mb_discard_group_preallocations(sb
, i
, &busy
);
6031 if (needed
> 0 && busy
&& ++retry
< 3) {
6039 static bool ext4_mb_discard_preallocations_should_retry(struct super_block
*sb
,
6040 struct ext4_allocation_context
*ac
, u64
*seq
)
6046 freed
= ext4_mb_discard_preallocations(sb
, ac
->ac_o_ex
.fe_len
);
6051 seq_retry
= ext4_get_discard_pa_seq_sum();
6052 if (!(ac
->ac_flags
& EXT4_MB_STRICT_CHECK
) || seq_retry
!= *seq
) {
6053 ac
->ac_flags
|= EXT4_MB_STRICT_CHECK
;
6059 mb_debug(sb
, "freed %d, retry ? %s\n", freed
, str_yes_no(ret
));
6064 * Simple allocator for Ext4 fast commit replay path. It searches for blocks
6065 * linearly starting at the goal block and also excludes the blocks which
6066 * are going to be in use after fast commit replay.
6069 ext4_mb_new_blocks_simple(struct ext4_allocation_request
*ar
, int *errp
)
6071 struct buffer_head
*bitmap_bh
;
6072 struct super_block
*sb
= ar
->inode
->i_sb
;
6073 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
6074 ext4_group_t group
, nr
;
6075 ext4_grpblk_t blkoff
;
6076 ext4_grpblk_t max
= EXT4_CLUSTERS_PER_GROUP(sb
);
6077 ext4_grpblk_t i
= 0;
6078 ext4_fsblk_t goal
, block
;
6079 struct ext4_super_block
*es
= sbi
->s_es
;
6082 if (goal
< le32_to_cpu(es
->s_first_data_block
) ||
6083 goal
>= ext4_blocks_count(es
))
6084 goal
= le32_to_cpu(es
->s_first_data_block
);
6087 ext4_get_group_no_and_offset(sb
, goal
, &group
, &blkoff
);
6088 for (nr
= ext4_get_groups_count(sb
); nr
> 0; nr
--) {
6089 bitmap_bh
= ext4_read_block_bitmap(sb
, group
);
6090 if (IS_ERR(bitmap_bh
)) {
6091 *errp
= PTR_ERR(bitmap_bh
);
6092 pr_warn("Failed to read block bitmap\n");
6097 i
= mb_find_next_zero_bit(bitmap_bh
->b_data
, max
,
6101 if (ext4_fc_replay_check_excluded(sb
,
6102 ext4_group_first_block_no(sb
, group
) +
6103 EXT4_C2B(sbi
, i
))) {
6112 if (++group
>= ext4_get_groups_count(sb
))
6123 block
= ext4_group_first_block_no(sb
, group
) + EXT4_C2B(sbi
, i
);
6124 ext4_mb_mark_bb(sb
, block
, 1, true);
6132 * Main entry point into mballoc to allocate blocks
6133 * it tries to use preallocation first, then falls back
6134 * to usual allocation
6136 ext4_fsblk_t
ext4_mb_new_blocks(handle_t
*handle
,
6137 struct ext4_allocation_request
*ar
, int *errp
)
6139 struct ext4_allocation_context
*ac
= NULL
;
6140 struct ext4_sb_info
*sbi
;
6141 struct super_block
*sb
;
6142 ext4_fsblk_t block
= 0;
6143 unsigned int inquota
= 0;
6144 unsigned int reserv_clstrs
= 0;
6149 sb
= ar
->inode
->i_sb
;
6152 trace_ext4_request_blocks(ar
);
6153 if (sbi
->s_mount_state
& EXT4_FC_REPLAY
)
6154 return ext4_mb_new_blocks_simple(ar
, errp
);
6156 /* Allow to use superuser reservation for quota file */
6157 if (ext4_is_quota_file(ar
->inode
))
6158 ar
->flags
|= EXT4_MB_USE_ROOT_BLOCKS
;
6160 if ((ar
->flags
& EXT4_MB_DELALLOC_RESERVED
) == 0) {
6161 /* Without delayed allocation we need to verify
6162 * there is enough free blocks to do block allocation
6163 * and verify allocation doesn't exceed the quota limits.
6166 ext4_claim_free_clusters(sbi
, ar
->len
, ar
->flags
)) {
6168 /* let others to free the space */
6170 ar
->len
= ar
->len
>> 1;
6173 ext4_mb_show_pa(sb
);
6177 reserv_clstrs
= ar
->len
;
6178 if (ar
->flags
& EXT4_MB_USE_ROOT_BLOCKS
) {
6179 dquot_alloc_block_nofail(ar
->inode
,
6180 EXT4_C2B(sbi
, ar
->len
));
6183 dquot_alloc_block(ar
->inode
,
6184 EXT4_C2B(sbi
, ar
->len
))) {
6186 ar
->flags
|= EXT4_MB_HINT_NOPREALLOC
;
6197 ac
= kmem_cache_zalloc(ext4_ac_cachep
, GFP_NOFS
);
6204 ext4_mb_initialize_context(ac
, ar
);
6206 ac
->ac_op
= EXT4_MB_HISTORY_PREALLOC
;
6207 seq
= this_cpu_read(discard_pa_seq
);
6208 if (!ext4_mb_use_preallocated(ac
)) {
6209 ac
->ac_op
= EXT4_MB_HISTORY_ALLOC
;
6210 ext4_mb_normalize_request(ac
, ar
);
6212 *errp
= ext4_mb_pa_alloc(ac
);
6216 /* allocate space in core */
6217 *errp
= ext4_mb_regular_allocator(ac
);
6219 * pa allocated above is added to grp->bb_prealloc_list only
6220 * when we were able to allocate some block i.e. when
6221 * ac->ac_status == AC_STATUS_FOUND.
6222 * And error from above mean ac->ac_status != AC_STATUS_FOUND
6223 * So we have to free this pa here itself.
6226 ext4_mb_pa_put_free(ac
);
6227 ext4_discard_allocated_blocks(ac
);
6230 if (ac
->ac_status
== AC_STATUS_FOUND
&&
6231 ac
->ac_o_ex
.fe_len
>= ac
->ac_f_ex
.fe_len
)
6232 ext4_mb_pa_put_free(ac
);
6234 if (likely(ac
->ac_status
== AC_STATUS_FOUND
)) {
6235 *errp
= ext4_mb_mark_diskspace_used(ac
, handle
, reserv_clstrs
);
6237 ext4_discard_allocated_blocks(ac
);
6240 block
= ext4_grp_offs_to_block(sb
, &ac
->ac_b_ex
);
6241 ar
->len
= ac
->ac_b_ex
.fe_len
;
6244 if (++retries
< 3 &&
6245 ext4_mb_discard_preallocations_should_retry(sb
, ac
, &seq
))
6248 * If block allocation fails then the pa allocated above
6249 * needs to be freed here itself.
6251 ext4_mb_pa_put_free(ac
);
6257 ac
->ac_b_ex
.fe_len
= 0;
6259 ext4_mb_show_ac(ac
);
6261 ext4_mb_release_context(ac
);
6262 kmem_cache_free(ext4_ac_cachep
, ac
);
6264 if (inquota
&& ar
->len
< inquota
)
6265 dquot_free_block(ar
->inode
, EXT4_C2B(sbi
, inquota
- ar
->len
));
6267 if ((ar
->flags
& EXT4_MB_DELALLOC_RESERVED
) == 0)
6268 /* release all the reserved blocks if non delalloc */
6269 percpu_counter_sub(&sbi
->s_dirtyclusters_counter
,
6273 trace_ext4_allocate_blocks(ar
, (unsigned long long)block
);
6279 * We can merge two free data extents only if the physical blocks
6280 * are contiguous, AND the extents were freed by the same transaction,
6281 * AND the blocks are associated with the same group.
6283 static void ext4_try_merge_freed_extent(struct ext4_sb_info
*sbi
,
6284 struct ext4_free_data
*entry
,
6285 struct ext4_free_data
*new_entry
,
6286 struct rb_root
*entry_rb_root
)
6288 if ((entry
->efd_tid
!= new_entry
->efd_tid
) ||
6289 (entry
->efd_group
!= new_entry
->efd_group
))
6291 if (entry
->efd_start_cluster
+ entry
->efd_count
==
6292 new_entry
->efd_start_cluster
) {
6293 new_entry
->efd_start_cluster
= entry
->efd_start_cluster
;
6294 new_entry
->efd_count
+= entry
->efd_count
;
6295 } else if (new_entry
->efd_start_cluster
+ new_entry
->efd_count
==
6296 entry
->efd_start_cluster
) {
6297 new_entry
->efd_count
+= entry
->efd_count
;
6300 spin_lock(&sbi
->s_md_lock
);
6301 list_del(&entry
->efd_list
);
6302 spin_unlock(&sbi
->s_md_lock
);
6303 rb_erase(&entry
->efd_node
, entry_rb_root
);
6304 kmem_cache_free(ext4_free_data_cachep
, entry
);
6307 static noinline_for_stack
void
6308 ext4_mb_free_metadata(handle_t
*handle
, struct ext4_buddy
*e4b
,
6309 struct ext4_free_data
*new_entry
)
6311 ext4_group_t group
= e4b
->bd_group
;
6312 ext4_grpblk_t cluster
;
6313 ext4_grpblk_t clusters
= new_entry
->efd_count
;
6314 struct ext4_free_data
*entry
;
6315 struct ext4_group_info
*db
= e4b
->bd_info
;
6316 struct super_block
*sb
= e4b
->bd_sb
;
6317 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
6318 struct rb_node
**n
= &db
->bb_free_root
.rb_node
, *node
;
6319 struct rb_node
*parent
= NULL
, *new_node
;
6321 BUG_ON(!ext4_handle_valid(handle
));
6322 BUG_ON(e4b
->bd_bitmap_folio
== NULL
);
6323 BUG_ON(e4b
->bd_buddy_folio
== NULL
);
6325 new_node
= &new_entry
->efd_node
;
6326 cluster
= new_entry
->efd_start_cluster
;
6329 /* first free block exent. We need to
6330 protect buddy cache from being freed,
6331 * otherwise we'll refresh it from
6332 * on-disk bitmap and lose not-yet-available
6334 folio_get(e4b
->bd_buddy_folio
);
6335 folio_get(e4b
->bd_bitmap_folio
);
6339 entry
= rb_entry(parent
, struct ext4_free_data
, efd_node
);
6340 if (cluster
< entry
->efd_start_cluster
)
6342 else if (cluster
>= (entry
->efd_start_cluster
+ entry
->efd_count
))
6343 n
= &(*n
)->rb_right
;
6345 ext4_grp_locked_error(sb
, group
, 0,
6346 ext4_group_first_block_no(sb
, group
) +
6347 EXT4_C2B(sbi
, cluster
),
6348 "Block already on to-be-freed list");
6349 kmem_cache_free(ext4_free_data_cachep
, new_entry
);
6354 rb_link_node(new_node
, parent
, n
);
6355 rb_insert_color(new_node
, &db
->bb_free_root
);
6357 /* Now try to see the extent can be merged to left and right */
6358 node
= rb_prev(new_node
);
6360 entry
= rb_entry(node
, struct ext4_free_data
, efd_node
);
6361 ext4_try_merge_freed_extent(sbi
, entry
, new_entry
,
6362 &(db
->bb_free_root
));
6365 node
= rb_next(new_node
);
6367 entry
= rb_entry(node
, struct ext4_free_data
, efd_node
);
6368 ext4_try_merge_freed_extent(sbi
, entry
, new_entry
,
6369 &(db
->bb_free_root
));
6372 spin_lock(&sbi
->s_md_lock
);
6373 list_add_tail(&new_entry
->efd_list
, &sbi
->s_freed_data_list
[new_entry
->efd_tid
& 1]);
6374 sbi
->s_mb_free_pending
+= clusters
;
6375 spin_unlock(&sbi
->s_md_lock
);
6378 static void ext4_free_blocks_simple(struct inode
*inode
, ext4_fsblk_t block
,
6379 unsigned long count
)
6381 struct super_block
*sb
= inode
->i_sb
;
6383 ext4_grpblk_t blkoff
;
6385 ext4_get_group_no_and_offset(sb
, block
, &group
, &blkoff
);
6386 ext4_mb_mark_context(NULL
, sb
, false, group
, blkoff
, count
,
6387 EXT4_MB_BITMAP_MARKED_CHECK
|
6388 EXT4_MB_SYNC_UPDATE
,
6393 * ext4_mb_clear_bb() -- helper function for freeing blocks.
6394 * Used by ext4_free_blocks()
6395 * @handle: handle for this transaction
6397 * @block: starting physical block to be freed
6398 * @count: number of blocks to be freed
6399 * @flags: flags used by ext4_free_blocks
6401 static void ext4_mb_clear_bb(handle_t
*handle
, struct inode
*inode
,
6402 ext4_fsblk_t block
, unsigned long count
,
6405 struct super_block
*sb
= inode
->i_sb
;
6406 struct ext4_group_info
*grp
;
6407 unsigned int overflow
;
6409 ext4_group_t block_group
;
6410 struct ext4_sb_info
*sbi
;
6411 struct ext4_buddy e4b
;
6412 unsigned int count_clusters
;
6415 ext4_grpblk_t changed
;
6419 if (!(flags
& EXT4_FREE_BLOCKS_VALIDATED
) &&
6420 !ext4_inode_block_valid(inode
, block
, count
)) {
6421 ext4_error(sb
, "Freeing blocks in system zone - "
6422 "Block = %llu, count = %lu", block
, count
);
6423 /* err = 0. ext4_std_error should be a no op */
6426 flags
|= EXT4_FREE_BLOCKS_VALIDATED
;
6430 ext4_get_group_no_and_offset(sb
, block
, &block_group
, &bit
);
6432 grp
= ext4_get_group_info(sb
, block_group
);
6433 if (unlikely(!grp
|| EXT4_MB_GRP_BBITMAP_CORRUPT(grp
)))
6437 * Check to see if we are freeing blocks across a group
6440 if (EXT4_C2B(sbi
, bit
) + count
> EXT4_BLOCKS_PER_GROUP(sb
)) {
6441 overflow
= EXT4_C2B(sbi
, bit
) + count
-
6442 EXT4_BLOCKS_PER_GROUP(sb
);
6444 /* The range changed so it's no longer validated */
6445 flags
&= ~EXT4_FREE_BLOCKS_VALIDATED
;
6447 count_clusters
= EXT4_NUM_B2C(sbi
, count
);
6448 trace_ext4_mballoc_free(sb
, inode
, block_group
, bit
, count_clusters
);
6450 /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */
6451 err
= ext4_mb_load_buddy_gfp(sb
, block_group
, &e4b
,
6452 GFP_NOFS
|__GFP_NOFAIL
);
6456 if (!(flags
& EXT4_FREE_BLOCKS_VALIDATED
) &&
6457 !ext4_inode_block_valid(inode
, block
, count
)) {
6458 ext4_error(sb
, "Freeing blocks in system zone - "
6459 "Block = %llu, count = %lu", block
, count
);
6460 /* err = 0. ext4_std_error should be a no op */
6464 #ifdef AGGRESSIVE_CHECK
6465 mark_flags
|= EXT4_MB_BITMAP_MARKED_CHECK
;
6467 err
= ext4_mb_mark_context(handle
, sb
, false, block_group
, bit
,
6468 count_clusters
, mark_flags
, &changed
);
6471 if (err
&& changed
== 0)
6474 #ifdef AGGRESSIVE_CHECK
6475 BUG_ON(changed
!= count_clusters
);
6479 * We need to make sure we don't reuse the freed block until after the
6480 * transaction is committed. We make an exception if the inode is to be
6481 * written in writeback mode since writeback mode has weak data
6482 * consistency guarantees.
6484 if (ext4_handle_valid(handle
) &&
6485 ((flags
& EXT4_FREE_BLOCKS_METADATA
) ||
6486 !ext4_should_writeback_data(inode
))) {
6487 struct ext4_free_data
*new_entry
;
6489 * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed
6492 new_entry
= kmem_cache_alloc(ext4_free_data_cachep
,
6493 GFP_NOFS
|__GFP_NOFAIL
);
6494 new_entry
->efd_start_cluster
= bit
;
6495 new_entry
->efd_group
= block_group
;
6496 new_entry
->efd_count
= count_clusters
;
6497 new_entry
->efd_tid
= handle
->h_transaction
->t_tid
;
6499 ext4_lock_group(sb
, block_group
);
6500 ext4_mb_free_metadata(handle
, &e4b
, new_entry
);
6502 if (test_opt(sb
, DISCARD
)) {
6503 err
= ext4_issue_discard(sb
, block_group
, bit
,
6506 * Ignore EOPNOTSUPP error. This is consistent with
6507 * what happens when using journal.
6509 if (err
== -EOPNOTSUPP
)
6512 ext4_msg(sb
, KERN_WARNING
, "discard request in"
6513 " group:%u block:%d count:%lu failed"
6514 " with %d", block_group
, bit
, count
,
6518 EXT4_MB_GRP_CLEAR_TRIMMED(e4b
.bd_info
);
6520 ext4_lock_group(sb
, block_group
);
6521 mb_free_blocks(inode
, &e4b
, bit
, count_clusters
);
6524 ext4_unlock_group(sb
, block_group
);
6527 * on a bigalloc file system, defer the s_freeclusters_counter
6528 * update to the caller (ext4_remove_space and friends) so they
6529 * can determine if a cluster freed here should be rereserved
6531 if (!(flags
& EXT4_FREE_BLOCKS_RERESERVE_CLUSTER
)) {
6532 if (!(flags
& EXT4_FREE_BLOCKS_NO_QUOT_UPDATE
))
6533 dquot_free_block(inode
, EXT4_C2B(sbi
, count_clusters
));
6534 percpu_counter_add(&sbi
->s_freeclusters_counter
,
6538 if (overflow
&& !err
) {
6541 ext4_mb_unload_buddy(&e4b
);
6542 /* The range changed so it's no longer validated */
6543 flags
&= ~EXT4_FREE_BLOCKS_VALIDATED
;
6548 ext4_mb_unload_buddy(&e4b
);
6550 ext4_std_error(sb
, err
);
6554 * ext4_free_blocks() -- Free given blocks and update quota
6555 * @handle: handle for this transaction
6557 * @bh: optional buffer of the block to be freed
6558 * @block: starting physical block to be freed
6559 * @count: number of blocks to be freed
6560 * @flags: flags used by ext4_free_blocks
6562 void ext4_free_blocks(handle_t
*handle
, struct inode
*inode
,
6563 struct buffer_head
*bh
, ext4_fsblk_t block
,
6564 unsigned long count
, int flags
)
6566 struct super_block
*sb
= inode
->i_sb
;
6567 unsigned int overflow
;
6568 struct ext4_sb_info
*sbi
;
6574 BUG_ON(block
!= bh
->b_blocknr
);
6576 block
= bh
->b_blocknr
;
6579 if (sbi
->s_mount_state
& EXT4_FC_REPLAY
) {
6580 ext4_free_blocks_simple(inode
, block
, EXT4_NUM_B2C(sbi
, count
));
6586 if (!(flags
& EXT4_FREE_BLOCKS_VALIDATED
) &&
6587 !ext4_inode_block_valid(inode
, block
, count
)) {
6588 ext4_error(sb
, "Freeing blocks not in datazone - "
6589 "block = %llu, count = %lu", block
, count
);
6592 flags
|= EXT4_FREE_BLOCKS_VALIDATED
;
6594 ext4_debug("freeing block %llu\n", block
);
6595 trace_ext4_free_blocks(inode
, block
, count
, flags
);
6597 if (bh
&& (flags
& EXT4_FREE_BLOCKS_FORGET
)) {
6600 ext4_forget(handle
, flags
& EXT4_FREE_BLOCKS_METADATA
,
6605 * If the extent to be freed does not begin on a cluster
6606 * boundary, we need to deal with partial clusters at the
6607 * beginning and end of the extent. Normally we will free
6608 * blocks at the beginning or the end unless we are explicitly
6609 * requested to avoid doing so.
6611 overflow
= EXT4_PBLK_COFF(sbi
, block
);
6613 if (flags
& EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER
) {
6614 overflow
= sbi
->s_cluster_ratio
- overflow
;
6616 if (count
> overflow
)
6624 /* The range changed so it's no longer validated */
6625 flags
&= ~EXT4_FREE_BLOCKS_VALIDATED
;
6627 overflow
= EXT4_LBLK_COFF(sbi
, count
);
6629 if (flags
& EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER
) {
6630 if (count
> overflow
)
6635 count
+= sbi
->s_cluster_ratio
- overflow
;
6636 /* The range changed so it's no longer validated */
6637 flags
&= ~EXT4_FREE_BLOCKS_VALIDATED
;
6640 if (!bh
&& (flags
& EXT4_FREE_BLOCKS_FORGET
)) {
6642 int is_metadata
= flags
& EXT4_FREE_BLOCKS_METADATA
;
6644 for (i
= 0; i
< count
; i
++) {
6647 bh
= sb_find_get_block(inode
->i_sb
, block
+ i
);
6648 ext4_forget(handle
, is_metadata
, inode
, bh
, block
+ i
);
6652 ext4_mb_clear_bb(handle
, inode
, block
, count
, flags
);
6656 * ext4_group_add_blocks() -- Add given blocks to an existing group
6657 * @handle: handle to this transaction
6659 * @block: start physical block to add to the block group
6660 * @count: number of blocks to free
6662 * This marks the blocks as free in the bitmap and buddy.
6664 int ext4_group_add_blocks(handle_t
*handle
, struct super_block
*sb
,
6665 ext4_fsblk_t block
, unsigned long count
)
6667 ext4_group_t block_group
;
6669 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
6670 struct ext4_buddy e4b
;
6672 ext4_fsblk_t first_cluster
= EXT4_B2C(sbi
, block
);
6673 ext4_fsblk_t last_cluster
= EXT4_B2C(sbi
, block
+ count
- 1);
6674 unsigned long cluster_count
= last_cluster
- first_cluster
+ 1;
6675 ext4_grpblk_t changed
;
6677 ext4_debug("Adding block(s) %llu-%llu\n", block
, block
+ count
- 1);
6679 if (cluster_count
== 0)
6682 ext4_get_group_no_and_offset(sb
, block
, &block_group
, &bit
);
6684 * Check to see if we are freeing blocks across a group
6687 if (bit
+ cluster_count
> EXT4_CLUSTERS_PER_GROUP(sb
)) {
6688 ext4_warning(sb
, "too many blocks added to group %u",
6694 err
= ext4_mb_load_buddy(sb
, block_group
, &e4b
);
6698 if (!ext4_sb_block_valid(sb
, NULL
, block
, count
)) {
6699 ext4_error(sb
, "Adding blocks in system zones - "
6700 "Block = %llu, count = %lu",
6706 err
= ext4_mb_mark_context(handle
, sb
, false, block_group
, bit
,
6707 cluster_count
, EXT4_MB_BITMAP_MARKED_CHECK
,
6709 if (err
&& changed
== 0)
6712 if (changed
!= cluster_count
)
6713 ext4_error(sb
, "bit already cleared in group %u", block_group
);
6715 ext4_lock_group(sb
, block_group
);
6716 mb_free_blocks(NULL
, &e4b
, bit
, cluster_count
);
6717 ext4_unlock_group(sb
, block_group
);
6718 percpu_counter_add(&sbi
->s_freeclusters_counter
,
6722 ext4_mb_unload_buddy(&e4b
);
6724 ext4_std_error(sb
, err
);
6729 * ext4_trim_extent -- function to TRIM one single free extent in the group
6730 * @sb: super block for the file system
6731 * @start: starting block of the free extent in the alloc. group
6732 * @count: number of blocks to TRIM
6733 * @e4b: ext4 buddy for the group
6735 * Trim "count" blocks starting at "start" in the "group". To assure that no
6736 * one will allocate those blocks, mark it as used in buddy bitmap. This must
6737 * be called with under the group lock.
6739 static int ext4_trim_extent(struct super_block
*sb
,
6740 int start
, int count
, struct ext4_buddy
*e4b
)
6744 struct ext4_free_extent ex
;
6745 ext4_group_t group
= e4b
->bd_group
;
6748 trace_ext4_trim_extent(sb
, group
, start
, count
);
6750 assert_spin_locked(ext4_group_lock_ptr(sb
, group
));
6752 ex
.fe_start
= start
;
6753 ex
.fe_group
= group
;
6757 * Mark blocks used, so no one can reuse them while
6760 mb_mark_used(e4b
, &ex
);
6761 ext4_unlock_group(sb
, group
);
6762 ret
= ext4_issue_discard(sb
, group
, start
, count
);
6763 ext4_lock_group(sb
, group
);
6764 mb_free_blocks(NULL
, e4b
, start
, ex
.fe_len
);
6768 static ext4_grpblk_t
ext4_last_grp_cluster(struct super_block
*sb
,
6771 unsigned long nr_clusters_in_group
;
6773 if (grp
< (ext4_get_groups_count(sb
) - 1))
6774 nr_clusters_in_group
= EXT4_CLUSTERS_PER_GROUP(sb
);
6776 nr_clusters_in_group
= (ext4_blocks_count(EXT4_SB(sb
)->s_es
) -
6777 ext4_group_first_block_no(sb
, grp
))
6778 >> EXT4_CLUSTER_BITS(sb
);
6780 return nr_clusters_in_group
- 1;
6783 static bool ext4_trim_interrupted(void)
6785 return fatal_signal_pending(current
) || freezing(current
);
6788 static int ext4_try_to_trim_range(struct super_block
*sb
,
6789 struct ext4_buddy
*e4b
, ext4_grpblk_t start
,
6790 ext4_grpblk_t max
, ext4_grpblk_t minblocks
)
6791 __acquires(ext4_group_lock_ptr(sb
, e4b
->bd_group
))
6792 __releases(ext4_group_lock_ptr(sb
, e4b
->bd_group
))
6794 ext4_grpblk_t next
, count
, free_count
, last
, origin_start
;
6795 bool set_trimmed
= false;
6798 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b
->bd_info
)))
6801 last
= ext4_last_grp_cluster(sb
, e4b
->bd_group
);
6802 bitmap
= e4b
->bd_bitmap
;
6803 if (start
== 0 && max
>= last
)
6805 origin_start
= start
;
6806 start
= max(e4b
->bd_info
->bb_first_free
, start
);
6810 while (start
<= max
) {
6811 start
= mb_find_next_zero_bit(bitmap
, max
+ 1, start
);
6815 next
= mb_find_next_bit(bitmap
, last
+ 1, start
);
6816 if (origin_start
== 0 && next
>= last
)
6819 if ((next
- start
) >= minblocks
) {
6820 int ret
= ext4_trim_extent(sb
, start
, next
- start
, e4b
);
6822 if (ret
&& ret
!= -EOPNOTSUPP
)
6824 count
+= next
- start
;
6826 free_count
+= next
- start
;
6829 if (ext4_trim_interrupted())
6832 if (need_resched()) {
6833 ext4_unlock_group(sb
, e4b
->bd_group
);
6835 ext4_lock_group(sb
, e4b
->bd_group
);
6838 if ((e4b
->bd_info
->bb_free
- free_count
) < minblocks
)
6843 EXT4_MB_GRP_SET_TRIMMED(e4b
->bd_info
);
6849 * ext4_trim_all_free -- function to trim all free space in alloc. group
6850 * @sb: super block for file system
6851 * @group: group to be trimmed
6852 * @start: first group block to examine
6853 * @max: last group block to examine
6854 * @minblocks: minimum extent block count
6856 * ext4_trim_all_free walks through group's block bitmap searching for free
6857 * extents. When the free extent is found, mark it as used in group buddy
6858 * bitmap. Then issue a TRIM command on this extent and free the extent in
6859 * the group buddy bitmap.
6861 static ext4_grpblk_t
6862 ext4_trim_all_free(struct super_block
*sb
, ext4_group_t group
,
6863 ext4_grpblk_t start
, ext4_grpblk_t max
,
6864 ext4_grpblk_t minblocks
)
6866 struct ext4_buddy e4b
;
6869 trace_ext4_trim_all_free(sb
, group
, start
, max
);
6871 ret
= ext4_mb_load_buddy(sb
, group
, &e4b
);
6873 ext4_warning(sb
, "Error %d loading buddy information for %u",
6878 ext4_lock_group(sb
, group
);
6880 if (!EXT4_MB_GRP_WAS_TRIMMED(e4b
.bd_info
) ||
6881 minblocks
< EXT4_SB(sb
)->s_last_trim_minblks
)
6882 ret
= ext4_try_to_trim_range(sb
, &e4b
, start
, max
, minblocks
);
6886 ext4_unlock_group(sb
, group
);
6887 ext4_mb_unload_buddy(&e4b
);
6889 ext4_debug("trimmed %d blocks in the group %d\n",
6896 * ext4_trim_fs() -- trim ioctl handle function
6897 * @sb: superblock for filesystem
6898 * @range: fstrim_range structure
6900 * start: First Byte to trim
6901 * len: number of Bytes to trim from start
6902 * minlen: minimum extent length in Bytes
6903 * ext4_trim_fs goes through all allocation groups containing Bytes from
6904 * start to start+len. For each such a group ext4_trim_all_free function
6905 * is invoked to trim all free space.
6907 int ext4_trim_fs(struct super_block
*sb
, struct fstrim_range
*range
)
6909 unsigned int discard_granularity
= bdev_discard_granularity(sb
->s_bdev
);
6910 struct ext4_group_info
*grp
;
6911 ext4_group_t group
, first_group
, last_group
;
6912 ext4_grpblk_t cnt
= 0, first_cluster
, last_cluster
;
6913 uint64_t start
, end
, minlen
, trimmed
= 0;
6914 ext4_fsblk_t first_data_blk
=
6915 le32_to_cpu(EXT4_SB(sb
)->s_es
->s_first_data_block
);
6916 ext4_fsblk_t max_blks
= ext4_blocks_count(EXT4_SB(sb
)->s_es
);
6919 start
= range
->start
>> sb
->s_blocksize_bits
;
6920 end
= start
+ (range
->len
>> sb
->s_blocksize_bits
) - 1;
6921 minlen
= EXT4_NUM_B2C(EXT4_SB(sb
),
6922 range
->minlen
>> sb
->s_blocksize_bits
);
6924 if (minlen
> EXT4_CLUSTERS_PER_GROUP(sb
) ||
6925 start
>= max_blks
||
6926 range
->len
< sb
->s_blocksize
)
6928 /* No point to try to trim less than discard granularity */
6929 if (range
->minlen
< discard_granularity
) {
6930 minlen
= EXT4_NUM_B2C(EXT4_SB(sb
),
6931 discard_granularity
>> sb
->s_blocksize_bits
);
6932 if (minlen
> EXT4_CLUSTERS_PER_GROUP(sb
))
6935 if (end
>= max_blks
- 1)
6937 if (end
<= first_data_blk
)
6939 if (start
< first_data_blk
)
6940 start
= first_data_blk
;
6942 /* Determine first and last group to examine based on start and end */
6943 ext4_get_group_no_and_offset(sb
, (ext4_fsblk_t
) start
,
6944 &first_group
, &first_cluster
);
6945 ext4_get_group_no_and_offset(sb
, (ext4_fsblk_t
) end
,
6946 &last_group
, &last_cluster
);
6948 /* end now represents the last cluster to discard in this group */
6949 end
= EXT4_CLUSTERS_PER_GROUP(sb
) - 1;
6951 for (group
= first_group
; group
<= last_group
; group
++) {
6952 if (ext4_trim_interrupted())
6954 grp
= ext4_get_group_info(sb
, group
);
6957 /* We only do this if the grp has never been initialized */
6958 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp
))) {
6959 ret
= ext4_mb_init_group(sb
, group
, GFP_NOFS
);
6965 * For all the groups except the last one, last cluster will
6966 * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
6967 * change it for the last group, note that last_cluster is
6968 * already computed earlier by ext4_get_group_no_and_offset()
6970 if (group
== last_group
)
6972 if (grp
->bb_free
>= minlen
) {
6973 cnt
= ext4_trim_all_free(sb
, group
, first_cluster
,
6983 * For every group except the first one, we are sure
6984 * that the first cluster to discard will be cluster #0.
6990 EXT4_SB(sb
)->s_last_trim_minblks
= minlen
;
6993 range
->len
= EXT4_C2B(EXT4_SB(sb
), trimmed
) << sb
->s_blocksize_bits
;
6997 /* Iterate all the free extents in the group. */
6999 ext4_mballoc_query_range(
7000 struct super_block
*sb
,
7002 ext4_grpblk_t first
,
7004 ext4_mballoc_query_range_fn meta_formatter
,
7005 ext4_mballoc_query_range_fn formatter
,
7009 ext4_grpblk_t start
, next
;
7010 struct ext4_buddy e4b
;
7013 error
= ext4_mb_load_buddy(sb
, group
, &e4b
);
7016 bitmap
= e4b
.bd_bitmap
;
7018 ext4_lock_group(sb
, group
);
7020 start
= max(e4b
.bd_info
->bb_first_free
, first
);
7021 if (end
>= EXT4_CLUSTERS_PER_GROUP(sb
))
7022 end
= EXT4_CLUSTERS_PER_GROUP(sb
) - 1;
7023 if (meta_formatter
&& start
!= first
) {
7026 ext4_unlock_group(sb
, group
);
7027 error
= meta_formatter(sb
, group
, first
, start
- first
,
7031 ext4_lock_group(sb
, group
);
7033 while (start
<= end
) {
7034 start
= mb_find_next_zero_bit(bitmap
, end
+ 1, start
);
7037 next
= mb_find_next_bit(bitmap
, end
+ 1, start
);
7039 ext4_unlock_group(sb
, group
);
7040 error
= formatter(sb
, group
, start
, next
- start
, priv
);
7043 ext4_lock_group(sb
, group
);
7048 ext4_unlock_group(sb
, group
);
7050 ext4_mb_unload_buddy(&e4b
);
7055 #ifdef CONFIG_EXT4_KUNIT_TESTS
7056 #include "mballoc-test.c"