1 // SPDX-License-Identifier: GPL-2.0
3 #include "linux/spinlock.h"
4 #include <linux/minmax.h>
7 #include "space-info.h"
10 #include "free-space-cache.h"
11 #include "ordered-data.h"
12 #include "transaction.h"
13 #include "block-group.h"
15 #include "accessors.h"
16 #include "extent-tree.h"
19 * HOW DOES SPACE RESERVATION WORK
21 * If you want to know about delalloc specifically, there is a separate comment
22 * for that with the delalloc code. This comment is about how the whole system
27 * 1) space_info. This is the ultimate arbiter of how much space we can use.
28 * There's a description of the bytes_ fields with the struct declaration,
29 * refer to that for specifics on each field. Suffice it to say that for
30 * reservations we care about total_bytes - SUM(space_info->bytes_) when
31 * determining if there is space to make an allocation. There is a space_info
32 * for METADATA, SYSTEM, and DATA areas.
34 * 2) block_rsv's. These are basically buckets for every different type of
35 * metadata reservation we have. You can see the comment in the block_rsv
36 * code on the rules for each type, but generally block_rsv->reserved is how
37 * much space is accounted for in space_info->bytes_may_use.
39 * 3) btrfs_calc*_size. These are the worst case calculations we used based
40 * on the number of items we will want to modify. We have one for changing
41 * items, and one for inserting new items. Generally we use these helpers to
42 * determine the size of the block reserves, and then use the actual bytes
43 * values to adjust the space_info counters.
45 * MAKING RESERVATIONS, THE NORMAL CASE
47 * We call into either btrfs_reserve_data_bytes() or
48 * btrfs_reserve_metadata_bytes(), depending on which we're looking for, with
49 * num_bytes we want to reserve.
52 * space_info->bytes_may_reserve += num_bytes
55 * Call btrfs_add_reserved_bytes() which does
56 * space_info->bytes_may_reserve -= num_bytes
57 * space_info->bytes_reserved += extent_bytes
60 * Call btrfs_update_block_group() which does
61 * space_info->bytes_reserved -= extent_bytes
62 * space_info->bytes_used += extent_bytes
64 * MAKING RESERVATIONS, FLUSHING NORMALLY (non-priority)
66 * Assume we are unable to simply make the reservation because we do not have
70 * create a reserve_ticket with ->bytes set to our reservation, add it to
71 * the tail of space_info->tickets, kick async flush thread
73 * ->handle_reserve_ticket
74 * wait on ticket->wait for ->bytes to be reduced to 0, or ->error to be set
77 * -> btrfs_async_reclaim_metadata_space/btrfs_async_reclaim_data_space
78 * Flushes various things attempting to free up space.
80 * -> btrfs_try_granting_tickets()
81 * This is called by anything that either subtracts space from
82 * space_info->bytes_may_use, ->bytes_pinned, etc, or adds to the
83 * space_info->total_bytes. This loops through the ->priority_tickets and
84 * then the ->tickets list checking to see if the reservation can be
85 * completed. If it can the space is added to space_info->bytes_may_use and
86 * the ticket is woken up.
89 * Check if ->bytes == 0, if it does we got our reservation and we can carry
90 * on, if not return the appropriate error (ENOSPC, but can be EINTR if we
93 * MAKING RESERVATIONS, FLUSHING HIGH PRIORITY
95 * Same as the above, except we add ourselves to the
96 * space_info->priority_tickets, and we do not use ticket->wait, we simply
97 * call flush_space() ourselves for the states that are safe for us to call
98 * without deadlocking and hope for the best.
100 * THE FLUSHING STATES
102 * Generally speaking we will have two cases for each state, a "nice" state
103 * and a "ALL THE THINGS" state. In btrfs we delay a lot of work in order to
104 * reduce the locking over head on the various trees, and even to keep from
105 * doing any work at all in the case of delayed refs. Each of these delayed
106 * things however hold reservations, and so letting them run allows us to
107 * reclaim space so we can make new reservations.
109 * FLUSH_DELAYED_ITEMS
110 * Every inode has a delayed item to update the inode. Take a simple write
111 * for example, we would update the inode item at write time to update the
112 * mtime, and then again at finish_ordered_io() time in order to update the
113 * isize or bytes. We keep these delayed items to coalesce these operations
114 * into a single operation done on demand. These are an easy way to reclaim
118 * Look at the delalloc comment to get an idea of how much space is reserved
119 * for delayed allocation. We can reclaim some of this space simply by
120 * running delalloc, but usually we need to wait for ordered extents to
121 * reclaim the bulk of this space.
124 * We have a block reserve for the outstanding delayed refs space, and every
125 * delayed ref operation holds a reservation. Running these is a quick way
126 * to reclaim space, but we want to hold this until the end because COW can
127 * churn a lot and we can avoid making some extent tree modifications if we
128 * are able to delay for as long as possible.
131 * We will skip this the first time through space reservation, because of
132 * overcommit and we don't want to have a lot of useless metadata space when
133 * our worst case reservations will likely never come true.
136 * If we're freeing inodes we're likely freeing checksums, file extent
137 * items, and extent tree items. Loads of space could be freed up by these
138 * operations, however they won't be usable until the transaction commits.
141 * This will commit the transaction. Historically we had a lot of logic
142 * surrounding whether or not we'd commit the transaction, but this waits born
143 * out of a pre-tickets era where we could end up committing the transaction
144 * thousands of times in a row without making progress. Now thanks to our
145 * ticketing system we know if we're not making progress and can error
146 * everybody out after a few commits rather than burning the disk hoping for
147 * a different answer.
151 * Because we hold so many reservations for metadata we will allow you to
152 * reserve more space than is currently free in the currently allocate
153 * metadata space. This only happens with metadata, data does not allow
156 * You can see the current logic for when we allow overcommit in
157 * btrfs_can_overcommit(), but it only applies to unallocated space. If there
158 * is no unallocated space to be had, all reservations are kept within the
159 * free space in the allocated metadata chunks.
161 * Because of overcommitting, you generally want to use the
162 * btrfs_can_overcommit() logic for metadata allocations, as it does the right
163 * thing with or without extra unallocated space.
166 u64 __pure
btrfs_space_info_used(const struct btrfs_space_info
*s_info
,
167 bool may_use_included
)
170 return s_info
->bytes_used
+ s_info
->bytes_reserved
+
171 s_info
->bytes_pinned
+ s_info
->bytes_readonly
+
172 s_info
->bytes_zone_unusable
+
173 (may_use_included
? s_info
->bytes_may_use
: 0);
177 * after adding space to the filesystem, we need to clear the full flags
178 * on all the space infos.
180 void btrfs_clear_space_info_full(struct btrfs_fs_info
*info
)
182 struct list_head
*head
= &info
->space_info
;
183 struct btrfs_space_info
*found
;
185 list_for_each_entry(found
, head
, list
)
190 * Block groups with more than this value (percents) of unusable space will be
191 * scheduled for background reclaim.
193 #define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75)
195 #define BTRFS_UNALLOC_BLOCK_GROUP_TARGET (10ULL)
198 * Calculate chunk size depending on volume type (regular or zoned).
200 static u64
calc_chunk_size(const struct btrfs_fs_info
*fs_info
, u64 flags
)
202 if (btrfs_is_zoned(fs_info
))
203 return fs_info
->zone_size
;
205 ASSERT(flags
& BTRFS_BLOCK_GROUP_TYPE_MASK
);
207 if (flags
& BTRFS_BLOCK_GROUP_DATA
)
208 return BTRFS_MAX_DATA_CHUNK_SIZE
;
209 else if (flags
& BTRFS_BLOCK_GROUP_SYSTEM
)
212 /* Handle BTRFS_BLOCK_GROUP_METADATA */
213 if (fs_info
->fs_devices
->total_rw_bytes
> 50ULL * SZ_1G
)
220 * Update default chunk size.
222 void btrfs_update_space_info_chunk_size(struct btrfs_space_info
*space_info
,
225 WRITE_ONCE(space_info
->chunk_size
, chunk_size
);
228 static int create_space_info(struct btrfs_fs_info
*info
, u64 flags
)
231 struct btrfs_space_info
*space_info
;
235 space_info
= kzalloc(sizeof(*space_info
), GFP_NOFS
);
239 space_info
->fs_info
= info
;
240 for (i
= 0; i
< BTRFS_NR_RAID_TYPES
; i
++)
241 INIT_LIST_HEAD(&space_info
->block_groups
[i
]);
242 init_rwsem(&space_info
->groups_sem
);
243 spin_lock_init(&space_info
->lock
);
244 space_info
->flags
= flags
& BTRFS_BLOCK_GROUP_TYPE_MASK
;
245 space_info
->force_alloc
= CHUNK_ALLOC_NO_FORCE
;
246 INIT_LIST_HEAD(&space_info
->ro_bgs
);
247 INIT_LIST_HEAD(&space_info
->tickets
);
248 INIT_LIST_HEAD(&space_info
->priority_tickets
);
249 space_info
->clamp
= 1;
250 btrfs_update_space_info_chunk_size(space_info
, calc_chunk_size(info
, flags
));
252 if (btrfs_is_zoned(info
))
253 space_info
->bg_reclaim_threshold
= BTRFS_DEFAULT_ZONED_RECLAIM_THRESH
;
255 ret
= btrfs_sysfs_add_space_info_type(info
, space_info
);
259 list_add(&space_info
->list
, &info
->space_info
);
260 if (flags
& BTRFS_BLOCK_GROUP_DATA
)
261 info
->data_sinfo
= space_info
;
266 int btrfs_init_space_info(struct btrfs_fs_info
*fs_info
)
268 struct btrfs_super_block
*disk_super
;
274 disk_super
= fs_info
->super_copy
;
275 if (!btrfs_super_root(disk_super
))
278 features
= btrfs_super_incompat_flags(disk_super
);
279 if (features
& BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS
)
282 flags
= BTRFS_BLOCK_GROUP_SYSTEM
;
283 ret
= create_space_info(fs_info
, flags
);
288 flags
= BTRFS_BLOCK_GROUP_METADATA
| BTRFS_BLOCK_GROUP_DATA
;
289 ret
= create_space_info(fs_info
, flags
);
291 flags
= BTRFS_BLOCK_GROUP_METADATA
;
292 ret
= create_space_info(fs_info
, flags
);
296 flags
= BTRFS_BLOCK_GROUP_DATA
;
297 ret
= create_space_info(fs_info
, flags
);
303 void btrfs_add_bg_to_space_info(struct btrfs_fs_info
*info
,
304 struct btrfs_block_group
*block_group
)
306 struct btrfs_space_info
*found
;
309 factor
= btrfs_bg_type_to_factor(block_group
->flags
);
311 found
= btrfs_find_space_info(info
, block_group
->flags
);
313 spin_lock(&found
->lock
);
314 found
->total_bytes
+= block_group
->length
;
315 found
->disk_total
+= block_group
->length
* factor
;
316 found
->bytes_used
+= block_group
->used
;
317 found
->disk_used
+= block_group
->used
* factor
;
318 found
->bytes_readonly
+= block_group
->bytes_super
;
319 btrfs_space_info_update_bytes_zone_unusable(info
, found
, block_group
->zone_unusable
);
320 if (block_group
->length
> 0)
322 btrfs_try_granting_tickets(info
, found
);
323 spin_unlock(&found
->lock
);
325 block_group
->space_info
= found
;
327 index
= btrfs_bg_flags_to_raid_index(block_group
->flags
);
328 down_write(&found
->groups_sem
);
329 list_add_tail(&block_group
->list
, &found
->block_groups
[index
]);
330 up_write(&found
->groups_sem
);
333 struct btrfs_space_info
*btrfs_find_space_info(struct btrfs_fs_info
*info
,
336 struct list_head
*head
= &info
->space_info
;
337 struct btrfs_space_info
*found
;
339 flags
&= BTRFS_BLOCK_GROUP_TYPE_MASK
;
341 list_for_each_entry(found
, head
, list
) {
342 if (found
->flags
& flags
)
348 static u64
calc_effective_data_chunk_size(struct btrfs_fs_info
*fs_info
)
350 struct btrfs_space_info
*data_sinfo
;
354 * Calculate the data_chunk_size, space_info->chunk_size is the
355 * "optimal" chunk size based on the fs size. However when we actually
356 * allocate the chunk we will strip this down further, making it no
357 * more than 10% of the disk or 1G, whichever is smaller.
359 * On the zoned mode, we need to use zone_size (= data_sinfo->chunk_size)
362 data_sinfo
= btrfs_find_space_info(fs_info
, BTRFS_BLOCK_GROUP_DATA
);
363 if (btrfs_is_zoned(fs_info
))
364 return data_sinfo
->chunk_size
;
365 data_chunk_size
= min(data_sinfo
->chunk_size
,
366 mult_perc(fs_info
->fs_devices
->total_rw_bytes
, 10));
367 return min_t(u64
, data_chunk_size
, SZ_1G
);
370 static u64
calc_available_free_space(struct btrfs_fs_info
*fs_info
,
371 const struct btrfs_space_info
*space_info
,
372 enum btrfs_reserve_flush_enum flush
)
379 if (space_info
->flags
& BTRFS_BLOCK_GROUP_SYSTEM
)
380 profile
= btrfs_system_alloc_profile(fs_info
);
382 profile
= btrfs_metadata_alloc_profile(fs_info
);
384 avail
= atomic64_read(&fs_info
->free_chunk_space
);
387 * If we have dup, raid1 or raid10 then only half of the free
388 * space is actually usable. For raid56, the space info used
389 * doesn't include the parity drive, so we don't have to
392 factor
= btrfs_bg_type_to_factor(profile
);
393 avail
= div_u64(avail
, factor
);
397 data_chunk_size
= calc_effective_data_chunk_size(fs_info
);
400 * Since data allocations immediately use block groups as part of the
401 * reservation, because we assume that data reservations will == actual
402 * usage, we could potentially overcommit and then immediately have that
403 * available space used by a data allocation, which could put us in a
404 * bind when we get close to filling the file system.
406 * To handle this simply remove the data_chunk_size from the available
407 * space. If we are relatively empty this won't affect our ability to
408 * overcommit much, and if we're very close to full it'll keep us from
409 * getting into a position where we've given ourselves very little
410 * metadata wiggle room.
412 if (avail
<= data_chunk_size
)
414 avail
-= data_chunk_size
;
417 * If we aren't flushing all things, let us overcommit up to
418 * 1/2th of the space. If we can flush, don't let us overcommit
419 * too much, let it overcommit up to 1/8 of the space.
421 if (flush
== BTRFS_RESERVE_FLUSH_ALL
)
427 * On the zoned mode, we always allocate one zone as one chunk.
428 * Returning non-zone size alingned bytes here will result in
429 * less pressure for the async metadata reclaim process, and it
430 * will over-commit too much leading to ENOSPC. Align down to the
431 * zone size to avoid that.
433 if (btrfs_is_zoned(fs_info
))
434 avail
= ALIGN_DOWN(avail
, fs_info
->zone_size
);
439 int btrfs_can_overcommit(struct btrfs_fs_info
*fs_info
,
440 const struct btrfs_space_info
*space_info
, u64 bytes
,
441 enum btrfs_reserve_flush_enum flush
)
446 /* Don't overcommit when in mixed mode */
447 if (space_info
->flags
& BTRFS_BLOCK_GROUP_DATA
)
450 used
= btrfs_space_info_used(space_info
, true);
451 avail
= calc_available_free_space(fs_info
, space_info
, flush
);
453 if (used
+ bytes
< space_info
->total_bytes
+ avail
)
458 static void remove_ticket(struct btrfs_space_info
*space_info
,
459 struct reserve_ticket
*ticket
)
461 if (!list_empty(&ticket
->list
)) {
462 list_del_init(&ticket
->list
);
463 ASSERT(space_info
->reclaim_size
>= ticket
->bytes
);
464 space_info
->reclaim_size
-= ticket
->bytes
;
469 * This is for space we already have accounted in space_info->bytes_may_use, so
470 * basically when we're returning space from block_rsv's.
472 void btrfs_try_granting_tickets(struct btrfs_fs_info
*fs_info
,
473 struct btrfs_space_info
*space_info
)
475 struct list_head
*head
;
476 enum btrfs_reserve_flush_enum flush
= BTRFS_RESERVE_NO_FLUSH
;
478 lockdep_assert_held(&space_info
->lock
);
480 head
= &space_info
->priority_tickets
;
482 while (!list_empty(head
)) {
483 struct reserve_ticket
*ticket
;
484 u64 used
= btrfs_space_info_used(space_info
, true);
486 ticket
= list_first_entry(head
, struct reserve_ticket
, list
);
488 /* Check and see if our ticket can be satisfied now. */
489 if ((used
+ ticket
->bytes
<= space_info
->total_bytes
) ||
490 btrfs_can_overcommit(fs_info
, space_info
, ticket
->bytes
,
492 btrfs_space_info_update_bytes_may_use(fs_info
,
495 remove_ticket(space_info
, ticket
);
497 space_info
->tickets_id
++;
498 wake_up(&ticket
->wait
);
504 if (head
== &space_info
->priority_tickets
) {
505 head
= &space_info
->tickets
;
506 flush
= BTRFS_RESERVE_FLUSH_ALL
;
511 #define DUMP_BLOCK_RSV(fs_info, rsv_name) \
513 struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \
514 spin_lock(&__rsv->lock); \
515 btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \
516 __rsv->size, __rsv->reserved); \
517 spin_unlock(&__rsv->lock); \
520 static const char *space_info_flag_to_str(const struct btrfs_space_info
*space_info
)
522 switch (space_info
->flags
) {
523 case BTRFS_BLOCK_GROUP_SYSTEM
:
525 case BTRFS_BLOCK_GROUP_METADATA
| BTRFS_BLOCK_GROUP_DATA
:
526 return "DATA+METADATA";
527 case BTRFS_BLOCK_GROUP_DATA
:
529 case BTRFS_BLOCK_GROUP_METADATA
:
536 static void dump_global_block_rsv(struct btrfs_fs_info
*fs_info
)
538 DUMP_BLOCK_RSV(fs_info
, global_block_rsv
);
539 DUMP_BLOCK_RSV(fs_info
, trans_block_rsv
);
540 DUMP_BLOCK_RSV(fs_info
, chunk_block_rsv
);
541 DUMP_BLOCK_RSV(fs_info
, delayed_block_rsv
);
542 DUMP_BLOCK_RSV(fs_info
, delayed_refs_rsv
);
545 static void __btrfs_dump_space_info(const struct btrfs_fs_info
*fs_info
,
546 const struct btrfs_space_info
*info
)
548 const char *flag_str
= space_info_flag_to_str(info
);
549 lockdep_assert_held(&info
->lock
);
551 /* The free space could be negative in case of overcommit */
552 btrfs_info(fs_info
, "space_info %s has %lld free, is %sfull",
554 (s64
)(info
->total_bytes
- btrfs_space_info_used(info
, true)),
555 info
->full
? "" : "not ");
557 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu",
558 info
->total_bytes
, info
->bytes_used
, info
->bytes_pinned
,
559 info
->bytes_reserved
, info
->bytes_may_use
,
560 info
->bytes_readonly
, info
->bytes_zone_unusable
);
563 void btrfs_dump_space_info(struct btrfs_fs_info
*fs_info
,
564 struct btrfs_space_info
*info
, u64 bytes
,
565 int dump_block_groups
)
567 struct btrfs_block_group
*cache
;
571 spin_lock(&info
->lock
);
572 __btrfs_dump_space_info(fs_info
, info
);
573 dump_global_block_rsv(fs_info
);
574 spin_unlock(&info
->lock
);
576 if (!dump_block_groups
)
579 down_read(&info
->groups_sem
);
581 list_for_each_entry(cache
, &info
->block_groups
[index
], list
) {
584 spin_lock(&cache
->lock
);
585 avail
= cache
->length
- cache
->used
- cache
->pinned
-
586 cache
->reserved
- cache
->bytes_super
- cache
->zone_unusable
;
588 "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu delalloc %llu super %llu zone_unusable (%llu bytes available) %s",
589 cache
->start
, cache
->length
, cache
->used
, cache
->pinned
,
590 cache
->reserved
, cache
->delalloc_bytes
,
591 cache
->bytes_super
, cache
->zone_unusable
,
592 avail
, cache
->ro
? "[readonly]" : "");
593 spin_unlock(&cache
->lock
);
594 btrfs_dump_free_space(cache
, bytes
);
595 total_avail
+= avail
;
597 if (++index
< BTRFS_NR_RAID_TYPES
)
599 up_read(&info
->groups_sem
);
601 btrfs_info(fs_info
, "%llu bytes available across all block groups", total_avail
);
604 static inline u64
calc_reclaim_items_nr(const struct btrfs_fs_info
*fs_info
,
610 bytes
= btrfs_calc_insert_metadata_size(fs_info
, 1);
611 nr
= div64_u64(to_reclaim
, bytes
);
618 * shrink metadata reservation for delalloc
620 static void shrink_delalloc(struct btrfs_fs_info
*fs_info
,
621 struct btrfs_space_info
*space_info
,
622 u64 to_reclaim
, bool wait_ordered
,
625 struct btrfs_trans_handle
*trans
;
632 delalloc_bytes
= percpu_counter_sum_positive(&fs_info
->delalloc_bytes
);
633 ordered_bytes
= percpu_counter_sum_positive(&fs_info
->ordered_bytes
);
634 if (delalloc_bytes
== 0 && ordered_bytes
== 0)
637 /* Calc the number of the pages we need flush for space reservation */
638 if (to_reclaim
== U64_MAX
) {
642 * to_reclaim is set to however much metadata we need to
643 * reclaim, but reclaiming that much data doesn't really track
644 * exactly. What we really want to do is reclaim full inode's
645 * worth of reservations, however that's not available to us
646 * here. We will take a fraction of the delalloc bytes for our
647 * flushing loops and hope for the best. Delalloc will expand
648 * the amount we write to cover an entire dirty extent, which
649 * will reclaim the metadata reservation for that range. If
650 * it's not enough subsequent flush stages will be more
653 to_reclaim
= max(to_reclaim
, delalloc_bytes
>> 3);
654 items
= calc_reclaim_items_nr(fs_info
, to_reclaim
) * 2;
657 trans
= current
->journal_info
;
660 * If we are doing more ordered than delalloc we need to just wait on
661 * ordered extents, otherwise we'll waste time trying to flush delalloc
662 * that likely won't give us the space back we need.
664 if (ordered_bytes
> delalloc_bytes
&& !for_preempt
)
668 while ((delalloc_bytes
|| ordered_bytes
) && loops
< 3) {
669 u64 temp
= min(delalloc_bytes
, to_reclaim
) >> PAGE_SHIFT
;
670 long nr_pages
= min_t(u64
, temp
, LONG_MAX
);
673 btrfs_start_delalloc_roots(fs_info
, nr_pages
, true);
676 * We need to make sure any outstanding async pages are now
677 * processed before we continue. This is because things like
678 * sync_inode() try to be smart and skip writing if the inode is
679 * marked clean. We don't use filemap_fwrite for flushing
680 * because we want to control how many pages we write out at a
681 * time, thus this is the only safe way to make sure we've
682 * waited for outstanding compressed workers to have started
683 * their jobs and thus have ordered extents set up properly.
685 * This exists because we do not want to wait for each
686 * individual inode to finish its async work, we simply want to
687 * start the IO on everybody, and then come back here and wait
688 * for all of the async work to catch up. Once we're done with
689 * that we know we'll have ordered extents for everything and we
690 * can decide if we wait for that or not.
692 * If we choose to replace this in the future, make absolutely
693 * sure that the proper waiting is being done in the async case,
694 * as there have been bugs in that area before.
696 async_pages
= atomic_read(&fs_info
->async_delalloc_pages
);
701 * We don't want to wait forever, if we wrote less pages in this
702 * loop than we have outstanding, only wait for that number of
703 * pages, otherwise we can wait for all async pages to finish
706 if (async_pages
> nr_pages
)
707 async_pages
-= nr_pages
;
710 wait_event(fs_info
->async_submit_wait
,
711 atomic_read(&fs_info
->async_delalloc_pages
) <=
715 if (wait_ordered
&& !trans
) {
716 btrfs_wait_ordered_roots(fs_info
, items
, NULL
);
718 time_left
= schedule_timeout_killable(1);
724 * If we are for preemption we just want a one-shot of delalloc
725 * flushing so we can stop flushing if we decide we don't need
731 spin_lock(&space_info
->lock
);
732 if (list_empty(&space_info
->tickets
) &&
733 list_empty(&space_info
->priority_tickets
)) {
734 spin_unlock(&space_info
->lock
);
737 spin_unlock(&space_info
->lock
);
739 delalloc_bytes
= percpu_counter_sum_positive(
740 &fs_info
->delalloc_bytes
);
741 ordered_bytes
= percpu_counter_sum_positive(
742 &fs_info
->ordered_bytes
);
747 * Try to flush some data based on policy set by @state. This is only advisory
748 * and may fail for various reasons. The caller is supposed to examine the
749 * state of @space_info to detect the outcome.
751 static void flush_space(struct btrfs_fs_info
*fs_info
,
752 struct btrfs_space_info
*space_info
, u64 num_bytes
,
753 enum btrfs_flush_state state
, bool for_preempt
)
755 struct btrfs_root
*root
= fs_info
->tree_root
;
756 struct btrfs_trans_handle
*trans
;
761 case FLUSH_DELAYED_ITEMS_NR
:
762 case FLUSH_DELAYED_ITEMS
:
763 if (state
== FLUSH_DELAYED_ITEMS_NR
)
764 nr
= calc_reclaim_items_nr(fs_info
, num_bytes
) * 2;
768 trans
= btrfs_join_transaction_nostart(root
);
770 ret
= PTR_ERR(trans
);
775 ret
= btrfs_run_delayed_items_nr(trans
, nr
);
776 btrfs_end_transaction(trans
);
779 case FLUSH_DELALLOC_WAIT
:
780 case FLUSH_DELALLOC_FULL
:
781 if (state
== FLUSH_DELALLOC_FULL
)
783 shrink_delalloc(fs_info
, space_info
, num_bytes
,
784 state
!= FLUSH_DELALLOC
, for_preempt
);
786 case FLUSH_DELAYED_REFS_NR
:
787 case FLUSH_DELAYED_REFS
:
788 trans
= btrfs_join_transaction_nostart(root
);
790 ret
= PTR_ERR(trans
);
795 if (state
== FLUSH_DELAYED_REFS_NR
)
796 btrfs_run_delayed_refs(trans
, num_bytes
);
798 btrfs_run_delayed_refs(trans
, 0);
799 btrfs_end_transaction(trans
);
802 case ALLOC_CHUNK_FORCE
:
803 trans
= btrfs_join_transaction(root
);
805 ret
= PTR_ERR(trans
);
808 ret
= btrfs_chunk_alloc(trans
,
809 btrfs_get_alloc_profile(fs_info
, space_info
->flags
),
810 (state
== ALLOC_CHUNK
) ? CHUNK_ALLOC_NO_FORCE
:
812 btrfs_end_transaction(trans
);
814 if (ret
> 0 || ret
== -ENOSPC
)
817 case RUN_DELAYED_IPUTS
:
819 * If we have pending delayed iputs then we could free up a
820 * bunch of pinned space, so make sure we run the iputs before
821 * we do our pinned bytes check below.
823 btrfs_run_delayed_iputs(fs_info
);
824 btrfs_wait_on_delayed_iputs(fs_info
);
827 ASSERT(current
->journal_info
== NULL
);
829 * We don't want to start a new transaction, just attach to the
830 * current one or wait it fully commits in case its commit is
831 * happening at the moment. Note: we don't use a nostart join
832 * because that does not wait for a transaction to fully commit
833 * (only for it to be unblocked, state TRANS_STATE_UNBLOCKED).
835 ret
= btrfs_commit_current_transaction(root
);
842 trace_btrfs_flush_space(fs_info
, space_info
->flags
, num_bytes
, state
,
847 static u64
btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info
*fs_info
,
848 const struct btrfs_space_info
*space_info
)
852 u64 to_reclaim
= space_info
->reclaim_size
;
854 lockdep_assert_held(&space_info
->lock
);
856 avail
= calc_available_free_space(fs_info
, space_info
,
857 BTRFS_RESERVE_FLUSH_ALL
);
858 used
= btrfs_space_info_used(space_info
, true);
861 * We may be flushing because suddenly we have less space than we had
862 * before, and now we're well over-committed based on our current free
863 * space. If that's the case add in our overage so we make sure to put
864 * appropriate pressure on the flushing state machine.
866 if (space_info
->total_bytes
+ avail
< used
)
867 to_reclaim
+= used
- (space_info
->total_bytes
+ avail
);
872 static bool need_preemptive_reclaim(struct btrfs_fs_info
*fs_info
,
873 const struct btrfs_space_info
*space_info
)
875 const u64 global_rsv_size
= btrfs_block_rsv_reserved(&fs_info
->global_block_rsv
);
876 u64 ordered
, delalloc
;
880 thresh
= mult_perc(space_info
->total_bytes
, 90);
882 lockdep_assert_held(&space_info
->lock
);
884 /* If we're just plain full then async reclaim just slows us down. */
885 if ((space_info
->bytes_used
+ space_info
->bytes_reserved
+
886 global_rsv_size
) >= thresh
)
889 used
= space_info
->bytes_may_use
+ space_info
->bytes_pinned
;
891 /* The total flushable belongs to the global rsv, don't flush. */
892 if (global_rsv_size
>= used
)
896 * 128MiB is 1/4 of the maximum global rsv size. If we have less than
897 * that devoted to other reservations then there's no sense in flushing,
898 * we don't have a lot of things that need flushing.
900 if (used
- global_rsv_size
<= SZ_128M
)
904 * We have tickets queued, bail so we don't compete with the async
907 if (space_info
->reclaim_size
)
911 * If we have over half of the free space occupied by reservations or
912 * pinned then we want to start flushing.
914 * We do not do the traditional thing here, which is to say
916 * if (used >= ((total_bytes + avail) / 2))
919 * because this doesn't quite work how we want. If we had more than 50%
920 * of the space_info used by bytes_used and we had 0 available we'd just
921 * constantly run the background flusher. Instead we want it to kick in
922 * if our reclaimable space exceeds our clamped free space.
924 * Our clamping range is 2^1 -> 2^8. Practically speaking that means
927 * Amount of RAM Minimum threshold Maximum threshold
930 * 128GiB 512MiB 64GiB
935 * These are the range our thresholds will fall in, corresponding to how
936 * much delalloc we need for the background flusher to kick in.
939 thresh
= calc_available_free_space(fs_info
, space_info
,
940 BTRFS_RESERVE_FLUSH_ALL
);
941 used
= space_info
->bytes_used
+ space_info
->bytes_reserved
+
942 space_info
->bytes_readonly
+ global_rsv_size
;
943 if (used
< space_info
->total_bytes
)
944 thresh
+= space_info
->total_bytes
- used
;
945 thresh
>>= space_info
->clamp
;
947 used
= space_info
->bytes_pinned
;
950 * If we have more ordered bytes than delalloc bytes then we're either
951 * doing a lot of DIO, or we simply don't have a lot of delalloc waiting
952 * around. Preemptive flushing is only useful in that it can free up
953 * space before tickets need to wait for things to finish. In the case
954 * of ordered extents, preemptively waiting on ordered extents gets us
955 * nothing, if our reservations are tied up in ordered extents we'll
956 * simply have to slow down writers by forcing them to wait on ordered
959 * In the case that ordered is larger than delalloc, only include the
960 * block reserves that we would actually be able to directly reclaim
961 * from. In this case if we're heavy on metadata operations this will
962 * clearly be heavy enough to warrant preemptive flushing. In the case
963 * of heavy DIO or ordered reservations, preemptive flushing will just
964 * waste time and cause us to slow down.
966 * We want to make sure we truly are maxed out on ordered however, so
967 * cut ordered in half, and if it's still higher than delalloc then we
968 * can keep flushing. This is to avoid the case where we start
969 * flushing, and now delalloc == ordered and we stop preemptively
970 * flushing when we could still have several gigs of delalloc to flush.
972 ordered
= percpu_counter_read_positive(&fs_info
->ordered_bytes
) >> 1;
973 delalloc
= percpu_counter_read_positive(&fs_info
->delalloc_bytes
);
974 if (ordered
>= delalloc
)
975 used
+= btrfs_block_rsv_reserved(&fs_info
->delayed_refs_rsv
) +
976 btrfs_block_rsv_reserved(&fs_info
->delayed_block_rsv
);
978 used
+= space_info
->bytes_may_use
- global_rsv_size
;
980 return (used
>= thresh
&& !btrfs_fs_closing(fs_info
) &&
981 !test_bit(BTRFS_FS_STATE_REMOUNTING
, &fs_info
->fs_state
));
984 static bool steal_from_global_rsv(struct btrfs_fs_info
*fs_info
,
985 struct btrfs_space_info
*space_info
,
986 struct reserve_ticket
*ticket
)
988 struct btrfs_block_rsv
*global_rsv
= &fs_info
->global_block_rsv
;
994 if (global_rsv
->space_info
!= space_info
)
997 spin_lock(&global_rsv
->lock
);
998 min_bytes
= mult_perc(global_rsv
->size
, 10);
999 if (global_rsv
->reserved
< min_bytes
+ ticket
->bytes
) {
1000 spin_unlock(&global_rsv
->lock
);
1003 global_rsv
->reserved
-= ticket
->bytes
;
1004 remove_ticket(space_info
, ticket
);
1006 wake_up(&ticket
->wait
);
1007 space_info
->tickets_id
++;
1008 if (global_rsv
->reserved
< global_rsv
->size
)
1009 global_rsv
->full
= 0;
1010 spin_unlock(&global_rsv
->lock
);
1016 * We've exhausted our flushing, start failing tickets.
1018 * @fs_info - fs_info for this fs
1019 * @space_info - the space info we were flushing
1021 * We call this when we've exhausted our flushing ability and haven't made
1022 * progress in satisfying tickets. The reservation code handles tickets in
1023 * order, so if there is a large ticket first and then smaller ones we could
1024 * very well satisfy the smaller tickets. This will attempt to wake up any
1025 * tickets in the list to catch this case.
1027 * This function returns true if it was able to make progress by clearing out
1028 * other tickets, or if it stumbles across a ticket that was smaller than the
1031 static bool maybe_fail_all_tickets(struct btrfs_fs_info
*fs_info
,
1032 struct btrfs_space_info
*space_info
)
1034 struct reserve_ticket
*ticket
;
1035 u64 tickets_id
= space_info
->tickets_id
;
1036 const bool aborted
= BTRFS_FS_ERROR(fs_info
);
1038 trace_btrfs_fail_all_tickets(fs_info
, space_info
);
1040 if (btrfs_test_opt(fs_info
, ENOSPC_DEBUG
)) {
1041 btrfs_info(fs_info
, "cannot satisfy tickets, dumping space info");
1042 __btrfs_dump_space_info(fs_info
, space_info
);
1045 while (!list_empty(&space_info
->tickets
) &&
1046 tickets_id
== space_info
->tickets_id
) {
1047 ticket
= list_first_entry(&space_info
->tickets
,
1048 struct reserve_ticket
, list
);
1050 if (!aborted
&& steal_from_global_rsv(fs_info
, space_info
, ticket
))
1053 if (!aborted
&& btrfs_test_opt(fs_info
, ENOSPC_DEBUG
))
1054 btrfs_info(fs_info
, "failing ticket with %llu bytes",
1057 remove_ticket(space_info
, ticket
);
1059 ticket
->error
= -EIO
;
1061 ticket
->error
= -ENOSPC
;
1062 wake_up(&ticket
->wait
);
1065 * We're just throwing tickets away, so more flushing may not
1066 * trip over btrfs_try_granting_tickets, so we need to call it
1067 * here to see if we can make progress with the next ticket in
1071 btrfs_try_granting_tickets(fs_info
, space_info
);
1073 return (tickets_id
!= space_info
->tickets_id
);
1077 * This is for normal flushers, we can wait all goddamned day if we want to. We
1078 * will loop and continuously try to flush as long as we are making progress.
1079 * We count progress as clearing off tickets each time we have to loop.
1081 static void btrfs_async_reclaim_metadata_space(struct work_struct
*work
)
1083 struct btrfs_fs_info
*fs_info
;
1084 struct btrfs_space_info
*space_info
;
1086 enum btrfs_flush_state flush_state
;
1087 int commit_cycles
= 0;
1088 u64 last_tickets_id
;
1090 fs_info
= container_of(work
, struct btrfs_fs_info
, async_reclaim_work
);
1091 space_info
= btrfs_find_space_info(fs_info
, BTRFS_BLOCK_GROUP_METADATA
);
1093 spin_lock(&space_info
->lock
);
1094 to_reclaim
= btrfs_calc_reclaim_metadata_size(fs_info
, space_info
);
1096 space_info
->flush
= 0;
1097 spin_unlock(&space_info
->lock
);
1100 last_tickets_id
= space_info
->tickets_id
;
1101 spin_unlock(&space_info
->lock
);
1103 flush_state
= FLUSH_DELAYED_ITEMS_NR
;
1105 flush_space(fs_info
, space_info
, to_reclaim
, flush_state
, false);
1106 spin_lock(&space_info
->lock
);
1107 if (list_empty(&space_info
->tickets
)) {
1108 space_info
->flush
= 0;
1109 spin_unlock(&space_info
->lock
);
1112 to_reclaim
= btrfs_calc_reclaim_metadata_size(fs_info
,
1114 if (last_tickets_id
== space_info
->tickets_id
) {
1117 last_tickets_id
= space_info
->tickets_id
;
1118 flush_state
= FLUSH_DELAYED_ITEMS_NR
;
1124 * We do not want to empty the system of delalloc unless we're
1125 * under heavy pressure, so allow one trip through the flushing
1126 * logic before we start doing a FLUSH_DELALLOC_FULL.
1128 if (flush_state
== FLUSH_DELALLOC_FULL
&& !commit_cycles
)
1132 * We don't want to force a chunk allocation until we've tried
1133 * pretty hard to reclaim space. Think of the case where we
1134 * freed up a bunch of space and so have a lot of pinned space
1135 * to reclaim. We would rather use that than possibly create a
1136 * underutilized metadata chunk. So if this is our first run
1137 * through the flushing state machine skip ALLOC_CHUNK_FORCE and
1138 * commit the transaction. If nothing has changed the next go
1139 * around then we can force a chunk allocation.
1141 if (flush_state
== ALLOC_CHUNK_FORCE
&& !commit_cycles
)
1144 if (flush_state
> COMMIT_TRANS
) {
1146 if (commit_cycles
> 2) {
1147 if (maybe_fail_all_tickets(fs_info
, space_info
)) {
1148 flush_state
= FLUSH_DELAYED_ITEMS_NR
;
1151 space_info
->flush
= 0;
1154 flush_state
= FLUSH_DELAYED_ITEMS_NR
;
1157 spin_unlock(&space_info
->lock
);
1158 } while (flush_state
<= COMMIT_TRANS
);
1162 * This handles pre-flushing of metadata space before we get to the point that
1163 * we need to start blocking threads on tickets. The logic here is different
1164 * from the other flush paths because it doesn't rely on tickets to tell us how
1165 * much we need to flush, instead it attempts to keep us below the 80% full
1166 * watermark of space by flushing whichever reservation pool is currently the
1169 static void btrfs_preempt_reclaim_metadata_space(struct work_struct
*work
)
1171 struct btrfs_fs_info
*fs_info
;
1172 struct btrfs_space_info
*space_info
;
1173 struct btrfs_block_rsv
*delayed_block_rsv
;
1174 struct btrfs_block_rsv
*delayed_refs_rsv
;
1175 struct btrfs_block_rsv
*global_rsv
;
1176 struct btrfs_block_rsv
*trans_rsv
;
1179 fs_info
= container_of(work
, struct btrfs_fs_info
,
1180 preempt_reclaim_work
);
1181 space_info
= btrfs_find_space_info(fs_info
, BTRFS_BLOCK_GROUP_METADATA
);
1182 delayed_block_rsv
= &fs_info
->delayed_block_rsv
;
1183 delayed_refs_rsv
= &fs_info
->delayed_refs_rsv
;
1184 global_rsv
= &fs_info
->global_block_rsv
;
1185 trans_rsv
= &fs_info
->trans_block_rsv
;
1187 spin_lock(&space_info
->lock
);
1188 while (need_preemptive_reclaim(fs_info
, space_info
)) {
1189 enum btrfs_flush_state flush
;
1190 u64 delalloc_size
= 0;
1191 u64 to_reclaim
, block_rsv_size
;
1192 const u64 global_rsv_size
= btrfs_block_rsv_reserved(global_rsv
);
1197 * We don't have a precise counter for the metadata being
1198 * reserved for delalloc, so we'll approximate it by subtracting
1199 * out the block rsv's space from the bytes_may_use. If that
1200 * amount is higher than the individual reserves, then we can
1201 * assume it's tied up in delalloc reservations.
1203 block_rsv_size
= global_rsv_size
+
1204 btrfs_block_rsv_reserved(delayed_block_rsv
) +
1205 btrfs_block_rsv_reserved(delayed_refs_rsv
) +
1206 btrfs_block_rsv_reserved(trans_rsv
);
1207 if (block_rsv_size
< space_info
->bytes_may_use
)
1208 delalloc_size
= space_info
->bytes_may_use
- block_rsv_size
;
1211 * We don't want to include the global_rsv in our calculation,
1212 * because that's space we can't touch. Subtract it from the
1213 * block_rsv_size for the next checks.
1215 block_rsv_size
-= global_rsv_size
;
1218 * We really want to avoid flushing delalloc too much, as it
1219 * could result in poor allocation patterns, so only flush it if
1220 * it's larger than the rest of the pools combined.
1222 if (delalloc_size
> block_rsv_size
) {
1223 to_reclaim
= delalloc_size
;
1224 flush
= FLUSH_DELALLOC
;
1225 } else if (space_info
->bytes_pinned
>
1226 (btrfs_block_rsv_reserved(delayed_block_rsv
) +
1227 btrfs_block_rsv_reserved(delayed_refs_rsv
))) {
1228 to_reclaim
= space_info
->bytes_pinned
;
1229 flush
= COMMIT_TRANS
;
1230 } else if (btrfs_block_rsv_reserved(delayed_block_rsv
) >
1231 btrfs_block_rsv_reserved(delayed_refs_rsv
)) {
1232 to_reclaim
= btrfs_block_rsv_reserved(delayed_block_rsv
);
1233 flush
= FLUSH_DELAYED_ITEMS_NR
;
1235 to_reclaim
= btrfs_block_rsv_reserved(delayed_refs_rsv
);
1236 flush
= FLUSH_DELAYED_REFS_NR
;
1239 spin_unlock(&space_info
->lock
);
1242 * We don't want to reclaim everything, just a portion, so scale
1243 * down the to_reclaim by 1/4. If it takes us down to 0,
1244 * reclaim 1 items worth.
1248 to_reclaim
= btrfs_calc_insert_metadata_size(fs_info
, 1);
1249 flush_space(fs_info
, space_info
, to_reclaim
, flush
, true);
1251 spin_lock(&space_info
->lock
);
1254 /* We only went through once, back off our clamping. */
1255 if (loops
== 1 && !space_info
->reclaim_size
)
1256 space_info
->clamp
= max(1, space_info
->clamp
- 1);
1257 trace_btrfs_done_preemptive_reclaim(fs_info
, space_info
);
1258 spin_unlock(&space_info
->lock
);
1262 * FLUSH_DELALLOC_WAIT:
1263 * Space is freed from flushing delalloc in one of two ways.
1265 * 1) compression is on and we allocate less space than we reserved
1266 * 2) we are overwriting existing space
1268 * For #1 that extra space is reclaimed as soon as the delalloc pages are
1269 * COWed, by way of btrfs_add_reserved_bytes() which adds the actual extent
1270 * length to ->bytes_reserved, and subtracts the reserved space from
1273 * For #2 this is trickier. Once the ordered extent runs we will drop the
1274 * extent in the range we are overwriting, which creates a delayed ref for
1275 * that freed extent. This however is not reclaimed until the transaction
1276 * commits, thus the next stages.
1279 * If we are freeing inodes, we want to make sure all delayed iputs have
1280 * completed, because they could have been on an inode with i_nlink == 0, and
1281 * thus have been truncated and freed up space. But again this space is not
1282 * immediately reusable, it comes in the form of a delayed ref, which must be
1283 * run and then the transaction must be committed.
1286 * This is where we reclaim all of the pinned space generated by running the
1290 * For data we start with alloc chunk force, however we could have been full
1291 * before, and then the transaction commit could have freed new block groups,
1292 * so if we now have space to allocate do the force chunk allocation.
1294 static const enum btrfs_flush_state data_flush_states
[] = {
1295 FLUSH_DELALLOC_FULL
,
1301 static void btrfs_async_reclaim_data_space(struct work_struct
*work
)
1303 struct btrfs_fs_info
*fs_info
;
1304 struct btrfs_space_info
*space_info
;
1305 u64 last_tickets_id
;
1306 enum btrfs_flush_state flush_state
= 0;
1308 fs_info
= container_of(work
, struct btrfs_fs_info
, async_data_reclaim_work
);
1309 space_info
= fs_info
->data_sinfo
;
1311 spin_lock(&space_info
->lock
);
1312 if (list_empty(&space_info
->tickets
)) {
1313 space_info
->flush
= 0;
1314 spin_unlock(&space_info
->lock
);
1317 last_tickets_id
= space_info
->tickets_id
;
1318 spin_unlock(&space_info
->lock
);
1320 while (!space_info
->full
) {
1321 flush_space(fs_info
, space_info
, U64_MAX
, ALLOC_CHUNK_FORCE
, false);
1322 spin_lock(&space_info
->lock
);
1323 if (list_empty(&space_info
->tickets
)) {
1324 space_info
->flush
= 0;
1325 spin_unlock(&space_info
->lock
);
1329 /* Something happened, fail everything and bail. */
1330 if (BTRFS_FS_ERROR(fs_info
))
1332 last_tickets_id
= space_info
->tickets_id
;
1333 spin_unlock(&space_info
->lock
);
1336 while (flush_state
< ARRAY_SIZE(data_flush_states
)) {
1337 flush_space(fs_info
, space_info
, U64_MAX
,
1338 data_flush_states
[flush_state
], false);
1339 spin_lock(&space_info
->lock
);
1340 if (list_empty(&space_info
->tickets
)) {
1341 space_info
->flush
= 0;
1342 spin_unlock(&space_info
->lock
);
1346 if (last_tickets_id
== space_info
->tickets_id
) {
1349 last_tickets_id
= space_info
->tickets_id
;
1353 if (flush_state
>= ARRAY_SIZE(data_flush_states
)) {
1354 if (space_info
->full
) {
1355 if (maybe_fail_all_tickets(fs_info
, space_info
))
1358 space_info
->flush
= 0;
1363 /* Something happened, fail everything and bail. */
1364 if (BTRFS_FS_ERROR(fs_info
))
1368 spin_unlock(&space_info
->lock
);
1373 maybe_fail_all_tickets(fs_info
, space_info
);
1374 space_info
->flush
= 0;
1375 spin_unlock(&space_info
->lock
);
1378 void btrfs_init_async_reclaim_work(struct btrfs_fs_info
*fs_info
)
1380 INIT_WORK(&fs_info
->async_reclaim_work
, btrfs_async_reclaim_metadata_space
);
1381 INIT_WORK(&fs_info
->async_data_reclaim_work
, btrfs_async_reclaim_data_space
);
1382 INIT_WORK(&fs_info
->preempt_reclaim_work
,
1383 btrfs_preempt_reclaim_metadata_space
);
1386 static const enum btrfs_flush_state priority_flush_states
[] = {
1387 FLUSH_DELAYED_ITEMS_NR
,
1388 FLUSH_DELAYED_ITEMS
,
1392 static const enum btrfs_flush_state evict_flush_states
[] = {
1393 FLUSH_DELAYED_ITEMS_NR
,
1394 FLUSH_DELAYED_ITEMS
,
1395 FLUSH_DELAYED_REFS_NR
,
1398 FLUSH_DELALLOC_WAIT
,
1399 FLUSH_DELALLOC_FULL
,
1404 static void priority_reclaim_metadata_space(struct btrfs_fs_info
*fs_info
,
1405 struct btrfs_space_info
*space_info
,
1406 struct reserve_ticket
*ticket
,
1407 const enum btrfs_flush_state
*states
,
1411 int flush_state
= 0;
1413 spin_lock(&space_info
->lock
);
1414 to_reclaim
= btrfs_calc_reclaim_metadata_size(fs_info
, space_info
);
1416 * This is the priority reclaim path, so to_reclaim could be >0 still
1417 * because we may have only satisfied the priority tickets and still
1418 * left non priority tickets on the list. We would then have
1419 * to_reclaim but ->bytes == 0.
1421 if (ticket
->bytes
== 0) {
1422 spin_unlock(&space_info
->lock
);
1426 while (flush_state
< states_nr
) {
1427 spin_unlock(&space_info
->lock
);
1428 flush_space(fs_info
, space_info
, to_reclaim
, states
[flush_state
],
1431 spin_lock(&space_info
->lock
);
1432 if (ticket
->bytes
== 0) {
1433 spin_unlock(&space_info
->lock
);
1439 * Attempt to steal from the global rsv if we can, except if the fs was
1440 * turned into error mode due to a transaction abort when flushing space
1441 * above, in that case fail with the abort error instead of returning
1442 * success to the caller if we can steal from the global rsv - this is
1443 * just to have caller fail immeditelly instead of later when trying to
1444 * modify the fs, making it easier to debug -ENOSPC problems.
1446 if (BTRFS_FS_ERROR(fs_info
)) {
1447 ticket
->error
= BTRFS_FS_ERROR(fs_info
);
1448 remove_ticket(space_info
, ticket
);
1449 } else if (!steal_from_global_rsv(fs_info
, space_info
, ticket
)) {
1450 ticket
->error
= -ENOSPC
;
1451 remove_ticket(space_info
, ticket
);
1455 * We must run try_granting_tickets here because we could be a large
1456 * ticket in front of a smaller ticket that can now be satisfied with
1457 * the available space.
1459 btrfs_try_granting_tickets(fs_info
, space_info
);
1460 spin_unlock(&space_info
->lock
);
1463 static void priority_reclaim_data_space(struct btrfs_fs_info
*fs_info
,
1464 struct btrfs_space_info
*space_info
,
1465 struct reserve_ticket
*ticket
)
1467 spin_lock(&space_info
->lock
);
1469 /* We could have been granted before we got here. */
1470 if (ticket
->bytes
== 0) {
1471 spin_unlock(&space_info
->lock
);
1475 while (!space_info
->full
) {
1476 spin_unlock(&space_info
->lock
);
1477 flush_space(fs_info
, space_info
, U64_MAX
, ALLOC_CHUNK_FORCE
, false);
1478 spin_lock(&space_info
->lock
);
1479 if (ticket
->bytes
== 0) {
1480 spin_unlock(&space_info
->lock
);
1485 ticket
->error
= -ENOSPC
;
1486 remove_ticket(space_info
, ticket
);
1487 btrfs_try_granting_tickets(fs_info
, space_info
);
1488 spin_unlock(&space_info
->lock
);
1491 static void wait_reserve_ticket(struct btrfs_space_info
*space_info
,
1492 struct reserve_ticket
*ticket
)
1498 spin_lock(&space_info
->lock
);
1499 while (ticket
->bytes
> 0 && ticket
->error
== 0) {
1500 ret
= prepare_to_wait_event(&ticket
->wait
, &wait
, TASK_KILLABLE
);
1503 * Delete us from the list. After we unlock the space
1504 * info, we don't want the async reclaim job to reserve
1505 * space for this ticket. If that would happen, then the
1506 * ticket's task would not known that space was reserved
1507 * despite getting an error, resulting in a space leak
1508 * (bytes_may_use counter of our space_info).
1510 remove_ticket(space_info
, ticket
);
1511 ticket
->error
= -EINTR
;
1514 spin_unlock(&space_info
->lock
);
1518 finish_wait(&ticket
->wait
, &wait
);
1519 spin_lock(&space_info
->lock
);
1521 spin_unlock(&space_info
->lock
);
1525 * Do the appropriate flushing and waiting for a ticket.
1527 * @fs_info: the filesystem
1528 * @space_info: space info for the reservation
1529 * @ticket: ticket for the reservation
1530 * @start_ns: timestamp when the reservation started
1531 * @orig_bytes: amount of bytes originally reserved
1532 * @flush: how much we can flush
1534 * This does the work of figuring out how to flush for the ticket, waiting for
1535 * the reservation, and returning the appropriate error if there is one.
1537 static int handle_reserve_ticket(struct btrfs_fs_info
*fs_info
,
1538 struct btrfs_space_info
*space_info
,
1539 struct reserve_ticket
*ticket
,
1540 u64 start_ns
, u64 orig_bytes
,
1541 enum btrfs_reserve_flush_enum flush
)
1546 case BTRFS_RESERVE_FLUSH_DATA
:
1547 case BTRFS_RESERVE_FLUSH_ALL
:
1548 case BTRFS_RESERVE_FLUSH_ALL_STEAL
:
1549 wait_reserve_ticket(space_info
, ticket
);
1551 case BTRFS_RESERVE_FLUSH_LIMIT
:
1552 priority_reclaim_metadata_space(fs_info
, space_info
, ticket
,
1553 priority_flush_states
,
1554 ARRAY_SIZE(priority_flush_states
));
1556 case BTRFS_RESERVE_FLUSH_EVICT
:
1557 priority_reclaim_metadata_space(fs_info
, space_info
, ticket
,
1559 ARRAY_SIZE(evict_flush_states
));
1561 case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE
:
1562 priority_reclaim_data_space(fs_info
, space_info
, ticket
);
1569 ret
= ticket
->error
;
1570 ASSERT(list_empty(&ticket
->list
));
1572 * Check that we can't have an error set if the reservation succeeded,
1573 * as that would confuse tasks and lead them to error out without
1574 * releasing reserved space (if an error happens the expectation is that
1575 * space wasn't reserved at all).
1577 ASSERT(!(ticket
->bytes
== 0 && ticket
->error
));
1578 trace_btrfs_reserve_ticket(fs_info
, space_info
->flags
, orig_bytes
,
1579 start_ns
, flush
, ticket
->error
);
1584 * This returns true if this flush state will go through the ordinary flushing
1587 static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush
)
1589 return (flush
== BTRFS_RESERVE_FLUSH_ALL
) ||
1590 (flush
== BTRFS_RESERVE_FLUSH_ALL_STEAL
);
1593 static inline void maybe_clamp_preempt(struct btrfs_fs_info
*fs_info
,
1594 struct btrfs_space_info
*space_info
)
1596 u64 ordered
= percpu_counter_sum_positive(&fs_info
->ordered_bytes
);
1597 u64 delalloc
= percpu_counter_sum_positive(&fs_info
->delalloc_bytes
);
1600 * If we're heavy on ordered operations then clamping won't help us. We
1601 * need to clamp specifically to keep up with dirty'ing buffered
1602 * writers, because there's not a 1:1 correlation of writing delalloc
1603 * and freeing space, like there is with flushing delayed refs or
1604 * delayed nodes. If we're already more ordered than delalloc then
1605 * we're keeping up, otherwise we aren't and should probably clamp.
1607 if (ordered
< delalloc
)
1608 space_info
->clamp
= min(space_info
->clamp
+ 1, 8);
1611 static inline bool can_steal(enum btrfs_reserve_flush_enum flush
)
1613 return (flush
== BTRFS_RESERVE_FLUSH_ALL_STEAL
||
1614 flush
== BTRFS_RESERVE_FLUSH_EVICT
);
1618 * NO_FLUSH and FLUSH_EMERGENCY don't want to create a ticket, they just want to
1619 * fail as quickly as possible.
1621 static inline bool can_ticket(enum btrfs_reserve_flush_enum flush
)
1623 return (flush
!= BTRFS_RESERVE_NO_FLUSH
&&
1624 flush
!= BTRFS_RESERVE_FLUSH_EMERGENCY
);
1628 * Try to reserve bytes from the block_rsv's space.
1630 * @fs_info: the filesystem
1631 * @space_info: space info we want to allocate from
1632 * @orig_bytes: number of bytes we want
1633 * @flush: whether or not we can flush to make our reservation
1635 * This will reserve orig_bytes number of bytes from the space info associated
1636 * with the block_rsv. If there is not enough space it will make an attempt to
1637 * flush out space to make room. It will do this by flushing delalloc if
1638 * possible or committing the transaction. If flush is 0 then no attempts to
1639 * regain reservations will be made and this will fail if there is not enough
1642 static int __reserve_bytes(struct btrfs_fs_info
*fs_info
,
1643 struct btrfs_space_info
*space_info
, u64 orig_bytes
,
1644 enum btrfs_reserve_flush_enum flush
)
1646 struct work_struct
*async_work
;
1647 struct reserve_ticket ticket
;
1651 bool pending_tickets
;
1655 * If have a transaction handle (current->journal_info != NULL), then
1656 * the flush method can not be neither BTRFS_RESERVE_FLUSH_ALL* nor
1657 * BTRFS_RESERVE_FLUSH_EVICT, as we could deadlock because those
1658 * flushing methods can trigger transaction commits.
1660 if (current
->journal_info
) {
1661 /* One assert per line for easier debugging. */
1662 ASSERT(flush
!= BTRFS_RESERVE_FLUSH_ALL
);
1663 ASSERT(flush
!= BTRFS_RESERVE_FLUSH_ALL_STEAL
);
1664 ASSERT(flush
!= BTRFS_RESERVE_FLUSH_EVICT
);
1667 if (flush
== BTRFS_RESERVE_FLUSH_DATA
)
1668 async_work
= &fs_info
->async_data_reclaim_work
;
1670 async_work
= &fs_info
->async_reclaim_work
;
1672 spin_lock(&space_info
->lock
);
1673 used
= btrfs_space_info_used(space_info
, true);
1676 * We don't want NO_FLUSH allocations to jump everybody, they can
1677 * generally handle ENOSPC in a different way, so treat them the same as
1678 * normal flushers when it comes to skipping pending tickets.
1680 if (is_normal_flushing(flush
) || (flush
== BTRFS_RESERVE_NO_FLUSH
))
1681 pending_tickets
= !list_empty(&space_info
->tickets
) ||
1682 !list_empty(&space_info
->priority_tickets
);
1684 pending_tickets
= !list_empty(&space_info
->priority_tickets
);
1687 * Carry on if we have enough space (short-circuit) OR call
1688 * can_overcommit() to ensure we can overcommit to continue.
1690 if (!pending_tickets
&&
1691 ((used
+ orig_bytes
<= space_info
->total_bytes
) ||
1692 btrfs_can_overcommit(fs_info
, space_info
, orig_bytes
, flush
))) {
1693 btrfs_space_info_update_bytes_may_use(fs_info
, space_info
,
1699 * Things are dire, we need to make a reservation so we don't abort. We
1700 * will let this reservation go through as long as we have actual space
1701 * left to allocate for the block.
1703 if (ret
&& unlikely(flush
== BTRFS_RESERVE_FLUSH_EMERGENCY
)) {
1704 used
= btrfs_space_info_used(space_info
, false);
1705 if (used
+ orig_bytes
<= space_info
->total_bytes
) {
1706 btrfs_space_info_update_bytes_may_use(fs_info
, space_info
,
1713 * If we couldn't make a reservation then setup our reservation ticket
1714 * and kick the async worker if it's not already running.
1716 * If we are a priority flusher then we just need to add our ticket to
1717 * the list and we will do our own flushing further down.
1719 if (ret
&& can_ticket(flush
)) {
1720 ticket
.bytes
= orig_bytes
;
1722 space_info
->reclaim_size
+= ticket
.bytes
;
1723 init_waitqueue_head(&ticket
.wait
);
1724 ticket
.steal
= can_steal(flush
);
1725 if (trace_btrfs_reserve_ticket_enabled())
1726 start_ns
= ktime_get_ns();
1728 if (flush
== BTRFS_RESERVE_FLUSH_ALL
||
1729 flush
== BTRFS_RESERVE_FLUSH_ALL_STEAL
||
1730 flush
== BTRFS_RESERVE_FLUSH_DATA
) {
1731 list_add_tail(&ticket
.list
, &space_info
->tickets
);
1732 if (!space_info
->flush
) {
1734 * We were forced to add a reserve ticket, so
1735 * our preemptive flushing is unable to keep
1736 * up. Clamp down on the threshold for the
1737 * preemptive flushing in order to keep up with
1740 maybe_clamp_preempt(fs_info
, space_info
);
1742 space_info
->flush
= 1;
1743 trace_btrfs_trigger_flush(fs_info
,
1747 queue_work(system_unbound_wq
, async_work
);
1750 list_add_tail(&ticket
.list
,
1751 &space_info
->priority_tickets
);
1753 } else if (!ret
&& space_info
->flags
& BTRFS_BLOCK_GROUP_METADATA
) {
1755 * We will do the space reservation dance during log replay,
1756 * which means we won't have fs_info->fs_root set, so don't do
1757 * the async reclaim as we will panic.
1759 if (!test_bit(BTRFS_FS_LOG_RECOVERING
, &fs_info
->flags
) &&
1760 !work_busy(&fs_info
->preempt_reclaim_work
) &&
1761 need_preemptive_reclaim(fs_info
, space_info
)) {
1762 trace_btrfs_trigger_flush(fs_info
, space_info
->flags
,
1763 orig_bytes
, flush
, "preempt");
1764 queue_work(system_unbound_wq
,
1765 &fs_info
->preempt_reclaim_work
);
1768 spin_unlock(&space_info
->lock
);
1769 if (!ret
|| !can_ticket(flush
))
1772 return handle_reserve_ticket(fs_info
, space_info
, &ticket
, start_ns
,
1777 * Try to reserve metadata bytes from the block_rsv's space.
1779 * @fs_info: the filesystem
1780 * @space_info: the space_info we're allocating for
1781 * @orig_bytes: number of bytes we want
1782 * @flush: whether or not we can flush to make our reservation
1784 * This will reserve orig_bytes number of bytes from the space info associated
1785 * with the block_rsv. If there is not enough space it will make an attempt to
1786 * flush out space to make room. It will do this by flushing delalloc if
1787 * possible or committing the transaction. If flush is 0 then no attempts to
1788 * regain reservations will be made and this will fail if there is not enough
1791 int btrfs_reserve_metadata_bytes(struct btrfs_fs_info
*fs_info
,
1792 struct btrfs_space_info
*space_info
,
1794 enum btrfs_reserve_flush_enum flush
)
1798 ret
= __reserve_bytes(fs_info
, space_info
, orig_bytes
, flush
);
1799 if (ret
== -ENOSPC
) {
1800 trace_btrfs_space_reservation(fs_info
, "space_info:enospc",
1801 space_info
->flags
, orig_bytes
, 1);
1803 if (btrfs_test_opt(fs_info
, ENOSPC_DEBUG
))
1804 btrfs_dump_space_info(fs_info
, space_info
, orig_bytes
, 0);
1810 * Try to reserve data bytes for an allocation.
1812 * @fs_info: the filesystem
1813 * @bytes: number of bytes we need
1814 * @flush: how we are allowed to flush
1816 * This will reserve bytes from the data space info. If there is not enough
1817 * space then we will attempt to flush space as specified by flush.
1819 int btrfs_reserve_data_bytes(struct btrfs_fs_info
*fs_info
, u64 bytes
,
1820 enum btrfs_reserve_flush_enum flush
)
1822 struct btrfs_space_info
*data_sinfo
= fs_info
->data_sinfo
;
1825 ASSERT(flush
== BTRFS_RESERVE_FLUSH_DATA
||
1826 flush
== BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE
||
1827 flush
== BTRFS_RESERVE_NO_FLUSH
);
1828 ASSERT(!current
->journal_info
|| flush
!= BTRFS_RESERVE_FLUSH_DATA
);
1830 ret
= __reserve_bytes(fs_info
, data_sinfo
, bytes
, flush
);
1831 if (ret
== -ENOSPC
) {
1832 trace_btrfs_space_reservation(fs_info
, "space_info:enospc",
1833 data_sinfo
->flags
, bytes
, 1);
1834 if (btrfs_test_opt(fs_info
, ENOSPC_DEBUG
))
1835 btrfs_dump_space_info(fs_info
, data_sinfo
, bytes
, 0);
1840 /* Dump all the space infos when we abort a transaction due to ENOSPC. */
1841 __cold
void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info
*fs_info
)
1843 struct btrfs_space_info
*space_info
;
1845 btrfs_info(fs_info
, "dumping space info:");
1846 list_for_each_entry(space_info
, &fs_info
->space_info
, list
) {
1847 spin_lock(&space_info
->lock
);
1848 __btrfs_dump_space_info(fs_info
, space_info
);
1849 spin_unlock(&space_info
->lock
);
1851 dump_global_block_rsv(fs_info
);
1855 * Account the unused space of all the readonly block group in the space_info.
1856 * takes mirrors into account.
1858 u64
btrfs_account_ro_block_groups_free_space(struct btrfs_space_info
*sinfo
)
1860 struct btrfs_block_group
*block_group
;
1864 /* It's df, we don't care if it's racy */
1865 if (list_empty(&sinfo
->ro_bgs
))
1868 spin_lock(&sinfo
->lock
);
1869 list_for_each_entry(block_group
, &sinfo
->ro_bgs
, ro_list
) {
1870 spin_lock(&block_group
->lock
);
1872 if (!block_group
->ro
) {
1873 spin_unlock(&block_group
->lock
);
1877 factor
= btrfs_bg_type_to_factor(block_group
->flags
);
1878 free_bytes
+= (block_group
->length
-
1879 block_group
->used
) * factor
;
1881 spin_unlock(&block_group
->lock
);
1883 spin_unlock(&sinfo
->lock
);
1888 static u64
calc_pct_ratio(u64 x
, u64 y
)
1895 err
= check_mul_overflow(100, x
, &x
);
1897 goto lose_precision
;
1898 return div64_u64(x
, y
);
1908 * A reasonable buffer for unallocated space is 10 data block_groups.
1909 * If we claw this back repeatedly, we can still achieve efficient
1910 * utilization when near full, and not do too much reclaim while
1911 * always maintaining a solid buffer for workloads that quickly
1912 * allocate and pressure the unallocated space.
1914 static u64
calc_unalloc_target(struct btrfs_fs_info
*fs_info
)
1916 u64 chunk_sz
= calc_effective_data_chunk_size(fs_info
);
1918 return BTRFS_UNALLOC_BLOCK_GROUP_TARGET
* chunk_sz
;
1922 * The fundamental goal of automatic reclaim is to protect the filesystem's
1923 * unallocated space and thus minimize the probability of the filesystem going
1924 * read only when a metadata allocation failure causes a transaction abort.
1926 * However, relocations happen into the space_info's unused space, therefore
1927 * automatic reclaim must also back off as that space runs low. There is no
1928 * value in doing trivial "relocations" of re-writing the same block group
1931 * Furthermore, we want to avoid doing too much reclaim even if there are good
1932 * candidates. This is because the allocator is pretty good at filling up the
1933 * holes with writes. So we want to do just enough reclaim to try and stay
1934 * safe from running out of unallocated space but not be wasteful about it.
1936 * Therefore, the dynamic reclaim threshold is calculated as follows:
1937 * - calculate a target unallocated amount of 5 block group sized chunks
1938 * - ratchet up the intensity of reclaim depending on how far we are from
1939 * that target by using a formula of unalloc / target to set the threshold.
1941 * Typically with 10 block groups as the target, the discrete values this comes
1942 * out to are 0, 10, 20, ... , 80, 90, and 99.
1944 static int calc_dynamic_reclaim_threshold(const struct btrfs_space_info
*space_info
)
1946 struct btrfs_fs_info
*fs_info
= space_info
->fs_info
;
1947 u64 unalloc
= atomic64_read(&fs_info
->free_chunk_space
);
1948 u64 target
= calc_unalloc_target(fs_info
);
1949 u64 alloc
= space_info
->total_bytes
;
1950 u64 used
= btrfs_space_info_used(space_info
, false);
1951 u64 unused
= alloc
- used
;
1952 u64 want
= target
> unalloc
? target
- unalloc
: 0;
1953 u64 data_chunk_size
= calc_effective_data_chunk_size(fs_info
);
1955 /* If we have no unused space, don't bother, it won't work anyway. */
1956 if (unused
< data_chunk_size
)
1959 /* Cast to int is OK because want <= target. */
1960 return calc_pct_ratio(want
, target
);
1963 int btrfs_calc_reclaim_threshold(const struct btrfs_space_info
*space_info
)
1965 lockdep_assert_held(&space_info
->lock
);
1967 if (READ_ONCE(space_info
->dynamic_reclaim
))
1968 return calc_dynamic_reclaim_threshold(space_info
);
1969 return READ_ONCE(space_info
->bg_reclaim_threshold
);
1973 * Under "urgent" reclaim, we will reclaim even fresh block groups that have
1974 * recently seen successful allocations, as we are desperate to reclaim
1975 * whatever we can to avoid ENOSPC in a transaction leading to a readonly fs.
1977 static bool is_reclaim_urgent(struct btrfs_space_info
*space_info
)
1979 struct btrfs_fs_info
*fs_info
= space_info
->fs_info
;
1980 u64 unalloc
= atomic64_read(&fs_info
->free_chunk_space
);
1981 u64 data_chunk_size
= calc_effective_data_chunk_size(fs_info
);
1983 return unalloc
< data_chunk_size
;
1986 static void do_reclaim_sweep(struct btrfs_space_info
*space_info
, int raid
)
1988 struct btrfs_block_group
*bg
;
1990 bool try_again
= true;
1993 spin_lock(&space_info
->lock
);
1994 urgent
= is_reclaim_urgent(space_info
);
1995 thresh_pct
= btrfs_calc_reclaim_threshold(space_info
);
1996 spin_unlock(&space_info
->lock
);
1998 down_read(&space_info
->groups_sem
);
2000 list_for_each_entry(bg
, &space_info
->block_groups
[raid
], list
) {
2002 bool reclaim
= false;
2004 btrfs_get_block_group(bg
);
2005 spin_lock(&bg
->lock
);
2006 thresh
= mult_perc(bg
->length
, thresh_pct
);
2007 if (bg
->used
< thresh
&& bg
->reclaim_mark
) {
2012 spin_unlock(&bg
->lock
);
2014 btrfs_mark_bg_to_reclaim(bg
);
2015 btrfs_put_block_group(bg
);
2019 * In situations where we are very motivated to reclaim (low unalloc)
2020 * use two passes to make the reclaim mark check best effort.
2022 * If we have any staler groups, we don't touch the fresher ones, but if we
2023 * really need a block group, do take a fresh one.
2025 if (try_again
&& urgent
) {
2030 up_read(&space_info
->groups_sem
);
2033 void btrfs_space_info_update_reclaimable(struct btrfs_space_info
*space_info
, s64 bytes
)
2035 u64 chunk_sz
= calc_effective_data_chunk_size(space_info
->fs_info
);
2037 lockdep_assert_held(&space_info
->lock
);
2038 space_info
->reclaimable_bytes
+= bytes
;
2040 if (space_info
->reclaimable_bytes
>= chunk_sz
)
2041 btrfs_set_periodic_reclaim_ready(space_info
, true);
2044 void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info
*space_info
, bool ready
)
2046 lockdep_assert_held(&space_info
->lock
);
2047 if (!READ_ONCE(space_info
->periodic_reclaim
))
2049 if (ready
!= space_info
->periodic_reclaim_ready
) {
2050 space_info
->periodic_reclaim_ready
= ready
;
2052 space_info
->reclaimable_bytes
= 0;
2056 bool btrfs_should_periodic_reclaim(struct btrfs_space_info
*space_info
)
2060 if (space_info
->flags
& BTRFS_BLOCK_GROUP_SYSTEM
)
2062 if (!READ_ONCE(space_info
->periodic_reclaim
))
2065 spin_lock(&space_info
->lock
);
2066 ret
= space_info
->periodic_reclaim_ready
;
2067 btrfs_set_periodic_reclaim_ready(space_info
, false);
2068 spin_unlock(&space_info
->lock
);
2073 void btrfs_reclaim_sweep(const struct btrfs_fs_info
*fs_info
)
2076 struct btrfs_space_info
*space_info
;
2078 list_for_each_entry(space_info
, &fs_info
->space_info
, list
) {
2079 if (!btrfs_should_periodic_reclaim(space_info
))
2081 for (raid
= 0; raid
< BTRFS_NR_RAID_TYPES
; raid
++)
2082 do_reclaim_sweep(space_info
, raid
);