1 // SPDX-License-Identifier: GPL-2.0
3 #include <linux/bitops.h>
4 #include <linux/slab.h>
5 #include <linux/blkdev.h>
6 #include <linux/sched/mm.h>
7 #include <linux/atomic.h>
8 #include <linux/vmalloc.h>
12 #include "rcu-string.h"
14 #include "block-group.h"
15 #include "dev-replace.h"
16 #include "space-info.h"
18 #include "accessors.h"
21 /* Maximum number of zones to report per blkdev_report_zones() call */
22 #define BTRFS_REPORT_NR_ZONES 4096
23 /* Invalid allocation pointer value for missing devices */
24 #define WP_MISSING_DEV ((u64)-1)
25 /* Pseudo write pointer value for conventional zone */
26 #define WP_CONVENTIONAL ((u64)-2)
29 * Location of the first zone of superblock logging zone pairs.
31 * - primary superblock: 0B (zone 0)
32 * - first copy: 512G (zone starting at that offset)
33 * - second copy: 4T (zone starting at that offset)
35 #define BTRFS_SB_LOG_PRIMARY_OFFSET (0ULL)
36 #define BTRFS_SB_LOG_FIRST_OFFSET (512ULL * SZ_1G)
37 #define BTRFS_SB_LOG_SECOND_OFFSET (4096ULL * SZ_1G)
39 #define BTRFS_SB_LOG_FIRST_SHIFT const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET)
40 #define BTRFS_SB_LOG_SECOND_SHIFT const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET)
42 /* Number of superblock log zones */
43 #define BTRFS_NR_SB_LOG_ZONES 2
46 * Minimum of active zones we need:
48 * - BTRFS_SUPER_MIRROR_MAX zones for superblock mirrors
49 * - 3 zones to ensure at least one zone per SYSTEM, META and DATA block group
50 * - 1 zone for tree-log dedicated block group
51 * - 1 zone for relocation
53 #define BTRFS_MIN_ACTIVE_ZONES (BTRFS_SUPER_MIRROR_MAX + 5)
56 * Minimum / maximum supported zone size. Currently, SMR disks have a zone
57 * size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range.
58 * We do not expect the zone size to become larger than 8GiB or smaller than
59 * 4MiB in the near future.
61 #define BTRFS_MAX_ZONE_SIZE SZ_8G
62 #define BTRFS_MIN_ZONE_SIZE SZ_4M
64 #define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT)
66 static void wait_eb_writebacks(struct btrfs_block_group
*block_group
);
67 static int do_zone_finish(struct btrfs_block_group
*block_group
, bool fully_written
);
69 static inline bool sb_zone_is_full(const struct blk_zone
*zone
)
71 return (zone
->cond
== BLK_ZONE_COND_FULL
) ||
72 (zone
->wp
+ SUPER_INFO_SECTORS
> zone
->start
+ zone
->capacity
);
75 static int copy_zone_info_cb(struct blk_zone
*zone
, unsigned int idx
, void *data
)
77 struct blk_zone
*zones
= data
;
79 memcpy(&zones
[idx
], zone
, sizeof(*zone
));
84 static int sb_write_pointer(struct block_device
*bdev
, struct blk_zone
*zones
,
87 bool empty
[BTRFS_NR_SB_LOG_ZONES
];
88 bool full
[BTRFS_NR_SB_LOG_ZONES
];
91 for (int i
= 0; i
< BTRFS_NR_SB_LOG_ZONES
; i
++) {
92 ASSERT(zones
[i
].type
!= BLK_ZONE_TYPE_CONVENTIONAL
);
93 empty
[i
] = (zones
[i
].cond
== BLK_ZONE_COND_EMPTY
);
94 full
[i
] = sb_zone_is_full(&zones
[i
]);
98 * Possible states of log buffer zones
100 * Empty[0] In use[0] Full[0]
106 * *: Special case, no superblock is written
107 * 0: Use write pointer of zones[0]
108 * 1: Use write pointer of zones[1]
109 * C: Compare super blocks from zones[0] and zones[1], use the latest
110 * one determined by generation
114 if (empty
[0] && empty
[1]) {
115 /* Special case to distinguish no superblock to read */
116 *wp_ret
= zones
[0].start
<< SECTOR_SHIFT
;
118 } else if (full
[0] && full
[1]) {
119 /* Compare two super blocks */
120 struct address_space
*mapping
= bdev
->bd_mapping
;
121 struct page
*page
[BTRFS_NR_SB_LOG_ZONES
];
122 struct btrfs_super_block
*super
[BTRFS_NR_SB_LOG_ZONES
];
124 for (int i
= 0; i
< BTRFS_NR_SB_LOG_ZONES
; i
++) {
125 u64 zone_end
= (zones
[i
].start
+ zones
[i
].capacity
) << SECTOR_SHIFT
;
126 u64 bytenr
= ALIGN_DOWN(zone_end
, BTRFS_SUPER_INFO_SIZE
) -
127 BTRFS_SUPER_INFO_SIZE
;
129 page
[i
] = read_cache_page_gfp(mapping
,
130 bytenr
>> PAGE_SHIFT
, GFP_NOFS
);
131 if (IS_ERR(page
[i
])) {
133 btrfs_release_disk_super(super
[0]);
134 return PTR_ERR(page
[i
]);
136 super
[i
] = page_address(page
[i
]);
139 if (btrfs_super_generation(super
[0]) >
140 btrfs_super_generation(super
[1]))
141 sector
= zones
[1].start
;
143 sector
= zones
[0].start
;
145 for (int i
= 0; i
< BTRFS_NR_SB_LOG_ZONES
; i
++)
146 btrfs_release_disk_super(super
[i
]);
147 } else if (!full
[0] && (empty
[1] || full
[1])) {
148 sector
= zones
[0].wp
;
149 } else if (full
[0]) {
150 sector
= zones
[1].wp
;
154 *wp_ret
= sector
<< SECTOR_SHIFT
;
159 * Get the first zone number of the superblock mirror
161 static inline u32
sb_zone_number(int shift
, int mirror
)
165 ASSERT(mirror
< BTRFS_SUPER_MIRROR_MAX
);
167 case 0: zone
= 0; break;
168 case 1: zone
= 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT
- shift
); break;
169 case 2: zone
= 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT
- shift
); break;
172 ASSERT(zone
<= U32_MAX
);
177 static inline sector_t
zone_start_sector(u32 zone_number
,
178 struct block_device
*bdev
)
180 return (sector_t
)zone_number
<< ilog2(bdev_zone_sectors(bdev
));
183 static inline u64
zone_start_physical(u32 zone_number
,
184 struct btrfs_zoned_device_info
*zone_info
)
186 return (u64
)zone_number
<< zone_info
->zone_size_shift
;
190 * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block
191 * device into static sized chunks and fake a conventional zone on each of
194 static int emulate_report_zones(struct btrfs_device
*device
, u64 pos
,
195 struct blk_zone
*zones
, unsigned int nr_zones
)
197 const sector_t zone_sectors
= device
->fs_info
->zone_size
>> SECTOR_SHIFT
;
198 sector_t bdev_size
= bdev_nr_sectors(device
->bdev
);
201 pos
>>= SECTOR_SHIFT
;
202 for (i
= 0; i
< nr_zones
; i
++) {
203 zones
[i
].start
= i
* zone_sectors
+ pos
;
204 zones
[i
].len
= zone_sectors
;
205 zones
[i
].capacity
= zone_sectors
;
206 zones
[i
].wp
= zones
[i
].start
+ zone_sectors
;
207 zones
[i
].type
= BLK_ZONE_TYPE_CONVENTIONAL
;
208 zones
[i
].cond
= BLK_ZONE_COND_NOT_WP
;
210 if (zones
[i
].wp
>= bdev_size
) {
219 static int btrfs_get_dev_zones(struct btrfs_device
*device
, u64 pos
,
220 struct blk_zone
*zones
, unsigned int *nr_zones
)
222 struct btrfs_zoned_device_info
*zinfo
= device
->zone_info
;
228 if (!bdev_is_zoned(device
->bdev
)) {
229 ret
= emulate_report_zones(device
, pos
, zones
, *nr_zones
);
235 if (zinfo
->zone_cache
) {
239 ASSERT(IS_ALIGNED(pos
, zinfo
->zone_size
));
240 zno
= pos
>> zinfo
->zone_size_shift
;
242 * We cannot report zones beyond the zone end. So, it is OK to
243 * cap *nr_zones to at the end.
245 *nr_zones
= min_t(u32
, *nr_zones
, zinfo
->nr_zones
- zno
);
247 for (i
= 0; i
< *nr_zones
; i
++) {
248 struct blk_zone
*zone_info
;
250 zone_info
= &zinfo
->zone_cache
[zno
+ i
];
255 if (i
== *nr_zones
) {
256 /* Cache hit on all the zones */
257 memcpy(zones
, zinfo
->zone_cache
+ zno
,
258 sizeof(*zinfo
->zone_cache
) * *nr_zones
);
263 ret
= blkdev_report_zones(device
->bdev
, pos
>> SECTOR_SHIFT
, *nr_zones
,
264 copy_zone_info_cb
, zones
);
266 btrfs_err_in_rcu(device
->fs_info
,
267 "zoned: failed to read zone %llu on %s (devid %llu)",
268 pos
, rcu_str_deref(device
->name
),
277 if (zinfo
->zone_cache
) {
278 u32 zno
= pos
>> zinfo
->zone_size_shift
;
280 memcpy(zinfo
->zone_cache
+ zno
, zones
,
281 sizeof(*zinfo
->zone_cache
) * *nr_zones
);
287 /* The emulated zone size is determined from the size of device extent */
288 static int calculate_emulated_zone_size(struct btrfs_fs_info
*fs_info
)
290 BTRFS_PATH_AUTO_FREE(path
);
291 struct btrfs_root
*root
= fs_info
->dev_root
;
292 struct btrfs_key key
;
293 struct extent_buffer
*leaf
;
294 struct btrfs_dev_extent
*dext
;
298 key
.type
= BTRFS_DEV_EXTENT_KEY
;
301 path
= btrfs_alloc_path();
305 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
309 if (path
->slots
[0] >= btrfs_header_nritems(path
->nodes
[0])) {
310 ret
= btrfs_next_leaf(root
, path
);
313 /* No dev extents at all? Not good */
318 leaf
= path
->nodes
[0];
319 dext
= btrfs_item_ptr(leaf
, path
->slots
[0], struct btrfs_dev_extent
);
320 fs_info
->zone_size
= btrfs_dev_extent_length(leaf
, dext
);
324 int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info
*fs_info
)
326 struct btrfs_fs_devices
*fs_devices
= fs_info
->fs_devices
;
327 struct btrfs_device
*device
;
330 /* fs_info->zone_size might not set yet. Use the incomapt flag here. */
331 if (!btrfs_fs_incompat(fs_info
, ZONED
))
334 mutex_lock(&fs_devices
->device_list_mutex
);
335 list_for_each_entry(device
, &fs_devices
->devices
, dev_list
) {
336 /* We can skip reading of zone info for missing devices */
340 ret
= btrfs_get_dev_zone_info(device
, true);
344 mutex_unlock(&fs_devices
->device_list_mutex
);
349 int btrfs_get_dev_zone_info(struct btrfs_device
*device
, bool populate_cache
)
351 struct btrfs_fs_info
*fs_info
= device
->fs_info
;
352 struct btrfs_zoned_device_info
*zone_info
= NULL
;
353 struct block_device
*bdev
= device
->bdev
;
354 unsigned int max_active_zones
;
355 unsigned int nactive
;
358 struct blk_zone
*zones
= NULL
;
359 unsigned int i
, nreported
= 0, nr_zones
;
360 sector_t zone_sectors
;
361 char *model
, *emulated
;
365 * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not
368 if (!btrfs_fs_incompat(fs_info
, ZONED
))
371 if (device
->zone_info
)
374 zone_info
= kzalloc(sizeof(*zone_info
), GFP_KERNEL
);
378 device
->zone_info
= zone_info
;
380 if (!bdev_is_zoned(bdev
)) {
381 if (!fs_info
->zone_size
) {
382 ret
= calculate_emulated_zone_size(fs_info
);
387 ASSERT(fs_info
->zone_size
);
388 zone_sectors
= fs_info
->zone_size
>> SECTOR_SHIFT
;
390 zone_sectors
= bdev_zone_sectors(bdev
);
393 ASSERT(is_power_of_two_u64(zone_sectors
));
394 zone_info
->zone_size
= zone_sectors
<< SECTOR_SHIFT
;
396 /* We reject devices with a zone size larger than 8GB */
397 if (zone_info
->zone_size
> BTRFS_MAX_ZONE_SIZE
) {
398 btrfs_err_in_rcu(fs_info
,
399 "zoned: %s: zone size %llu larger than supported maximum %llu",
400 rcu_str_deref(device
->name
),
401 zone_info
->zone_size
, BTRFS_MAX_ZONE_SIZE
);
404 } else if (zone_info
->zone_size
< BTRFS_MIN_ZONE_SIZE
) {
405 btrfs_err_in_rcu(fs_info
,
406 "zoned: %s: zone size %llu smaller than supported minimum %u",
407 rcu_str_deref(device
->name
),
408 zone_info
->zone_size
, BTRFS_MIN_ZONE_SIZE
);
413 nr_sectors
= bdev_nr_sectors(bdev
);
414 zone_info
->zone_size_shift
= ilog2(zone_info
->zone_size
);
415 zone_info
->nr_zones
= nr_sectors
>> ilog2(zone_sectors
);
416 if (!IS_ALIGNED(nr_sectors
, zone_sectors
))
417 zone_info
->nr_zones
++;
419 max_active_zones
= bdev_max_active_zones(bdev
);
420 if (max_active_zones
&& max_active_zones
< BTRFS_MIN_ACTIVE_ZONES
) {
421 btrfs_err_in_rcu(fs_info
,
422 "zoned: %s: max active zones %u is too small, need at least %u active zones",
423 rcu_str_deref(device
->name
), max_active_zones
,
424 BTRFS_MIN_ACTIVE_ZONES
);
428 zone_info
->max_active_zones
= max_active_zones
;
430 zone_info
->seq_zones
= bitmap_zalloc(zone_info
->nr_zones
, GFP_KERNEL
);
431 if (!zone_info
->seq_zones
) {
436 zone_info
->empty_zones
= bitmap_zalloc(zone_info
->nr_zones
, GFP_KERNEL
);
437 if (!zone_info
->empty_zones
) {
442 zone_info
->active_zones
= bitmap_zalloc(zone_info
->nr_zones
, GFP_KERNEL
);
443 if (!zone_info
->active_zones
) {
448 zones
= kvcalloc(BTRFS_REPORT_NR_ZONES
, sizeof(struct blk_zone
), GFP_KERNEL
);
455 * Enable zone cache only for a zoned device. On a non-zoned device, we
456 * fill the zone info with emulated CONVENTIONAL zones, so no need to
459 if (populate_cache
&& bdev_is_zoned(device
->bdev
)) {
460 zone_info
->zone_cache
= vcalloc(zone_info
->nr_zones
,
461 sizeof(struct blk_zone
));
462 if (!zone_info
->zone_cache
) {
463 btrfs_err_in_rcu(device
->fs_info
,
464 "zoned: failed to allocate zone cache for %s",
465 rcu_str_deref(device
->name
));
473 while (sector
< nr_sectors
) {
474 nr_zones
= BTRFS_REPORT_NR_ZONES
;
475 ret
= btrfs_get_dev_zones(device
, sector
<< SECTOR_SHIFT
, zones
,
480 for (i
= 0; i
< nr_zones
; i
++) {
481 if (zones
[i
].type
== BLK_ZONE_TYPE_SEQWRITE_REQ
)
482 __set_bit(nreported
, zone_info
->seq_zones
);
483 switch (zones
[i
].cond
) {
484 case BLK_ZONE_COND_EMPTY
:
485 __set_bit(nreported
, zone_info
->empty_zones
);
487 case BLK_ZONE_COND_IMP_OPEN
:
488 case BLK_ZONE_COND_EXP_OPEN
:
489 case BLK_ZONE_COND_CLOSED
:
490 __set_bit(nreported
, zone_info
->active_zones
);
496 sector
= zones
[nr_zones
- 1].start
+ zones
[nr_zones
- 1].len
;
499 if (nreported
!= zone_info
->nr_zones
) {
500 btrfs_err_in_rcu(device
->fs_info
,
501 "inconsistent number of zones on %s (%u/%u)",
502 rcu_str_deref(device
->name
), nreported
,
503 zone_info
->nr_zones
);
508 if (max_active_zones
) {
509 if (nactive
> max_active_zones
) {
510 btrfs_err_in_rcu(device
->fs_info
,
511 "zoned: %u active zones on %s exceeds max_active_zones %u",
512 nactive
, rcu_str_deref(device
->name
),
517 atomic_set(&zone_info
->active_zones_left
,
518 max_active_zones
- nactive
);
519 set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING
, &fs_info
->flags
);
522 /* Validate superblock log */
523 nr_zones
= BTRFS_NR_SB_LOG_ZONES
;
524 for (i
= 0; i
< BTRFS_SUPER_MIRROR_MAX
; i
++) {
527 int sb_pos
= BTRFS_NR_SB_LOG_ZONES
* i
;
529 sb_zone
= sb_zone_number(zone_info
->zone_size_shift
, i
);
530 if (sb_zone
+ 1 >= zone_info
->nr_zones
)
533 ret
= btrfs_get_dev_zones(device
,
534 zone_start_physical(sb_zone
, zone_info
),
535 &zone_info
->sb_zones
[sb_pos
],
540 if (nr_zones
!= BTRFS_NR_SB_LOG_ZONES
) {
541 btrfs_err_in_rcu(device
->fs_info
,
542 "zoned: failed to read super block log zone info at devid %llu zone %u",
543 device
->devid
, sb_zone
);
549 * If zones[0] is conventional, always use the beginning of the
550 * zone to record superblock. No need to validate in that case.
552 if (zone_info
->sb_zones
[BTRFS_NR_SB_LOG_ZONES
* i
].type
==
553 BLK_ZONE_TYPE_CONVENTIONAL
)
556 ret
= sb_write_pointer(device
->bdev
,
557 &zone_info
->sb_zones
[sb_pos
], &sb_wp
);
558 if (ret
!= -ENOENT
&& ret
) {
559 btrfs_err_in_rcu(device
->fs_info
,
560 "zoned: super block log zone corrupted devid %llu zone %u",
561 device
->devid
, sb_zone
);
570 if (bdev_is_zoned(bdev
)) {
571 model
= "host-managed zoned";
575 emulated
= "emulated ";
578 btrfs_info_in_rcu(fs_info
,
579 "%s block device %s, %u %szones of %llu bytes",
580 model
, rcu_str_deref(device
->name
), zone_info
->nr_zones
,
581 emulated
, zone_info
->zone_size
);
587 btrfs_destroy_dev_zone_info(device
);
591 void btrfs_destroy_dev_zone_info(struct btrfs_device
*device
)
593 struct btrfs_zoned_device_info
*zone_info
= device
->zone_info
;
598 bitmap_free(zone_info
->active_zones
);
599 bitmap_free(zone_info
->seq_zones
);
600 bitmap_free(zone_info
->empty_zones
);
601 vfree(zone_info
->zone_cache
);
603 device
->zone_info
= NULL
;
606 struct btrfs_zoned_device_info
*btrfs_clone_dev_zone_info(struct btrfs_device
*orig_dev
)
608 struct btrfs_zoned_device_info
*zone_info
;
610 zone_info
= kmemdup(orig_dev
->zone_info
, sizeof(*zone_info
), GFP_KERNEL
);
614 zone_info
->seq_zones
= bitmap_zalloc(zone_info
->nr_zones
, GFP_KERNEL
);
615 if (!zone_info
->seq_zones
)
618 bitmap_copy(zone_info
->seq_zones
, orig_dev
->zone_info
->seq_zones
,
619 zone_info
->nr_zones
);
621 zone_info
->empty_zones
= bitmap_zalloc(zone_info
->nr_zones
, GFP_KERNEL
);
622 if (!zone_info
->empty_zones
)
625 bitmap_copy(zone_info
->empty_zones
, orig_dev
->zone_info
->empty_zones
,
626 zone_info
->nr_zones
);
628 zone_info
->active_zones
= bitmap_zalloc(zone_info
->nr_zones
, GFP_KERNEL
);
629 if (!zone_info
->active_zones
)
632 bitmap_copy(zone_info
->active_zones
, orig_dev
->zone_info
->active_zones
,
633 zone_info
->nr_zones
);
634 zone_info
->zone_cache
= NULL
;
639 bitmap_free(zone_info
->seq_zones
);
640 bitmap_free(zone_info
->empty_zones
);
641 bitmap_free(zone_info
->active_zones
);
646 static int btrfs_get_dev_zone(struct btrfs_device
*device
, u64 pos
, struct blk_zone
*zone
)
648 unsigned int nr_zones
= 1;
651 ret
= btrfs_get_dev_zones(device
, pos
, zone
, &nr_zones
);
652 if (ret
!= 0 || !nr_zones
)
653 return ret
? ret
: -EIO
;
658 static int btrfs_check_for_zoned_device(struct btrfs_fs_info
*fs_info
)
660 struct btrfs_device
*device
;
662 list_for_each_entry(device
, &fs_info
->fs_devices
->devices
, dev_list
) {
663 if (device
->bdev
&& bdev_is_zoned(device
->bdev
)) {
665 "zoned: mode not enabled but zoned device found: %pg",
674 int btrfs_check_zoned_mode(struct btrfs_fs_info
*fs_info
)
676 struct queue_limits
*lim
= &fs_info
->limits
;
677 struct btrfs_device
*device
;
682 * Host-Managed devices can't be used without the ZONED flag. With the
683 * ZONED all devices can be used, using zone emulation if required.
685 if (!btrfs_fs_incompat(fs_info
, ZONED
))
686 return btrfs_check_for_zoned_device(fs_info
);
688 blk_set_stacking_limits(lim
);
690 list_for_each_entry(device
, &fs_info
->fs_devices
->devices
, dev_list
) {
691 struct btrfs_zoned_device_info
*zone_info
= device
->zone_info
;
697 zone_size
= zone_info
->zone_size
;
698 } else if (zone_info
->zone_size
!= zone_size
) {
700 "zoned: unequal block device zone sizes: have %llu found %llu",
701 zone_info
->zone_size
, zone_size
);
706 * With the zoned emulation, we can have non-zoned device on the
707 * zoned mode. In this case, we don't have a valid max zone
710 if (bdev_is_zoned(device
->bdev
))
711 blk_stack_limits(lim
, bdev_limits(device
->bdev
), 0);
714 ret
= blk_validate_limits(lim
);
716 btrfs_err(fs_info
, "zoned: failed to validate queue limits");
721 * stripe_size is always aligned to BTRFS_STRIPE_LEN in
722 * btrfs_create_chunk(). Since we want stripe_len == zone_size,
723 * check the alignment here.
725 if (!IS_ALIGNED(zone_size
, BTRFS_STRIPE_LEN
)) {
727 "zoned: zone size %llu not aligned to stripe %u",
728 zone_size
, BTRFS_STRIPE_LEN
);
732 if (btrfs_fs_incompat(fs_info
, MIXED_GROUPS
)) {
733 btrfs_err(fs_info
, "zoned: mixed block groups not supported");
737 fs_info
->zone_size
= zone_size
;
739 * Also limit max_zone_append_size by max_segments * PAGE_SIZE.
740 * Technically, we can have multiple pages per segment. But, since
741 * we add the pages one by one to a bio, and cannot increase the
742 * metadata reservation even if it increases the number of extents, it
743 * is safe to stick with the limit.
745 fs_info
->max_zone_append_size
= ALIGN_DOWN(
746 min3((u64
)lim
->max_zone_append_sectors
<< SECTOR_SHIFT
,
747 (u64
)lim
->max_sectors
<< SECTOR_SHIFT
,
748 (u64
)lim
->max_segments
<< PAGE_SHIFT
),
749 fs_info
->sectorsize
);
750 fs_info
->fs_devices
->chunk_alloc_policy
= BTRFS_CHUNK_ALLOC_ZONED
;
751 if (fs_info
->max_zone_append_size
< fs_info
->max_extent_size
)
752 fs_info
->max_extent_size
= fs_info
->max_zone_append_size
;
755 * Check mount options here, because we might change fs_info->zoned
756 * from fs_info->zone_size.
758 ret
= btrfs_check_mountopts_zoned(fs_info
, &fs_info
->mount_opt
);
762 btrfs_info(fs_info
, "zoned mode enabled with zone size %llu", zone_size
);
766 int btrfs_check_mountopts_zoned(const struct btrfs_fs_info
*info
,
767 unsigned long long *mount_opt
)
769 if (!btrfs_is_zoned(info
))
773 * Space cache writing is not COWed. Disable that to avoid write errors
774 * in sequential zones.
776 if (btrfs_raw_test_opt(*mount_opt
, SPACE_CACHE
)) {
777 btrfs_err(info
, "zoned: space cache v1 is not supported");
781 if (btrfs_raw_test_opt(*mount_opt
, NODATACOW
)) {
782 btrfs_err(info
, "zoned: NODATACOW not supported");
786 if (btrfs_raw_test_opt(*mount_opt
, DISCARD_ASYNC
)) {
788 "zoned: async discard ignored and disabled for zoned mode");
789 btrfs_clear_opt(*mount_opt
, DISCARD_ASYNC
);
795 static int sb_log_location(struct block_device
*bdev
, struct blk_zone
*zones
,
796 int rw
, u64
*bytenr_ret
)
801 if (zones
[0].type
== BLK_ZONE_TYPE_CONVENTIONAL
) {
802 *bytenr_ret
= zones
[0].start
<< SECTOR_SHIFT
;
806 ret
= sb_write_pointer(bdev
, zones
, &wp
);
807 if (ret
!= -ENOENT
&& ret
< 0)
811 struct blk_zone
*reset
= NULL
;
813 if (wp
== zones
[0].start
<< SECTOR_SHIFT
)
815 else if (wp
== zones
[1].start
<< SECTOR_SHIFT
)
818 if (reset
&& reset
->cond
!= BLK_ZONE_COND_EMPTY
) {
819 unsigned int nofs_flags
;
821 ASSERT(sb_zone_is_full(reset
));
823 nofs_flags
= memalloc_nofs_save();
824 ret
= blkdev_zone_mgmt(bdev
, REQ_OP_ZONE_RESET
,
825 reset
->start
, reset
->len
);
826 memalloc_nofs_restore(nofs_flags
);
830 reset
->cond
= BLK_ZONE_COND_EMPTY
;
831 reset
->wp
= reset
->start
;
833 } else if (ret
!= -ENOENT
) {
835 * For READ, we want the previous one. Move write pointer to
836 * the end of a zone, if it is at the head of a zone.
840 if (wp
== zones
[0].start
<< SECTOR_SHIFT
)
841 zone_end
= zones
[1].start
+ zones
[1].capacity
;
842 else if (wp
== zones
[1].start
<< SECTOR_SHIFT
)
843 zone_end
= zones
[0].start
+ zones
[0].capacity
;
845 wp
= ALIGN_DOWN(zone_end
<< SECTOR_SHIFT
,
846 BTRFS_SUPER_INFO_SIZE
);
848 wp
-= BTRFS_SUPER_INFO_SIZE
;
856 int btrfs_sb_log_location_bdev(struct block_device
*bdev
, int mirror
, int rw
,
859 struct blk_zone zones
[BTRFS_NR_SB_LOG_ZONES
];
860 sector_t zone_sectors
;
863 u8 zone_sectors_shift
;
867 if (!bdev_is_zoned(bdev
)) {
868 *bytenr_ret
= btrfs_sb_offset(mirror
);
872 ASSERT(rw
== READ
|| rw
== WRITE
);
874 zone_sectors
= bdev_zone_sectors(bdev
);
875 if (!is_power_of_2(zone_sectors
))
877 zone_sectors_shift
= ilog2(zone_sectors
);
878 nr_sectors
= bdev_nr_sectors(bdev
);
879 nr_zones
= nr_sectors
>> zone_sectors_shift
;
881 sb_zone
= sb_zone_number(zone_sectors_shift
+ SECTOR_SHIFT
, mirror
);
882 if (sb_zone
+ 1 >= nr_zones
)
885 ret
= blkdev_report_zones(bdev
, zone_start_sector(sb_zone
, bdev
),
886 BTRFS_NR_SB_LOG_ZONES
, copy_zone_info_cb
,
890 if (ret
!= BTRFS_NR_SB_LOG_ZONES
)
893 return sb_log_location(bdev
, zones
, rw
, bytenr_ret
);
896 int btrfs_sb_log_location(struct btrfs_device
*device
, int mirror
, int rw
,
899 struct btrfs_zoned_device_info
*zinfo
= device
->zone_info
;
903 * For a zoned filesystem on a non-zoned block device, use the same
904 * super block locations as regular filesystem. Doing so, the super
905 * block can always be retrieved and the zoned flag of the volume
906 * detected from the super block information.
908 if (!bdev_is_zoned(device
->bdev
)) {
909 *bytenr_ret
= btrfs_sb_offset(mirror
);
913 zone_num
= sb_zone_number(zinfo
->zone_size_shift
, mirror
);
914 if (zone_num
+ 1 >= zinfo
->nr_zones
)
917 return sb_log_location(device
->bdev
,
918 &zinfo
->sb_zones
[BTRFS_NR_SB_LOG_ZONES
* mirror
],
922 static inline bool is_sb_log_zone(struct btrfs_zoned_device_info
*zinfo
,
930 zone_num
= sb_zone_number(zinfo
->zone_size_shift
, mirror
);
931 if (zone_num
+ 1 >= zinfo
->nr_zones
)
934 if (!test_bit(zone_num
, zinfo
->seq_zones
))
940 int btrfs_advance_sb_log(struct btrfs_device
*device
, int mirror
)
942 struct btrfs_zoned_device_info
*zinfo
= device
->zone_info
;
943 struct blk_zone
*zone
;
946 if (!is_sb_log_zone(zinfo
, mirror
))
949 zone
= &zinfo
->sb_zones
[BTRFS_NR_SB_LOG_ZONES
* mirror
];
950 for (i
= 0; i
< BTRFS_NR_SB_LOG_ZONES
; i
++) {
951 /* Advance the next zone */
952 if (zone
->cond
== BLK_ZONE_COND_FULL
) {
957 if (zone
->cond
== BLK_ZONE_COND_EMPTY
)
958 zone
->cond
= BLK_ZONE_COND_IMP_OPEN
;
960 zone
->wp
+= SUPER_INFO_SECTORS
;
962 if (sb_zone_is_full(zone
)) {
964 * No room left to write new superblock. Since
965 * superblock is written with REQ_SYNC, it is safe to
966 * finish the zone now.
968 * If the write pointer is exactly at the capacity,
969 * explicit ZONE_FINISH is not necessary.
971 if (zone
->wp
!= zone
->start
+ zone
->capacity
) {
972 unsigned int nofs_flags
;
975 nofs_flags
= memalloc_nofs_save();
976 ret
= blkdev_zone_mgmt(device
->bdev
,
977 REQ_OP_ZONE_FINISH
, zone
->start
,
979 memalloc_nofs_restore(nofs_flags
);
984 zone
->wp
= zone
->start
+ zone
->len
;
985 zone
->cond
= BLK_ZONE_COND_FULL
;
990 /* All the zones are FULL. Should not reach here. */
995 int btrfs_reset_sb_log_zones(struct block_device
*bdev
, int mirror
)
997 unsigned int nofs_flags
;
998 sector_t zone_sectors
;
1000 u8 zone_sectors_shift
;
1005 zone_sectors
= bdev_zone_sectors(bdev
);
1006 zone_sectors_shift
= ilog2(zone_sectors
);
1007 nr_sectors
= bdev_nr_sectors(bdev
);
1008 nr_zones
= nr_sectors
>> zone_sectors_shift
;
1010 sb_zone
= sb_zone_number(zone_sectors_shift
+ SECTOR_SHIFT
, mirror
);
1011 if (sb_zone
+ 1 >= nr_zones
)
1014 nofs_flags
= memalloc_nofs_save();
1015 ret
= blkdev_zone_mgmt(bdev
, REQ_OP_ZONE_RESET
,
1016 zone_start_sector(sb_zone
, bdev
),
1017 zone_sectors
* BTRFS_NR_SB_LOG_ZONES
);
1018 memalloc_nofs_restore(nofs_flags
);
1023 * Find allocatable zones within a given region.
1025 * @device: the device to allocate a region on
1026 * @hole_start: the position of the hole to allocate the region
1027 * @num_bytes: size of wanted region
1028 * @hole_end: the end of the hole
1029 * @return: position of allocatable zones
1031 * Allocatable region should not contain any superblock locations.
1033 u64
btrfs_find_allocatable_zones(struct btrfs_device
*device
, u64 hole_start
,
1034 u64 hole_end
, u64 num_bytes
)
1036 struct btrfs_zoned_device_info
*zinfo
= device
->zone_info
;
1037 const u8 shift
= zinfo
->zone_size_shift
;
1038 u64 nzones
= num_bytes
>> shift
;
1039 u64 pos
= hole_start
;
1044 ASSERT(IS_ALIGNED(hole_start
, zinfo
->zone_size
));
1045 ASSERT(IS_ALIGNED(num_bytes
, zinfo
->zone_size
));
1047 while (pos
< hole_end
) {
1048 begin
= pos
>> shift
;
1049 end
= begin
+ nzones
;
1051 if (end
> zinfo
->nr_zones
)
1054 /* Check if zones in the region are all empty */
1055 if (btrfs_dev_is_sequential(device
, pos
) &&
1056 !bitmap_test_range_all_set(zinfo
->empty_zones
, begin
, nzones
)) {
1057 pos
+= zinfo
->zone_size
;
1062 for (i
= 0; i
< BTRFS_SUPER_MIRROR_MAX
; i
++) {
1066 sb_zone
= sb_zone_number(shift
, i
);
1067 if (!(end
<= sb_zone
||
1068 sb_zone
+ BTRFS_NR_SB_LOG_ZONES
<= begin
)) {
1070 pos
= zone_start_physical(
1071 sb_zone
+ BTRFS_NR_SB_LOG_ZONES
, zinfo
);
1075 /* We also need to exclude regular superblock positions */
1076 sb_pos
= btrfs_sb_offset(i
);
1077 if (!(pos
+ num_bytes
<= sb_pos
||
1078 sb_pos
+ BTRFS_SUPER_INFO_SIZE
<= pos
)) {
1080 pos
= ALIGN(sb_pos
+ BTRFS_SUPER_INFO_SIZE
,
1092 static bool btrfs_dev_set_active_zone(struct btrfs_device
*device
, u64 pos
)
1094 struct btrfs_zoned_device_info
*zone_info
= device
->zone_info
;
1095 unsigned int zno
= (pos
>> zone_info
->zone_size_shift
);
1097 /* We can use any number of zones */
1098 if (zone_info
->max_active_zones
== 0)
1101 if (!test_bit(zno
, zone_info
->active_zones
)) {
1102 /* Active zone left? */
1103 if (atomic_dec_if_positive(&zone_info
->active_zones_left
) < 0)
1105 if (test_and_set_bit(zno
, zone_info
->active_zones
)) {
1106 /* Someone already set the bit */
1107 atomic_inc(&zone_info
->active_zones_left
);
1114 static void btrfs_dev_clear_active_zone(struct btrfs_device
*device
, u64 pos
)
1116 struct btrfs_zoned_device_info
*zone_info
= device
->zone_info
;
1117 unsigned int zno
= (pos
>> zone_info
->zone_size_shift
);
1119 /* We can use any number of zones */
1120 if (zone_info
->max_active_zones
== 0)
1123 if (test_and_clear_bit(zno
, zone_info
->active_zones
))
1124 atomic_inc(&zone_info
->active_zones_left
);
1127 int btrfs_reset_device_zone(struct btrfs_device
*device
, u64 physical
,
1128 u64 length
, u64
*bytes
)
1130 unsigned int nofs_flags
;
1134 nofs_flags
= memalloc_nofs_save();
1135 ret
= blkdev_zone_mgmt(device
->bdev
, REQ_OP_ZONE_RESET
,
1136 physical
>> SECTOR_SHIFT
, length
>> SECTOR_SHIFT
);
1137 memalloc_nofs_restore(nofs_flags
);
1143 btrfs_dev_set_zone_empty(device
, physical
);
1144 btrfs_dev_clear_active_zone(device
, physical
);
1145 physical
+= device
->zone_info
->zone_size
;
1146 length
-= device
->zone_info
->zone_size
;
1152 int btrfs_ensure_empty_zones(struct btrfs_device
*device
, u64 start
, u64 size
)
1154 struct btrfs_zoned_device_info
*zinfo
= device
->zone_info
;
1155 const u8 shift
= zinfo
->zone_size_shift
;
1156 unsigned long begin
= start
>> shift
;
1157 unsigned long nbits
= size
>> shift
;
1161 ASSERT(IS_ALIGNED(start
, zinfo
->zone_size
));
1162 ASSERT(IS_ALIGNED(size
, zinfo
->zone_size
));
1164 if (begin
+ nbits
> zinfo
->nr_zones
)
1167 /* All the zones are conventional */
1168 if (bitmap_test_range_all_zero(zinfo
->seq_zones
, begin
, nbits
))
1171 /* All the zones are sequential and empty */
1172 if (bitmap_test_range_all_set(zinfo
->seq_zones
, begin
, nbits
) &&
1173 bitmap_test_range_all_set(zinfo
->empty_zones
, begin
, nbits
))
1176 for (pos
= start
; pos
< start
+ size
; pos
+= zinfo
->zone_size
) {
1179 if (!btrfs_dev_is_sequential(device
, pos
) ||
1180 btrfs_dev_is_empty_zone(device
, pos
))
1183 /* Free regions should be empty */
1186 "zoned: resetting device %s (devid %llu) zone %llu for allocation",
1187 rcu_str_deref(device
->name
), device
->devid
, pos
>> shift
);
1190 ret
= btrfs_reset_device_zone(device
, pos
, zinfo
->zone_size
,
1200 * Calculate an allocation pointer from the extent allocation information
1201 * for a block group consist of conventional zones. It is pointed to the
1202 * end of the highest addressed extent in the block group as an allocation
1205 static int calculate_alloc_pointer(struct btrfs_block_group
*cache
,
1206 u64
*offset_ret
, bool new)
1208 struct btrfs_fs_info
*fs_info
= cache
->fs_info
;
1209 struct btrfs_root
*root
;
1210 BTRFS_PATH_AUTO_FREE(path
);
1211 struct btrfs_key key
;
1212 struct btrfs_key found_key
;
1217 * Avoid tree lookups for a new block group, there's no use for it.
1218 * It must always be 0.
1220 * Also, we have a lock chain of extent buffer lock -> chunk mutex.
1221 * For new a block group, this function is called from
1222 * btrfs_make_block_group() which is already taking the chunk mutex.
1223 * Thus, we cannot call calculate_alloc_pointer() which takes extent
1224 * buffer locks to avoid deadlock.
1231 path
= btrfs_alloc_path();
1235 key
.objectid
= cache
->start
+ cache
->length
;
1239 root
= btrfs_extent_root(fs_info
, key
.objectid
);
1240 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
1241 /* We should not find the exact match */
1247 ret
= btrfs_previous_extent_item(root
, path
, cache
->start
);
1256 btrfs_item_key_to_cpu(path
->nodes
[0], &found_key
, path
->slots
[0]);
1258 if (found_key
.type
== BTRFS_EXTENT_ITEM_KEY
)
1259 length
= found_key
.offset
;
1261 length
= fs_info
->nodesize
;
1263 if (!(found_key
.objectid
>= cache
->start
&&
1264 found_key
.objectid
+ length
<= cache
->start
+ cache
->length
)) {
1267 *offset_ret
= found_key
.objectid
+ length
- cache
->start
;
1277 static int btrfs_load_zone_info(struct btrfs_fs_info
*fs_info
, int zone_idx
,
1278 struct zone_info
*info
, unsigned long *active
,
1279 struct btrfs_chunk_map
*map
)
1281 struct btrfs_dev_replace
*dev_replace
= &fs_info
->dev_replace
;
1282 struct btrfs_device
*device
;
1283 int dev_replace_is_ongoing
= 0;
1284 unsigned int nofs_flag
;
1285 struct blk_zone zone
;
1288 info
->physical
= map
->stripes
[zone_idx
].physical
;
1290 down_read(&dev_replace
->rwsem
);
1291 device
= map
->stripes
[zone_idx
].dev
;
1293 if (!device
->bdev
) {
1294 up_read(&dev_replace
->rwsem
);
1295 info
->alloc_offset
= WP_MISSING_DEV
;
1299 /* Consider a zone as active if we can allow any number of active zones. */
1300 if (!device
->zone_info
->max_active_zones
)
1301 __set_bit(zone_idx
, active
);
1303 if (!btrfs_dev_is_sequential(device
, info
->physical
)) {
1304 up_read(&dev_replace
->rwsem
);
1305 info
->alloc_offset
= WP_CONVENTIONAL
;
1309 /* This zone will be used for allocation, so mark this zone non-empty. */
1310 btrfs_dev_clear_zone_empty(device
, info
->physical
);
1312 dev_replace_is_ongoing
= btrfs_dev_replace_is_ongoing(dev_replace
);
1313 if (dev_replace_is_ongoing
&& dev_replace
->tgtdev
!= NULL
)
1314 btrfs_dev_clear_zone_empty(dev_replace
->tgtdev
, info
->physical
);
1317 * The group is mapped to a sequential zone. Get the zone write pointer
1318 * to determine the allocation offset within the zone.
1320 WARN_ON(!IS_ALIGNED(info
->physical
, fs_info
->zone_size
));
1321 nofs_flag
= memalloc_nofs_save();
1322 ret
= btrfs_get_dev_zone(device
, info
->physical
, &zone
);
1323 memalloc_nofs_restore(nofs_flag
);
1325 up_read(&dev_replace
->rwsem
);
1326 if (ret
!= -EIO
&& ret
!= -EOPNOTSUPP
)
1328 info
->alloc_offset
= WP_MISSING_DEV
;
1332 if (zone
.type
== BLK_ZONE_TYPE_CONVENTIONAL
) {
1333 btrfs_err_in_rcu(fs_info
,
1334 "zoned: unexpected conventional zone %llu on device %s (devid %llu)",
1335 zone
.start
<< SECTOR_SHIFT
, rcu_str_deref(device
->name
),
1337 up_read(&dev_replace
->rwsem
);
1341 info
->capacity
= (zone
.capacity
<< SECTOR_SHIFT
);
1343 switch (zone
.cond
) {
1344 case BLK_ZONE_COND_OFFLINE
:
1345 case BLK_ZONE_COND_READONLY
:
1346 btrfs_err_in_rcu(fs_info
,
1347 "zoned: offline/readonly zone %llu on device %s (devid %llu)",
1348 (info
->physical
>> device
->zone_info
->zone_size_shift
),
1349 rcu_str_deref(device
->name
), device
->devid
);
1350 info
->alloc_offset
= WP_MISSING_DEV
;
1352 case BLK_ZONE_COND_EMPTY
:
1353 info
->alloc_offset
= 0;
1355 case BLK_ZONE_COND_FULL
:
1356 info
->alloc_offset
= info
->capacity
;
1359 /* Partially used zone. */
1360 info
->alloc_offset
= ((zone
.wp
- zone
.start
) << SECTOR_SHIFT
);
1361 __set_bit(zone_idx
, active
);
1365 up_read(&dev_replace
->rwsem
);
1370 static int btrfs_load_block_group_single(struct btrfs_block_group
*bg
,
1371 struct zone_info
*info
,
1372 unsigned long *active
)
1374 if (info
->alloc_offset
== WP_MISSING_DEV
) {
1375 btrfs_err(bg
->fs_info
,
1376 "zoned: cannot recover write pointer for zone %llu",
1381 bg
->alloc_offset
= info
->alloc_offset
;
1382 bg
->zone_capacity
= info
->capacity
;
1383 if (test_bit(0, active
))
1384 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &bg
->runtime_flags
);
1388 static int btrfs_load_block_group_dup(struct btrfs_block_group
*bg
,
1389 struct btrfs_chunk_map
*map
,
1390 struct zone_info
*zone_info
,
1391 unsigned long *active
)
1393 struct btrfs_fs_info
*fs_info
= bg
->fs_info
;
1395 if ((map
->type
& BTRFS_BLOCK_GROUP_DATA
) && !fs_info
->stripe_root
) {
1396 btrfs_err(fs_info
, "zoned: data DUP profile needs raid-stripe-tree");
1400 bg
->zone_capacity
= min_not_zero(zone_info
[0].capacity
, zone_info
[1].capacity
);
1402 if (zone_info
[0].alloc_offset
== WP_MISSING_DEV
) {
1403 btrfs_err(bg
->fs_info
,
1404 "zoned: cannot recover write pointer for zone %llu",
1405 zone_info
[0].physical
);
1408 if (zone_info
[1].alloc_offset
== WP_MISSING_DEV
) {
1409 btrfs_err(bg
->fs_info
,
1410 "zoned: cannot recover write pointer for zone %llu",
1411 zone_info
[1].physical
);
1414 if (zone_info
[0].alloc_offset
!= zone_info
[1].alloc_offset
) {
1415 btrfs_err(bg
->fs_info
,
1416 "zoned: write pointer offset mismatch of zones in DUP profile");
1420 if (test_bit(0, active
) != test_bit(1, active
)) {
1421 if (!btrfs_zone_activate(bg
))
1423 } else if (test_bit(0, active
)) {
1424 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &bg
->runtime_flags
);
1427 bg
->alloc_offset
= zone_info
[0].alloc_offset
;
1431 static int btrfs_load_block_group_raid1(struct btrfs_block_group
*bg
,
1432 struct btrfs_chunk_map
*map
,
1433 struct zone_info
*zone_info
,
1434 unsigned long *active
)
1436 struct btrfs_fs_info
*fs_info
= bg
->fs_info
;
1439 if ((map
->type
& BTRFS_BLOCK_GROUP_DATA
) && !fs_info
->stripe_root
) {
1440 btrfs_err(fs_info
, "zoned: data %s needs raid-stripe-tree",
1441 btrfs_bg_type_to_raid_name(map
->type
));
1445 /* In case a device is missing we have a cap of 0, so don't use it. */
1446 bg
->zone_capacity
= min_not_zero(zone_info
[0].capacity
, zone_info
[1].capacity
);
1448 for (i
= 0; i
< map
->num_stripes
; i
++) {
1449 if (zone_info
[i
].alloc_offset
== WP_MISSING_DEV
||
1450 zone_info
[i
].alloc_offset
== WP_CONVENTIONAL
)
1453 if ((zone_info
[0].alloc_offset
!= zone_info
[i
].alloc_offset
) &&
1454 !btrfs_test_opt(fs_info
, DEGRADED
)) {
1456 "zoned: write pointer offset mismatch of zones in %s profile",
1457 btrfs_bg_type_to_raid_name(map
->type
));
1460 if (test_bit(0, active
) != test_bit(i
, active
)) {
1461 if (!btrfs_test_opt(fs_info
, DEGRADED
) &&
1462 !btrfs_zone_activate(bg
)) {
1466 if (test_bit(0, active
))
1467 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &bg
->runtime_flags
);
1471 if (zone_info
[0].alloc_offset
!= WP_MISSING_DEV
)
1472 bg
->alloc_offset
= zone_info
[0].alloc_offset
;
1474 bg
->alloc_offset
= zone_info
[i
- 1].alloc_offset
;
1479 static int btrfs_load_block_group_raid0(struct btrfs_block_group
*bg
,
1480 struct btrfs_chunk_map
*map
,
1481 struct zone_info
*zone_info
,
1482 unsigned long *active
)
1484 struct btrfs_fs_info
*fs_info
= bg
->fs_info
;
1486 if ((map
->type
& BTRFS_BLOCK_GROUP_DATA
) && !fs_info
->stripe_root
) {
1487 btrfs_err(fs_info
, "zoned: data %s needs raid-stripe-tree",
1488 btrfs_bg_type_to_raid_name(map
->type
));
1492 for (int i
= 0; i
< map
->num_stripes
; i
++) {
1493 if (zone_info
[i
].alloc_offset
== WP_MISSING_DEV
||
1494 zone_info
[i
].alloc_offset
== WP_CONVENTIONAL
)
1497 if (test_bit(0, active
) != test_bit(i
, active
)) {
1498 if (!btrfs_zone_activate(bg
))
1501 if (test_bit(0, active
))
1502 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &bg
->runtime_flags
);
1504 bg
->zone_capacity
+= zone_info
[i
].capacity
;
1505 bg
->alloc_offset
+= zone_info
[i
].alloc_offset
;
1511 static int btrfs_load_block_group_raid10(struct btrfs_block_group
*bg
,
1512 struct btrfs_chunk_map
*map
,
1513 struct zone_info
*zone_info
,
1514 unsigned long *active
)
1516 struct btrfs_fs_info
*fs_info
= bg
->fs_info
;
1518 if ((map
->type
& BTRFS_BLOCK_GROUP_DATA
) && !fs_info
->stripe_root
) {
1519 btrfs_err(fs_info
, "zoned: data %s needs raid-stripe-tree",
1520 btrfs_bg_type_to_raid_name(map
->type
));
1524 for (int i
= 0; i
< map
->num_stripes
; i
++) {
1525 if (zone_info
[i
].alloc_offset
== WP_MISSING_DEV
||
1526 zone_info
[i
].alloc_offset
== WP_CONVENTIONAL
)
1529 if (test_bit(0, active
) != test_bit(i
, active
)) {
1530 if (!btrfs_zone_activate(bg
))
1533 if (test_bit(0, active
))
1534 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &bg
->runtime_flags
);
1537 if ((i
% map
->sub_stripes
) == 0) {
1538 bg
->zone_capacity
+= zone_info
[i
].capacity
;
1539 bg
->alloc_offset
+= zone_info
[i
].alloc_offset
;
1546 int btrfs_load_block_group_zone_info(struct btrfs_block_group
*cache
, bool new)
1548 struct btrfs_fs_info
*fs_info
= cache
->fs_info
;
1549 struct btrfs_chunk_map
*map
;
1550 u64 logical
= cache
->start
;
1551 u64 length
= cache
->length
;
1552 struct zone_info
*zone_info
= NULL
;
1555 unsigned long *active
= NULL
;
1557 u32 num_sequential
= 0, num_conventional
= 0;
1560 if (!btrfs_is_zoned(fs_info
))
1564 if (!IS_ALIGNED(length
, fs_info
->zone_size
)) {
1566 "zoned: block group %llu len %llu unaligned to zone size %llu",
1567 logical
, length
, fs_info
->zone_size
);
1571 map
= btrfs_find_chunk_map(fs_info
, logical
, length
);
1575 cache
->physical_map
= map
;
1577 zone_info
= kcalloc(map
->num_stripes
, sizeof(*zone_info
), GFP_NOFS
);
1583 active
= bitmap_zalloc(map
->num_stripes
, GFP_NOFS
);
1589 for (i
= 0; i
< map
->num_stripes
; i
++) {
1590 ret
= btrfs_load_zone_info(fs_info
, i
, &zone_info
[i
], active
, map
);
1594 if (zone_info
[i
].alloc_offset
== WP_CONVENTIONAL
)
1600 if (num_sequential
> 0)
1601 set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE
, &cache
->runtime_flags
);
1603 if (num_conventional
> 0) {
1604 /* Zone capacity is always zone size in emulation */
1605 cache
->zone_capacity
= cache
->length
;
1606 ret
= calculate_alloc_pointer(cache
, &last_alloc
, new);
1609 "zoned: failed to determine allocation offset of bg %llu",
1612 } else if (map
->num_stripes
== num_conventional
) {
1613 cache
->alloc_offset
= last_alloc
;
1614 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &cache
->runtime_flags
);
1619 profile
= map
->type
& BTRFS_BLOCK_GROUP_PROFILE_MASK
;
1621 case 0: /* single */
1622 ret
= btrfs_load_block_group_single(cache
, &zone_info
[0], active
);
1624 case BTRFS_BLOCK_GROUP_DUP
:
1625 ret
= btrfs_load_block_group_dup(cache
, map
, zone_info
, active
);
1627 case BTRFS_BLOCK_GROUP_RAID1
:
1628 case BTRFS_BLOCK_GROUP_RAID1C3
:
1629 case BTRFS_BLOCK_GROUP_RAID1C4
:
1630 ret
= btrfs_load_block_group_raid1(cache
, map
, zone_info
, active
);
1632 case BTRFS_BLOCK_GROUP_RAID0
:
1633 ret
= btrfs_load_block_group_raid0(cache
, map
, zone_info
, active
);
1635 case BTRFS_BLOCK_GROUP_RAID10
:
1636 ret
= btrfs_load_block_group_raid10(cache
, map
, zone_info
, active
);
1638 case BTRFS_BLOCK_GROUP_RAID5
:
1639 case BTRFS_BLOCK_GROUP_RAID6
:
1641 btrfs_err(fs_info
, "zoned: profile %s not yet supported",
1642 btrfs_bg_type_to_raid_name(map
->type
));
1647 if (ret
== -EIO
&& profile
!= 0 && profile
!= BTRFS_BLOCK_GROUP_RAID0
&&
1648 profile
!= BTRFS_BLOCK_GROUP_RAID10
) {
1650 * Detected broken write pointer. Make this block group
1651 * unallocatable by setting the allocation pointer at the end of
1652 * allocatable region. Relocating this block group will fix the
1655 * Currently, we cannot handle RAID0 or RAID10 case like this
1656 * because we don't have a proper zone_capacity value. But,
1657 * reading from this block group won't work anyway by a missing
1660 cache
->alloc_offset
= cache
->zone_capacity
;
1665 /* Reject non SINGLE data profiles without RST */
1666 if ((map
->type
& BTRFS_BLOCK_GROUP_DATA
) &&
1667 (map
->type
& BTRFS_BLOCK_GROUP_PROFILE_MASK
) &&
1668 !fs_info
->stripe_root
) {
1669 btrfs_err(fs_info
, "zoned: data %s needs raid-stripe-tree",
1670 btrfs_bg_type_to_raid_name(map
->type
));
1674 if (cache
->alloc_offset
> cache
->zone_capacity
) {
1676 "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
1677 cache
->alloc_offset
, cache
->zone_capacity
,
1682 /* An extent is allocated after the write pointer */
1683 if (!ret
&& num_conventional
&& last_alloc
> cache
->alloc_offset
) {
1685 "zoned: got wrong write pointer in BG %llu: %llu > %llu",
1686 logical
, last_alloc
, cache
->alloc_offset
);
1691 cache
->meta_write_pointer
= cache
->alloc_offset
+ cache
->start
;
1692 if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &cache
->runtime_flags
)) {
1693 btrfs_get_block_group(cache
);
1694 spin_lock(&fs_info
->zone_active_bgs_lock
);
1695 list_add_tail(&cache
->active_bg_list
,
1696 &fs_info
->zone_active_bgs
);
1697 spin_unlock(&fs_info
->zone_active_bgs_lock
);
1700 btrfs_free_chunk_map(cache
->physical_map
);
1701 cache
->physical_map
= NULL
;
1703 bitmap_free(active
);
1709 void btrfs_calc_zone_unusable(struct btrfs_block_group
*cache
)
1713 if (!btrfs_is_zoned(cache
->fs_info
))
1716 WARN_ON(cache
->bytes_super
!= 0);
1717 unusable
= (cache
->alloc_offset
- cache
->used
) +
1718 (cache
->length
- cache
->zone_capacity
);
1719 free
= cache
->zone_capacity
- cache
->alloc_offset
;
1721 /* We only need ->free_space in ALLOC_SEQ block groups */
1722 cache
->cached
= BTRFS_CACHE_FINISHED
;
1723 cache
->free_space_ctl
->free_space
= free
;
1724 cache
->zone_unusable
= unusable
;
1727 bool btrfs_use_zone_append(struct btrfs_bio
*bbio
)
1729 u64 start
= (bbio
->bio
.bi_iter
.bi_sector
<< SECTOR_SHIFT
);
1730 struct btrfs_inode
*inode
= bbio
->inode
;
1731 struct btrfs_fs_info
*fs_info
= bbio
->fs_info
;
1732 struct btrfs_block_group
*cache
;
1735 if (!btrfs_is_zoned(fs_info
))
1738 if (!inode
|| !is_data_inode(inode
))
1741 if (btrfs_op(&bbio
->bio
) != BTRFS_MAP_WRITE
)
1745 * Using REQ_OP_ZONE_APPEND for relocation can break assumptions on the
1746 * extent layout the relocation code has.
1747 * Furthermore we have set aside own block-group from which only the
1748 * relocation "process" can allocate and make sure only one process at a
1749 * time can add pages to an extent that gets relocated, so it's safe to
1750 * use regular REQ_OP_WRITE for this special case.
1752 if (btrfs_is_data_reloc_root(inode
->root
))
1755 cache
= btrfs_lookup_block_group(fs_info
, start
);
1760 ret
= !!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE
, &cache
->runtime_flags
);
1761 btrfs_put_block_group(cache
);
1766 void btrfs_record_physical_zoned(struct btrfs_bio
*bbio
)
1768 const u64 physical
= bbio
->bio
.bi_iter
.bi_sector
<< SECTOR_SHIFT
;
1769 struct btrfs_ordered_sum
*sum
= bbio
->sums
;
1771 if (physical
< bbio
->orig_physical
)
1772 sum
->logical
-= bbio
->orig_physical
- physical
;
1774 sum
->logical
+= physical
- bbio
->orig_physical
;
1777 static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent
*ordered
,
1780 struct extent_map_tree
*em_tree
= &ordered
->inode
->extent_tree
;
1781 struct extent_map
*em
;
1783 ordered
->disk_bytenr
= logical
;
1785 write_lock(&em_tree
->lock
);
1786 em
= search_extent_mapping(em_tree
, ordered
->file_offset
,
1787 ordered
->num_bytes
);
1788 /* The em should be a new COW extent, thus it should not have an offset. */
1789 ASSERT(em
->offset
== 0);
1790 em
->disk_bytenr
= logical
;
1791 free_extent_map(em
);
1792 write_unlock(&em_tree
->lock
);
1795 static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent
*ordered
,
1796 u64 logical
, u64 len
)
1798 struct btrfs_ordered_extent
*new;
1800 if (!test_bit(BTRFS_ORDERED_NOCOW
, &ordered
->flags
) &&
1801 split_extent_map(ordered
->inode
, ordered
->file_offset
,
1802 ordered
->num_bytes
, len
, logical
))
1805 new = btrfs_split_ordered_extent(ordered
, len
);
1808 new->disk_bytenr
= logical
;
1809 btrfs_finish_one_ordered(new);
1813 void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent
*ordered
)
1815 struct btrfs_inode
*inode
= ordered
->inode
;
1816 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
1817 struct btrfs_ordered_sum
*sum
;
1821 * Write to pre-allocated region is for the data relocation, and so
1822 * it should use WRITE operation. No split/rewrite are necessary.
1824 if (test_bit(BTRFS_ORDERED_PREALLOC
, &ordered
->flags
))
1827 ASSERT(!list_empty(&ordered
->list
));
1828 /* The ordered->list can be empty in the above pre-alloc case. */
1829 sum
= list_first_entry(&ordered
->list
, struct btrfs_ordered_sum
, list
);
1830 logical
= sum
->logical
;
1833 while (len
< ordered
->disk_num_bytes
) {
1834 sum
= list_next_entry(sum
, list
);
1835 if (sum
->logical
== logical
+ len
) {
1839 if (!btrfs_zoned_split_ordered(ordered
, logical
, len
)) {
1840 set_bit(BTRFS_ORDERED_IOERR
, &ordered
->flags
);
1841 btrfs_err(fs_info
, "failed to split ordered extent");
1844 logical
= sum
->logical
;
1848 if (ordered
->disk_bytenr
!= logical
)
1849 btrfs_rewrite_logical_zoned(ordered
, logical
);
1853 * If we end up here for nodatasum I/O, the btrfs_ordered_sum structures
1854 * were allocated by btrfs_alloc_dummy_sum only to record the logical
1855 * addresses and don't contain actual checksums. We thus must free them
1856 * here so that we don't attempt to log the csums later.
1858 if ((inode
->flags
& BTRFS_INODE_NODATASUM
) ||
1859 test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS
, &fs_info
->fs_state
)) {
1860 while ((sum
= list_first_entry_or_null(&ordered
->list
,
1861 typeof(*sum
), list
))) {
1862 list_del(&sum
->list
);
1868 static bool check_bg_is_active(struct btrfs_eb_write_context
*ctx
,
1869 struct btrfs_block_group
**active_bg
)
1871 const struct writeback_control
*wbc
= ctx
->wbc
;
1872 struct btrfs_block_group
*block_group
= ctx
->zoned_bg
;
1873 struct btrfs_fs_info
*fs_info
= block_group
->fs_info
;
1875 if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &block_group
->runtime_flags
))
1878 if (fs_info
->treelog_bg
== block_group
->start
) {
1879 if (!btrfs_zone_activate(block_group
)) {
1880 int ret_fin
= btrfs_zone_finish_one_bg(fs_info
);
1882 if (ret_fin
!= 1 || !btrfs_zone_activate(block_group
))
1885 } else if (*active_bg
!= block_group
) {
1886 struct btrfs_block_group
*tgt
= *active_bg
;
1888 /* zoned_meta_io_lock protects fs_info->active_{meta,system}_bg. */
1889 lockdep_assert_held(&fs_info
->zoned_meta_io_lock
);
1893 * If there is an unsent IO left in the allocated area,
1894 * we cannot wait for them as it may cause a deadlock.
1896 if (tgt
->meta_write_pointer
< tgt
->start
+ tgt
->alloc_offset
) {
1897 if (wbc
->sync_mode
== WB_SYNC_NONE
||
1898 (wbc
->sync_mode
== WB_SYNC_ALL
&& !wbc
->for_sync
))
1902 /* Pivot active metadata/system block group. */
1903 btrfs_zoned_meta_io_unlock(fs_info
);
1904 wait_eb_writebacks(tgt
);
1905 do_zone_finish(tgt
, true);
1906 btrfs_zoned_meta_io_lock(fs_info
);
1907 if (*active_bg
== tgt
) {
1908 btrfs_put_block_group(tgt
);
1912 if (!btrfs_zone_activate(block_group
))
1914 if (*active_bg
!= block_group
) {
1915 ASSERT(*active_bg
== NULL
);
1916 *active_bg
= block_group
;
1917 btrfs_get_block_group(block_group
);
1925 * Check if @ctx->eb is aligned to the write pointer.
1928 * 0: @ctx->eb is at the write pointer. You can write it.
1929 * -EAGAIN: There is a hole. The caller should handle the case.
1930 * -EBUSY: There is a hole, but the caller can just bail out.
1932 int btrfs_check_meta_write_pointer(struct btrfs_fs_info
*fs_info
,
1933 struct btrfs_eb_write_context
*ctx
)
1935 const struct writeback_control
*wbc
= ctx
->wbc
;
1936 const struct extent_buffer
*eb
= ctx
->eb
;
1937 struct btrfs_block_group
*block_group
= ctx
->zoned_bg
;
1939 if (!btrfs_is_zoned(fs_info
))
1943 if (block_group
->start
> eb
->start
||
1944 block_group
->start
+ block_group
->length
<= eb
->start
) {
1945 btrfs_put_block_group(block_group
);
1947 ctx
->zoned_bg
= NULL
;
1952 block_group
= btrfs_lookup_block_group(fs_info
, eb
->start
);
1955 ctx
->zoned_bg
= block_group
;
1958 if (block_group
->meta_write_pointer
== eb
->start
) {
1959 struct btrfs_block_group
**tgt
;
1961 if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING
, &fs_info
->flags
))
1964 if (block_group
->flags
& BTRFS_BLOCK_GROUP_SYSTEM
)
1965 tgt
= &fs_info
->active_system_bg
;
1967 tgt
= &fs_info
->active_meta_bg
;
1968 if (check_bg_is_active(ctx
, tgt
))
1973 * Since we may release fs_info->zoned_meta_io_lock, someone can already
1974 * start writing this eb. In that case, we can just bail out.
1976 if (block_group
->meta_write_pointer
> eb
->start
)
1979 /* If for_sync, this hole will be filled with transaction commit. */
1980 if (wbc
->sync_mode
== WB_SYNC_ALL
&& !wbc
->for_sync
)
1985 int btrfs_zoned_issue_zeroout(struct btrfs_device
*device
, u64 physical
, u64 length
)
1987 if (!btrfs_dev_is_sequential(device
, physical
))
1990 return blkdev_issue_zeroout(device
->bdev
, physical
>> SECTOR_SHIFT
,
1991 length
>> SECTOR_SHIFT
, GFP_NOFS
, 0);
1994 static int read_zone_info(struct btrfs_fs_info
*fs_info
, u64 logical
,
1995 struct blk_zone
*zone
)
1997 struct btrfs_io_context
*bioc
= NULL
;
1998 u64 mapped_length
= PAGE_SIZE
;
1999 unsigned int nofs_flag
;
2003 ret
= btrfs_map_block(fs_info
, BTRFS_MAP_GET_READ_MIRRORS
, logical
,
2004 &mapped_length
, &bioc
, NULL
, NULL
);
2005 if (ret
|| !bioc
|| mapped_length
< PAGE_SIZE
) {
2010 if (bioc
->map_type
& BTRFS_BLOCK_GROUP_RAID56_MASK
) {
2015 nofs_flag
= memalloc_nofs_save();
2016 nmirrors
= (int)bioc
->num_stripes
;
2017 for (i
= 0; i
< nmirrors
; i
++) {
2018 u64 physical
= bioc
->stripes
[i
].physical
;
2019 struct btrfs_device
*dev
= bioc
->stripes
[i
].dev
;
2021 /* Missing device */
2025 ret
= btrfs_get_dev_zone(dev
, physical
, zone
);
2026 /* Failing device */
2027 if (ret
== -EIO
|| ret
== -EOPNOTSUPP
)
2031 memalloc_nofs_restore(nofs_flag
);
2033 btrfs_put_bioc(bioc
);
2038 * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by
2039 * filling zeros between @physical_pos to a write pointer of dev-replace
2042 int btrfs_sync_zone_write_pointer(struct btrfs_device
*tgt_dev
, u64 logical
,
2043 u64 physical_start
, u64 physical_pos
)
2045 struct btrfs_fs_info
*fs_info
= tgt_dev
->fs_info
;
2046 struct blk_zone zone
;
2051 if (!btrfs_dev_is_sequential(tgt_dev
, physical_pos
))
2054 ret
= read_zone_info(fs_info
, logical
, &zone
);
2058 wp
= physical_start
+ ((zone
.wp
- zone
.start
) << SECTOR_SHIFT
);
2060 if (physical_pos
== wp
)
2063 if (physical_pos
> wp
)
2066 length
= wp
- physical_pos
;
2067 return btrfs_zoned_issue_zeroout(tgt_dev
, physical_pos
, length
);
2071 * Activate block group and underlying device zones
2073 * @block_group: the block group to activate
2075 * Return: true on success, false otherwise
2077 bool btrfs_zone_activate(struct btrfs_block_group
*block_group
)
2079 struct btrfs_fs_info
*fs_info
= block_group
->fs_info
;
2080 struct btrfs_chunk_map
*map
;
2081 struct btrfs_device
*device
;
2083 const bool is_data
= (block_group
->flags
& BTRFS_BLOCK_GROUP_DATA
);
2087 if (!btrfs_is_zoned(block_group
->fs_info
))
2090 map
= block_group
->physical_map
;
2092 spin_lock(&fs_info
->zone_active_bgs_lock
);
2093 spin_lock(&block_group
->lock
);
2094 if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &block_group
->runtime_flags
)) {
2100 if (btrfs_zoned_bg_is_full(block_group
)) {
2105 for (i
= 0; i
< map
->num_stripes
; i
++) {
2106 struct btrfs_zoned_device_info
*zinfo
;
2109 device
= map
->stripes
[i
].dev
;
2110 physical
= map
->stripes
[i
].physical
;
2111 zinfo
= device
->zone_info
;
2113 if (zinfo
->max_active_zones
== 0)
2117 reserved
= zinfo
->reserved_active_zones
;
2119 * For the data block group, leave active zones for one
2120 * metadata block group and one system block group.
2122 if (atomic_read(&zinfo
->active_zones_left
) <= reserved
) {
2127 if (!btrfs_dev_set_active_zone(device
, physical
)) {
2128 /* Cannot activate the zone */
2133 zinfo
->reserved_active_zones
--;
2136 /* Successfully activated all the zones */
2137 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &block_group
->runtime_flags
);
2138 spin_unlock(&block_group
->lock
);
2140 /* For the active block group list */
2141 btrfs_get_block_group(block_group
);
2142 list_add_tail(&block_group
->active_bg_list
, &fs_info
->zone_active_bgs
);
2143 spin_unlock(&fs_info
->zone_active_bgs_lock
);
2148 spin_unlock(&block_group
->lock
);
2149 spin_unlock(&fs_info
->zone_active_bgs_lock
);
2153 static void wait_eb_writebacks(struct btrfs_block_group
*block_group
)
2155 struct btrfs_fs_info
*fs_info
= block_group
->fs_info
;
2156 const u64 end
= block_group
->start
+ block_group
->length
;
2157 struct radix_tree_iter iter
;
2158 struct extent_buffer
*eb
;
2162 radix_tree_for_each_slot(slot
, &fs_info
->buffer_radix
, &iter
,
2163 block_group
->start
>> fs_info
->sectorsize_bits
) {
2164 eb
= radix_tree_deref_slot(slot
);
2167 if (radix_tree_deref_retry(eb
)) {
2168 slot
= radix_tree_iter_retry(&iter
);
2172 if (eb
->start
< block_group
->start
)
2174 if (eb
->start
>= end
)
2177 slot
= radix_tree_iter_resume(slot
, &iter
);
2179 wait_on_extent_buffer_writeback(eb
);
2185 static int do_zone_finish(struct btrfs_block_group
*block_group
, bool fully_written
)
2187 struct btrfs_fs_info
*fs_info
= block_group
->fs_info
;
2188 struct btrfs_chunk_map
*map
;
2189 const bool is_metadata
= (block_group
->flags
&
2190 (BTRFS_BLOCK_GROUP_METADATA
| BTRFS_BLOCK_GROUP_SYSTEM
));
2191 struct btrfs_dev_replace
*dev_replace
= &fs_info
->dev_replace
;
2195 spin_lock(&block_group
->lock
);
2196 if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &block_group
->runtime_flags
)) {
2197 spin_unlock(&block_group
->lock
);
2201 /* Check if we have unwritten allocated space */
2203 block_group
->start
+ block_group
->alloc_offset
> block_group
->meta_write_pointer
) {
2204 spin_unlock(&block_group
->lock
);
2209 * If we are sure that the block group is full (= no more room left for
2210 * new allocation) and the IO for the last usable block is completed, we
2211 * don't need to wait for the other IOs. This holds because we ensure
2212 * the sequential IO submissions using the ZONE_APPEND command for data
2213 * and block_group->meta_write_pointer for metadata.
2215 if (!fully_written
) {
2216 if (test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC
, &block_group
->runtime_flags
)) {
2217 spin_unlock(&block_group
->lock
);
2220 spin_unlock(&block_group
->lock
);
2222 ret
= btrfs_inc_block_group_ro(block_group
, false);
2226 /* Ensure all writes in this block group finish */
2227 btrfs_wait_block_group_reservations(block_group
);
2228 /* No need to wait for NOCOW writers. Zoned mode does not allow that */
2229 btrfs_wait_ordered_roots(fs_info
, U64_MAX
, block_group
);
2230 /* Wait for extent buffers to be written. */
2232 wait_eb_writebacks(block_group
);
2234 spin_lock(&block_group
->lock
);
2237 * Bail out if someone already deactivated the block group, or
2238 * allocated space is left in the block group.
2240 if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
,
2241 &block_group
->runtime_flags
)) {
2242 spin_unlock(&block_group
->lock
);
2243 btrfs_dec_block_group_ro(block_group
);
2247 if (block_group
->reserved
||
2248 test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC
,
2249 &block_group
->runtime_flags
)) {
2250 spin_unlock(&block_group
->lock
);
2251 btrfs_dec_block_group_ro(block_group
);
2256 clear_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &block_group
->runtime_flags
);
2257 block_group
->alloc_offset
= block_group
->zone_capacity
;
2258 if (block_group
->flags
& (BTRFS_BLOCK_GROUP_METADATA
| BTRFS_BLOCK_GROUP_SYSTEM
))
2259 block_group
->meta_write_pointer
= block_group
->start
+
2260 block_group
->zone_capacity
;
2261 block_group
->free_space_ctl
->free_space
= 0;
2262 btrfs_clear_treelog_bg(block_group
);
2263 btrfs_clear_data_reloc_bg(block_group
);
2264 spin_unlock(&block_group
->lock
);
2266 down_read(&dev_replace
->rwsem
);
2267 map
= block_group
->physical_map
;
2268 for (i
= 0; i
< map
->num_stripes
; i
++) {
2269 struct btrfs_device
*device
= map
->stripes
[i
].dev
;
2270 const u64 physical
= map
->stripes
[i
].physical
;
2271 struct btrfs_zoned_device_info
*zinfo
= device
->zone_info
;
2272 unsigned int nofs_flags
;
2274 if (zinfo
->max_active_zones
== 0)
2277 nofs_flags
= memalloc_nofs_save();
2278 ret
= blkdev_zone_mgmt(device
->bdev
, REQ_OP_ZONE_FINISH
,
2279 physical
>> SECTOR_SHIFT
,
2280 zinfo
->zone_size
>> SECTOR_SHIFT
);
2281 memalloc_nofs_restore(nofs_flags
);
2284 up_read(&dev_replace
->rwsem
);
2288 if (!(block_group
->flags
& BTRFS_BLOCK_GROUP_DATA
))
2289 zinfo
->reserved_active_zones
++;
2290 btrfs_dev_clear_active_zone(device
, physical
);
2292 up_read(&dev_replace
->rwsem
);
2295 btrfs_dec_block_group_ro(block_group
);
2297 spin_lock(&fs_info
->zone_active_bgs_lock
);
2298 ASSERT(!list_empty(&block_group
->active_bg_list
));
2299 list_del_init(&block_group
->active_bg_list
);
2300 spin_unlock(&fs_info
->zone_active_bgs_lock
);
2302 /* For active_bg_list */
2303 btrfs_put_block_group(block_group
);
2305 clear_and_wake_up_bit(BTRFS_FS_NEED_ZONE_FINISH
, &fs_info
->flags
);
2310 int btrfs_zone_finish(struct btrfs_block_group
*block_group
)
2312 if (!btrfs_is_zoned(block_group
->fs_info
))
2315 return do_zone_finish(block_group
, false);
2318 bool btrfs_can_activate_zone(struct btrfs_fs_devices
*fs_devices
, u64 flags
)
2320 struct btrfs_fs_info
*fs_info
= fs_devices
->fs_info
;
2321 struct btrfs_device
*device
;
2324 if (!btrfs_is_zoned(fs_info
))
2327 /* Check if there is a device with active zones left */
2328 mutex_lock(&fs_info
->chunk_mutex
);
2329 spin_lock(&fs_info
->zone_active_bgs_lock
);
2330 list_for_each_entry(device
, &fs_devices
->alloc_list
, dev_alloc_list
) {
2331 struct btrfs_zoned_device_info
*zinfo
= device
->zone_info
;
2337 if (!zinfo
->max_active_zones
) {
2342 if (flags
& BTRFS_BLOCK_GROUP_DATA
)
2343 reserved
= zinfo
->reserved_active_zones
;
2345 switch (flags
& BTRFS_BLOCK_GROUP_PROFILE_MASK
) {
2346 case 0: /* single */
2347 ret
= (atomic_read(&zinfo
->active_zones_left
) >= (1 + reserved
));
2349 case BTRFS_BLOCK_GROUP_DUP
:
2350 ret
= (atomic_read(&zinfo
->active_zones_left
) >= (2 + reserved
));
2356 spin_unlock(&fs_info
->zone_active_bgs_lock
);
2357 mutex_unlock(&fs_info
->chunk_mutex
);
2360 set_bit(BTRFS_FS_NEED_ZONE_FINISH
, &fs_info
->flags
);
2365 void btrfs_zone_finish_endio(struct btrfs_fs_info
*fs_info
, u64 logical
, u64 length
)
2367 struct btrfs_block_group
*block_group
;
2368 u64 min_alloc_bytes
;
2370 if (!btrfs_is_zoned(fs_info
))
2373 block_group
= btrfs_lookup_block_group(fs_info
, logical
);
2374 ASSERT(block_group
);
2376 /* No MIXED_BG on zoned btrfs. */
2377 if (block_group
->flags
& BTRFS_BLOCK_GROUP_DATA
)
2378 min_alloc_bytes
= fs_info
->sectorsize
;
2380 min_alloc_bytes
= fs_info
->nodesize
;
2382 /* Bail out if we can allocate more data from this block group. */
2383 if (logical
+ length
+ min_alloc_bytes
<=
2384 block_group
->start
+ block_group
->zone_capacity
)
2387 do_zone_finish(block_group
, true);
2390 btrfs_put_block_group(block_group
);
2393 static void btrfs_zone_finish_endio_workfn(struct work_struct
*work
)
2395 struct btrfs_block_group
*bg
=
2396 container_of(work
, struct btrfs_block_group
, zone_finish_work
);
2398 wait_on_extent_buffer_writeback(bg
->last_eb
);
2399 free_extent_buffer(bg
->last_eb
);
2400 btrfs_zone_finish_endio(bg
->fs_info
, bg
->start
, bg
->length
);
2401 btrfs_put_block_group(bg
);
2404 void btrfs_schedule_zone_finish_bg(struct btrfs_block_group
*bg
,
2405 struct extent_buffer
*eb
)
2407 if (!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE
, &bg
->runtime_flags
) ||
2408 eb
->start
+ eb
->len
* 2 <= bg
->start
+ bg
->zone_capacity
)
2411 if (WARN_ON(bg
->zone_finish_work
.func
== btrfs_zone_finish_endio_workfn
)) {
2412 btrfs_err(bg
->fs_info
, "double scheduling of bg %llu zone finishing",
2418 btrfs_get_block_group(bg
);
2419 atomic_inc(&eb
->refs
);
2421 INIT_WORK(&bg
->zone_finish_work
, btrfs_zone_finish_endio_workfn
);
2422 queue_work(system_unbound_wq
, &bg
->zone_finish_work
);
2425 void btrfs_clear_data_reloc_bg(struct btrfs_block_group
*bg
)
2427 struct btrfs_fs_info
*fs_info
= bg
->fs_info
;
2429 spin_lock(&fs_info
->relocation_bg_lock
);
2430 if (fs_info
->data_reloc_bg
== bg
->start
)
2431 fs_info
->data_reloc_bg
= 0;
2432 spin_unlock(&fs_info
->relocation_bg_lock
);
2435 void btrfs_free_zone_cache(struct btrfs_fs_info
*fs_info
)
2437 struct btrfs_fs_devices
*fs_devices
= fs_info
->fs_devices
;
2438 struct btrfs_device
*device
;
2440 if (!btrfs_is_zoned(fs_info
))
2443 mutex_lock(&fs_devices
->device_list_mutex
);
2444 list_for_each_entry(device
, &fs_devices
->devices
, dev_list
) {
2445 if (device
->zone_info
) {
2446 vfree(device
->zone_info
->zone_cache
);
2447 device
->zone_info
->zone_cache
= NULL
;
2450 mutex_unlock(&fs_devices
->device_list_mutex
);
2453 bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info
*fs_info
)
2455 struct btrfs_fs_devices
*fs_devices
= fs_info
->fs_devices
;
2456 struct btrfs_device
*device
;
2461 ASSERT(btrfs_is_zoned(fs_info
));
2463 if (fs_info
->bg_reclaim_threshold
== 0)
2466 mutex_lock(&fs_devices
->device_list_mutex
);
2467 list_for_each_entry(device
, &fs_devices
->devices
, dev_list
) {
2471 total
+= device
->disk_total_bytes
;
2472 used
+= device
->bytes_used
;
2474 mutex_unlock(&fs_devices
->device_list_mutex
);
2476 factor
= div64_u64(used
* 100, total
);
2477 return factor
>= fs_info
->bg_reclaim_threshold
;
2480 void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info
*fs_info
, u64 logical
,
2483 struct btrfs_block_group
*block_group
;
2485 if (!btrfs_is_zoned(fs_info
))
2488 block_group
= btrfs_lookup_block_group(fs_info
, logical
);
2489 /* It should be called on a previous data relocation block group. */
2490 ASSERT(block_group
&& (block_group
->flags
& BTRFS_BLOCK_GROUP_DATA
));
2492 spin_lock(&block_group
->lock
);
2493 if (!test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC
, &block_group
->runtime_flags
))
2496 /* All relocation extents are written. */
2497 if (block_group
->start
+ block_group
->alloc_offset
== logical
+ length
) {
2499 * Now, release this block group for further allocations and
2502 clear_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC
,
2503 &block_group
->runtime_flags
);
2507 spin_unlock(&block_group
->lock
);
2508 btrfs_put_block_group(block_group
);
2511 int btrfs_zone_finish_one_bg(struct btrfs_fs_info
*fs_info
)
2513 struct btrfs_block_group
*block_group
;
2514 struct btrfs_block_group
*min_bg
= NULL
;
2515 u64 min_avail
= U64_MAX
;
2518 spin_lock(&fs_info
->zone_active_bgs_lock
);
2519 list_for_each_entry(block_group
, &fs_info
->zone_active_bgs
,
2523 spin_lock(&block_group
->lock
);
2524 if (block_group
->reserved
|| block_group
->alloc_offset
== 0 ||
2525 (block_group
->flags
& BTRFS_BLOCK_GROUP_SYSTEM
) ||
2526 test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC
, &block_group
->runtime_flags
)) {
2527 spin_unlock(&block_group
->lock
);
2531 avail
= block_group
->zone_capacity
- block_group
->alloc_offset
;
2532 if (min_avail
> avail
) {
2534 btrfs_put_block_group(min_bg
);
2535 min_bg
= block_group
;
2537 btrfs_get_block_group(min_bg
);
2539 spin_unlock(&block_group
->lock
);
2541 spin_unlock(&fs_info
->zone_active_bgs_lock
);
2546 ret
= btrfs_zone_finish(min_bg
);
2547 btrfs_put_block_group(min_bg
);
2549 return ret
< 0 ? ret
: 1;
2552 int btrfs_zoned_activate_one_bg(struct btrfs_fs_info
*fs_info
,
2553 struct btrfs_space_info
*space_info
,
2556 struct btrfs_block_group
*bg
;
2559 if (!btrfs_is_zoned(fs_info
) || (space_info
->flags
& BTRFS_BLOCK_GROUP_DATA
))
2564 bool need_finish
= false;
2566 down_read(&space_info
->groups_sem
);
2567 for (index
= 0; index
< BTRFS_NR_RAID_TYPES
; index
++) {
2568 list_for_each_entry(bg
, &space_info
->block_groups
[index
],
2570 if (!spin_trylock(&bg
->lock
))
2572 if (btrfs_zoned_bg_is_full(bg
) ||
2573 test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
,
2574 &bg
->runtime_flags
)) {
2575 spin_unlock(&bg
->lock
);
2578 spin_unlock(&bg
->lock
);
2580 if (btrfs_zone_activate(bg
)) {
2581 up_read(&space_info
->groups_sem
);
2588 up_read(&space_info
->groups_sem
);
2590 if (!do_finish
|| !need_finish
)
2593 ret
= btrfs_zone_finish_one_bg(fs_info
);
2604 * Reserve zones for one metadata block group, one tree-log block group, and one
2605 * system block group.
2607 void btrfs_check_active_zone_reservation(struct btrfs_fs_info
*fs_info
)
2609 struct btrfs_fs_devices
*fs_devices
= fs_info
->fs_devices
;
2610 struct btrfs_block_group
*block_group
;
2611 struct btrfs_device
*device
;
2612 /* Reserve zones for normal SINGLE metadata and tree-log block group. */
2613 unsigned int metadata_reserve
= 2;
2614 /* Reserve a zone for SINGLE system block group. */
2615 unsigned int system_reserve
= 1;
2617 if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING
, &fs_info
->flags
))
2621 * This function is called from the mount context. So, there is no
2622 * parallel process touching the bits. No need for read_seqretry().
2624 if (fs_info
->avail_metadata_alloc_bits
& BTRFS_BLOCK_GROUP_DUP
)
2625 metadata_reserve
= 4;
2626 if (fs_info
->avail_system_alloc_bits
& BTRFS_BLOCK_GROUP_DUP
)
2629 /* Apply the reservation on all the devices. */
2630 mutex_lock(&fs_devices
->device_list_mutex
);
2631 list_for_each_entry(device
, &fs_devices
->devices
, dev_list
) {
2635 device
->zone_info
->reserved_active_zones
=
2636 metadata_reserve
+ system_reserve
;
2638 mutex_unlock(&fs_devices
->device_list_mutex
);
2640 /* Release reservation for currently active block groups. */
2641 spin_lock(&fs_info
->zone_active_bgs_lock
);
2642 list_for_each_entry(block_group
, &fs_info
->zone_active_bgs
, active_bg_list
) {
2643 struct btrfs_chunk_map
*map
= block_group
->physical_map
;
2645 if (!(block_group
->flags
&
2646 (BTRFS_BLOCK_GROUP_METADATA
| BTRFS_BLOCK_GROUP_SYSTEM
)))
2649 for (int i
= 0; i
< map
->num_stripes
; i
++)
2650 map
->stripes
[i
].dev
->zone_info
->reserved_active_zones
--;
2652 spin_unlock(&fs_info
->zone_active_bgs_lock
);