1 // SPDX-License-Identifier: GPL-2.0
3 #include <linux/bitops.h>
4 #include <linux/slab.h>
5 #include <linux/blkdev.h>
6 #include <linux/sched/mm.h>
7 #include <linux/atomic.h>
8 #include <linux/vmalloc.h>
12 #include "rcu-string.h"
14 #include "block-group.h"
15 #include "dev-replace.h"
16 #include "space-info.h"
18 #include "accessors.h"
21 /* Maximum number of zones to report per blkdev_report_zones() call */
22 #define BTRFS_REPORT_NR_ZONES 4096
23 /* Invalid allocation pointer value for missing devices */
24 #define WP_MISSING_DEV ((u64)-1)
25 /* Pseudo write pointer value for conventional zone */
26 #define WP_CONVENTIONAL ((u64)-2)
29 * Location of the first zone of superblock logging zone pairs.
31 * - primary superblock: 0B (zone 0)
32 * - first copy: 512G (zone starting at that offset)
33 * - second copy: 4T (zone starting at that offset)
35 #define BTRFS_SB_LOG_PRIMARY_OFFSET (0ULL)
36 #define BTRFS_SB_LOG_FIRST_OFFSET (512ULL * SZ_1G)
37 #define BTRFS_SB_LOG_SECOND_OFFSET (4096ULL * SZ_1G)
39 #define BTRFS_SB_LOG_FIRST_SHIFT const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET)
40 #define BTRFS_SB_LOG_SECOND_SHIFT const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET)
42 /* Number of superblock log zones */
43 #define BTRFS_NR_SB_LOG_ZONES 2
46 * Minimum of active zones we need:
48 * - BTRFS_SUPER_MIRROR_MAX zones for superblock mirrors
49 * - 3 zones to ensure at least one zone per SYSTEM, META and DATA block group
50 * - 1 zone for tree-log dedicated block group
51 * - 1 zone for relocation
53 #define BTRFS_MIN_ACTIVE_ZONES (BTRFS_SUPER_MIRROR_MAX + 5)
56 * Minimum / maximum supported zone size. Currently, SMR disks have a zone
57 * size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range.
58 * We do not expect the zone size to become larger than 8GiB or smaller than
59 * 4MiB in the near future.
61 #define BTRFS_MAX_ZONE_SIZE SZ_8G
62 #define BTRFS_MIN_ZONE_SIZE SZ_4M
64 #define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT)
66 static void wait_eb_writebacks(struct btrfs_block_group
*block_group
);
67 static int do_zone_finish(struct btrfs_block_group
*block_group
, bool fully_written
);
69 static inline bool sb_zone_is_full(const struct blk_zone
*zone
)
71 return (zone
->cond
== BLK_ZONE_COND_FULL
) ||
72 (zone
->wp
+ SUPER_INFO_SECTORS
> zone
->start
+ zone
->capacity
);
75 static int copy_zone_info_cb(struct blk_zone
*zone
, unsigned int idx
, void *data
)
77 struct blk_zone
*zones
= data
;
79 memcpy(&zones
[idx
], zone
, sizeof(*zone
));
84 static int sb_write_pointer(struct block_device
*bdev
, struct blk_zone
*zones
,
87 bool empty
[BTRFS_NR_SB_LOG_ZONES
];
88 bool full
[BTRFS_NR_SB_LOG_ZONES
];
91 for (int i
= 0; i
< BTRFS_NR_SB_LOG_ZONES
; i
++) {
92 ASSERT(zones
[i
].type
!= BLK_ZONE_TYPE_CONVENTIONAL
);
93 empty
[i
] = (zones
[i
].cond
== BLK_ZONE_COND_EMPTY
);
94 full
[i
] = sb_zone_is_full(&zones
[i
]);
98 * Possible states of log buffer zones
100 * Empty[0] In use[0] Full[0]
106 * *: Special case, no superblock is written
107 * 0: Use write pointer of zones[0]
108 * 1: Use write pointer of zones[1]
109 * C: Compare super blocks from zones[0] and zones[1], use the latest
110 * one determined by generation
114 if (empty
[0] && empty
[1]) {
115 /* Special case to distinguish no superblock to read */
116 *wp_ret
= zones
[0].start
<< SECTOR_SHIFT
;
118 } else if (full
[0] && full
[1]) {
119 /* Compare two super blocks */
120 struct address_space
*mapping
= bdev
->bd_mapping
;
121 struct page
*page
[BTRFS_NR_SB_LOG_ZONES
];
122 struct btrfs_super_block
*super
[BTRFS_NR_SB_LOG_ZONES
];
124 for (int i
= 0; i
< BTRFS_NR_SB_LOG_ZONES
; i
++) {
125 u64 zone_end
= (zones
[i
].start
+ zones
[i
].capacity
) << SECTOR_SHIFT
;
126 u64 bytenr
= ALIGN_DOWN(zone_end
, BTRFS_SUPER_INFO_SIZE
) -
127 BTRFS_SUPER_INFO_SIZE
;
129 page
[i
] = read_cache_page_gfp(mapping
,
130 bytenr
>> PAGE_SHIFT
, GFP_NOFS
);
131 if (IS_ERR(page
[i
])) {
133 btrfs_release_disk_super(super
[0]);
134 return PTR_ERR(page
[i
]);
136 super
[i
] = page_address(page
[i
]);
139 if (btrfs_super_generation(super
[0]) >
140 btrfs_super_generation(super
[1]))
141 sector
= zones
[1].start
;
143 sector
= zones
[0].start
;
145 for (int i
= 0; i
< BTRFS_NR_SB_LOG_ZONES
; i
++)
146 btrfs_release_disk_super(super
[i
]);
147 } else if (!full
[0] && (empty
[1] || full
[1])) {
148 sector
= zones
[0].wp
;
149 } else if (full
[0]) {
150 sector
= zones
[1].wp
;
154 *wp_ret
= sector
<< SECTOR_SHIFT
;
159 * Get the first zone number of the superblock mirror
161 static inline u32
sb_zone_number(int shift
, int mirror
)
165 ASSERT(mirror
< BTRFS_SUPER_MIRROR_MAX
);
167 case 0: zone
= 0; break;
168 case 1: zone
= 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT
- shift
); break;
169 case 2: zone
= 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT
- shift
); break;
172 ASSERT(zone
<= U32_MAX
);
177 static inline sector_t
zone_start_sector(u32 zone_number
,
178 struct block_device
*bdev
)
180 return (sector_t
)zone_number
<< ilog2(bdev_zone_sectors(bdev
));
183 static inline u64
zone_start_physical(u32 zone_number
,
184 struct btrfs_zoned_device_info
*zone_info
)
186 return (u64
)zone_number
<< zone_info
->zone_size_shift
;
190 * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block
191 * device into static sized chunks and fake a conventional zone on each of
194 static int emulate_report_zones(struct btrfs_device
*device
, u64 pos
,
195 struct blk_zone
*zones
, unsigned int nr_zones
)
197 const sector_t zone_sectors
= device
->fs_info
->zone_size
>> SECTOR_SHIFT
;
198 sector_t bdev_size
= bdev_nr_sectors(device
->bdev
);
201 pos
>>= SECTOR_SHIFT
;
202 for (i
= 0; i
< nr_zones
; i
++) {
203 zones
[i
].start
= i
* zone_sectors
+ pos
;
204 zones
[i
].len
= zone_sectors
;
205 zones
[i
].capacity
= zone_sectors
;
206 zones
[i
].wp
= zones
[i
].start
+ zone_sectors
;
207 zones
[i
].type
= BLK_ZONE_TYPE_CONVENTIONAL
;
208 zones
[i
].cond
= BLK_ZONE_COND_NOT_WP
;
210 if (zones
[i
].wp
>= bdev_size
) {
219 static int btrfs_get_dev_zones(struct btrfs_device
*device
, u64 pos
,
220 struct blk_zone
*zones
, unsigned int *nr_zones
)
222 struct btrfs_zoned_device_info
*zinfo
= device
->zone_info
;
228 if (!bdev_is_zoned(device
->bdev
)) {
229 ret
= emulate_report_zones(device
, pos
, zones
, *nr_zones
);
235 if (zinfo
->zone_cache
) {
239 ASSERT(IS_ALIGNED(pos
, zinfo
->zone_size
));
240 zno
= pos
>> zinfo
->zone_size_shift
;
242 * We cannot report zones beyond the zone end. So, it is OK to
243 * cap *nr_zones to at the end.
245 *nr_zones
= min_t(u32
, *nr_zones
, zinfo
->nr_zones
- zno
);
247 for (i
= 0; i
< *nr_zones
; i
++) {
248 struct blk_zone
*zone_info
;
250 zone_info
= &zinfo
->zone_cache
[zno
+ i
];
255 if (i
== *nr_zones
) {
256 /* Cache hit on all the zones */
257 memcpy(zones
, zinfo
->zone_cache
+ zno
,
258 sizeof(*zinfo
->zone_cache
) * *nr_zones
);
263 ret
= blkdev_report_zones(device
->bdev
, pos
>> SECTOR_SHIFT
, *nr_zones
,
264 copy_zone_info_cb
, zones
);
266 btrfs_err_in_rcu(device
->fs_info
,
267 "zoned: failed to read zone %llu on %s (devid %llu)",
268 pos
, rcu_str_deref(device
->name
),
277 if (zinfo
->zone_cache
) {
278 u32 zno
= pos
>> zinfo
->zone_size_shift
;
280 memcpy(zinfo
->zone_cache
+ zno
, zones
,
281 sizeof(*zinfo
->zone_cache
) * *nr_zones
);
287 /* The emulated zone size is determined from the size of device extent */
288 static int calculate_emulated_zone_size(struct btrfs_fs_info
*fs_info
)
290 BTRFS_PATH_AUTO_FREE(path
);
291 struct btrfs_root
*root
= fs_info
->dev_root
;
292 struct btrfs_key key
;
293 struct extent_buffer
*leaf
;
294 struct btrfs_dev_extent
*dext
;
298 key
.type
= BTRFS_DEV_EXTENT_KEY
;
301 path
= btrfs_alloc_path();
305 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
309 if (path
->slots
[0] >= btrfs_header_nritems(path
->nodes
[0])) {
310 ret
= btrfs_next_leaf(root
, path
);
313 /* No dev extents at all? Not good */
318 leaf
= path
->nodes
[0];
319 dext
= btrfs_item_ptr(leaf
, path
->slots
[0], struct btrfs_dev_extent
);
320 fs_info
->zone_size
= btrfs_dev_extent_length(leaf
, dext
);
324 int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info
*fs_info
)
326 struct btrfs_fs_devices
*fs_devices
= fs_info
->fs_devices
;
327 struct btrfs_device
*device
;
330 /* fs_info->zone_size might not set yet. Use the incomapt flag here. */
331 if (!btrfs_fs_incompat(fs_info
, ZONED
))
334 mutex_lock(&fs_devices
->device_list_mutex
);
335 list_for_each_entry(device
, &fs_devices
->devices
, dev_list
) {
336 /* We can skip reading of zone info for missing devices */
340 ret
= btrfs_get_dev_zone_info(device
, true);
344 mutex_unlock(&fs_devices
->device_list_mutex
);
349 int btrfs_get_dev_zone_info(struct btrfs_device
*device
, bool populate_cache
)
351 struct btrfs_fs_info
*fs_info
= device
->fs_info
;
352 struct btrfs_zoned_device_info
*zone_info
= NULL
;
353 struct block_device
*bdev
= device
->bdev
;
354 unsigned int max_active_zones
;
355 unsigned int nactive
;
358 struct blk_zone
*zones
= NULL
;
359 unsigned int i
, nreported
= 0, nr_zones
;
360 sector_t zone_sectors
;
361 char *model
, *emulated
;
365 * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not
368 if (!btrfs_fs_incompat(fs_info
, ZONED
))
371 if (device
->zone_info
)
374 zone_info
= kzalloc(sizeof(*zone_info
), GFP_KERNEL
);
378 device
->zone_info
= zone_info
;
380 if (!bdev_is_zoned(bdev
)) {
381 if (!fs_info
->zone_size
) {
382 ret
= calculate_emulated_zone_size(fs_info
);
387 ASSERT(fs_info
->zone_size
);
388 zone_sectors
= fs_info
->zone_size
>> SECTOR_SHIFT
;
390 zone_sectors
= bdev_zone_sectors(bdev
);
393 ASSERT(is_power_of_two_u64(zone_sectors
));
394 zone_info
->zone_size
= zone_sectors
<< SECTOR_SHIFT
;
396 /* We reject devices with a zone size larger than 8GB */
397 if (zone_info
->zone_size
> BTRFS_MAX_ZONE_SIZE
) {
398 btrfs_err_in_rcu(fs_info
,
399 "zoned: %s: zone size %llu larger than supported maximum %llu",
400 rcu_str_deref(device
->name
),
401 zone_info
->zone_size
, BTRFS_MAX_ZONE_SIZE
);
404 } else if (zone_info
->zone_size
< BTRFS_MIN_ZONE_SIZE
) {
405 btrfs_err_in_rcu(fs_info
,
406 "zoned: %s: zone size %llu smaller than supported minimum %u",
407 rcu_str_deref(device
->name
),
408 zone_info
->zone_size
, BTRFS_MIN_ZONE_SIZE
);
413 nr_sectors
= bdev_nr_sectors(bdev
);
414 zone_info
->zone_size_shift
= ilog2(zone_info
->zone_size
);
415 zone_info
->nr_zones
= nr_sectors
>> ilog2(zone_sectors
);
416 if (!IS_ALIGNED(nr_sectors
, zone_sectors
))
417 zone_info
->nr_zones
++;
419 max_active_zones
= bdev_max_active_zones(bdev
);
420 if (max_active_zones
&& max_active_zones
< BTRFS_MIN_ACTIVE_ZONES
) {
421 btrfs_err_in_rcu(fs_info
,
422 "zoned: %s: max active zones %u is too small, need at least %u active zones",
423 rcu_str_deref(device
->name
), max_active_zones
,
424 BTRFS_MIN_ACTIVE_ZONES
);
428 zone_info
->max_active_zones
= max_active_zones
;
430 zone_info
->seq_zones
= bitmap_zalloc(zone_info
->nr_zones
, GFP_KERNEL
);
431 if (!zone_info
->seq_zones
) {
436 zone_info
->empty_zones
= bitmap_zalloc(zone_info
->nr_zones
, GFP_KERNEL
);
437 if (!zone_info
->empty_zones
) {
442 zone_info
->active_zones
= bitmap_zalloc(zone_info
->nr_zones
, GFP_KERNEL
);
443 if (!zone_info
->active_zones
) {
448 zones
= kvcalloc(BTRFS_REPORT_NR_ZONES
, sizeof(struct blk_zone
), GFP_KERNEL
);
455 * Enable zone cache only for a zoned device. On a non-zoned device, we
456 * fill the zone info with emulated CONVENTIONAL zones, so no need to
459 if (populate_cache
&& bdev_is_zoned(device
->bdev
)) {
460 zone_info
->zone_cache
= vcalloc(zone_info
->nr_zones
,
461 sizeof(struct blk_zone
));
462 if (!zone_info
->zone_cache
) {
463 btrfs_err_in_rcu(device
->fs_info
,
464 "zoned: failed to allocate zone cache for %s",
465 rcu_str_deref(device
->name
));
473 while (sector
< nr_sectors
) {
474 nr_zones
= BTRFS_REPORT_NR_ZONES
;
475 ret
= btrfs_get_dev_zones(device
, sector
<< SECTOR_SHIFT
, zones
,
480 for (i
= 0; i
< nr_zones
; i
++) {
481 if (zones
[i
].type
== BLK_ZONE_TYPE_SEQWRITE_REQ
)
482 __set_bit(nreported
, zone_info
->seq_zones
);
483 switch (zones
[i
].cond
) {
484 case BLK_ZONE_COND_EMPTY
:
485 __set_bit(nreported
, zone_info
->empty_zones
);
487 case BLK_ZONE_COND_IMP_OPEN
:
488 case BLK_ZONE_COND_EXP_OPEN
:
489 case BLK_ZONE_COND_CLOSED
:
490 __set_bit(nreported
, zone_info
->active_zones
);
496 sector
= zones
[nr_zones
- 1].start
+ zones
[nr_zones
- 1].len
;
499 if (nreported
!= zone_info
->nr_zones
) {
500 btrfs_err_in_rcu(device
->fs_info
,
501 "inconsistent number of zones on %s (%u/%u)",
502 rcu_str_deref(device
->name
), nreported
,
503 zone_info
->nr_zones
);
508 if (max_active_zones
) {
509 if (nactive
> max_active_zones
) {
510 btrfs_err_in_rcu(device
->fs_info
,
511 "zoned: %u active zones on %s exceeds max_active_zones %u",
512 nactive
, rcu_str_deref(device
->name
),
517 atomic_set(&zone_info
->active_zones_left
,
518 max_active_zones
- nactive
);
519 set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING
, &fs_info
->flags
);
522 /* Validate superblock log */
523 nr_zones
= BTRFS_NR_SB_LOG_ZONES
;
524 for (i
= 0; i
< BTRFS_SUPER_MIRROR_MAX
; i
++) {
527 int sb_pos
= BTRFS_NR_SB_LOG_ZONES
* i
;
529 sb_zone
= sb_zone_number(zone_info
->zone_size_shift
, i
);
530 if (sb_zone
+ 1 >= zone_info
->nr_zones
)
533 ret
= btrfs_get_dev_zones(device
,
534 zone_start_physical(sb_zone
, zone_info
),
535 &zone_info
->sb_zones
[sb_pos
],
540 if (nr_zones
!= BTRFS_NR_SB_LOG_ZONES
) {
541 btrfs_err_in_rcu(device
->fs_info
,
542 "zoned: failed to read super block log zone info at devid %llu zone %u",
543 device
->devid
, sb_zone
);
549 * If zones[0] is conventional, always use the beginning of the
550 * zone to record superblock. No need to validate in that case.
552 if (zone_info
->sb_zones
[BTRFS_NR_SB_LOG_ZONES
* i
].type
==
553 BLK_ZONE_TYPE_CONVENTIONAL
)
556 ret
= sb_write_pointer(device
->bdev
,
557 &zone_info
->sb_zones
[sb_pos
], &sb_wp
);
558 if (ret
!= -ENOENT
&& ret
) {
559 btrfs_err_in_rcu(device
->fs_info
,
560 "zoned: super block log zone corrupted devid %llu zone %u",
561 device
->devid
, sb_zone
);
570 if (bdev_is_zoned(bdev
)) {
571 model
= "host-managed zoned";
575 emulated
= "emulated ";
578 btrfs_info_in_rcu(fs_info
,
579 "%s block device %s, %u %szones of %llu bytes",
580 model
, rcu_str_deref(device
->name
), zone_info
->nr_zones
,
581 emulated
, zone_info
->zone_size
);
587 btrfs_destroy_dev_zone_info(device
);
591 void btrfs_destroy_dev_zone_info(struct btrfs_device
*device
)
593 struct btrfs_zoned_device_info
*zone_info
= device
->zone_info
;
598 bitmap_free(zone_info
->active_zones
);
599 bitmap_free(zone_info
->seq_zones
);
600 bitmap_free(zone_info
->empty_zones
);
601 vfree(zone_info
->zone_cache
);
603 device
->zone_info
= NULL
;
606 struct btrfs_zoned_device_info
*btrfs_clone_dev_zone_info(struct btrfs_device
*orig_dev
)
608 struct btrfs_zoned_device_info
*zone_info
;
610 zone_info
= kmemdup(orig_dev
->zone_info
, sizeof(*zone_info
), GFP_KERNEL
);
614 zone_info
->seq_zones
= bitmap_zalloc(zone_info
->nr_zones
, GFP_KERNEL
);
615 if (!zone_info
->seq_zones
)
618 bitmap_copy(zone_info
->seq_zones
, orig_dev
->zone_info
->seq_zones
,
619 zone_info
->nr_zones
);
621 zone_info
->empty_zones
= bitmap_zalloc(zone_info
->nr_zones
, GFP_KERNEL
);
622 if (!zone_info
->empty_zones
)
625 bitmap_copy(zone_info
->empty_zones
, orig_dev
->zone_info
->empty_zones
,
626 zone_info
->nr_zones
);
628 zone_info
->active_zones
= bitmap_zalloc(zone_info
->nr_zones
, GFP_KERNEL
);
629 if (!zone_info
->active_zones
)
632 bitmap_copy(zone_info
->active_zones
, orig_dev
->zone_info
->active_zones
,
633 zone_info
->nr_zones
);
634 zone_info
->zone_cache
= NULL
;
639 bitmap_free(zone_info
->seq_zones
);
640 bitmap_free(zone_info
->empty_zones
);
641 bitmap_free(zone_info
->active_zones
);
646 static int btrfs_get_dev_zone(struct btrfs_device
*device
, u64 pos
, struct blk_zone
*zone
)
648 unsigned int nr_zones
= 1;
651 ret
= btrfs_get_dev_zones(device
, pos
, zone
, &nr_zones
);
652 if (ret
!= 0 || !nr_zones
)
653 return ret
? ret
: -EIO
;
658 static int btrfs_check_for_zoned_device(struct btrfs_fs_info
*fs_info
)
660 struct btrfs_device
*device
;
662 list_for_each_entry(device
, &fs_info
->fs_devices
->devices
, dev_list
) {
663 if (device
->bdev
&& bdev_is_zoned(device
->bdev
)) {
665 "zoned: mode not enabled but zoned device found: %pg",
674 int btrfs_check_zoned_mode(struct btrfs_fs_info
*fs_info
)
676 struct queue_limits
*lim
= &fs_info
->limits
;
677 struct btrfs_device
*device
;
682 * Host-Managed devices can't be used without the ZONED flag. With the
683 * ZONED all devices can be used, using zone emulation if required.
685 if (!btrfs_fs_incompat(fs_info
, ZONED
))
686 return btrfs_check_for_zoned_device(fs_info
);
688 blk_set_stacking_limits(lim
);
690 list_for_each_entry(device
, &fs_info
->fs_devices
->devices
, dev_list
) {
691 struct btrfs_zoned_device_info
*zone_info
= device
->zone_info
;
697 zone_size
= zone_info
->zone_size
;
698 } else if (zone_info
->zone_size
!= zone_size
) {
700 "zoned: unequal block device zone sizes: have %llu found %llu",
701 zone_info
->zone_size
, zone_size
);
706 * With the zoned emulation, we can have non-zoned device on the
707 * zoned mode. In this case, we don't have a valid max zone
710 if (bdev_is_zoned(device
->bdev
)) {
711 blk_stack_limits(lim
,
712 &bdev_get_queue(device
->bdev
)->limits
,
718 * stripe_size is always aligned to BTRFS_STRIPE_LEN in
719 * btrfs_create_chunk(). Since we want stripe_len == zone_size,
720 * check the alignment here.
722 if (!IS_ALIGNED(zone_size
, BTRFS_STRIPE_LEN
)) {
724 "zoned: zone size %llu not aligned to stripe %u",
725 zone_size
, BTRFS_STRIPE_LEN
);
729 if (btrfs_fs_incompat(fs_info
, MIXED_GROUPS
)) {
730 btrfs_err(fs_info
, "zoned: mixed block groups not supported");
734 fs_info
->zone_size
= zone_size
;
736 * Also limit max_zone_append_size by max_segments * PAGE_SIZE.
737 * Technically, we can have multiple pages per segment. But, since
738 * we add the pages one by one to a bio, and cannot increase the
739 * metadata reservation even if it increases the number of extents, it
740 * is safe to stick with the limit.
742 fs_info
->max_zone_append_size
= ALIGN_DOWN(
743 min3((u64
)lim
->max_zone_append_sectors
<< SECTOR_SHIFT
,
744 (u64
)lim
->max_sectors
<< SECTOR_SHIFT
,
745 (u64
)lim
->max_segments
<< PAGE_SHIFT
),
746 fs_info
->sectorsize
);
747 fs_info
->fs_devices
->chunk_alloc_policy
= BTRFS_CHUNK_ALLOC_ZONED
;
748 if (fs_info
->max_zone_append_size
< fs_info
->max_extent_size
)
749 fs_info
->max_extent_size
= fs_info
->max_zone_append_size
;
752 * Check mount options here, because we might change fs_info->zoned
753 * from fs_info->zone_size.
755 ret
= btrfs_check_mountopts_zoned(fs_info
, &fs_info
->mount_opt
);
759 btrfs_info(fs_info
, "zoned mode enabled with zone size %llu", zone_size
);
763 int btrfs_check_mountopts_zoned(const struct btrfs_fs_info
*info
,
764 unsigned long long *mount_opt
)
766 if (!btrfs_is_zoned(info
))
770 * Space cache writing is not COWed. Disable that to avoid write errors
771 * in sequential zones.
773 if (btrfs_raw_test_opt(*mount_opt
, SPACE_CACHE
)) {
774 btrfs_err(info
, "zoned: space cache v1 is not supported");
778 if (btrfs_raw_test_opt(*mount_opt
, NODATACOW
)) {
779 btrfs_err(info
, "zoned: NODATACOW not supported");
783 if (btrfs_raw_test_opt(*mount_opt
, DISCARD_ASYNC
)) {
785 "zoned: async discard ignored and disabled for zoned mode");
786 btrfs_clear_opt(*mount_opt
, DISCARD_ASYNC
);
792 static int sb_log_location(struct block_device
*bdev
, struct blk_zone
*zones
,
793 int rw
, u64
*bytenr_ret
)
798 if (zones
[0].type
== BLK_ZONE_TYPE_CONVENTIONAL
) {
799 *bytenr_ret
= zones
[0].start
<< SECTOR_SHIFT
;
803 ret
= sb_write_pointer(bdev
, zones
, &wp
);
804 if (ret
!= -ENOENT
&& ret
< 0)
808 struct blk_zone
*reset
= NULL
;
810 if (wp
== zones
[0].start
<< SECTOR_SHIFT
)
812 else if (wp
== zones
[1].start
<< SECTOR_SHIFT
)
815 if (reset
&& reset
->cond
!= BLK_ZONE_COND_EMPTY
) {
816 unsigned int nofs_flags
;
818 ASSERT(sb_zone_is_full(reset
));
820 nofs_flags
= memalloc_nofs_save();
821 ret
= blkdev_zone_mgmt(bdev
, REQ_OP_ZONE_RESET
,
822 reset
->start
, reset
->len
);
823 memalloc_nofs_restore(nofs_flags
);
827 reset
->cond
= BLK_ZONE_COND_EMPTY
;
828 reset
->wp
= reset
->start
;
830 } else if (ret
!= -ENOENT
) {
832 * For READ, we want the previous one. Move write pointer to
833 * the end of a zone, if it is at the head of a zone.
837 if (wp
== zones
[0].start
<< SECTOR_SHIFT
)
838 zone_end
= zones
[1].start
+ zones
[1].capacity
;
839 else if (wp
== zones
[1].start
<< SECTOR_SHIFT
)
840 zone_end
= zones
[0].start
+ zones
[0].capacity
;
842 wp
= ALIGN_DOWN(zone_end
<< SECTOR_SHIFT
,
843 BTRFS_SUPER_INFO_SIZE
);
845 wp
-= BTRFS_SUPER_INFO_SIZE
;
853 int btrfs_sb_log_location_bdev(struct block_device
*bdev
, int mirror
, int rw
,
856 struct blk_zone zones
[BTRFS_NR_SB_LOG_ZONES
];
857 sector_t zone_sectors
;
860 u8 zone_sectors_shift
;
864 if (!bdev_is_zoned(bdev
)) {
865 *bytenr_ret
= btrfs_sb_offset(mirror
);
869 ASSERT(rw
== READ
|| rw
== WRITE
);
871 zone_sectors
= bdev_zone_sectors(bdev
);
872 if (!is_power_of_2(zone_sectors
))
874 zone_sectors_shift
= ilog2(zone_sectors
);
875 nr_sectors
= bdev_nr_sectors(bdev
);
876 nr_zones
= nr_sectors
>> zone_sectors_shift
;
878 sb_zone
= sb_zone_number(zone_sectors_shift
+ SECTOR_SHIFT
, mirror
);
879 if (sb_zone
+ 1 >= nr_zones
)
882 ret
= blkdev_report_zones(bdev
, zone_start_sector(sb_zone
, bdev
),
883 BTRFS_NR_SB_LOG_ZONES
, copy_zone_info_cb
,
887 if (ret
!= BTRFS_NR_SB_LOG_ZONES
)
890 return sb_log_location(bdev
, zones
, rw
, bytenr_ret
);
893 int btrfs_sb_log_location(struct btrfs_device
*device
, int mirror
, int rw
,
896 struct btrfs_zoned_device_info
*zinfo
= device
->zone_info
;
900 * For a zoned filesystem on a non-zoned block device, use the same
901 * super block locations as regular filesystem. Doing so, the super
902 * block can always be retrieved and the zoned flag of the volume
903 * detected from the super block information.
905 if (!bdev_is_zoned(device
->bdev
)) {
906 *bytenr_ret
= btrfs_sb_offset(mirror
);
910 zone_num
= sb_zone_number(zinfo
->zone_size_shift
, mirror
);
911 if (zone_num
+ 1 >= zinfo
->nr_zones
)
914 return sb_log_location(device
->bdev
,
915 &zinfo
->sb_zones
[BTRFS_NR_SB_LOG_ZONES
* mirror
],
919 static inline bool is_sb_log_zone(struct btrfs_zoned_device_info
*zinfo
,
927 zone_num
= sb_zone_number(zinfo
->zone_size_shift
, mirror
);
928 if (zone_num
+ 1 >= zinfo
->nr_zones
)
931 if (!test_bit(zone_num
, zinfo
->seq_zones
))
937 int btrfs_advance_sb_log(struct btrfs_device
*device
, int mirror
)
939 struct btrfs_zoned_device_info
*zinfo
= device
->zone_info
;
940 struct blk_zone
*zone
;
943 if (!is_sb_log_zone(zinfo
, mirror
))
946 zone
= &zinfo
->sb_zones
[BTRFS_NR_SB_LOG_ZONES
* mirror
];
947 for (i
= 0; i
< BTRFS_NR_SB_LOG_ZONES
; i
++) {
948 /* Advance the next zone */
949 if (zone
->cond
== BLK_ZONE_COND_FULL
) {
954 if (zone
->cond
== BLK_ZONE_COND_EMPTY
)
955 zone
->cond
= BLK_ZONE_COND_IMP_OPEN
;
957 zone
->wp
+= SUPER_INFO_SECTORS
;
959 if (sb_zone_is_full(zone
)) {
961 * No room left to write new superblock. Since
962 * superblock is written with REQ_SYNC, it is safe to
963 * finish the zone now.
965 * If the write pointer is exactly at the capacity,
966 * explicit ZONE_FINISH is not necessary.
968 if (zone
->wp
!= zone
->start
+ zone
->capacity
) {
969 unsigned int nofs_flags
;
972 nofs_flags
= memalloc_nofs_save();
973 ret
= blkdev_zone_mgmt(device
->bdev
,
974 REQ_OP_ZONE_FINISH
, zone
->start
,
976 memalloc_nofs_restore(nofs_flags
);
981 zone
->wp
= zone
->start
+ zone
->len
;
982 zone
->cond
= BLK_ZONE_COND_FULL
;
987 /* All the zones are FULL. Should not reach here. */
992 int btrfs_reset_sb_log_zones(struct block_device
*bdev
, int mirror
)
994 unsigned int nofs_flags
;
995 sector_t zone_sectors
;
997 u8 zone_sectors_shift
;
1002 zone_sectors
= bdev_zone_sectors(bdev
);
1003 zone_sectors_shift
= ilog2(zone_sectors
);
1004 nr_sectors
= bdev_nr_sectors(bdev
);
1005 nr_zones
= nr_sectors
>> zone_sectors_shift
;
1007 sb_zone
= sb_zone_number(zone_sectors_shift
+ SECTOR_SHIFT
, mirror
);
1008 if (sb_zone
+ 1 >= nr_zones
)
1011 nofs_flags
= memalloc_nofs_save();
1012 ret
= blkdev_zone_mgmt(bdev
, REQ_OP_ZONE_RESET
,
1013 zone_start_sector(sb_zone
, bdev
),
1014 zone_sectors
* BTRFS_NR_SB_LOG_ZONES
);
1015 memalloc_nofs_restore(nofs_flags
);
1020 * Find allocatable zones within a given region.
1022 * @device: the device to allocate a region on
1023 * @hole_start: the position of the hole to allocate the region
1024 * @num_bytes: size of wanted region
1025 * @hole_end: the end of the hole
1026 * @return: position of allocatable zones
1028 * Allocatable region should not contain any superblock locations.
1030 u64
btrfs_find_allocatable_zones(struct btrfs_device
*device
, u64 hole_start
,
1031 u64 hole_end
, u64 num_bytes
)
1033 struct btrfs_zoned_device_info
*zinfo
= device
->zone_info
;
1034 const u8 shift
= zinfo
->zone_size_shift
;
1035 u64 nzones
= num_bytes
>> shift
;
1036 u64 pos
= hole_start
;
1041 ASSERT(IS_ALIGNED(hole_start
, zinfo
->zone_size
));
1042 ASSERT(IS_ALIGNED(num_bytes
, zinfo
->zone_size
));
1044 while (pos
< hole_end
) {
1045 begin
= pos
>> shift
;
1046 end
= begin
+ nzones
;
1048 if (end
> zinfo
->nr_zones
)
1051 /* Check if zones in the region are all empty */
1052 if (btrfs_dev_is_sequential(device
, pos
) &&
1053 !bitmap_test_range_all_set(zinfo
->empty_zones
, begin
, nzones
)) {
1054 pos
+= zinfo
->zone_size
;
1059 for (i
= 0; i
< BTRFS_SUPER_MIRROR_MAX
; i
++) {
1063 sb_zone
= sb_zone_number(shift
, i
);
1064 if (!(end
<= sb_zone
||
1065 sb_zone
+ BTRFS_NR_SB_LOG_ZONES
<= begin
)) {
1067 pos
= zone_start_physical(
1068 sb_zone
+ BTRFS_NR_SB_LOG_ZONES
, zinfo
);
1072 /* We also need to exclude regular superblock positions */
1073 sb_pos
= btrfs_sb_offset(i
);
1074 if (!(pos
+ num_bytes
<= sb_pos
||
1075 sb_pos
+ BTRFS_SUPER_INFO_SIZE
<= pos
)) {
1077 pos
= ALIGN(sb_pos
+ BTRFS_SUPER_INFO_SIZE
,
1089 static bool btrfs_dev_set_active_zone(struct btrfs_device
*device
, u64 pos
)
1091 struct btrfs_zoned_device_info
*zone_info
= device
->zone_info
;
1092 unsigned int zno
= (pos
>> zone_info
->zone_size_shift
);
1094 /* We can use any number of zones */
1095 if (zone_info
->max_active_zones
== 0)
1098 if (!test_bit(zno
, zone_info
->active_zones
)) {
1099 /* Active zone left? */
1100 if (atomic_dec_if_positive(&zone_info
->active_zones_left
) < 0)
1102 if (test_and_set_bit(zno
, zone_info
->active_zones
)) {
1103 /* Someone already set the bit */
1104 atomic_inc(&zone_info
->active_zones_left
);
1111 static void btrfs_dev_clear_active_zone(struct btrfs_device
*device
, u64 pos
)
1113 struct btrfs_zoned_device_info
*zone_info
= device
->zone_info
;
1114 unsigned int zno
= (pos
>> zone_info
->zone_size_shift
);
1116 /* We can use any number of zones */
1117 if (zone_info
->max_active_zones
== 0)
1120 if (test_and_clear_bit(zno
, zone_info
->active_zones
))
1121 atomic_inc(&zone_info
->active_zones_left
);
1124 int btrfs_reset_device_zone(struct btrfs_device
*device
, u64 physical
,
1125 u64 length
, u64
*bytes
)
1127 unsigned int nofs_flags
;
1131 nofs_flags
= memalloc_nofs_save();
1132 ret
= blkdev_zone_mgmt(device
->bdev
, REQ_OP_ZONE_RESET
,
1133 physical
>> SECTOR_SHIFT
, length
>> SECTOR_SHIFT
);
1134 memalloc_nofs_restore(nofs_flags
);
1140 btrfs_dev_set_zone_empty(device
, physical
);
1141 btrfs_dev_clear_active_zone(device
, physical
);
1142 physical
+= device
->zone_info
->zone_size
;
1143 length
-= device
->zone_info
->zone_size
;
1149 int btrfs_ensure_empty_zones(struct btrfs_device
*device
, u64 start
, u64 size
)
1151 struct btrfs_zoned_device_info
*zinfo
= device
->zone_info
;
1152 const u8 shift
= zinfo
->zone_size_shift
;
1153 unsigned long begin
= start
>> shift
;
1154 unsigned long nbits
= size
>> shift
;
1158 ASSERT(IS_ALIGNED(start
, zinfo
->zone_size
));
1159 ASSERT(IS_ALIGNED(size
, zinfo
->zone_size
));
1161 if (begin
+ nbits
> zinfo
->nr_zones
)
1164 /* All the zones are conventional */
1165 if (bitmap_test_range_all_zero(zinfo
->seq_zones
, begin
, nbits
))
1168 /* All the zones are sequential and empty */
1169 if (bitmap_test_range_all_set(zinfo
->seq_zones
, begin
, nbits
) &&
1170 bitmap_test_range_all_set(zinfo
->empty_zones
, begin
, nbits
))
1173 for (pos
= start
; pos
< start
+ size
; pos
+= zinfo
->zone_size
) {
1176 if (!btrfs_dev_is_sequential(device
, pos
) ||
1177 btrfs_dev_is_empty_zone(device
, pos
))
1180 /* Free regions should be empty */
1183 "zoned: resetting device %s (devid %llu) zone %llu for allocation",
1184 rcu_str_deref(device
->name
), device
->devid
, pos
>> shift
);
1187 ret
= btrfs_reset_device_zone(device
, pos
, zinfo
->zone_size
,
1197 * Calculate an allocation pointer from the extent allocation information
1198 * for a block group consist of conventional zones. It is pointed to the
1199 * end of the highest addressed extent in the block group as an allocation
1202 static int calculate_alloc_pointer(struct btrfs_block_group
*cache
,
1203 u64
*offset_ret
, bool new)
1205 struct btrfs_fs_info
*fs_info
= cache
->fs_info
;
1206 struct btrfs_root
*root
;
1207 BTRFS_PATH_AUTO_FREE(path
);
1208 struct btrfs_key key
;
1209 struct btrfs_key found_key
;
1214 * Avoid tree lookups for a new block group, there's no use for it.
1215 * It must always be 0.
1217 * Also, we have a lock chain of extent buffer lock -> chunk mutex.
1218 * For new a block group, this function is called from
1219 * btrfs_make_block_group() which is already taking the chunk mutex.
1220 * Thus, we cannot call calculate_alloc_pointer() which takes extent
1221 * buffer locks to avoid deadlock.
1228 path
= btrfs_alloc_path();
1232 key
.objectid
= cache
->start
+ cache
->length
;
1236 root
= btrfs_extent_root(fs_info
, key
.objectid
);
1237 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
1238 /* We should not find the exact match */
1244 ret
= btrfs_previous_extent_item(root
, path
, cache
->start
);
1253 btrfs_item_key_to_cpu(path
->nodes
[0], &found_key
, path
->slots
[0]);
1255 if (found_key
.type
== BTRFS_EXTENT_ITEM_KEY
)
1256 length
= found_key
.offset
;
1258 length
= fs_info
->nodesize
;
1260 if (!(found_key
.objectid
>= cache
->start
&&
1261 found_key
.objectid
+ length
<= cache
->start
+ cache
->length
)) {
1264 *offset_ret
= found_key
.objectid
+ length
- cache
->start
;
1274 static int btrfs_load_zone_info(struct btrfs_fs_info
*fs_info
, int zone_idx
,
1275 struct zone_info
*info
, unsigned long *active
,
1276 struct btrfs_chunk_map
*map
)
1278 struct btrfs_dev_replace
*dev_replace
= &fs_info
->dev_replace
;
1279 struct btrfs_device
*device
;
1280 int dev_replace_is_ongoing
= 0;
1281 unsigned int nofs_flag
;
1282 struct blk_zone zone
;
1285 info
->physical
= map
->stripes
[zone_idx
].physical
;
1287 down_read(&dev_replace
->rwsem
);
1288 device
= map
->stripes
[zone_idx
].dev
;
1290 if (!device
->bdev
) {
1291 up_read(&dev_replace
->rwsem
);
1292 info
->alloc_offset
= WP_MISSING_DEV
;
1296 /* Consider a zone as active if we can allow any number of active zones. */
1297 if (!device
->zone_info
->max_active_zones
)
1298 __set_bit(zone_idx
, active
);
1300 if (!btrfs_dev_is_sequential(device
, info
->physical
)) {
1301 up_read(&dev_replace
->rwsem
);
1302 info
->alloc_offset
= WP_CONVENTIONAL
;
1306 /* This zone will be used for allocation, so mark this zone non-empty. */
1307 btrfs_dev_clear_zone_empty(device
, info
->physical
);
1309 dev_replace_is_ongoing
= btrfs_dev_replace_is_ongoing(dev_replace
);
1310 if (dev_replace_is_ongoing
&& dev_replace
->tgtdev
!= NULL
)
1311 btrfs_dev_clear_zone_empty(dev_replace
->tgtdev
, info
->physical
);
1314 * The group is mapped to a sequential zone. Get the zone write pointer
1315 * to determine the allocation offset within the zone.
1317 WARN_ON(!IS_ALIGNED(info
->physical
, fs_info
->zone_size
));
1318 nofs_flag
= memalloc_nofs_save();
1319 ret
= btrfs_get_dev_zone(device
, info
->physical
, &zone
);
1320 memalloc_nofs_restore(nofs_flag
);
1322 up_read(&dev_replace
->rwsem
);
1323 if (ret
!= -EIO
&& ret
!= -EOPNOTSUPP
)
1325 info
->alloc_offset
= WP_MISSING_DEV
;
1329 if (zone
.type
== BLK_ZONE_TYPE_CONVENTIONAL
) {
1330 btrfs_err_in_rcu(fs_info
,
1331 "zoned: unexpected conventional zone %llu on device %s (devid %llu)",
1332 zone
.start
<< SECTOR_SHIFT
, rcu_str_deref(device
->name
),
1334 up_read(&dev_replace
->rwsem
);
1338 info
->capacity
= (zone
.capacity
<< SECTOR_SHIFT
);
1340 switch (zone
.cond
) {
1341 case BLK_ZONE_COND_OFFLINE
:
1342 case BLK_ZONE_COND_READONLY
:
1344 "zoned: offline/readonly zone %llu on device %s (devid %llu)",
1345 (info
->physical
>> device
->zone_info
->zone_size_shift
),
1346 rcu_str_deref(device
->name
), device
->devid
);
1347 info
->alloc_offset
= WP_MISSING_DEV
;
1349 case BLK_ZONE_COND_EMPTY
:
1350 info
->alloc_offset
= 0;
1352 case BLK_ZONE_COND_FULL
:
1353 info
->alloc_offset
= info
->capacity
;
1356 /* Partially used zone. */
1357 info
->alloc_offset
= ((zone
.wp
- zone
.start
) << SECTOR_SHIFT
);
1358 __set_bit(zone_idx
, active
);
1362 up_read(&dev_replace
->rwsem
);
1367 static int btrfs_load_block_group_single(struct btrfs_block_group
*bg
,
1368 struct zone_info
*info
,
1369 unsigned long *active
)
1371 if (info
->alloc_offset
== WP_MISSING_DEV
) {
1372 btrfs_err(bg
->fs_info
,
1373 "zoned: cannot recover write pointer for zone %llu",
1378 bg
->alloc_offset
= info
->alloc_offset
;
1379 bg
->zone_capacity
= info
->capacity
;
1380 if (test_bit(0, active
))
1381 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &bg
->runtime_flags
);
1385 static int btrfs_load_block_group_dup(struct btrfs_block_group
*bg
,
1386 struct btrfs_chunk_map
*map
,
1387 struct zone_info
*zone_info
,
1388 unsigned long *active
)
1390 struct btrfs_fs_info
*fs_info
= bg
->fs_info
;
1392 if ((map
->type
& BTRFS_BLOCK_GROUP_DATA
) && !fs_info
->stripe_root
) {
1393 btrfs_err(fs_info
, "zoned: data DUP profile needs raid-stripe-tree");
1397 bg
->zone_capacity
= min_not_zero(zone_info
[0].capacity
, zone_info
[1].capacity
);
1399 if (zone_info
[0].alloc_offset
== WP_MISSING_DEV
) {
1400 btrfs_err(bg
->fs_info
,
1401 "zoned: cannot recover write pointer for zone %llu",
1402 zone_info
[0].physical
);
1405 if (zone_info
[1].alloc_offset
== WP_MISSING_DEV
) {
1406 btrfs_err(bg
->fs_info
,
1407 "zoned: cannot recover write pointer for zone %llu",
1408 zone_info
[1].physical
);
1411 if (zone_info
[0].alloc_offset
!= zone_info
[1].alloc_offset
) {
1412 btrfs_err(bg
->fs_info
,
1413 "zoned: write pointer offset mismatch of zones in DUP profile");
1417 if (test_bit(0, active
) != test_bit(1, active
)) {
1418 if (!btrfs_zone_activate(bg
))
1420 } else if (test_bit(0, active
)) {
1421 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &bg
->runtime_flags
);
1424 bg
->alloc_offset
= zone_info
[0].alloc_offset
;
1428 static int btrfs_load_block_group_raid1(struct btrfs_block_group
*bg
,
1429 struct btrfs_chunk_map
*map
,
1430 struct zone_info
*zone_info
,
1431 unsigned long *active
)
1433 struct btrfs_fs_info
*fs_info
= bg
->fs_info
;
1436 if ((map
->type
& BTRFS_BLOCK_GROUP_DATA
) && !fs_info
->stripe_root
) {
1437 btrfs_err(fs_info
, "zoned: data %s needs raid-stripe-tree",
1438 btrfs_bg_type_to_raid_name(map
->type
));
1442 /* In case a device is missing we have a cap of 0, so don't use it. */
1443 bg
->zone_capacity
= min_not_zero(zone_info
[0].capacity
, zone_info
[1].capacity
);
1445 for (i
= 0; i
< map
->num_stripes
; i
++) {
1446 if (zone_info
[i
].alloc_offset
== WP_MISSING_DEV
||
1447 zone_info
[i
].alloc_offset
== WP_CONVENTIONAL
)
1450 if ((zone_info
[0].alloc_offset
!= zone_info
[i
].alloc_offset
) &&
1451 !btrfs_test_opt(fs_info
, DEGRADED
)) {
1453 "zoned: write pointer offset mismatch of zones in %s profile",
1454 btrfs_bg_type_to_raid_name(map
->type
));
1457 if (test_bit(0, active
) != test_bit(i
, active
)) {
1458 if (!btrfs_test_opt(fs_info
, DEGRADED
) &&
1459 !btrfs_zone_activate(bg
)) {
1463 if (test_bit(0, active
))
1464 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &bg
->runtime_flags
);
1468 if (zone_info
[0].alloc_offset
!= WP_MISSING_DEV
)
1469 bg
->alloc_offset
= zone_info
[0].alloc_offset
;
1471 bg
->alloc_offset
= zone_info
[i
- 1].alloc_offset
;
1476 static int btrfs_load_block_group_raid0(struct btrfs_block_group
*bg
,
1477 struct btrfs_chunk_map
*map
,
1478 struct zone_info
*zone_info
,
1479 unsigned long *active
)
1481 struct btrfs_fs_info
*fs_info
= bg
->fs_info
;
1483 if ((map
->type
& BTRFS_BLOCK_GROUP_DATA
) && !fs_info
->stripe_root
) {
1484 btrfs_err(fs_info
, "zoned: data %s needs raid-stripe-tree",
1485 btrfs_bg_type_to_raid_name(map
->type
));
1489 for (int i
= 0; i
< map
->num_stripes
; i
++) {
1490 if (zone_info
[i
].alloc_offset
== WP_MISSING_DEV
||
1491 zone_info
[i
].alloc_offset
== WP_CONVENTIONAL
)
1494 if (test_bit(0, active
) != test_bit(i
, active
)) {
1495 if (!btrfs_zone_activate(bg
))
1498 if (test_bit(0, active
))
1499 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &bg
->runtime_flags
);
1501 bg
->zone_capacity
+= zone_info
[i
].capacity
;
1502 bg
->alloc_offset
+= zone_info
[i
].alloc_offset
;
1508 static int btrfs_load_block_group_raid10(struct btrfs_block_group
*bg
,
1509 struct btrfs_chunk_map
*map
,
1510 struct zone_info
*zone_info
,
1511 unsigned long *active
)
1513 struct btrfs_fs_info
*fs_info
= bg
->fs_info
;
1515 if ((map
->type
& BTRFS_BLOCK_GROUP_DATA
) && !fs_info
->stripe_root
) {
1516 btrfs_err(fs_info
, "zoned: data %s needs raid-stripe-tree",
1517 btrfs_bg_type_to_raid_name(map
->type
));
1521 for (int i
= 0; i
< map
->num_stripes
; i
++) {
1522 if (zone_info
[i
].alloc_offset
== WP_MISSING_DEV
||
1523 zone_info
[i
].alloc_offset
== WP_CONVENTIONAL
)
1526 if (test_bit(0, active
) != test_bit(i
, active
)) {
1527 if (!btrfs_zone_activate(bg
))
1530 if (test_bit(0, active
))
1531 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &bg
->runtime_flags
);
1534 if ((i
% map
->sub_stripes
) == 0) {
1535 bg
->zone_capacity
+= zone_info
[i
].capacity
;
1536 bg
->alloc_offset
+= zone_info
[i
].alloc_offset
;
1543 int btrfs_load_block_group_zone_info(struct btrfs_block_group
*cache
, bool new)
1545 struct btrfs_fs_info
*fs_info
= cache
->fs_info
;
1546 struct btrfs_chunk_map
*map
;
1547 u64 logical
= cache
->start
;
1548 u64 length
= cache
->length
;
1549 struct zone_info
*zone_info
= NULL
;
1552 unsigned long *active
= NULL
;
1554 u32 num_sequential
= 0, num_conventional
= 0;
1557 if (!btrfs_is_zoned(fs_info
))
1561 if (!IS_ALIGNED(length
, fs_info
->zone_size
)) {
1563 "zoned: block group %llu len %llu unaligned to zone size %llu",
1564 logical
, length
, fs_info
->zone_size
);
1568 map
= btrfs_find_chunk_map(fs_info
, logical
, length
);
1572 cache
->physical_map
= map
;
1574 zone_info
= kcalloc(map
->num_stripes
, sizeof(*zone_info
), GFP_NOFS
);
1580 active
= bitmap_zalloc(map
->num_stripes
, GFP_NOFS
);
1586 for (i
= 0; i
< map
->num_stripes
; i
++) {
1587 ret
= btrfs_load_zone_info(fs_info
, i
, &zone_info
[i
], active
, map
);
1591 if (zone_info
[i
].alloc_offset
== WP_CONVENTIONAL
)
1597 if (num_sequential
> 0)
1598 set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE
, &cache
->runtime_flags
);
1600 if (num_conventional
> 0) {
1601 /* Zone capacity is always zone size in emulation */
1602 cache
->zone_capacity
= cache
->length
;
1603 ret
= calculate_alloc_pointer(cache
, &last_alloc
, new);
1606 "zoned: failed to determine allocation offset of bg %llu",
1609 } else if (map
->num_stripes
== num_conventional
) {
1610 cache
->alloc_offset
= last_alloc
;
1611 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &cache
->runtime_flags
);
1616 profile
= map
->type
& BTRFS_BLOCK_GROUP_PROFILE_MASK
;
1618 case 0: /* single */
1619 ret
= btrfs_load_block_group_single(cache
, &zone_info
[0], active
);
1621 case BTRFS_BLOCK_GROUP_DUP
:
1622 ret
= btrfs_load_block_group_dup(cache
, map
, zone_info
, active
);
1624 case BTRFS_BLOCK_GROUP_RAID1
:
1625 case BTRFS_BLOCK_GROUP_RAID1C3
:
1626 case BTRFS_BLOCK_GROUP_RAID1C4
:
1627 ret
= btrfs_load_block_group_raid1(cache
, map
, zone_info
, active
);
1629 case BTRFS_BLOCK_GROUP_RAID0
:
1630 ret
= btrfs_load_block_group_raid0(cache
, map
, zone_info
, active
);
1632 case BTRFS_BLOCK_GROUP_RAID10
:
1633 ret
= btrfs_load_block_group_raid10(cache
, map
, zone_info
, active
);
1635 case BTRFS_BLOCK_GROUP_RAID5
:
1636 case BTRFS_BLOCK_GROUP_RAID6
:
1638 btrfs_err(fs_info
, "zoned: profile %s not yet supported",
1639 btrfs_bg_type_to_raid_name(map
->type
));
1644 if (ret
== -EIO
&& profile
!= 0 && profile
!= BTRFS_BLOCK_GROUP_RAID0
&&
1645 profile
!= BTRFS_BLOCK_GROUP_RAID10
) {
1647 * Detected broken write pointer. Make this block group
1648 * unallocatable by setting the allocation pointer at the end of
1649 * allocatable region. Relocating this block group will fix the
1652 * Currently, we cannot handle RAID0 or RAID10 case like this
1653 * because we don't have a proper zone_capacity value. But,
1654 * reading from this block group won't work anyway by a missing
1657 cache
->alloc_offset
= cache
->zone_capacity
;
1662 /* Reject non SINGLE data profiles without RST */
1663 if ((map
->type
& BTRFS_BLOCK_GROUP_DATA
) &&
1664 (map
->type
& BTRFS_BLOCK_GROUP_PROFILE_MASK
) &&
1665 !fs_info
->stripe_root
) {
1666 btrfs_err(fs_info
, "zoned: data %s needs raid-stripe-tree",
1667 btrfs_bg_type_to_raid_name(map
->type
));
1671 if (cache
->alloc_offset
> cache
->zone_capacity
) {
1673 "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
1674 cache
->alloc_offset
, cache
->zone_capacity
,
1679 /* An extent is allocated after the write pointer */
1680 if (!ret
&& num_conventional
&& last_alloc
> cache
->alloc_offset
) {
1682 "zoned: got wrong write pointer in BG %llu: %llu > %llu",
1683 logical
, last_alloc
, cache
->alloc_offset
);
1688 cache
->meta_write_pointer
= cache
->alloc_offset
+ cache
->start
;
1689 if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &cache
->runtime_flags
)) {
1690 btrfs_get_block_group(cache
);
1691 spin_lock(&fs_info
->zone_active_bgs_lock
);
1692 list_add_tail(&cache
->active_bg_list
,
1693 &fs_info
->zone_active_bgs
);
1694 spin_unlock(&fs_info
->zone_active_bgs_lock
);
1697 btrfs_free_chunk_map(cache
->physical_map
);
1698 cache
->physical_map
= NULL
;
1700 bitmap_free(active
);
1706 void btrfs_calc_zone_unusable(struct btrfs_block_group
*cache
)
1710 if (!btrfs_is_zoned(cache
->fs_info
))
1713 WARN_ON(cache
->bytes_super
!= 0);
1714 unusable
= (cache
->alloc_offset
- cache
->used
) +
1715 (cache
->length
- cache
->zone_capacity
);
1716 free
= cache
->zone_capacity
- cache
->alloc_offset
;
1718 /* We only need ->free_space in ALLOC_SEQ block groups */
1719 cache
->cached
= BTRFS_CACHE_FINISHED
;
1720 cache
->free_space_ctl
->free_space
= free
;
1721 cache
->zone_unusable
= unusable
;
1724 bool btrfs_use_zone_append(struct btrfs_bio
*bbio
)
1726 u64 start
= (bbio
->bio
.bi_iter
.bi_sector
<< SECTOR_SHIFT
);
1727 struct btrfs_inode
*inode
= bbio
->inode
;
1728 struct btrfs_fs_info
*fs_info
= bbio
->fs_info
;
1729 struct btrfs_block_group
*cache
;
1732 if (!btrfs_is_zoned(fs_info
))
1735 if (!inode
|| !is_data_inode(inode
))
1738 if (btrfs_op(&bbio
->bio
) != BTRFS_MAP_WRITE
)
1742 * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the
1743 * extent layout the relocation code has.
1744 * Furthermore we have set aside own block-group from which only the
1745 * relocation "process" can allocate and make sure only one process at a
1746 * time can add pages to an extent that gets relocated, so it's safe to
1747 * use regular REQ_OP_WRITE for this special case.
1749 if (btrfs_is_data_reloc_root(inode
->root
))
1752 cache
= btrfs_lookup_block_group(fs_info
, start
);
1757 ret
= !!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE
, &cache
->runtime_flags
);
1758 btrfs_put_block_group(cache
);
1763 void btrfs_record_physical_zoned(struct btrfs_bio
*bbio
)
1765 const u64 physical
= bbio
->bio
.bi_iter
.bi_sector
<< SECTOR_SHIFT
;
1766 struct btrfs_ordered_sum
*sum
= bbio
->sums
;
1768 if (physical
< bbio
->orig_physical
)
1769 sum
->logical
-= bbio
->orig_physical
- physical
;
1771 sum
->logical
+= physical
- bbio
->orig_physical
;
1774 static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent
*ordered
,
1777 struct extent_map_tree
*em_tree
= &ordered
->inode
->extent_tree
;
1778 struct extent_map
*em
;
1780 ordered
->disk_bytenr
= logical
;
1782 write_lock(&em_tree
->lock
);
1783 em
= search_extent_mapping(em_tree
, ordered
->file_offset
,
1784 ordered
->num_bytes
);
1785 /* The em should be a new COW extent, thus it should not have an offset. */
1786 ASSERT(em
->offset
== 0);
1787 em
->disk_bytenr
= logical
;
1788 free_extent_map(em
);
1789 write_unlock(&em_tree
->lock
);
1792 static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent
*ordered
,
1793 u64 logical
, u64 len
)
1795 struct btrfs_ordered_extent
*new;
1797 if (!test_bit(BTRFS_ORDERED_NOCOW
, &ordered
->flags
) &&
1798 split_extent_map(ordered
->inode
, ordered
->file_offset
,
1799 ordered
->num_bytes
, len
, logical
))
1802 new = btrfs_split_ordered_extent(ordered
, len
);
1805 new->disk_bytenr
= logical
;
1806 btrfs_finish_one_ordered(new);
1810 void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent
*ordered
)
1812 struct btrfs_inode
*inode
= ordered
->inode
;
1813 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
1814 struct btrfs_ordered_sum
*sum
;
1818 * Write to pre-allocated region is for the data relocation, and so
1819 * it should use WRITE operation. No split/rewrite are necessary.
1821 if (test_bit(BTRFS_ORDERED_PREALLOC
, &ordered
->flags
))
1824 ASSERT(!list_empty(&ordered
->list
));
1825 /* The ordered->list can be empty in the above pre-alloc case. */
1826 sum
= list_first_entry(&ordered
->list
, struct btrfs_ordered_sum
, list
);
1827 logical
= sum
->logical
;
1830 while (len
< ordered
->disk_num_bytes
) {
1831 sum
= list_next_entry(sum
, list
);
1832 if (sum
->logical
== logical
+ len
) {
1836 if (!btrfs_zoned_split_ordered(ordered
, logical
, len
)) {
1837 set_bit(BTRFS_ORDERED_IOERR
, &ordered
->flags
);
1838 btrfs_err(fs_info
, "failed to split ordered extent");
1841 logical
= sum
->logical
;
1845 if (ordered
->disk_bytenr
!= logical
)
1846 btrfs_rewrite_logical_zoned(ordered
, logical
);
1850 * If we end up here for nodatasum I/O, the btrfs_ordered_sum structures
1851 * were allocated by btrfs_alloc_dummy_sum only to record the logical
1852 * addresses and don't contain actual checksums. We thus must free them
1853 * here so that we don't attempt to log the csums later.
1855 if ((inode
->flags
& BTRFS_INODE_NODATASUM
) ||
1856 test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS
, &fs_info
->fs_state
)) {
1857 while ((sum
= list_first_entry_or_null(&ordered
->list
,
1858 typeof(*sum
), list
))) {
1859 list_del(&sum
->list
);
1865 static bool check_bg_is_active(struct btrfs_eb_write_context
*ctx
,
1866 struct btrfs_block_group
**active_bg
)
1868 const struct writeback_control
*wbc
= ctx
->wbc
;
1869 struct btrfs_block_group
*block_group
= ctx
->zoned_bg
;
1870 struct btrfs_fs_info
*fs_info
= block_group
->fs_info
;
1872 if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &block_group
->runtime_flags
))
1875 if (fs_info
->treelog_bg
== block_group
->start
) {
1876 if (!btrfs_zone_activate(block_group
)) {
1877 int ret_fin
= btrfs_zone_finish_one_bg(fs_info
);
1879 if (ret_fin
!= 1 || !btrfs_zone_activate(block_group
))
1882 } else if (*active_bg
!= block_group
) {
1883 struct btrfs_block_group
*tgt
= *active_bg
;
1885 /* zoned_meta_io_lock protects fs_info->active_{meta,system}_bg. */
1886 lockdep_assert_held(&fs_info
->zoned_meta_io_lock
);
1890 * If there is an unsent IO left in the allocated area,
1891 * we cannot wait for them as it may cause a deadlock.
1893 if (tgt
->meta_write_pointer
< tgt
->start
+ tgt
->alloc_offset
) {
1894 if (wbc
->sync_mode
== WB_SYNC_NONE
||
1895 (wbc
->sync_mode
== WB_SYNC_ALL
&& !wbc
->for_sync
))
1899 /* Pivot active metadata/system block group. */
1900 btrfs_zoned_meta_io_unlock(fs_info
);
1901 wait_eb_writebacks(tgt
);
1902 do_zone_finish(tgt
, true);
1903 btrfs_zoned_meta_io_lock(fs_info
);
1904 if (*active_bg
== tgt
) {
1905 btrfs_put_block_group(tgt
);
1909 if (!btrfs_zone_activate(block_group
))
1911 if (*active_bg
!= block_group
) {
1912 ASSERT(*active_bg
== NULL
);
1913 *active_bg
= block_group
;
1914 btrfs_get_block_group(block_group
);
1922 * Check if @ctx->eb is aligned to the write pointer.
1925 * 0: @ctx->eb is at the write pointer. You can write it.
1926 * -EAGAIN: There is a hole. The caller should handle the case.
1927 * -EBUSY: There is a hole, but the caller can just bail out.
1929 int btrfs_check_meta_write_pointer(struct btrfs_fs_info
*fs_info
,
1930 struct btrfs_eb_write_context
*ctx
)
1932 const struct writeback_control
*wbc
= ctx
->wbc
;
1933 const struct extent_buffer
*eb
= ctx
->eb
;
1934 struct btrfs_block_group
*block_group
= ctx
->zoned_bg
;
1936 if (!btrfs_is_zoned(fs_info
))
1940 if (block_group
->start
> eb
->start
||
1941 block_group
->start
+ block_group
->length
<= eb
->start
) {
1942 btrfs_put_block_group(block_group
);
1944 ctx
->zoned_bg
= NULL
;
1949 block_group
= btrfs_lookup_block_group(fs_info
, eb
->start
);
1952 ctx
->zoned_bg
= block_group
;
1955 if (block_group
->meta_write_pointer
== eb
->start
) {
1956 struct btrfs_block_group
**tgt
;
1958 if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING
, &fs_info
->flags
))
1961 if (block_group
->flags
& BTRFS_BLOCK_GROUP_SYSTEM
)
1962 tgt
= &fs_info
->active_system_bg
;
1964 tgt
= &fs_info
->active_meta_bg
;
1965 if (check_bg_is_active(ctx
, tgt
))
1970 * Since we may release fs_info->zoned_meta_io_lock, someone can already
1971 * start writing this eb. In that case, we can just bail out.
1973 if (block_group
->meta_write_pointer
> eb
->start
)
1976 /* If for_sync, this hole will be filled with trasnsaction commit. */
1977 if (wbc
->sync_mode
== WB_SYNC_ALL
&& !wbc
->for_sync
)
1982 int btrfs_zoned_issue_zeroout(struct btrfs_device
*device
, u64 physical
, u64 length
)
1984 if (!btrfs_dev_is_sequential(device
, physical
))
1987 return blkdev_issue_zeroout(device
->bdev
, physical
>> SECTOR_SHIFT
,
1988 length
>> SECTOR_SHIFT
, GFP_NOFS
, 0);
1991 static int read_zone_info(struct btrfs_fs_info
*fs_info
, u64 logical
,
1992 struct blk_zone
*zone
)
1994 struct btrfs_io_context
*bioc
= NULL
;
1995 u64 mapped_length
= PAGE_SIZE
;
1996 unsigned int nofs_flag
;
2000 ret
= btrfs_map_block(fs_info
, BTRFS_MAP_GET_READ_MIRRORS
, logical
,
2001 &mapped_length
, &bioc
, NULL
, NULL
);
2002 if (ret
|| !bioc
|| mapped_length
< PAGE_SIZE
) {
2007 if (bioc
->map_type
& BTRFS_BLOCK_GROUP_RAID56_MASK
) {
2012 nofs_flag
= memalloc_nofs_save();
2013 nmirrors
= (int)bioc
->num_stripes
;
2014 for (i
= 0; i
< nmirrors
; i
++) {
2015 u64 physical
= bioc
->stripes
[i
].physical
;
2016 struct btrfs_device
*dev
= bioc
->stripes
[i
].dev
;
2018 /* Missing device */
2022 ret
= btrfs_get_dev_zone(dev
, physical
, zone
);
2023 /* Failing device */
2024 if (ret
== -EIO
|| ret
== -EOPNOTSUPP
)
2028 memalloc_nofs_restore(nofs_flag
);
2030 btrfs_put_bioc(bioc
);
2035 * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by
2036 * filling zeros between @physical_pos to a write pointer of dev-replace
2039 int btrfs_sync_zone_write_pointer(struct btrfs_device
*tgt_dev
, u64 logical
,
2040 u64 physical_start
, u64 physical_pos
)
2042 struct btrfs_fs_info
*fs_info
= tgt_dev
->fs_info
;
2043 struct blk_zone zone
;
2048 if (!btrfs_dev_is_sequential(tgt_dev
, physical_pos
))
2051 ret
= read_zone_info(fs_info
, logical
, &zone
);
2055 wp
= physical_start
+ ((zone
.wp
- zone
.start
) << SECTOR_SHIFT
);
2057 if (physical_pos
== wp
)
2060 if (physical_pos
> wp
)
2063 length
= wp
- physical_pos
;
2064 return btrfs_zoned_issue_zeroout(tgt_dev
, physical_pos
, length
);
2068 * Activate block group and underlying device zones
2070 * @block_group: the block group to activate
2072 * Return: true on success, false otherwise
2074 bool btrfs_zone_activate(struct btrfs_block_group
*block_group
)
2076 struct btrfs_fs_info
*fs_info
= block_group
->fs_info
;
2077 struct btrfs_chunk_map
*map
;
2078 struct btrfs_device
*device
;
2080 const bool is_data
= (block_group
->flags
& BTRFS_BLOCK_GROUP_DATA
);
2084 if (!btrfs_is_zoned(block_group
->fs_info
))
2087 map
= block_group
->physical_map
;
2089 spin_lock(&fs_info
->zone_active_bgs_lock
);
2090 spin_lock(&block_group
->lock
);
2091 if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &block_group
->runtime_flags
)) {
2097 if (btrfs_zoned_bg_is_full(block_group
)) {
2102 for (i
= 0; i
< map
->num_stripes
; i
++) {
2103 struct btrfs_zoned_device_info
*zinfo
;
2106 device
= map
->stripes
[i
].dev
;
2107 physical
= map
->stripes
[i
].physical
;
2108 zinfo
= device
->zone_info
;
2110 if (zinfo
->max_active_zones
== 0)
2114 reserved
= zinfo
->reserved_active_zones
;
2116 * For the data block group, leave active zones for one
2117 * metadata block group and one system block group.
2119 if (atomic_read(&zinfo
->active_zones_left
) <= reserved
) {
2124 if (!btrfs_dev_set_active_zone(device
, physical
)) {
2125 /* Cannot activate the zone */
2130 zinfo
->reserved_active_zones
--;
2133 /* Successfully activated all the zones */
2134 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &block_group
->runtime_flags
);
2135 spin_unlock(&block_group
->lock
);
2137 /* For the active block group list */
2138 btrfs_get_block_group(block_group
);
2139 list_add_tail(&block_group
->active_bg_list
, &fs_info
->zone_active_bgs
);
2140 spin_unlock(&fs_info
->zone_active_bgs_lock
);
2145 spin_unlock(&block_group
->lock
);
2146 spin_unlock(&fs_info
->zone_active_bgs_lock
);
2150 static void wait_eb_writebacks(struct btrfs_block_group
*block_group
)
2152 struct btrfs_fs_info
*fs_info
= block_group
->fs_info
;
2153 const u64 end
= block_group
->start
+ block_group
->length
;
2154 struct radix_tree_iter iter
;
2155 struct extent_buffer
*eb
;
2159 radix_tree_for_each_slot(slot
, &fs_info
->buffer_radix
, &iter
,
2160 block_group
->start
>> fs_info
->sectorsize_bits
) {
2161 eb
= radix_tree_deref_slot(slot
);
2164 if (radix_tree_deref_retry(eb
)) {
2165 slot
= radix_tree_iter_retry(&iter
);
2169 if (eb
->start
< block_group
->start
)
2171 if (eb
->start
>= end
)
2174 slot
= radix_tree_iter_resume(slot
, &iter
);
2176 wait_on_extent_buffer_writeback(eb
);
2182 static int do_zone_finish(struct btrfs_block_group
*block_group
, bool fully_written
)
2184 struct btrfs_fs_info
*fs_info
= block_group
->fs_info
;
2185 struct btrfs_chunk_map
*map
;
2186 const bool is_metadata
= (block_group
->flags
&
2187 (BTRFS_BLOCK_GROUP_METADATA
| BTRFS_BLOCK_GROUP_SYSTEM
));
2188 struct btrfs_dev_replace
*dev_replace
= &fs_info
->dev_replace
;
2192 spin_lock(&block_group
->lock
);
2193 if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &block_group
->runtime_flags
)) {
2194 spin_unlock(&block_group
->lock
);
2198 /* Check if we have unwritten allocated space */
2200 block_group
->start
+ block_group
->alloc_offset
> block_group
->meta_write_pointer
) {
2201 spin_unlock(&block_group
->lock
);
2206 * If we are sure that the block group is full (= no more room left for
2207 * new allocation) and the IO for the last usable block is completed, we
2208 * don't need to wait for the other IOs. This holds because we ensure
2209 * the sequential IO submissions using the ZONE_APPEND command for data
2210 * and block_group->meta_write_pointer for metadata.
2212 if (!fully_written
) {
2213 if (test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC
, &block_group
->runtime_flags
)) {
2214 spin_unlock(&block_group
->lock
);
2217 spin_unlock(&block_group
->lock
);
2219 ret
= btrfs_inc_block_group_ro(block_group
, false);
2223 /* Ensure all writes in this block group finish */
2224 btrfs_wait_block_group_reservations(block_group
);
2225 /* No need to wait for NOCOW writers. Zoned mode does not allow that */
2226 btrfs_wait_ordered_roots(fs_info
, U64_MAX
, block_group
);
2227 /* Wait for extent buffers to be written. */
2229 wait_eb_writebacks(block_group
);
2231 spin_lock(&block_group
->lock
);
2234 * Bail out if someone already deactivated the block group, or
2235 * allocated space is left in the block group.
2237 if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
,
2238 &block_group
->runtime_flags
)) {
2239 spin_unlock(&block_group
->lock
);
2240 btrfs_dec_block_group_ro(block_group
);
2244 if (block_group
->reserved
||
2245 test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC
,
2246 &block_group
->runtime_flags
)) {
2247 spin_unlock(&block_group
->lock
);
2248 btrfs_dec_block_group_ro(block_group
);
2253 clear_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &block_group
->runtime_flags
);
2254 block_group
->alloc_offset
= block_group
->zone_capacity
;
2255 if (block_group
->flags
& (BTRFS_BLOCK_GROUP_METADATA
| BTRFS_BLOCK_GROUP_SYSTEM
))
2256 block_group
->meta_write_pointer
= block_group
->start
+
2257 block_group
->zone_capacity
;
2258 block_group
->free_space_ctl
->free_space
= 0;
2259 btrfs_clear_treelog_bg(block_group
);
2260 btrfs_clear_data_reloc_bg(block_group
);
2261 spin_unlock(&block_group
->lock
);
2263 down_read(&dev_replace
->rwsem
);
2264 map
= block_group
->physical_map
;
2265 for (i
= 0; i
< map
->num_stripes
; i
++) {
2266 struct btrfs_device
*device
= map
->stripes
[i
].dev
;
2267 const u64 physical
= map
->stripes
[i
].physical
;
2268 struct btrfs_zoned_device_info
*zinfo
= device
->zone_info
;
2269 unsigned int nofs_flags
;
2271 if (zinfo
->max_active_zones
== 0)
2274 nofs_flags
= memalloc_nofs_save();
2275 ret
= blkdev_zone_mgmt(device
->bdev
, REQ_OP_ZONE_FINISH
,
2276 physical
>> SECTOR_SHIFT
,
2277 zinfo
->zone_size
>> SECTOR_SHIFT
);
2278 memalloc_nofs_restore(nofs_flags
);
2281 up_read(&dev_replace
->rwsem
);
2285 if (!(block_group
->flags
& BTRFS_BLOCK_GROUP_DATA
))
2286 zinfo
->reserved_active_zones
++;
2287 btrfs_dev_clear_active_zone(device
, physical
);
2289 up_read(&dev_replace
->rwsem
);
2292 btrfs_dec_block_group_ro(block_group
);
2294 spin_lock(&fs_info
->zone_active_bgs_lock
);
2295 ASSERT(!list_empty(&block_group
->active_bg_list
));
2296 list_del_init(&block_group
->active_bg_list
);
2297 spin_unlock(&fs_info
->zone_active_bgs_lock
);
2299 /* For active_bg_list */
2300 btrfs_put_block_group(block_group
);
2302 clear_and_wake_up_bit(BTRFS_FS_NEED_ZONE_FINISH
, &fs_info
->flags
);
2307 int btrfs_zone_finish(struct btrfs_block_group
*block_group
)
2309 if (!btrfs_is_zoned(block_group
->fs_info
))
2312 return do_zone_finish(block_group
, false);
2315 bool btrfs_can_activate_zone(struct btrfs_fs_devices
*fs_devices
, u64 flags
)
2317 struct btrfs_fs_info
*fs_info
= fs_devices
->fs_info
;
2318 struct btrfs_device
*device
;
2321 if (!btrfs_is_zoned(fs_info
))
2324 /* Check if there is a device with active zones left */
2325 mutex_lock(&fs_info
->chunk_mutex
);
2326 spin_lock(&fs_info
->zone_active_bgs_lock
);
2327 list_for_each_entry(device
, &fs_devices
->alloc_list
, dev_alloc_list
) {
2328 struct btrfs_zoned_device_info
*zinfo
= device
->zone_info
;
2334 if (!zinfo
->max_active_zones
) {
2339 if (flags
& BTRFS_BLOCK_GROUP_DATA
)
2340 reserved
= zinfo
->reserved_active_zones
;
2342 switch (flags
& BTRFS_BLOCK_GROUP_PROFILE_MASK
) {
2343 case 0: /* single */
2344 ret
= (atomic_read(&zinfo
->active_zones_left
) >= (1 + reserved
));
2346 case BTRFS_BLOCK_GROUP_DUP
:
2347 ret
= (atomic_read(&zinfo
->active_zones_left
) >= (2 + reserved
));
2353 spin_unlock(&fs_info
->zone_active_bgs_lock
);
2354 mutex_unlock(&fs_info
->chunk_mutex
);
2357 set_bit(BTRFS_FS_NEED_ZONE_FINISH
, &fs_info
->flags
);
2362 void btrfs_zone_finish_endio(struct btrfs_fs_info
*fs_info
, u64 logical
, u64 length
)
2364 struct btrfs_block_group
*block_group
;
2365 u64 min_alloc_bytes
;
2367 if (!btrfs_is_zoned(fs_info
))
2370 block_group
= btrfs_lookup_block_group(fs_info
, logical
);
2371 ASSERT(block_group
);
2373 /* No MIXED_BG on zoned btrfs. */
2374 if (block_group
->flags
& BTRFS_BLOCK_GROUP_DATA
)
2375 min_alloc_bytes
= fs_info
->sectorsize
;
2377 min_alloc_bytes
= fs_info
->nodesize
;
2379 /* Bail out if we can allocate more data from this block group. */
2380 if (logical
+ length
+ min_alloc_bytes
<=
2381 block_group
->start
+ block_group
->zone_capacity
)
2384 do_zone_finish(block_group
, true);
2387 btrfs_put_block_group(block_group
);
2390 static void btrfs_zone_finish_endio_workfn(struct work_struct
*work
)
2392 struct btrfs_block_group
*bg
=
2393 container_of(work
, struct btrfs_block_group
, zone_finish_work
);
2395 wait_on_extent_buffer_writeback(bg
->last_eb
);
2396 free_extent_buffer(bg
->last_eb
);
2397 btrfs_zone_finish_endio(bg
->fs_info
, bg
->start
, bg
->length
);
2398 btrfs_put_block_group(bg
);
2401 void btrfs_schedule_zone_finish_bg(struct btrfs_block_group
*bg
,
2402 struct extent_buffer
*eb
)
2404 if (!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE
, &bg
->runtime_flags
) ||
2405 eb
->start
+ eb
->len
* 2 <= bg
->start
+ bg
->zone_capacity
)
2408 if (WARN_ON(bg
->zone_finish_work
.func
== btrfs_zone_finish_endio_workfn
)) {
2409 btrfs_err(bg
->fs_info
, "double scheduling of bg %llu zone finishing",
2415 btrfs_get_block_group(bg
);
2416 atomic_inc(&eb
->refs
);
2418 INIT_WORK(&bg
->zone_finish_work
, btrfs_zone_finish_endio_workfn
);
2419 queue_work(system_unbound_wq
, &bg
->zone_finish_work
);
2422 void btrfs_clear_data_reloc_bg(struct btrfs_block_group
*bg
)
2424 struct btrfs_fs_info
*fs_info
= bg
->fs_info
;
2426 spin_lock(&fs_info
->relocation_bg_lock
);
2427 if (fs_info
->data_reloc_bg
== bg
->start
)
2428 fs_info
->data_reloc_bg
= 0;
2429 spin_unlock(&fs_info
->relocation_bg_lock
);
2432 void btrfs_free_zone_cache(struct btrfs_fs_info
*fs_info
)
2434 struct btrfs_fs_devices
*fs_devices
= fs_info
->fs_devices
;
2435 struct btrfs_device
*device
;
2437 if (!btrfs_is_zoned(fs_info
))
2440 mutex_lock(&fs_devices
->device_list_mutex
);
2441 list_for_each_entry(device
, &fs_devices
->devices
, dev_list
) {
2442 if (device
->zone_info
) {
2443 vfree(device
->zone_info
->zone_cache
);
2444 device
->zone_info
->zone_cache
= NULL
;
2447 mutex_unlock(&fs_devices
->device_list_mutex
);
2450 bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info
*fs_info
)
2452 struct btrfs_fs_devices
*fs_devices
= fs_info
->fs_devices
;
2453 struct btrfs_device
*device
;
2458 ASSERT(btrfs_is_zoned(fs_info
));
2460 if (fs_info
->bg_reclaim_threshold
== 0)
2463 mutex_lock(&fs_devices
->device_list_mutex
);
2464 list_for_each_entry(device
, &fs_devices
->devices
, dev_list
) {
2468 total
+= device
->disk_total_bytes
;
2469 used
+= device
->bytes_used
;
2471 mutex_unlock(&fs_devices
->device_list_mutex
);
2473 factor
= div64_u64(used
* 100, total
);
2474 return factor
>= fs_info
->bg_reclaim_threshold
;
2477 void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info
*fs_info
, u64 logical
,
2480 struct btrfs_block_group
*block_group
;
2482 if (!btrfs_is_zoned(fs_info
))
2485 block_group
= btrfs_lookup_block_group(fs_info
, logical
);
2486 /* It should be called on a previous data relocation block group. */
2487 ASSERT(block_group
&& (block_group
->flags
& BTRFS_BLOCK_GROUP_DATA
));
2489 spin_lock(&block_group
->lock
);
2490 if (!test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC
, &block_group
->runtime_flags
))
2493 /* All relocation extents are written. */
2494 if (block_group
->start
+ block_group
->alloc_offset
== logical
+ length
) {
2496 * Now, release this block group for further allocations and
2499 clear_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC
,
2500 &block_group
->runtime_flags
);
2504 spin_unlock(&block_group
->lock
);
2505 btrfs_put_block_group(block_group
);
2508 int btrfs_zone_finish_one_bg(struct btrfs_fs_info
*fs_info
)
2510 struct btrfs_block_group
*block_group
;
2511 struct btrfs_block_group
*min_bg
= NULL
;
2512 u64 min_avail
= U64_MAX
;
2515 spin_lock(&fs_info
->zone_active_bgs_lock
);
2516 list_for_each_entry(block_group
, &fs_info
->zone_active_bgs
,
2520 spin_lock(&block_group
->lock
);
2521 if (block_group
->reserved
|| block_group
->alloc_offset
== 0 ||
2522 (block_group
->flags
& BTRFS_BLOCK_GROUP_SYSTEM
) ||
2523 test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC
, &block_group
->runtime_flags
)) {
2524 spin_unlock(&block_group
->lock
);
2528 avail
= block_group
->zone_capacity
- block_group
->alloc_offset
;
2529 if (min_avail
> avail
) {
2531 btrfs_put_block_group(min_bg
);
2532 min_bg
= block_group
;
2534 btrfs_get_block_group(min_bg
);
2536 spin_unlock(&block_group
->lock
);
2538 spin_unlock(&fs_info
->zone_active_bgs_lock
);
2543 ret
= btrfs_zone_finish(min_bg
);
2544 btrfs_put_block_group(min_bg
);
2546 return ret
< 0 ? ret
: 1;
2549 int btrfs_zoned_activate_one_bg(struct btrfs_fs_info
*fs_info
,
2550 struct btrfs_space_info
*space_info
,
2553 struct btrfs_block_group
*bg
;
2556 if (!btrfs_is_zoned(fs_info
) || (space_info
->flags
& BTRFS_BLOCK_GROUP_DATA
))
2561 bool need_finish
= false;
2563 down_read(&space_info
->groups_sem
);
2564 for (index
= 0; index
< BTRFS_NR_RAID_TYPES
; index
++) {
2565 list_for_each_entry(bg
, &space_info
->block_groups
[index
],
2567 if (!spin_trylock(&bg
->lock
))
2569 if (btrfs_zoned_bg_is_full(bg
) ||
2570 test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
,
2571 &bg
->runtime_flags
)) {
2572 spin_unlock(&bg
->lock
);
2575 spin_unlock(&bg
->lock
);
2577 if (btrfs_zone_activate(bg
)) {
2578 up_read(&space_info
->groups_sem
);
2585 up_read(&space_info
->groups_sem
);
2587 if (!do_finish
|| !need_finish
)
2590 ret
= btrfs_zone_finish_one_bg(fs_info
);
2601 * Reserve zones for one metadata block group, one tree-log block group, and one
2602 * system block group.
2604 void btrfs_check_active_zone_reservation(struct btrfs_fs_info
*fs_info
)
2606 struct btrfs_fs_devices
*fs_devices
= fs_info
->fs_devices
;
2607 struct btrfs_block_group
*block_group
;
2608 struct btrfs_device
*device
;
2609 /* Reserve zones for normal SINGLE metadata and tree-log block group. */
2610 unsigned int metadata_reserve
= 2;
2611 /* Reserve a zone for SINGLE system block group. */
2612 unsigned int system_reserve
= 1;
2614 if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING
, &fs_info
->flags
))
2618 * This function is called from the mount context. So, there is no
2619 * parallel process touching the bits. No need for read_seqretry().
2621 if (fs_info
->avail_metadata_alloc_bits
& BTRFS_BLOCK_GROUP_DUP
)
2622 metadata_reserve
= 4;
2623 if (fs_info
->avail_system_alloc_bits
& BTRFS_BLOCK_GROUP_DUP
)
2626 /* Apply the reservation on all the devices. */
2627 mutex_lock(&fs_devices
->device_list_mutex
);
2628 list_for_each_entry(device
, &fs_devices
->devices
, dev_list
) {
2632 device
->zone_info
->reserved_active_zones
=
2633 metadata_reserve
+ system_reserve
;
2635 mutex_unlock(&fs_devices
->device_list_mutex
);
2637 /* Release reservation for currently active block groups. */
2638 spin_lock(&fs_info
->zone_active_bgs_lock
);
2639 list_for_each_entry(block_group
, &fs_info
->zone_active_bgs
, active_bg_list
) {
2640 struct btrfs_chunk_map
*map
= block_group
->physical_map
;
2642 if (!(block_group
->flags
&
2643 (BTRFS_BLOCK_GROUP_METADATA
| BTRFS_BLOCK_GROUP_SYSTEM
)))
2646 for (int i
= 0; i
< map
->num_stripes
; i
++)
2647 map
->stripes
[i
].dev
->zone_info
->reserved_active_zones
--;
2649 spin_unlock(&fs_info
->zone_active_bgs_lock
);