1 // SPDX-License-Identifier: GPL-2.0
3 #include <linux/slab.h>
4 #include <linux/blkdev.h>
8 #include "rcu-string.h"
10 /* Maximum number of zones to report per blkdev_report_zones() call */
11 #define BTRFS_REPORT_NR_ZONES 4096
13 /* Number of superblock log zones */
14 #define BTRFS_NR_SB_LOG_ZONES 2
16 static int copy_zone_info_cb(struct blk_zone
*zone
, unsigned int idx
, void *data
)
18 struct blk_zone
*zones
= data
;
20 memcpy(&zones
[idx
], zone
, sizeof(*zone
));
25 static int sb_write_pointer(struct block_device
*bdev
, struct blk_zone
*zones
,
28 bool empty
[BTRFS_NR_SB_LOG_ZONES
];
29 bool full
[BTRFS_NR_SB_LOG_ZONES
];
32 ASSERT(zones
[0].type
!= BLK_ZONE_TYPE_CONVENTIONAL
&&
33 zones
[1].type
!= BLK_ZONE_TYPE_CONVENTIONAL
);
35 empty
[0] = (zones
[0].cond
== BLK_ZONE_COND_EMPTY
);
36 empty
[1] = (zones
[1].cond
== BLK_ZONE_COND_EMPTY
);
37 full
[0] = (zones
[0].cond
== BLK_ZONE_COND_FULL
);
38 full
[1] = (zones
[1].cond
== BLK_ZONE_COND_FULL
);
41 * Possible states of log buffer zones
43 * Empty[0] In use[0] Full[0]
49 * *: Special case, no superblock is written
50 * 0: Use write pointer of zones[0]
51 * 1: Use write pointer of zones[1]
52 * C: Compare super blcoks from zones[0] and zones[1], use the latest
53 * one determined by generation
57 if (empty
[0] && empty
[1]) {
58 /* Special case to distinguish no superblock to read */
59 *wp_ret
= zones
[0].start
<< SECTOR_SHIFT
;
61 } else if (full
[0] && full
[1]) {
62 /* Compare two super blocks */
63 struct address_space
*mapping
= bdev
->bd_inode
->i_mapping
;
64 struct page
*page
[BTRFS_NR_SB_LOG_ZONES
];
65 struct btrfs_super_block
*super
[BTRFS_NR_SB_LOG_ZONES
];
68 for (i
= 0; i
< BTRFS_NR_SB_LOG_ZONES
; i
++) {
71 bytenr
= ((zones
[i
].start
+ zones
[i
].len
)
72 << SECTOR_SHIFT
) - BTRFS_SUPER_INFO_SIZE
;
74 page
[i
] = read_cache_page_gfp(mapping
,
75 bytenr
>> PAGE_SHIFT
, GFP_NOFS
);
76 if (IS_ERR(page
[i
])) {
78 btrfs_release_disk_super(super
[0]);
79 return PTR_ERR(page
[i
]);
81 super
[i
] = page_address(page
[i
]);
84 if (super
[0]->generation
> super
[1]->generation
)
85 sector
= zones
[1].start
;
87 sector
= zones
[0].start
;
89 for (i
= 0; i
< BTRFS_NR_SB_LOG_ZONES
; i
++)
90 btrfs_release_disk_super(super
[i
]);
91 } else if (!full
[0] && (empty
[1] || full
[1])) {
98 *wp_ret
= sector
<< SECTOR_SHIFT
;
103 * The following zones are reserved as the circular buffer on ZONED btrfs.
104 * - The primary superblock: zones 0 and 1
105 * - The first copy: zones 16 and 17
106 * - The second copy: zones 1024 or zone at 256GB which is minimum, and
109 static inline u32
sb_zone_number(int shift
, int mirror
)
111 ASSERT(mirror
< BTRFS_SUPER_MIRROR_MAX
);
116 case 2: return min_t(u64
, btrfs_sb_offset(mirror
) >> shift
, 1024);
122 static int btrfs_get_dev_zones(struct btrfs_device
*device
, u64 pos
,
123 struct blk_zone
*zones
, unsigned int *nr_zones
)
130 ret
= blkdev_report_zones(device
->bdev
, pos
>> SECTOR_SHIFT
, *nr_zones
,
131 copy_zone_info_cb
, zones
);
133 btrfs_err_in_rcu(device
->fs_info
,
134 "zoned: failed to read zone %llu on %s (devid %llu)",
135 pos
, rcu_str_deref(device
->name
),
146 int btrfs_get_dev_zone_info(struct btrfs_device
*device
)
148 struct btrfs_zoned_device_info
*zone_info
= NULL
;
149 struct block_device
*bdev
= device
->bdev
;
150 struct request_queue
*queue
= bdev_get_queue(bdev
);
153 struct blk_zone
*zones
= NULL
;
154 unsigned int i
, nreported
= 0, nr_zones
;
155 unsigned int zone_sectors
;
158 if (!bdev_is_zoned(bdev
))
161 if (device
->zone_info
)
164 zone_info
= kzalloc(sizeof(*zone_info
), GFP_KERNEL
);
168 nr_sectors
= bdev_nr_sectors(bdev
);
169 zone_sectors
= bdev_zone_sectors(bdev
);
170 /* Check if it's power of 2 (see is_power_of_2) */
171 ASSERT(zone_sectors
!= 0 && (zone_sectors
& (zone_sectors
- 1)) == 0);
172 zone_info
->zone_size
= zone_sectors
<< SECTOR_SHIFT
;
173 zone_info
->zone_size_shift
= ilog2(zone_info
->zone_size
);
174 zone_info
->max_zone_append_size
=
175 (u64
)queue_max_zone_append_sectors(queue
) << SECTOR_SHIFT
;
176 zone_info
->nr_zones
= nr_sectors
>> ilog2(zone_sectors
);
177 if (!IS_ALIGNED(nr_sectors
, zone_sectors
))
178 zone_info
->nr_zones
++;
180 zone_info
->seq_zones
= bitmap_zalloc(zone_info
->nr_zones
, GFP_KERNEL
);
181 if (!zone_info
->seq_zones
) {
186 zone_info
->empty_zones
= bitmap_zalloc(zone_info
->nr_zones
, GFP_KERNEL
);
187 if (!zone_info
->empty_zones
) {
192 zones
= kcalloc(BTRFS_REPORT_NR_ZONES
, sizeof(struct blk_zone
), GFP_KERNEL
);
199 while (sector
< nr_sectors
) {
200 nr_zones
= BTRFS_REPORT_NR_ZONES
;
201 ret
= btrfs_get_dev_zones(device
, sector
<< SECTOR_SHIFT
, zones
,
206 for (i
= 0; i
< nr_zones
; i
++) {
207 if (zones
[i
].type
== BLK_ZONE_TYPE_SEQWRITE_REQ
)
208 __set_bit(nreported
, zone_info
->seq_zones
);
209 if (zones
[i
].cond
== BLK_ZONE_COND_EMPTY
)
210 __set_bit(nreported
, zone_info
->empty_zones
);
213 sector
= zones
[nr_zones
- 1].start
+ zones
[nr_zones
- 1].len
;
216 if (nreported
!= zone_info
->nr_zones
) {
217 btrfs_err_in_rcu(device
->fs_info
,
218 "inconsistent number of zones on %s (%u/%u)",
219 rcu_str_deref(device
->name
), nreported
,
220 zone_info
->nr_zones
);
225 /* Validate superblock log */
226 nr_zones
= BTRFS_NR_SB_LOG_ZONES
;
227 for (i
= 0; i
< BTRFS_SUPER_MIRROR_MAX
; i
++) {
230 int sb_pos
= BTRFS_NR_SB_LOG_ZONES
* i
;
232 sb_zone
= sb_zone_number(zone_info
->zone_size_shift
, i
);
233 if (sb_zone
+ 1 >= zone_info
->nr_zones
)
236 sector
= sb_zone
<< (zone_info
->zone_size_shift
- SECTOR_SHIFT
);
237 ret
= btrfs_get_dev_zones(device
, sector
<< SECTOR_SHIFT
,
238 &zone_info
->sb_zones
[sb_pos
],
243 if (nr_zones
!= BTRFS_NR_SB_LOG_ZONES
) {
244 btrfs_err_in_rcu(device
->fs_info
,
245 "zoned: failed to read super block log zone info at devid %llu zone %u",
246 device
->devid
, sb_zone
);
252 * If zones[0] is conventional, always use the beggining of the
253 * zone to record superblock. No need to validate in that case.
255 if (zone_info
->sb_zones
[BTRFS_NR_SB_LOG_ZONES
* i
].type
==
256 BLK_ZONE_TYPE_CONVENTIONAL
)
259 ret
= sb_write_pointer(device
->bdev
,
260 &zone_info
->sb_zones
[sb_pos
], &sb_wp
);
261 if (ret
!= -ENOENT
&& ret
) {
262 btrfs_err_in_rcu(device
->fs_info
,
263 "zoned: super block log zone corrupted devid %llu zone %u",
264 device
->devid
, sb_zone
);
273 device
->zone_info
= zone_info
;
275 /* device->fs_info is not safe to use for printing messages */
276 btrfs_info_in_rcu(NULL
,
277 "host-%s zoned block device %s, %u zones of %llu bytes",
278 bdev_zoned_model(bdev
) == BLK_ZONED_HM
? "managed" : "aware",
279 rcu_str_deref(device
->name
), zone_info
->nr_zones
,
280 zone_info
->zone_size
);
286 bitmap_free(zone_info
->empty_zones
);
287 bitmap_free(zone_info
->seq_zones
);
293 void btrfs_destroy_dev_zone_info(struct btrfs_device
*device
)
295 struct btrfs_zoned_device_info
*zone_info
= device
->zone_info
;
300 bitmap_free(zone_info
->seq_zones
);
301 bitmap_free(zone_info
->empty_zones
);
303 device
->zone_info
= NULL
;
306 int btrfs_get_dev_zone(struct btrfs_device
*device
, u64 pos
,
307 struct blk_zone
*zone
)
309 unsigned int nr_zones
= 1;
312 ret
= btrfs_get_dev_zones(device
, pos
, zone
, &nr_zones
);
313 if (ret
!= 0 || !nr_zones
)
314 return ret
? ret
: -EIO
;
319 int btrfs_check_zoned_mode(struct btrfs_fs_info
*fs_info
)
321 struct btrfs_fs_devices
*fs_devices
= fs_info
->fs_devices
;
322 struct btrfs_device
*device
;
323 u64 zoned_devices
= 0;
326 u64 max_zone_append_size
= 0;
327 const bool incompat_zoned
= btrfs_is_zoned(fs_info
);
330 /* Count zoned devices */
331 list_for_each_entry(device
, &fs_devices
->devices
, dev_list
) {
332 enum blk_zoned_model model
;
337 model
= bdev_zoned_model(device
->bdev
);
338 if (model
== BLK_ZONED_HM
||
339 (model
== BLK_ZONED_HA
&& incompat_zoned
)) {
340 struct btrfs_zoned_device_info
*zone_info
;
342 zone_info
= device
->zone_info
;
345 zone_size
= zone_info
->zone_size
;
346 } else if (zone_info
->zone_size
!= zone_size
) {
348 "zoned: unequal block device zone sizes: have %llu found %llu",
349 device
->zone_info
->zone_size
,
354 if (!max_zone_append_size
||
355 (zone_info
->max_zone_append_size
&&
356 zone_info
->max_zone_append_size
< max_zone_append_size
))
357 max_zone_append_size
=
358 zone_info
->max_zone_append_size
;
363 if (!zoned_devices
&& !incompat_zoned
)
366 if (!zoned_devices
&& incompat_zoned
) {
367 /* No zoned block device found on ZONED filesystem */
369 "zoned: no zoned devices found on a zoned filesystem");
374 if (zoned_devices
&& !incompat_zoned
) {
376 "zoned: mode not enabled but zoned device found");
381 if (zoned_devices
!= nr_devices
) {
383 "zoned: cannot mix zoned and regular devices");
389 * stripe_size is always aligned to BTRFS_STRIPE_LEN in
390 * __btrfs_alloc_chunk(). Since we want stripe_len == zone_size,
391 * check the alignment here.
393 if (!IS_ALIGNED(zone_size
, BTRFS_STRIPE_LEN
)) {
395 "zoned: zone size %llu not aligned to stripe %u",
396 zone_size
, BTRFS_STRIPE_LEN
);
401 if (btrfs_fs_incompat(fs_info
, MIXED_GROUPS
)) {
402 btrfs_err(fs_info
, "zoned: mixed block groups not supported");
407 fs_info
->zone_size
= zone_size
;
408 fs_info
->max_zone_append_size
= max_zone_append_size
;
410 btrfs_info(fs_info
, "zoned mode enabled with zone size %llu", zone_size
);
415 int btrfs_check_mountopts_zoned(struct btrfs_fs_info
*info
)
417 if (!btrfs_is_zoned(info
))
421 * Space cache writing is not COWed. Disable that to avoid write errors
422 * in sequential zones.
424 if (btrfs_test_opt(info
, SPACE_CACHE
)) {
425 btrfs_err(info
, "zoned: space cache v1 is not supported");
429 if (btrfs_test_opt(info
, NODATACOW
)) {
430 btrfs_err(info
, "zoned: NODATACOW not supported");
437 static int sb_log_location(struct block_device
*bdev
, struct blk_zone
*zones
,
438 int rw
, u64
*bytenr_ret
)
443 if (zones
[0].type
== BLK_ZONE_TYPE_CONVENTIONAL
) {
444 *bytenr_ret
= zones
[0].start
<< SECTOR_SHIFT
;
448 ret
= sb_write_pointer(bdev
, zones
, &wp
);
449 if (ret
!= -ENOENT
&& ret
< 0)
453 struct blk_zone
*reset
= NULL
;
455 if (wp
== zones
[0].start
<< SECTOR_SHIFT
)
457 else if (wp
== zones
[1].start
<< SECTOR_SHIFT
)
460 if (reset
&& reset
->cond
!= BLK_ZONE_COND_EMPTY
) {
461 ASSERT(reset
->cond
== BLK_ZONE_COND_FULL
);
463 ret
= blkdev_zone_mgmt(bdev
, REQ_OP_ZONE_RESET
,
464 reset
->start
, reset
->len
,
469 reset
->cond
= BLK_ZONE_COND_EMPTY
;
470 reset
->wp
= reset
->start
;
472 } else if (ret
!= -ENOENT
) {
473 /* For READ, we want the precious one */
474 if (wp
== zones
[0].start
<< SECTOR_SHIFT
)
475 wp
= (zones
[1].start
+ zones
[1].len
) << SECTOR_SHIFT
;
476 wp
-= BTRFS_SUPER_INFO_SIZE
;
484 int btrfs_sb_log_location_bdev(struct block_device
*bdev
, int mirror
, int rw
,
487 struct blk_zone zones
[BTRFS_NR_SB_LOG_ZONES
];
488 unsigned int zone_sectors
;
492 u8 zone_sectors_shift
;
496 if (!bdev_is_zoned(bdev
)) {
497 *bytenr_ret
= btrfs_sb_offset(mirror
);
501 ASSERT(rw
== READ
|| rw
== WRITE
);
503 zone_sectors
= bdev_zone_sectors(bdev
);
504 if (!is_power_of_2(zone_sectors
))
506 zone_size
= zone_sectors
<< SECTOR_SHIFT
;
507 zone_sectors_shift
= ilog2(zone_sectors
);
508 nr_sectors
= bdev_nr_sectors(bdev
);
509 nr_zones
= nr_sectors
>> zone_sectors_shift
;
511 sb_zone
= sb_zone_number(zone_sectors_shift
+ SECTOR_SHIFT
, mirror
);
512 if (sb_zone
+ 1 >= nr_zones
)
515 ret
= blkdev_report_zones(bdev
, sb_zone
<< zone_sectors_shift
,
516 BTRFS_NR_SB_LOG_ZONES
, copy_zone_info_cb
,
520 if (ret
!= BTRFS_NR_SB_LOG_ZONES
)
523 return sb_log_location(bdev
, zones
, rw
, bytenr_ret
);
526 int btrfs_sb_log_location(struct btrfs_device
*device
, int mirror
, int rw
,
529 struct btrfs_zoned_device_info
*zinfo
= device
->zone_info
;
533 *bytenr_ret
= btrfs_sb_offset(mirror
);
537 zone_num
= sb_zone_number(zinfo
->zone_size_shift
, mirror
);
538 if (zone_num
+ 1 >= zinfo
->nr_zones
)
541 return sb_log_location(device
->bdev
,
542 &zinfo
->sb_zones
[BTRFS_NR_SB_LOG_ZONES
* mirror
],
546 static inline bool is_sb_log_zone(struct btrfs_zoned_device_info
*zinfo
,
554 zone_num
= sb_zone_number(zinfo
->zone_size_shift
, mirror
);
555 if (zone_num
+ 1 >= zinfo
->nr_zones
)
558 if (!test_bit(zone_num
, zinfo
->seq_zones
))
564 void btrfs_advance_sb_log(struct btrfs_device
*device
, int mirror
)
566 struct btrfs_zoned_device_info
*zinfo
= device
->zone_info
;
567 struct blk_zone
*zone
;
569 if (!is_sb_log_zone(zinfo
, mirror
))
572 zone
= &zinfo
->sb_zones
[BTRFS_NR_SB_LOG_ZONES
* mirror
];
573 if (zone
->cond
!= BLK_ZONE_COND_FULL
) {
574 if (zone
->cond
== BLK_ZONE_COND_EMPTY
)
575 zone
->cond
= BLK_ZONE_COND_IMP_OPEN
;
577 zone
->wp
+= (BTRFS_SUPER_INFO_SIZE
>> SECTOR_SHIFT
);
579 if (zone
->wp
== zone
->start
+ zone
->len
)
580 zone
->cond
= BLK_ZONE_COND_FULL
;
586 ASSERT(zone
->cond
!= BLK_ZONE_COND_FULL
);
587 if (zone
->cond
== BLK_ZONE_COND_EMPTY
)
588 zone
->cond
= BLK_ZONE_COND_IMP_OPEN
;
590 zone
->wp
+= (BTRFS_SUPER_INFO_SIZE
>> SECTOR_SHIFT
);
592 if (zone
->wp
== zone
->start
+ zone
->len
)
593 zone
->cond
= BLK_ZONE_COND_FULL
;
596 int btrfs_reset_sb_log_zones(struct block_device
*bdev
, int mirror
)
598 sector_t zone_sectors
;
600 u8 zone_sectors_shift
;
604 zone_sectors
= bdev_zone_sectors(bdev
);
605 zone_sectors_shift
= ilog2(zone_sectors
);
606 nr_sectors
= bdev_nr_sectors(bdev
);
607 nr_zones
= nr_sectors
>> zone_sectors_shift
;
609 sb_zone
= sb_zone_number(zone_sectors_shift
+ SECTOR_SHIFT
, mirror
);
610 if (sb_zone
+ 1 >= nr_zones
)
613 return blkdev_zone_mgmt(bdev
, REQ_OP_ZONE_RESET
,
614 sb_zone
<< zone_sectors_shift
,
615 zone_sectors
* BTRFS_NR_SB_LOG_ZONES
, GFP_NOFS
);