1 // SPDX-License-Identifier: GPL-2.0
3 * Zoned block device handling
5 * Copyright (c) 2015, Hannes Reinecke
6 * Copyright (c) 2015, SUSE Linux GmbH
8 * Copyright (c) 2016, Damien Le Moal
9 * Copyright (c) 2016, Western Digital
10 * Copyright (c) 2024, Western Digital Corporation or its affiliates.
13 #include <linux/kernel.h>
14 #include <linux/module.h>
15 #include <linux/blkdev.h>
16 #include <linux/blk-mq.h>
18 #include <linux/vmalloc.h>
19 #include <linux/sched/mm.h>
20 #include <linux/spinlock.h>
21 #include <linux/refcount.h>
22 #include <linux/mempool.h>
25 #include "blk-mq-sched.h"
26 #include "blk-mq-debugfs.h"
28 #define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name
29 static const char *const zone_cond_name
[] = {
30 ZONE_COND_NAME(NOT_WP
),
31 ZONE_COND_NAME(EMPTY
),
32 ZONE_COND_NAME(IMP_OPEN
),
33 ZONE_COND_NAME(EXP_OPEN
),
34 ZONE_COND_NAME(CLOSED
),
35 ZONE_COND_NAME(READONLY
),
37 ZONE_COND_NAME(OFFLINE
),
42 * Per-zone write plug.
43 * @node: hlist_node structure for managing the plug using a hash table.
44 * @link: To list the plug in the zone write plug error list of the disk.
45 * @ref: Zone write plug reference counter. A zone write plug reference is
46 * always at least 1 when the plug is hashed in the disk plug hash table.
47 * The reference is incremented whenever a new BIO needing plugging is
48 * submitted and when a function needs to manipulate a plug. The
49 * reference count is decremented whenever a plugged BIO completes and
50 * when a function that referenced the plug returns. The initial
51 * reference is dropped whenever the zone of the zone write plug is reset,
52 * finished and when the zone becomes full (last write BIO to the zone
54 * @lock: Spinlock to atomically manipulate the plug.
55 * @flags: Flags indicating the plug state.
56 * @zone_no: The number of the zone the plug is managing.
57 * @wp_offset: The zone write pointer location relative to the start of the zone
58 * as a number of 512B sectors.
59 * @bio_list: The list of BIOs that are currently plugged.
60 * @bio_work: Work struct to handle issuing of plugged BIOs
61 * @rcu_head: RCU head to free zone write plugs with an RCU grace period.
62 * @disk: The gendisk the plug belongs to.
64 struct blk_zone_wplug
{
65 struct hlist_node node
;
66 struct list_head link
;
71 unsigned int wp_offset
;
72 struct bio_list bio_list
;
73 struct work_struct bio_work
;
74 struct rcu_head rcu_head
;
79 * Zone write plug flags bits:
80 * - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged,
81 * that is, that write BIOs are being throttled due to a write BIO already
82 * being executed or the zone write plug bio list is not empty.
83 * - BLK_ZONE_WPLUG_ERROR: Indicates that a write error happened which will be
84 * recovered with a report zone to update the zone write pointer offset.
85 * - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed
86 * from the disk hash table and that the initial reference to the zone
87 * write plug set when the plug was first added to the hash table has been
88 * dropped. This flag is set when a zone is reset, finished or become full,
89 * to prevent new references to the zone write plug to be taken for
90 * newly incoming BIOs. A zone write plug flagged with this flag will be
91 * freed once all remaining references from BIOs or functions are dropped.
93 #define BLK_ZONE_WPLUG_PLUGGED (1U << 0)
94 #define BLK_ZONE_WPLUG_ERROR (1U << 1)
95 #define BLK_ZONE_WPLUG_UNHASHED (1U << 2)
97 #define BLK_ZONE_WPLUG_BUSY (BLK_ZONE_WPLUG_PLUGGED | BLK_ZONE_WPLUG_ERROR)
100 * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX.
101 * @zone_cond: BLK_ZONE_COND_XXX.
103 * Description: Centralize block layer function to convert BLK_ZONE_COND_XXX
104 * into string format. Useful in the debugging and tracing zone conditions. For
105 * invalid BLK_ZONE_COND_XXX it returns string "UNKNOWN".
107 const char *blk_zone_cond_str(enum blk_zone_cond zone_cond
)
109 static const char *zone_cond_str
= "UNKNOWN";
111 if (zone_cond
< ARRAY_SIZE(zone_cond_name
) && zone_cond_name
[zone_cond
])
112 zone_cond_str
= zone_cond_name
[zone_cond
];
114 return zone_cond_str
;
116 EXPORT_SYMBOL_GPL(blk_zone_cond_str
);
119 * blkdev_report_zones - Get zones information
120 * @bdev: Target block device
121 * @sector: Sector from which to report zones
122 * @nr_zones: Maximum number of zones to report
123 * @cb: Callback function called for each reported zone
124 * @data: Private data for the callback
127 * Get zone information starting from the zone containing @sector for at most
128 * @nr_zones, and call @cb for each zone reported by the device.
129 * To report all zones in a device starting from @sector, the BLK_ALL_ZONES
130 * constant can be passed to @nr_zones.
131 * Returns the number of zones reported by the device, or a negative errno
132 * value in case of failure.
134 * Note: The caller must use memalloc_noXX_save/restore() calls to control
135 * memory allocations done within this function.
137 int blkdev_report_zones(struct block_device
*bdev
, sector_t sector
,
138 unsigned int nr_zones
, report_zones_cb cb
, void *data
)
140 struct gendisk
*disk
= bdev
->bd_disk
;
141 sector_t capacity
= get_capacity(disk
);
143 if (!bdev_is_zoned(bdev
) || WARN_ON_ONCE(!disk
->fops
->report_zones
))
146 if (!nr_zones
|| sector
>= capacity
)
149 return disk
->fops
->report_zones(disk
, sector
, nr_zones
, cb
, data
);
151 EXPORT_SYMBOL_GPL(blkdev_report_zones
);
153 static int blkdev_zone_reset_all(struct block_device
*bdev
)
157 bio_init(&bio
, bdev
, NULL
, 0, REQ_OP_ZONE_RESET_ALL
| REQ_SYNC
);
158 return submit_bio_wait(&bio
);
162 * blkdev_zone_mgmt - Execute a zone management operation on a range of zones
163 * @bdev: Target block device
164 * @op: Operation to be performed on the zones
165 * @sector: Start sector of the first zone to operate on
166 * @nr_sectors: Number of sectors, should be at least the length of one zone and
167 * must be zone size aligned.
170 * Perform the specified operation on the range of zones specified by
171 * @sector..@sector+@nr_sectors. Specifying the entire disk sector range
172 * is valid, but the specified range should not contain conventional zones.
173 * The operation to execute on each zone can be a zone reset, open, close
176 int blkdev_zone_mgmt(struct block_device
*bdev
, enum req_op op
,
177 sector_t sector
, sector_t nr_sectors
)
179 sector_t zone_sectors
= bdev_zone_sectors(bdev
);
180 sector_t capacity
= bdev_nr_sectors(bdev
);
181 sector_t end_sector
= sector
+ nr_sectors
;
182 struct bio
*bio
= NULL
;
185 if (!bdev_is_zoned(bdev
))
188 if (bdev_read_only(bdev
))
191 if (!op_is_zone_mgmt(op
))
194 if (end_sector
<= sector
|| end_sector
> capacity
)
198 /* Check alignment (handle eventual smaller last zone) */
199 if (!bdev_is_zone_start(bdev
, sector
))
202 if (!bdev_is_zone_start(bdev
, nr_sectors
) && end_sector
!= capacity
)
206 * In the case of a zone reset operation over all zones, use
207 * REQ_OP_ZONE_RESET_ALL.
209 if (op
== REQ_OP_ZONE_RESET
&& sector
== 0 && nr_sectors
== capacity
)
210 return blkdev_zone_reset_all(bdev
);
212 while (sector
< end_sector
) {
213 bio
= blk_next_bio(bio
, bdev
, 0, op
| REQ_SYNC
, GFP_KERNEL
);
214 bio
->bi_iter
.bi_sector
= sector
;
215 sector
+= zone_sectors
;
217 /* This may take a while, so be nice to others */
221 ret
= submit_bio_wait(bio
);
226 EXPORT_SYMBOL_GPL(blkdev_zone_mgmt
);
228 struct zone_report_args
{
229 struct blk_zone __user
*zones
;
232 static int blkdev_copy_zone_to_user(struct blk_zone
*zone
, unsigned int idx
,
235 struct zone_report_args
*args
= data
;
237 if (copy_to_user(&args
->zones
[idx
], zone
, sizeof(struct blk_zone
)))
243 * BLKREPORTZONE ioctl processing.
244 * Called from blkdev_ioctl.
246 int blkdev_report_zones_ioctl(struct block_device
*bdev
, unsigned int cmd
,
249 void __user
*argp
= (void __user
*)arg
;
250 struct zone_report_args args
;
251 struct blk_zone_report rep
;
257 if (!bdev_is_zoned(bdev
))
260 if (copy_from_user(&rep
, argp
, sizeof(struct blk_zone_report
)))
266 args
.zones
= argp
+ sizeof(struct blk_zone_report
);
267 ret
= blkdev_report_zones(bdev
, rep
.sector
, rep
.nr_zones
,
268 blkdev_copy_zone_to_user
, &args
);
273 rep
.flags
= BLK_ZONE_REP_CAPACITY
;
274 if (copy_to_user(argp
, &rep
, sizeof(struct blk_zone_report
)))
279 static int blkdev_truncate_zone_range(struct block_device
*bdev
,
280 blk_mode_t mode
, const struct blk_zone_range
*zrange
)
284 if (zrange
->sector
+ zrange
->nr_sectors
<= zrange
->sector
||
285 zrange
->sector
+ zrange
->nr_sectors
> get_capacity(bdev
->bd_disk
))
289 start
= zrange
->sector
<< SECTOR_SHIFT
;
290 end
= ((zrange
->sector
+ zrange
->nr_sectors
) << SECTOR_SHIFT
) - 1;
292 return truncate_bdev_range(bdev
, mode
, start
, end
);
296 * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing.
297 * Called from blkdev_ioctl.
299 int blkdev_zone_mgmt_ioctl(struct block_device
*bdev
, blk_mode_t mode
,
300 unsigned int cmd
, unsigned long arg
)
302 void __user
*argp
= (void __user
*)arg
;
303 struct blk_zone_range zrange
;
310 if (!bdev_is_zoned(bdev
))
313 if (!(mode
& BLK_OPEN_WRITE
))
316 if (copy_from_user(&zrange
, argp
, sizeof(struct blk_zone_range
)))
321 op
= REQ_OP_ZONE_RESET
;
323 /* Invalidate the page cache, including dirty pages. */
324 filemap_invalidate_lock(bdev
->bd_mapping
);
325 ret
= blkdev_truncate_zone_range(bdev
, mode
, &zrange
);
330 op
= REQ_OP_ZONE_OPEN
;
333 op
= REQ_OP_ZONE_CLOSE
;
336 op
= REQ_OP_ZONE_FINISH
;
342 ret
= blkdev_zone_mgmt(bdev
, op
, zrange
.sector
, zrange
.nr_sectors
);
345 if (cmd
== BLKRESETZONE
)
346 filemap_invalidate_unlock(bdev
->bd_mapping
);
351 static bool disk_zone_is_last(struct gendisk
*disk
, struct blk_zone
*zone
)
353 return zone
->start
+ zone
->len
>= get_capacity(disk
);
356 static bool disk_zone_is_full(struct gendisk
*disk
,
357 unsigned int zno
, unsigned int offset_in_zone
)
359 if (zno
< disk
->nr_zones
- 1)
360 return offset_in_zone
>= disk
->zone_capacity
;
361 return offset_in_zone
>= disk
->last_zone_capacity
;
364 static bool disk_zone_wplug_is_full(struct gendisk
*disk
,
365 struct blk_zone_wplug
*zwplug
)
367 return disk_zone_is_full(disk
, zwplug
->zone_no
, zwplug
->wp_offset
);
370 static bool disk_insert_zone_wplug(struct gendisk
*disk
,
371 struct blk_zone_wplug
*zwplug
)
373 struct blk_zone_wplug
*zwplg
;
376 hash_32(zwplug
->zone_no
, disk
->zone_wplugs_hash_bits
);
379 * Add the new zone write plug to the hash table, but carefully as we
380 * are racing with other submission context, so we may already have a
381 * zone write plug for the same zone.
383 spin_lock_irqsave(&disk
->zone_wplugs_lock
, flags
);
384 hlist_for_each_entry_rcu(zwplg
, &disk
->zone_wplugs_hash
[idx
], node
) {
385 if (zwplg
->zone_no
== zwplug
->zone_no
) {
386 spin_unlock_irqrestore(&disk
->zone_wplugs_lock
, flags
);
390 hlist_add_head_rcu(&zwplug
->node
, &disk
->zone_wplugs_hash
[idx
]);
391 spin_unlock_irqrestore(&disk
->zone_wplugs_lock
, flags
);
396 static struct blk_zone_wplug
*disk_get_zone_wplug(struct gendisk
*disk
,
399 unsigned int zno
= disk_zone_no(disk
, sector
);
400 unsigned int idx
= hash_32(zno
, disk
->zone_wplugs_hash_bits
);
401 struct blk_zone_wplug
*zwplug
;
405 hlist_for_each_entry_rcu(zwplug
, &disk
->zone_wplugs_hash
[idx
], node
) {
406 if (zwplug
->zone_no
== zno
&&
407 refcount_inc_not_zero(&zwplug
->ref
)) {
418 static void disk_free_zone_wplug_rcu(struct rcu_head
*rcu_head
)
420 struct blk_zone_wplug
*zwplug
=
421 container_of(rcu_head
, struct blk_zone_wplug
, rcu_head
);
423 mempool_free(zwplug
, zwplug
->disk
->zone_wplugs_pool
);
426 static inline void disk_put_zone_wplug(struct blk_zone_wplug
*zwplug
)
428 if (refcount_dec_and_test(&zwplug
->ref
)) {
429 WARN_ON_ONCE(!bio_list_empty(&zwplug
->bio_list
));
430 WARN_ON_ONCE(!list_empty(&zwplug
->link
));
431 WARN_ON_ONCE(!(zwplug
->flags
& BLK_ZONE_WPLUG_UNHASHED
));
433 call_rcu(&zwplug
->rcu_head
, disk_free_zone_wplug_rcu
);
437 static inline bool disk_should_remove_zone_wplug(struct gendisk
*disk
,
438 struct blk_zone_wplug
*zwplug
)
440 /* If the zone write plug was already removed, we are done. */
441 if (zwplug
->flags
& BLK_ZONE_WPLUG_UNHASHED
)
444 /* If the zone write plug is still busy, it cannot be removed. */
445 if (zwplug
->flags
& BLK_ZONE_WPLUG_BUSY
)
449 * Completions of BIOs with blk_zone_write_plug_bio_endio() may
450 * happen after handling a request completion with
451 * blk_zone_write_plug_finish_request() (e.g. with split BIOs
452 * that are chained). In such case, disk_zone_wplug_unplug_bio()
453 * should not attempt to remove the zone write plug until all BIO
454 * completions are seen. Check by looking at the zone write plug
455 * reference count, which is 2 when the plug is unused (one reference
456 * taken when the plug was allocated and another reference taken by the
459 if (refcount_read(&zwplug
->ref
) > 2)
462 /* We can remove zone write plugs for zones that are empty or full. */
463 return !zwplug
->wp_offset
|| disk_zone_wplug_is_full(disk
, zwplug
);
466 static void disk_remove_zone_wplug(struct gendisk
*disk
,
467 struct blk_zone_wplug
*zwplug
)
471 /* If the zone write plug was already removed, we have nothing to do. */
472 if (zwplug
->flags
& BLK_ZONE_WPLUG_UNHASHED
)
476 * Mark the zone write plug as unhashed and drop the extra reference we
477 * took when the plug was inserted in the hash table.
479 zwplug
->flags
|= BLK_ZONE_WPLUG_UNHASHED
;
480 spin_lock_irqsave(&disk
->zone_wplugs_lock
, flags
);
481 hlist_del_init_rcu(&zwplug
->node
);
482 spin_unlock_irqrestore(&disk
->zone_wplugs_lock
, flags
);
483 disk_put_zone_wplug(zwplug
);
486 static void blk_zone_wplug_bio_work(struct work_struct
*work
);
489 * Get a reference on the write plug for the zone containing @sector.
490 * If the plug does not exist, it is allocated and hashed.
491 * Return a pointer to the zone write plug with the plug spinlock held.
493 static struct blk_zone_wplug
*disk_get_and_lock_zone_wplug(struct gendisk
*disk
,
494 sector_t sector
, gfp_t gfp_mask
,
495 unsigned long *flags
)
497 unsigned int zno
= disk_zone_no(disk
, sector
);
498 struct blk_zone_wplug
*zwplug
;
501 zwplug
= disk_get_zone_wplug(disk
, sector
);
504 * Check that a BIO completion or a zone reset or finish
505 * operation has not already removed the zone write plug from
506 * the hash table and dropped its reference count. In such case,
507 * we need to get a new plug so start over from the beginning.
509 spin_lock_irqsave(&zwplug
->lock
, *flags
);
510 if (zwplug
->flags
& BLK_ZONE_WPLUG_UNHASHED
) {
511 spin_unlock_irqrestore(&zwplug
->lock
, *flags
);
512 disk_put_zone_wplug(zwplug
);
519 * Allocate and initialize a zone write plug with an extra reference
520 * so that it is not freed when the zone write plug becomes idle without
521 * the zone being full.
523 zwplug
= mempool_alloc(disk
->zone_wplugs_pool
, gfp_mask
);
527 INIT_HLIST_NODE(&zwplug
->node
);
528 INIT_LIST_HEAD(&zwplug
->link
);
529 refcount_set(&zwplug
->ref
, 2);
530 spin_lock_init(&zwplug
->lock
);
532 zwplug
->zone_no
= zno
;
533 zwplug
->wp_offset
= sector
& (disk
->queue
->limits
.chunk_sectors
- 1);
534 bio_list_init(&zwplug
->bio_list
);
535 INIT_WORK(&zwplug
->bio_work
, blk_zone_wplug_bio_work
);
538 spin_lock_irqsave(&zwplug
->lock
, *flags
);
541 * Insert the new zone write plug in the hash table. This can fail only
542 * if another context already inserted a plug. Retry from the beginning
545 if (!disk_insert_zone_wplug(disk
, zwplug
)) {
546 spin_unlock_irqrestore(&zwplug
->lock
, *flags
);
547 mempool_free(zwplug
, disk
->zone_wplugs_pool
);
554 static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug
*zwplug
,
557 struct request_queue
*q
= zwplug
->disk
->queue
;
559 bio_clear_flag(bio
, BIO_ZONE_WRITE_PLUGGING
);
561 disk_put_zone_wplug(zwplug
);
566 * Abort (fail) all plugged BIOs of a zone write plug.
568 static void disk_zone_wplug_abort(struct blk_zone_wplug
*zwplug
)
572 while ((bio
= bio_list_pop(&zwplug
->bio_list
)))
573 blk_zone_wplug_bio_io_error(zwplug
, bio
);
577 * Abort (fail) all plugged BIOs of a zone write plug that are not aligned
578 * with the assumed write pointer location of the zone when the BIO will
581 static void disk_zone_wplug_abort_unaligned(struct gendisk
*disk
,
582 struct blk_zone_wplug
*zwplug
)
584 unsigned int wp_offset
= zwplug
->wp_offset
;
585 struct bio_list bl
= BIO_EMPTY_LIST
;
588 while ((bio
= bio_list_pop(&zwplug
->bio_list
))) {
589 if (disk_zone_is_full(disk
, zwplug
->zone_no
, wp_offset
) ||
590 (bio_op(bio
) != REQ_OP_ZONE_APPEND
&&
591 bio_offset_from_zone_start(bio
) != wp_offset
)) {
592 blk_zone_wplug_bio_io_error(zwplug
, bio
);
596 wp_offset
+= bio_sectors(bio
);
597 bio_list_add(&bl
, bio
);
600 bio_list_merge(&zwplug
->bio_list
, &bl
);
603 static inline void disk_zone_wplug_set_error(struct gendisk
*disk
,
604 struct blk_zone_wplug
*zwplug
)
608 if (zwplug
->flags
& BLK_ZONE_WPLUG_ERROR
)
612 * At this point, we already have a reference on the zone write plug.
613 * However, since we are going to add the plug to the disk zone write
614 * plugs work list, increase its reference count. This reference will
615 * be dropped in disk_zone_wplugs_work() once the error state is
616 * handled, or in disk_zone_wplug_clear_error() if the zone is reset or
619 zwplug
->flags
|= BLK_ZONE_WPLUG_ERROR
;
620 refcount_inc(&zwplug
->ref
);
622 spin_lock_irqsave(&disk
->zone_wplugs_lock
, flags
);
623 list_add_tail(&zwplug
->link
, &disk
->zone_wplugs_err_list
);
624 spin_unlock_irqrestore(&disk
->zone_wplugs_lock
, flags
);
627 static inline void disk_zone_wplug_clear_error(struct gendisk
*disk
,
628 struct blk_zone_wplug
*zwplug
)
632 if (!(zwplug
->flags
& BLK_ZONE_WPLUG_ERROR
))
636 * We are racing with the error handling work which drops the reference
637 * on the zone write plug after handling the error state. So remove the
638 * plug from the error list and drop its reference count only if the
639 * error handling has not yet started, that is, if the zone write plug
642 spin_lock_irqsave(&disk
->zone_wplugs_lock
, flags
);
643 if (!list_empty(&zwplug
->link
)) {
644 list_del_init(&zwplug
->link
);
645 zwplug
->flags
&= ~BLK_ZONE_WPLUG_ERROR
;
646 disk_put_zone_wplug(zwplug
);
648 spin_unlock_irqrestore(&disk
->zone_wplugs_lock
, flags
);
652 * Set a zone write plug write pointer offset to either 0 (zone reset case)
653 * or to the zone size (zone finish case). This aborts all plugged BIOs, which
654 * is fine to do as doing a zone reset or zone finish while writes are in-flight
655 * is a mistake from the user which will most likely cause all plugged BIOs to
658 static void disk_zone_wplug_set_wp_offset(struct gendisk
*disk
,
659 struct blk_zone_wplug
*zwplug
,
660 unsigned int wp_offset
)
664 spin_lock_irqsave(&zwplug
->lock
, flags
);
667 * Make sure that a BIO completion or another zone reset or finish
668 * operation has not already removed the plug from the hash table.
670 if (zwplug
->flags
& BLK_ZONE_WPLUG_UNHASHED
) {
671 spin_unlock_irqrestore(&zwplug
->lock
, flags
);
675 /* Update the zone write pointer and abort all plugged BIOs. */
676 zwplug
->wp_offset
= wp_offset
;
677 disk_zone_wplug_abort(zwplug
);
680 * Updating the write pointer offset puts back the zone
681 * in a good state. So clear the error flag and decrement the
682 * error count if we were in error state.
684 disk_zone_wplug_clear_error(disk
, zwplug
);
687 * The zone write plug now has no BIO plugged: remove it from the
688 * hash table so that it cannot be seen. The plug will be freed
689 * when the last reference is dropped.
691 if (disk_should_remove_zone_wplug(disk
, zwplug
))
692 disk_remove_zone_wplug(disk
, zwplug
);
694 spin_unlock_irqrestore(&zwplug
->lock
, flags
);
697 static bool blk_zone_wplug_handle_reset_or_finish(struct bio
*bio
,
698 unsigned int wp_offset
)
700 struct gendisk
*disk
= bio
->bi_bdev
->bd_disk
;
701 sector_t sector
= bio
->bi_iter
.bi_sector
;
702 struct blk_zone_wplug
*zwplug
;
704 /* Conventional zones cannot be reset nor finished. */
705 if (!bdev_zone_is_seq(bio
->bi_bdev
, sector
)) {
711 * If we have a zone write plug, set its write pointer offset to 0
712 * (reset case) or to the zone size (finish case). This will abort all
713 * BIOs plugged for the target zone. It is fine as resetting or
714 * finishing zones while writes are still in-flight will result in the
715 * writes failing anyway.
717 zwplug
= disk_get_zone_wplug(disk
, sector
);
719 disk_zone_wplug_set_wp_offset(disk
, zwplug
, wp_offset
);
720 disk_put_zone_wplug(zwplug
);
726 static bool blk_zone_wplug_handle_reset_all(struct bio
*bio
)
728 struct gendisk
*disk
= bio
->bi_bdev
->bd_disk
;
729 struct blk_zone_wplug
*zwplug
;
733 * Set the write pointer offset of all zone write plugs to 0. This will
734 * abort all plugged BIOs. It is fine as resetting zones while writes
735 * are still in-flight will result in the writes failing anyway.
737 for (sector
= 0; sector
< get_capacity(disk
);
738 sector
+= disk
->queue
->limits
.chunk_sectors
) {
739 zwplug
= disk_get_zone_wplug(disk
, sector
);
741 disk_zone_wplug_set_wp_offset(disk
, zwplug
, 0);
742 disk_put_zone_wplug(zwplug
);
749 static inline void blk_zone_wplug_add_bio(struct blk_zone_wplug
*zwplug
,
750 struct bio
*bio
, unsigned int nr_segs
)
753 * Grab an extra reference on the BIO request queue usage counter.
754 * This reference will be reused to submit a request for the BIO for
755 * blk-mq devices and dropped when the BIO is failed and after
756 * it is issued in the case of BIO-based devices.
758 percpu_ref_get(&bio
->bi_bdev
->bd_disk
->queue
->q_usage_counter
);
761 * The BIO is being plugged and thus will have to wait for the on-going
762 * write and for all other writes already plugged. So polling makes
765 bio_clear_polled(bio
);
768 * Reuse the poll cookie field to store the number of segments when
769 * split to the hardware limits.
771 bio
->__bi_nr_segments
= nr_segs
;
774 * We always receive BIOs after they are split and ready to be issued.
775 * The block layer passes the parts of a split BIO in order, and the
776 * user must also issue write sequentially. So simply add the new BIO
777 * at the tail of the list to preserve the sequential write order.
779 bio_list_add(&zwplug
->bio_list
, bio
);
783 * Called from bio_attempt_back_merge() when a BIO was merged with a request.
785 void blk_zone_write_plug_bio_merged(struct bio
*bio
)
787 struct blk_zone_wplug
*zwplug
;
791 * If the BIO was already plugged, then we were called through
792 * blk_zone_write_plug_init_request() -> blk_attempt_bio_merge().
793 * For this case, we already hold a reference on the zone write plug for
794 * the BIO and blk_zone_write_plug_init_request() will handle the
795 * zone write pointer offset update.
797 if (bio_flagged(bio
, BIO_ZONE_WRITE_PLUGGING
))
800 bio_set_flag(bio
, BIO_ZONE_WRITE_PLUGGING
);
803 * Get a reference on the zone write plug of the target zone and advance
804 * the zone write pointer offset. Given that this is a merge, we already
805 * have at least one request and one BIO referencing the zone write
806 * plug. So this should not fail.
808 zwplug
= disk_get_zone_wplug(bio
->bi_bdev
->bd_disk
,
809 bio
->bi_iter
.bi_sector
);
810 if (WARN_ON_ONCE(!zwplug
))
813 spin_lock_irqsave(&zwplug
->lock
, flags
);
814 zwplug
->wp_offset
+= bio_sectors(bio
);
815 spin_unlock_irqrestore(&zwplug
->lock
, flags
);
819 * Attempt to merge plugged BIOs with a newly prepared request for a BIO that
820 * already went through zone write plugging (either a new BIO or one that was
823 void blk_zone_write_plug_init_request(struct request
*req
)
825 sector_t req_back_sector
= blk_rq_pos(req
) + blk_rq_sectors(req
);
826 struct request_queue
*q
= req
->q
;
827 struct gendisk
*disk
= q
->disk
;
828 struct blk_zone_wplug
*zwplug
=
829 disk_get_zone_wplug(disk
, blk_rq_pos(req
));
833 if (WARN_ON_ONCE(!zwplug
))
837 * Indicate that completion of this request needs to be handled with
838 * blk_zone_write_plug_finish_request(), which will drop the reference
839 * on the zone write plug we took above on entry to this function.
841 req
->rq_flags
|= RQF_ZONE_WRITE_PLUGGING
;
843 if (blk_queue_nomerges(q
))
847 * Walk through the list of plugged BIOs to check if they can be merged
848 * into the back of the request.
850 spin_lock_irqsave(&zwplug
->lock
, flags
);
851 while (!disk_zone_wplug_is_full(disk
, zwplug
)) {
852 bio
= bio_list_peek(&zwplug
->bio_list
);
856 if (bio
->bi_iter
.bi_sector
!= req_back_sector
||
857 !blk_rq_merge_ok(req
, bio
))
860 WARN_ON_ONCE(bio_op(bio
) != REQ_OP_WRITE_ZEROES
&&
861 !bio
->__bi_nr_segments
);
863 bio_list_pop(&zwplug
->bio_list
);
864 if (bio_attempt_back_merge(req
, bio
, bio
->__bi_nr_segments
) !=
866 bio_list_add_head(&zwplug
->bio_list
, bio
);
871 * Drop the extra reference on the queue usage we got when
872 * plugging the BIO and advance the write pointer offset.
875 zwplug
->wp_offset
+= bio_sectors(bio
);
877 req_back_sector
+= bio_sectors(bio
);
879 spin_unlock_irqrestore(&zwplug
->lock
, flags
);
883 * Check and prepare a BIO for submission by incrementing the write pointer
884 * offset of its zone write plug and changing zone append operations into
885 * regular write when zone append emulation is needed.
887 static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug
*zwplug
,
890 struct gendisk
*disk
= bio
->bi_bdev
->bd_disk
;
893 * Check that the user is not attempting to write to a full zone.
894 * We know such BIO will fail, and that would potentially overflow our
895 * write pointer offset beyond the end of the zone.
897 if (disk_zone_wplug_is_full(disk
, zwplug
))
900 if (bio_op(bio
) == REQ_OP_ZONE_APPEND
) {
902 * Use a regular write starting at the current write pointer.
903 * Similarly to native zone append operations, do not allow
906 bio
->bi_opf
&= ~REQ_OP_MASK
;
907 bio
->bi_opf
|= REQ_OP_WRITE
| REQ_NOMERGE
;
908 bio
->bi_iter
.bi_sector
+= zwplug
->wp_offset
;
911 * Remember that this BIO is in fact a zone append operation
912 * so that we can restore its operation code on completion.
914 bio_set_flag(bio
, BIO_EMULATES_ZONE_APPEND
);
917 * Check for non-sequential writes early because we avoid a
918 * whole lot of error handling trouble if we don't send it off
921 if (bio_offset_from_zone_start(bio
) != zwplug
->wp_offset
)
925 /* Advance the zone write pointer offset. */
926 zwplug
->wp_offset
+= bio_sectors(bio
);
931 /* We detected an invalid write BIO: schedule error recovery. */
932 disk_zone_wplug_set_error(disk
, zwplug
);
933 kblockd_schedule_work(&disk
->zone_wplugs_work
);
937 static bool blk_zone_wplug_handle_write(struct bio
*bio
, unsigned int nr_segs
)
939 struct gendisk
*disk
= bio
->bi_bdev
->bd_disk
;
940 sector_t sector
= bio
->bi_iter
.bi_sector
;
941 struct blk_zone_wplug
*zwplug
;
942 gfp_t gfp_mask
= GFP_NOIO
;
946 * BIOs must be fully contained within a zone so that we use the correct
947 * zone write plug for the entire BIO. For blk-mq devices, the block
948 * layer should already have done any splitting required to ensure this
949 * and this BIO should thus not be straddling zone boundaries. For
950 * BIO-based devices, it is the responsibility of the driver to split
951 * the bio before submitting it.
953 if (WARN_ON_ONCE(bio_straddles_zones(bio
))) {
958 /* Conventional zones do not need write plugging. */
959 if (!bdev_zone_is_seq(bio
->bi_bdev
, sector
)) {
960 /* Zone append to conventional zones is not allowed. */
961 if (bio_op(bio
) == REQ_OP_ZONE_APPEND
) {
968 if (bio
->bi_opf
& REQ_NOWAIT
)
969 gfp_mask
= GFP_NOWAIT
;
971 zwplug
= disk_get_and_lock_zone_wplug(disk
, sector
, gfp_mask
, &flags
);
977 /* Indicate that this BIO is being handled using zone write plugging. */
978 bio_set_flag(bio
, BIO_ZONE_WRITE_PLUGGING
);
981 * If the zone is already plugged or has a pending error, add the BIO
982 * to the plug BIO list. Otherwise, plug and let the BIO execute.
984 if (zwplug
->flags
& BLK_ZONE_WPLUG_BUSY
)
988 * If an error is detected when preparing the BIO, add it to the BIO
989 * list so that error recovery can deal with it.
991 if (!blk_zone_wplug_prepare_bio(zwplug
, bio
))
994 zwplug
->flags
|= BLK_ZONE_WPLUG_PLUGGED
;
996 spin_unlock_irqrestore(&zwplug
->lock
, flags
);
1001 zwplug
->flags
|= BLK_ZONE_WPLUG_PLUGGED
;
1002 blk_zone_wplug_add_bio(zwplug
, bio
, nr_segs
);
1004 spin_unlock_irqrestore(&zwplug
->lock
, flags
);
1010 * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging
1011 * @bio: The BIO being submitted
1012 * @nr_segs: The number of physical segments of @bio
1014 * Handle write, write zeroes and zone append operations requiring emulation
1015 * using zone write plugging.
1017 * Return true whenever @bio execution needs to be delayed through the zone
1018 * write plug. Otherwise, return false to let the submission path process
1021 bool blk_zone_plug_bio(struct bio
*bio
, unsigned int nr_segs
)
1023 struct block_device
*bdev
= bio
->bi_bdev
;
1025 if (!bdev
->bd_disk
->zone_wplugs_hash
)
1029 * If the BIO already has the plugging flag set, then it was already
1030 * handled through this path and this is a submission from the zone
1031 * plug bio submit work.
1033 if (bio_flagged(bio
, BIO_ZONE_WRITE_PLUGGING
))
1037 * We do not need to do anything special for empty flush BIOs, e.g
1038 * BIOs such as issued by blkdev_issue_flush(). The is because it is
1039 * the responsibility of the user to first wait for the completion of
1040 * write operations for flush to have any effect on the persistence of
1043 if (op_is_flush(bio
->bi_opf
) && !bio_sectors(bio
))
1047 * Regular writes and write zeroes need to be handled through the target
1048 * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH
1049 * which may need to go through the flush machinery depending on the
1050 * target device capabilities. Plugging such writes is fine as the flush
1051 * machinery operates at the request level, below the plug, and
1052 * completion of the flush sequence will go through the regular BIO
1053 * completion, which will handle zone write plugging.
1054 * Zone append operations for devices that requested emulation must
1055 * also be plugged so that these BIOs can be changed into regular
1057 * Zone reset, reset all and finish commands need special treatment
1058 * to correctly track the write pointer offset of zones. These commands
1059 * are not plugged as we do not need serialization with write
1060 * operations. It is the responsibility of the user to not issue reset
1061 * and finish commands when write operations are in flight.
1063 switch (bio_op(bio
)) {
1064 case REQ_OP_ZONE_APPEND
:
1065 if (!bdev_emulates_zone_append(bdev
))
1069 case REQ_OP_WRITE_ZEROES
:
1070 return blk_zone_wplug_handle_write(bio
, nr_segs
);
1071 case REQ_OP_ZONE_RESET
:
1072 return blk_zone_wplug_handle_reset_or_finish(bio
, 0);
1073 case REQ_OP_ZONE_FINISH
:
1074 return blk_zone_wplug_handle_reset_or_finish(bio
,
1075 bdev_zone_sectors(bdev
));
1076 case REQ_OP_ZONE_RESET_ALL
:
1077 return blk_zone_wplug_handle_reset_all(bio
);
1084 EXPORT_SYMBOL_GPL(blk_zone_plug_bio
);
1086 static void disk_zone_wplug_schedule_bio_work(struct gendisk
*disk
,
1087 struct blk_zone_wplug
*zwplug
)
1090 * Take a reference on the zone write plug and schedule the submission
1091 * of the next plugged BIO. blk_zone_wplug_bio_work() will release the
1092 * reference we take here.
1094 WARN_ON_ONCE(!(zwplug
->flags
& BLK_ZONE_WPLUG_PLUGGED
));
1095 refcount_inc(&zwplug
->ref
);
1096 queue_work(disk
->zone_wplugs_wq
, &zwplug
->bio_work
);
1099 static void disk_zone_wplug_unplug_bio(struct gendisk
*disk
,
1100 struct blk_zone_wplug
*zwplug
)
1102 unsigned long flags
;
1104 spin_lock_irqsave(&zwplug
->lock
, flags
);
1107 * If we had an error, schedule error recovery. The recovery work
1108 * will restart submission of plugged BIOs.
1110 if (zwplug
->flags
& BLK_ZONE_WPLUG_ERROR
) {
1111 spin_unlock_irqrestore(&zwplug
->lock
, flags
);
1112 kblockd_schedule_work(&disk
->zone_wplugs_work
);
1116 /* Schedule submission of the next plugged BIO if we have one. */
1117 if (!bio_list_empty(&zwplug
->bio_list
)) {
1118 disk_zone_wplug_schedule_bio_work(disk
, zwplug
);
1119 spin_unlock_irqrestore(&zwplug
->lock
, flags
);
1123 zwplug
->flags
&= ~BLK_ZONE_WPLUG_PLUGGED
;
1126 * If the zone is full (it was fully written or finished, or empty
1127 * (it was reset), remove its zone write plug from the hash table.
1129 if (disk_should_remove_zone_wplug(disk
, zwplug
))
1130 disk_remove_zone_wplug(disk
, zwplug
);
1132 spin_unlock_irqrestore(&zwplug
->lock
, flags
);
1135 void blk_zone_write_plug_bio_endio(struct bio
*bio
)
1137 struct gendisk
*disk
= bio
->bi_bdev
->bd_disk
;
1138 struct blk_zone_wplug
*zwplug
=
1139 disk_get_zone_wplug(disk
, bio
->bi_iter
.bi_sector
);
1140 unsigned long flags
;
1142 if (WARN_ON_ONCE(!zwplug
))
1145 /* Make sure we do not see this BIO again by clearing the plug flag. */
1146 bio_clear_flag(bio
, BIO_ZONE_WRITE_PLUGGING
);
1149 * If this is a regular write emulating a zone append operation,
1150 * restore the original operation code.
1152 if (bio_flagged(bio
, BIO_EMULATES_ZONE_APPEND
)) {
1153 bio
->bi_opf
&= ~REQ_OP_MASK
;
1154 bio
->bi_opf
|= REQ_OP_ZONE_APPEND
;
1158 * If the BIO failed, mark the plug as having an error to trigger
1161 if (bio
->bi_status
!= BLK_STS_OK
) {
1162 spin_lock_irqsave(&zwplug
->lock
, flags
);
1163 disk_zone_wplug_set_error(disk
, zwplug
);
1164 spin_unlock_irqrestore(&zwplug
->lock
, flags
);
1167 /* Drop the reference we took when the BIO was issued. */
1168 disk_put_zone_wplug(zwplug
);
1171 * For BIO-based devices, blk_zone_write_plug_finish_request()
1172 * is not called. So we need to schedule execution of the next
1175 if (bdev_test_flag(bio
->bi_bdev
, BD_HAS_SUBMIT_BIO
))
1176 disk_zone_wplug_unplug_bio(disk
, zwplug
);
1178 /* Drop the reference we took when entering this function. */
1179 disk_put_zone_wplug(zwplug
);
1182 void blk_zone_write_plug_finish_request(struct request
*req
)
1184 struct gendisk
*disk
= req
->q
->disk
;
1185 struct blk_zone_wplug
*zwplug
;
1187 zwplug
= disk_get_zone_wplug(disk
, req
->__sector
);
1188 if (WARN_ON_ONCE(!zwplug
))
1191 req
->rq_flags
&= ~RQF_ZONE_WRITE_PLUGGING
;
1194 * Drop the reference we took when the request was initialized in
1195 * blk_zone_write_plug_init_request().
1197 disk_put_zone_wplug(zwplug
);
1199 disk_zone_wplug_unplug_bio(disk
, zwplug
);
1201 /* Drop the reference we took when entering this function. */
1202 disk_put_zone_wplug(zwplug
);
1205 static void blk_zone_wplug_bio_work(struct work_struct
*work
)
1207 struct blk_zone_wplug
*zwplug
=
1208 container_of(work
, struct blk_zone_wplug
, bio_work
);
1209 struct block_device
*bdev
;
1210 unsigned long flags
;
1214 * Submit the next plugged BIO. If we do not have any, clear
1217 spin_lock_irqsave(&zwplug
->lock
, flags
);
1219 bio
= bio_list_pop(&zwplug
->bio_list
);
1221 zwplug
->flags
&= ~BLK_ZONE_WPLUG_PLUGGED
;
1222 spin_unlock_irqrestore(&zwplug
->lock
, flags
);
1226 if (!blk_zone_wplug_prepare_bio(zwplug
, bio
)) {
1227 /* Error recovery will decide what to do with the BIO. */
1228 bio_list_add_head(&zwplug
->bio_list
, bio
);
1229 spin_unlock_irqrestore(&zwplug
->lock
, flags
);
1233 spin_unlock_irqrestore(&zwplug
->lock
, flags
);
1235 bdev
= bio
->bi_bdev
;
1236 submit_bio_noacct_nocheck(bio
);
1239 * blk-mq devices will reuse the extra reference on the request queue
1240 * usage counter we took when the BIO was plugged, but the submission
1241 * path for BIO-based devices will not do that. So drop this extra
1244 if (bdev_test_flag(bdev
, BD_HAS_SUBMIT_BIO
))
1245 blk_queue_exit(bdev
->bd_disk
->queue
);
1248 /* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */
1249 disk_put_zone_wplug(zwplug
);
1252 static unsigned int blk_zone_wp_offset(struct blk_zone
*zone
)
1254 switch (zone
->cond
) {
1255 case BLK_ZONE_COND_IMP_OPEN
:
1256 case BLK_ZONE_COND_EXP_OPEN
:
1257 case BLK_ZONE_COND_CLOSED
:
1258 return zone
->wp
- zone
->start
;
1259 case BLK_ZONE_COND_FULL
:
1261 case BLK_ZONE_COND_EMPTY
:
1263 case BLK_ZONE_COND_NOT_WP
:
1264 case BLK_ZONE_COND_OFFLINE
:
1265 case BLK_ZONE_COND_READONLY
:
1268 * Conventional, offline and read-only zones do not have a valid
1275 static int blk_zone_wplug_report_zone_cb(struct blk_zone
*zone
,
1276 unsigned int idx
, void *data
)
1278 struct blk_zone
*zonep
= data
;
1284 static void disk_zone_wplug_handle_error(struct gendisk
*disk
,
1285 struct blk_zone_wplug
*zwplug
)
1287 sector_t zone_start_sector
=
1288 bdev_zone_sectors(disk
->part0
) * zwplug
->zone_no
;
1289 unsigned int noio_flag
;
1290 struct blk_zone zone
;
1291 unsigned long flags
;
1294 /* Get the current zone information from the device. */
1295 noio_flag
= memalloc_noio_save();
1296 ret
= disk
->fops
->report_zones(disk
, zone_start_sector
, 1,
1297 blk_zone_wplug_report_zone_cb
, &zone
);
1298 memalloc_noio_restore(noio_flag
);
1300 spin_lock_irqsave(&zwplug
->lock
, flags
);
1303 * A zone reset or finish may have cleared the error already. In such
1304 * case, do nothing as the report zones may have seen the "old" write
1305 * pointer value before the reset/finish operation completed.
1307 if (!(zwplug
->flags
& BLK_ZONE_WPLUG_ERROR
))
1310 zwplug
->flags
&= ~BLK_ZONE_WPLUG_ERROR
;
1314 * We failed to get the zone information, meaning that something
1315 * is likely really wrong with the device. Abort all remaining
1316 * plugged BIOs as otherwise we could endup waiting forever on
1317 * plugged BIOs to complete if there is a queue freeze on-going.
1319 disk_zone_wplug_abort(zwplug
);
1323 /* Update the zone write pointer offset. */
1324 zwplug
->wp_offset
= blk_zone_wp_offset(&zone
);
1325 disk_zone_wplug_abort_unaligned(disk
, zwplug
);
1327 /* Restart BIO submission if we still have any BIO left. */
1328 if (!bio_list_empty(&zwplug
->bio_list
)) {
1329 disk_zone_wplug_schedule_bio_work(disk
, zwplug
);
1334 zwplug
->flags
&= ~BLK_ZONE_WPLUG_PLUGGED
;
1335 if (disk_should_remove_zone_wplug(disk
, zwplug
))
1336 disk_remove_zone_wplug(disk
, zwplug
);
1339 spin_unlock_irqrestore(&zwplug
->lock
, flags
);
1342 static void disk_zone_wplugs_work(struct work_struct
*work
)
1344 struct gendisk
*disk
=
1345 container_of(work
, struct gendisk
, zone_wplugs_work
);
1346 struct blk_zone_wplug
*zwplug
;
1347 unsigned long flags
;
1349 spin_lock_irqsave(&disk
->zone_wplugs_lock
, flags
);
1351 while (!list_empty(&disk
->zone_wplugs_err_list
)) {
1352 zwplug
= list_first_entry(&disk
->zone_wplugs_err_list
,
1353 struct blk_zone_wplug
, link
);
1354 list_del_init(&zwplug
->link
);
1355 spin_unlock_irqrestore(&disk
->zone_wplugs_lock
, flags
);
1357 disk_zone_wplug_handle_error(disk
, zwplug
);
1358 disk_put_zone_wplug(zwplug
);
1360 spin_lock_irqsave(&disk
->zone_wplugs_lock
, flags
);
1363 spin_unlock_irqrestore(&disk
->zone_wplugs_lock
, flags
);
1366 static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk
*disk
)
1368 return 1U << disk
->zone_wplugs_hash_bits
;
1371 void disk_init_zone_resources(struct gendisk
*disk
)
1373 spin_lock_init(&disk
->zone_wplugs_lock
);
1374 INIT_LIST_HEAD(&disk
->zone_wplugs_err_list
);
1375 INIT_WORK(&disk
->zone_wplugs_work
, disk_zone_wplugs_work
);
1379 * For the size of a disk zone write plug hash table, use the size of the
1380 * zone write plug mempool, which is the maximum of the disk open zones and
1381 * active zones limits. But do not exceed 4KB (512 hlist head entries), that is,
1382 * 9 bits. For a disk that has no limits, mempool size defaults to 128.
1384 #define BLK_ZONE_WPLUG_MAX_HASH_BITS 9
1385 #define BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE 128
1387 static int disk_alloc_zone_resources(struct gendisk
*disk
,
1388 unsigned int pool_size
)
1392 disk
->zone_wplugs_hash_bits
=
1393 min(ilog2(pool_size
) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS
);
1395 disk
->zone_wplugs_hash
=
1396 kcalloc(disk_zone_wplugs_hash_size(disk
),
1397 sizeof(struct hlist_head
), GFP_KERNEL
);
1398 if (!disk
->zone_wplugs_hash
)
1401 for (i
= 0; i
< disk_zone_wplugs_hash_size(disk
); i
++)
1402 INIT_HLIST_HEAD(&disk
->zone_wplugs_hash
[i
]);
1404 disk
->zone_wplugs_pool
= mempool_create_kmalloc_pool(pool_size
,
1405 sizeof(struct blk_zone_wplug
));
1406 if (!disk
->zone_wplugs_pool
)
1409 disk
->zone_wplugs_wq
=
1410 alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM
| WQ_HIGHPRI
,
1411 pool_size
, disk
->disk_name
);
1412 if (!disk
->zone_wplugs_wq
)
1418 mempool_destroy(disk
->zone_wplugs_pool
);
1419 disk
->zone_wplugs_pool
= NULL
;
1421 kfree(disk
->zone_wplugs_hash
);
1422 disk
->zone_wplugs_hash
= NULL
;
1423 disk
->zone_wplugs_hash_bits
= 0;
1427 static void disk_destroy_zone_wplugs_hash_table(struct gendisk
*disk
)
1429 struct blk_zone_wplug
*zwplug
;
1432 if (!disk
->zone_wplugs_hash
)
1435 /* Free all the zone write plugs we have. */
1436 for (i
= 0; i
< disk_zone_wplugs_hash_size(disk
); i
++) {
1437 while (!hlist_empty(&disk
->zone_wplugs_hash
[i
])) {
1438 zwplug
= hlist_entry(disk
->zone_wplugs_hash
[i
].first
,
1439 struct blk_zone_wplug
, node
);
1440 refcount_inc(&zwplug
->ref
);
1441 disk_remove_zone_wplug(disk
, zwplug
);
1442 disk_put_zone_wplug(zwplug
);
1446 kfree(disk
->zone_wplugs_hash
);
1447 disk
->zone_wplugs_hash
= NULL
;
1448 disk
->zone_wplugs_hash_bits
= 0;
1451 static unsigned int disk_set_conv_zones_bitmap(struct gendisk
*disk
,
1452 unsigned long *bitmap
)
1454 unsigned int nr_conv_zones
= 0;
1455 unsigned long flags
;
1457 spin_lock_irqsave(&disk
->zone_wplugs_lock
, flags
);
1459 nr_conv_zones
= bitmap_weight(bitmap
, disk
->nr_zones
);
1460 bitmap
= rcu_replace_pointer(disk
->conv_zones_bitmap
, bitmap
,
1461 lockdep_is_held(&disk
->zone_wplugs_lock
));
1462 spin_unlock_irqrestore(&disk
->zone_wplugs_lock
, flags
);
1464 kfree_rcu_mightsleep(bitmap
);
1466 return nr_conv_zones
;
1469 void disk_free_zone_resources(struct gendisk
*disk
)
1471 if (!disk
->zone_wplugs_pool
)
1474 cancel_work_sync(&disk
->zone_wplugs_work
);
1476 if (disk
->zone_wplugs_wq
) {
1477 destroy_workqueue(disk
->zone_wplugs_wq
);
1478 disk
->zone_wplugs_wq
= NULL
;
1481 disk_destroy_zone_wplugs_hash_table(disk
);
1484 * Wait for the zone write plugs to be RCU-freed before
1485 * destorying the mempool.
1489 mempool_destroy(disk
->zone_wplugs_pool
);
1490 disk
->zone_wplugs_pool
= NULL
;
1492 disk_set_conv_zones_bitmap(disk
, NULL
);
1493 disk
->zone_capacity
= 0;
1494 disk
->last_zone_capacity
= 0;
1498 static inline bool disk_need_zone_resources(struct gendisk
*disk
)
1501 * All mq zoned devices need zone resources so that the block layer
1502 * can automatically handle write BIO plugging. BIO-based device drivers
1503 * (e.g. DM devices) are normally responsible for handling zone write
1504 * ordering and do not need zone resources, unless the driver requires
1505 * zone append emulation.
1507 return queue_is_mq(disk
->queue
) ||
1508 queue_emulates_zone_append(disk
->queue
);
1511 static int disk_revalidate_zone_resources(struct gendisk
*disk
,
1512 unsigned int nr_zones
)
1514 struct queue_limits
*lim
= &disk
->queue
->limits
;
1515 unsigned int pool_size
;
1517 if (!disk_need_zone_resources(disk
))
1521 * If the device has no limit on the maximum number of open and active
1522 * zones, use BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE.
1524 pool_size
= max(lim
->max_open_zones
, lim
->max_active_zones
);
1526 pool_size
= min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE
, nr_zones
);
1528 if (!disk
->zone_wplugs_hash
)
1529 return disk_alloc_zone_resources(disk
, pool_size
);
1534 struct blk_revalidate_zone_args
{
1535 struct gendisk
*disk
;
1536 unsigned long *conv_zones_bitmap
;
1537 unsigned int nr_zones
;
1538 unsigned int zone_capacity
;
1539 unsigned int last_zone_capacity
;
1544 * Update the disk zone resources information and device queue limits.
1545 * The disk queue is frozen when this is executed.
1547 static int disk_update_zone_resources(struct gendisk
*disk
,
1548 struct blk_revalidate_zone_args
*args
)
1550 struct request_queue
*q
= disk
->queue
;
1551 unsigned int nr_seq_zones
, nr_conv_zones
;
1552 unsigned int pool_size
;
1553 struct queue_limits lim
;
1556 disk
->nr_zones
= args
->nr_zones
;
1557 disk
->zone_capacity
= args
->zone_capacity
;
1558 disk
->last_zone_capacity
= args
->last_zone_capacity
;
1560 disk_set_conv_zones_bitmap(disk
, args
->conv_zones_bitmap
);
1561 if (nr_conv_zones
>= disk
->nr_zones
) {
1562 pr_warn("%s: Invalid number of conventional zones %u / %u\n",
1563 disk
->disk_name
, nr_conv_zones
, disk
->nr_zones
);
1567 lim
= queue_limits_start_update(q
);
1570 * Some devices can advertize zone resource limits that are larger than
1571 * the number of sequential zones of the zoned block device, e.g. a
1572 * small ZNS namespace. For such case, assume that the zoned device has
1573 * no zone resource limits.
1575 nr_seq_zones
= disk
->nr_zones
- nr_conv_zones
;
1576 if (lim
.max_open_zones
>= nr_seq_zones
)
1577 lim
.max_open_zones
= 0;
1578 if (lim
.max_active_zones
>= nr_seq_zones
)
1579 lim
.max_active_zones
= 0;
1581 if (!disk
->zone_wplugs_pool
)
1585 * If the device has no limit on the maximum number of open and active
1586 * zones, set its max open zone limit to the mempool size to indicate
1587 * to the user that there is a potential performance impact due to
1588 * dynamic zone write plug allocation when simultaneously writing to
1589 * more zones than the size of the mempool.
1591 pool_size
= max(lim
.max_open_zones
, lim
.max_active_zones
);
1593 pool_size
= min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE
, nr_seq_zones
);
1595 mempool_resize(disk
->zone_wplugs_pool
, pool_size
);
1597 if (!lim
.max_open_zones
&& !lim
.max_active_zones
) {
1598 if (pool_size
< nr_seq_zones
)
1599 lim
.max_open_zones
= pool_size
;
1601 lim
.max_open_zones
= 0;
1605 blk_mq_freeze_queue(q
);
1606 ret
= queue_limits_commit_update(q
, &lim
);
1607 blk_mq_unfreeze_queue(q
);
1612 static int blk_revalidate_conv_zone(struct blk_zone
*zone
, unsigned int idx
,
1613 struct blk_revalidate_zone_args
*args
)
1615 struct gendisk
*disk
= args
->disk
;
1617 if (zone
->capacity
!= zone
->len
) {
1618 pr_warn("%s: Invalid conventional zone capacity\n",
1623 if (disk_zone_is_last(disk
, zone
))
1624 args
->last_zone_capacity
= zone
->capacity
;
1626 if (!disk_need_zone_resources(disk
))
1629 if (!args
->conv_zones_bitmap
) {
1630 args
->conv_zones_bitmap
=
1631 bitmap_zalloc(args
->nr_zones
, GFP_NOIO
);
1632 if (!args
->conv_zones_bitmap
)
1636 set_bit(idx
, args
->conv_zones_bitmap
);
1641 static int blk_revalidate_seq_zone(struct blk_zone
*zone
, unsigned int idx
,
1642 struct blk_revalidate_zone_args
*args
)
1644 struct gendisk
*disk
= args
->disk
;
1645 struct blk_zone_wplug
*zwplug
;
1646 unsigned int wp_offset
;
1647 unsigned long flags
;
1650 * Remember the capacity of the first sequential zone and check
1651 * if it is constant for all zones, ignoring the last zone as it can be
1654 if (!args
->zone_capacity
)
1655 args
->zone_capacity
= zone
->capacity
;
1656 if (disk_zone_is_last(disk
, zone
)) {
1657 args
->last_zone_capacity
= zone
->capacity
;
1658 } else if (zone
->capacity
!= args
->zone_capacity
) {
1659 pr_warn("%s: Invalid variable zone capacity\n",
1665 * We need to track the write pointer of all zones that are not
1666 * empty nor full. So make sure we have a zone write plug for
1667 * such zone if the device has a zone write plug hash table.
1669 if (!disk
->zone_wplugs_hash
)
1672 wp_offset
= blk_zone_wp_offset(zone
);
1673 if (!wp_offset
|| wp_offset
>= zone
->capacity
)
1676 zwplug
= disk_get_and_lock_zone_wplug(disk
, zone
->wp
, GFP_NOIO
, &flags
);
1679 spin_unlock_irqrestore(&zwplug
->lock
, flags
);
1680 disk_put_zone_wplug(zwplug
);
1686 * Helper function to check the validity of zones of a zoned block device.
1688 static int blk_revalidate_zone_cb(struct blk_zone
*zone
, unsigned int idx
,
1691 struct blk_revalidate_zone_args
*args
= data
;
1692 struct gendisk
*disk
= args
->disk
;
1693 sector_t zone_sectors
= disk
->queue
->limits
.chunk_sectors
;
1696 /* Check for bad zones and holes in the zone report */
1697 if (zone
->start
!= args
->sector
) {
1698 pr_warn("%s: Zone gap at sectors %llu..%llu\n",
1699 disk
->disk_name
, args
->sector
, zone
->start
);
1703 if (zone
->start
>= get_capacity(disk
) || !zone
->len
) {
1704 pr_warn("%s: Invalid zone start %llu, length %llu\n",
1705 disk
->disk_name
, zone
->start
, zone
->len
);
1710 * All zones must have the same size, with the exception on an eventual
1711 * smaller last zone.
1713 if (!disk_zone_is_last(disk
, zone
)) {
1714 if (zone
->len
!= zone_sectors
) {
1715 pr_warn("%s: Invalid zoned device with non constant zone size\n",
1719 } else if (zone
->len
> zone_sectors
) {
1720 pr_warn("%s: Invalid zoned device with larger last zone size\n",
1725 if (!zone
->capacity
|| zone
->capacity
> zone
->len
) {
1726 pr_warn("%s: Invalid zone capacity\n",
1731 /* Check zone type */
1732 switch (zone
->type
) {
1733 case BLK_ZONE_TYPE_CONVENTIONAL
:
1734 ret
= blk_revalidate_conv_zone(zone
, idx
, args
);
1736 case BLK_ZONE_TYPE_SEQWRITE_REQ
:
1737 ret
= blk_revalidate_seq_zone(zone
, idx
, args
);
1739 case BLK_ZONE_TYPE_SEQWRITE_PREF
:
1741 pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n",
1742 disk
->disk_name
, (int)zone
->type
, zone
->start
);
1747 args
->sector
+= zone
->len
;
1753 * blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs
1754 * @disk: Target disk
1756 * Helper function for low-level device drivers to check, (re) allocate and
1757 * initialize resources used for managing zoned disks. This function should
1758 * normally be called by blk-mq based drivers when a zoned gendisk is probed
1759 * and when the zone configuration of the gendisk changes (e.g. after a format).
1760 * Before calling this function, the device driver must already have set the
1761 * device zone size (chunk_sector limit) and the max zone append limit.
1762 * BIO based drivers can also use this function as long as the device queue
1763 * can be safely frozen.
1765 int blk_revalidate_disk_zones(struct gendisk
*disk
)
1767 struct request_queue
*q
= disk
->queue
;
1768 sector_t zone_sectors
= q
->limits
.chunk_sectors
;
1769 sector_t capacity
= get_capacity(disk
);
1770 struct blk_revalidate_zone_args args
= { };
1771 unsigned int noio_flag
;
1774 if (WARN_ON_ONCE(!blk_queue_is_zoned(q
)))
1781 * Checks that the device driver indicated a valid zone size and that
1782 * the max zone append limit is set.
1784 if (!zone_sectors
|| !is_power_of_2(zone_sectors
)) {
1785 pr_warn("%s: Invalid non power of two zone size (%llu)\n",
1786 disk
->disk_name
, zone_sectors
);
1791 * Ensure that all memory allocations in this context are done as if
1792 * GFP_NOIO was specified.
1795 args
.nr_zones
= (capacity
+ zone_sectors
- 1) >> ilog2(zone_sectors
);
1796 noio_flag
= memalloc_noio_save();
1797 ret
= disk_revalidate_zone_resources(disk
, args
.nr_zones
);
1799 memalloc_noio_restore(noio_flag
);
1802 ret
= disk
->fops
->report_zones(disk
, 0, UINT_MAX
,
1803 blk_revalidate_zone_cb
, &args
);
1805 pr_warn("%s: No zones reported\n", disk
->disk_name
);
1808 memalloc_noio_restore(noio_flag
);
1811 * If zones where reported, make sure that the entire disk capacity
1814 if (ret
> 0 && args
.sector
!= capacity
) {
1815 pr_warn("%s: Missing zones from sector %llu\n",
1816 disk
->disk_name
, args
.sector
);
1821 * Set the new disk zone parameters only once the queue is frozen and
1822 * all I/Os are completed.
1825 ret
= disk_update_zone_resources(disk
, &args
);
1827 pr_warn("%s: failed to revalidate zones\n", disk
->disk_name
);
1829 blk_mq_freeze_queue(q
);
1830 disk_free_zone_resources(disk
);
1831 blk_mq_unfreeze_queue(q
);
1836 EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones
);
1838 #ifdef CONFIG_BLK_DEBUG_FS
1840 int queue_zone_wplugs_show(void *data
, struct seq_file
*m
)
1842 struct request_queue
*q
= data
;
1843 struct gendisk
*disk
= q
->disk
;
1844 struct blk_zone_wplug
*zwplug
;
1845 unsigned int zwp_wp_offset
, zwp_flags
;
1846 unsigned int zwp_zone_no
, zwp_ref
;
1847 unsigned int zwp_bio_list_size
, i
;
1848 unsigned long flags
;
1850 if (!disk
->zone_wplugs_hash
)
1854 for (i
= 0; i
< disk_zone_wplugs_hash_size(disk
); i
++) {
1855 hlist_for_each_entry_rcu(zwplug
,
1856 &disk
->zone_wplugs_hash
[i
], node
) {
1857 spin_lock_irqsave(&zwplug
->lock
, flags
);
1858 zwp_zone_no
= zwplug
->zone_no
;
1859 zwp_flags
= zwplug
->flags
;
1860 zwp_ref
= refcount_read(&zwplug
->ref
);
1861 zwp_wp_offset
= zwplug
->wp_offset
;
1862 zwp_bio_list_size
= bio_list_size(&zwplug
->bio_list
);
1863 spin_unlock_irqrestore(&zwplug
->lock
, flags
);
1865 seq_printf(m
, "%u 0x%x %u %u %u\n",
1866 zwp_zone_no
, zwp_flags
, zwp_ref
,
1867 zwp_wp_offset
, zwp_bio_list_size
);