1 // SPDX-License-Identifier: GPL-2.0
3 * Zoned block device handling
5 * Copyright (c) 2015, Hannes Reinecke
6 * Copyright (c) 2015, SUSE Linux GmbH
8 * Copyright (c) 2016, Damien Le Moal
9 * Copyright (c) 2016, Western Digital
10 * Copyright (c) 2024, Western Digital Corporation or its affiliates.
13 #include <linux/kernel.h>
14 #include <linux/module.h>
15 #include <linux/blkdev.h>
16 #include <linux/blk-mq.h>
18 #include <linux/vmalloc.h>
19 #include <linux/sched/mm.h>
20 #include <linux/spinlock.h>
21 #include <linux/atomic.h>
22 #include <linux/mempool.h>
25 #include "blk-mq-sched.h"
26 #include "blk-mq-debugfs.h"
28 #define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name
29 static const char *const zone_cond_name
[] = {
30 ZONE_COND_NAME(NOT_WP
),
31 ZONE_COND_NAME(EMPTY
),
32 ZONE_COND_NAME(IMP_OPEN
),
33 ZONE_COND_NAME(EXP_OPEN
),
34 ZONE_COND_NAME(CLOSED
),
35 ZONE_COND_NAME(READONLY
),
37 ZONE_COND_NAME(OFFLINE
),
42 * Per-zone write plug.
43 * @node: hlist_node structure for managing the plug using a hash table.
44 * @link: To list the plug in the zone write plug error list of the disk.
45 * @ref: Zone write plug reference counter. A zone write plug reference is
46 * always at least 1 when the plug is hashed in the disk plug hash table.
47 * The reference is incremented whenever a new BIO needing plugging is
48 * submitted and when a function needs to manipulate a plug. The
49 * reference count is decremented whenever a plugged BIO completes and
50 * when a function that referenced the plug returns. The initial
51 * reference is dropped whenever the zone of the zone write plug is reset,
52 * finished and when the zone becomes full (last write BIO to the zone
54 * @lock: Spinlock to atomically manipulate the plug.
55 * @flags: Flags indicating the plug state.
56 * @zone_no: The number of the zone the plug is managing.
57 * @wp_offset: The zone write pointer location relative to the start of the zone
58 * as a number of 512B sectors.
59 * @bio_list: The list of BIOs that are currently plugged.
60 * @bio_work: Work struct to handle issuing of plugged BIOs
61 * @rcu_head: RCU head to free zone write plugs with an RCU grace period.
62 * @disk: The gendisk the plug belongs to.
64 struct blk_zone_wplug
{
65 struct hlist_node node
;
66 struct list_head link
;
71 unsigned int wp_offset
;
72 struct bio_list bio_list
;
73 struct work_struct bio_work
;
74 struct rcu_head rcu_head
;
79 * Zone write plug flags bits:
80 * - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged,
81 * that is, that write BIOs are being throttled due to a write BIO already
82 * being executed or the zone write plug bio list is not empty.
83 * - BLK_ZONE_WPLUG_ERROR: Indicates that a write error happened which will be
84 * recovered with a report zone to update the zone write pointer offset.
85 * - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed
86 * from the disk hash table and that the initial reference to the zone
87 * write plug set when the plug was first added to the hash table has been
88 * dropped. This flag is set when a zone is reset, finished or become full,
89 * to prevent new references to the zone write plug to be taken for
90 * newly incoming BIOs. A zone write plug flagged with this flag will be
91 * freed once all remaining references from BIOs or functions are dropped.
93 #define BLK_ZONE_WPLUG_PLUGGED (1U << 0)
94 #define BLK_ZONE_WPLUG_ERROR (1U << 1)
95 #define BLK_ZONE_WPLUG_UNHASHED (1U << 2)
97 #define BLK_ZONE_WPLUG_BUSY (BLK_ZONE_WPLUG_PLUGGED | BLK_ZONE_WPLUG_ERROR)
100 * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX.
101 * @zone_cond: BLK_ZONE_COND_XXX.
103 * Description: Centralize block layer function to convert BLK_ZONE_COND_XXX
104 * into string format. Useful in the debugging and tracing zone conditions. For
105 * invalid BLK_ZONE_COND_XXX it returns string "UNKNOWN".
107 const char *blk_zone_cond_str(enum blk_zone_cond zone_cond
)
109 static const char *zone_cond_str
= "UNKNOWN";
111 if (zone_cond
< ARRAY_SIZE(zone_cond_name
) && zone_cond_name
[zone_cond
])
112 zone_cond_str
= zone_cond_name
[zone_cond
];
114 return zone_cond_str
;
116 EXPORT_SYMBOL_GPL(blk_zone_cond_str
);
119 * blkdev_report_zones - Get zones information
120 * @bdev: Target block device
121 * @sector: Sector from which to report zones
122 * @nr_zones: Maximum number of zones to report
123 * @cb: Callback function called for each reported zone
124 * @data: Private data for the callback
127 * Get zone information starting from the zone containing @sector for at most
128 * @nr_zones, and call @cb for each zone reported by the device.
129 * To report all zones in a device starting from @sector, the BLK_ALL_ZONES
130 * constant can be passed to @nr_zones.
131 * Returns the number of zones reported by the device, or a negative errno
132 * value in case of failure.
134 * Note: The caller must use memalloc_noXX_save/restore() calls to control
135 * memory allocations done within this function.
137 int blkdev_report_zones(struct block_device
*bdev
, sector_t sector
,
138 unsigned int nr_zones
, report_zones_cb cb
, void *data
)
140 struct gendisk
*disk
= bdev
->bd_disk
;
141 sector_t capacity
= get_capacity(disk
);
143 if (!bdev_is_zoned(bdev
) || WARN_ON_ONCE(!disk
->fops
->report_zones
))
146 if (!nr_zones
|| sector
>= capacity
)
149 return disk
->fops
->report_zones(disk
, sector
, nr_zones
, cb
, data
);
151 EXPORT_SYMBOL_GPL(blkdev_report_zones
);
153 static int blkdev_zone_reset_all(struct block_device
*bdev
)
157 bio_init(&bio
, bdev
, NULL
, 0, REQ_OP_ZONE_RESET_ALL
| REQ_SYNC
);
158 return submit_bio_wait(&bio
);
162 * blkdev_zone_mgmt - Execute a zone management operation on a range of zones
163 * @bdev: Target block device
164 * @op: Operation to be performed on the zones
165 * @sector: Start sector of the first zone to operate on
166 * @nr_sectors: Number of sectors, should be at least the length of one zone and
167 * must be zone size aligned.
170 * Perform the specified operation on the range of zones specified by
171 * @sector..@sector+@nr_sectors. Specifying the entire disk sector range
172 * is valid, but the specified range should not contain conventional zones.
173 * The operation to execute on each zone can be a zone reset, open, close
176 int blkdev_zone_mgmt(struct block_device
*bdev
, enum req_op op
,
177 sector_t sector
, sector_t nr_sectors
)
179 sector_t zone_sectors
= bdev_zone_sectors(bdev
);
180 sector_t capacity
= bdev_nr_sectors(bdev
);
181 sector_t end_sector
= sector
+ nr_sectors
;
182 struct bio
*bio
= NULL
;
185 if (!bdev_is_zoned(bdev
))
188 if (bdev_read_only(bdev
))
191 if (!op_is_zone_mgmt(op
))
194 if (end_sector
<= sector
|| end_sector
> capacity
)
198 /* Check alignment (handle eventual smaller last zone) */
199 if (!bdev_is_zone_start(bdev
, sector
))
202 if (!bdev_is_zone_start(bdev
, nr_sectors
) && end_sector
!= capacity
)
206 * In the case of a zone reset operation over all zones, use
207 * REQ_OP_ZONE_RESET_ALL.
209 if (op
== REQ_OP_ZONE_RESET
&& sector
== 0 && nr_sectors
== capacity
)
210 return blkdev_zone_reset_all(bdev
);
212 while (sector
< end_sector
) {
213 bio
= blk_next_bio(bio
, bdev
, 0, op
| REQ_SYNC
, GFP_KERNEL
);
214 bio
->bi_iter
.bi_sector
= sector
;
215 sector
+= zone_sectors
;
217 /* This may take a while, so be nice to others */
221 ret
= submit_bio_wait(bio
);
226 EXPORT_SYMBOL_GPL(blkdev_zone_mgmt
);
228 struct zone_report_args
{
229 struct blk_zone __user
*zones
;
232 static int blkdev_copy_zone_to_user(struct blk_zone
*zone
, unsigned int idx
,
235 struct zone_report_args
*args
= data
;
237 if (copy_to_user(&args
->zones
[idx
], zone
, sizeof(struct blk_zone
)))
243 * BLKREPORTZONE ioctl processing.
244 * Called from blkdev_ioctl.
246 int blkdev_report_zones_ioctl(struct block_device
*bdev
, unsigned int cmd
,
249 void __user
*argp
= (void __user
*)arg
;
250 struct zone_report_args args
;
251 struct blk_zone_report rep
;
257 if (!bdev_is_zoned(bdev
))
260 if (copy_from_user(&rep
, argp
, sizeof(struct blk_zone_report
)))
266 args
.zones
= argp
+ sizeof(struct blk_zone_report
);
267 ret
= blkdev_report_zones(bdev
, rep
.sector
, rep
.nr_zones
,
268 blkdev_copy_zone_to_user
, &args
);
273 rep
.flags
= BLK_ZONE_REP_CAPACITY
;
274 if (copy_to_user(argp
, &rep
, sizeof(struct blk_zone_report
)))
279 static int blkdev_truncate_zone_range(struct block_device
*bdev
,
280 blk_mode_t mode
, const struct blk_zone_range
*zrange
)
284 if (zrange
->sector
+ zrange
->nr_sectors
<= zrange
->sector
||
285 zrange
->sector
+ zrange
->nr_sectors
> get_capacity(bdev
->bd_disk
))
289 start
= zrange
->sector
<< SECTOR_SHIFT
;
290 end
= ((zrange
->sector
+ zrange
->nr_sectors
) << SECTOR_SHIFT
) - 1;
292 return truncate_bdev_range(bdev
, mode
, start
, end
);
296 * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing.
297 * Called from blkdev_ioctl.
299 int blkdev_zone_mgmt_ioctl(struct block_device
*bdev
, blk_mode_t mode
,
300 unsigned int cmd
, unsigned long arg
)
302 void __user
*argp
= (void __user
*)arg
;
303 struct blk_zone_range zrange
;
310 if (!bdev_is_zoned(bdev
))
313 if (!(mode
& BLK_OPEN_WRITE
))
316 if (copy_from_user(&zrange
, argp
, sizeof(struct blk_zone_range
)))
321 op
= REQ_OP_ZONE_RESET
;
323 /* Invalidate the page cache, including dirty pages. */
324 filemap_invalidate_lock(bdev
->bd_mapping
);
325 ret
= blkdev_truncate_zone_range(bdev
, mode
, &zrange
);
330 op
= REQ_OP_ZONE_OPEN
;
333 op
= REQ_OP_ZONE_CLOSE
;
336 op
= REQ_OP_ZONE_FINISH
;
342 ret
= blkdev_zone_mgmt(bdev
, op
, zrange
.sector
, zrange
.nr_sectors
);
345 if (cmd
== BLKRESETZONE
)
346 filemap_invalidate_unlock(bdev
->bd_mapping
);
351 static inline bool disk_zone_is_conv(struct gendisk
*disk
, sector_t sector
)
353 if (!disk
->conv_zones_bitmap
)
355 return test_bit(disk_zone_no(disk
, sector
), disk
->conv_zones_bitmap
);
358 static bool disk_zone_is_last(struct gendisk
*disk
, struct blk_zone
*zone
)
360 return zone
->start
+ zone
->len
>= get_capacity(disk
);
363 static bool disk_zone_is_full(struct gendisk
*disk
,
364 unsigned int zno
, unsigned int offset_in_zone
)
366 if (zno
< disk
->nr_zones
- 1)
367 return offset_in_zone
>= disk
->zone_capacity
;
368 return offset_in_zone
>= disk
->last_zone_capacity
;
371 static bool disk_zone_wplug_is_full(struct gendisk
*disk
,
372 struct blk_zone_wplug
*zwplug
)
374 return disk_zone_is_full(disk
, zwplug
->zone_no
, zwplug
->wp_offset
);
377 static bool disk_insert_zone_wplug(struct gendisk
*disk
,
378 struct blk_zone_wplug
*zwplug
)
380 struct blk_zone_wplug
*zwplg
;
383 hash_32(zwplug
->zone_no
, disk
->zone_wplugs_hash_bits
);
386 * Add the new zone write plug to the hash table, but carefully as we
387 * are racing with other submission context, so we may already have a
388 * zone write plug for the same zone.
390 spin_lock_irqsave(&disk
->zone_wplugs_lock
, flags
);
391 hlist_for_each_entry_rcu(zwplg
, &disk
->zone_wplugs_hash
[idx
], node
) {
392 if (zwplg
->zone_no
== zwplug
->zone_no
) {
393 spin_unlock_irqrestore(&disk
->zone_wplugs_lock
, flags
);
397 hlist_add_head_rcu(&zwplug
->node
, &disk
->zone_wplugs_hash
[idx
]);
398 spin_unlock_irqrestore(&disk
->zone_wplugs_lock
, flags
);
403 static struct blk_zone_wplug
*disk_get_zone_wplug(struct gendisk
*disk
,
406 unsigned int zno
= disk_zone_no(disk
, sector
);
407 unsigned int idx
= hash_32(zno
, disk
->zone_wplugs_hash_bits
);
408 struct blk_zone_wplug
*zwplug
;
412 hlist_for_each_entry_rcu(zwplug
, &disk
->zone_wplugs_hash
[idx
], node
) {
413 if (zwplug
->zone_no
== zno
&&
414 atomic_inc_not_zero(&zwplug
->ref
)) {
425 static void disk_free_zone_wplug_rcu(struct rcu_head
*rcu_head
)
427 struct blk_zone_wplug
*zwplug
=
428 container_of(rcu_head
, struct blk_zone_wplug
, rcu_head
);
430 mempool_free(zwplug
, zwplug
->disk
->zone_wplugs_pool
);
433 static inline void disk_put_zone_wplug(struct blk_zone_wplug
*zwplug
)
435 if (atomic_dec_and_test(&zwplug
->ref
)) {
436 WARN_ON_ONCE(!bio_list_empty(&zwplug
->bio_list
));
437 WARN_ON_ONCE(!list_empty(&zwplug
->link
));
438 WARN_ON_ONCE(!(zwplug
->flags
& BLK_ZONE_WPLUG_UNHASHED
));
440 call_rcu(&zwplug
->rcu_head
, disk_free_zone_wplug_rcu
);
444 static inline bool disk_should_remove_zone_wplug(struct gendisk
*disk
,
445 struct blk_zone_wplug
*zwplug
)
447 /* If the zone write plug was already removed, we are done. */
448 if (zwplug
->flags
& BLK_ZONE_WPLUG_UNHASHED
)
451 /* If the zone write plug is still busy, it cannot be removed. */
452 if (zwplug
->flags
& BLK_ZONE_WPLUG_BUSY
)
456 * Completions of BIOs with blk_zone_write_plug_bio_endio() may
457 * happen after handling a request completion with
458 * blk_zone_write_plug_finish_request() (e.g. with split BIOs
459 * that are chained). In such case, disk_zone_wplug_unplug_bio()
460 * should not attempt to remove the zone write plug until all BIO
461 * completions are seen. Check by looking at the zone write plug
462 * reference count, which is 2 when the plug is unused (one reference
463 * taken when the plug was allocated and another reference taken by the
466 if (atomic_read(&zwplug
->ref
) > 2)
469 /* We can remove zone write plugs for zones that are empty or full. */
470 return !zwplug
->wp_offset
|| disk_zone_wplug_is_full(disk
, zwplug
);
473 static void disk_remove_zone_wplug(struct gendisk
*disk
,
474 struct blk_zone_wplug
*zwplug
)
478 /* If the zone write plug was already removed, we have nothing to do. */
479 if (zwplug
->flags
& BLK_ZONE_WPLUG_UNHASHED
)
483 * Mark the zone write plug as unhashed and drop the extra reference we
484 * took when the plug was inserted in the hash table.
486 zwplug
->flags
|= BLK_ZONE_WPLUG_UNHASHED
;
487 spin_lock_irqsave(&disk
->zone_wplugs_lock
, flags
);
488 hlist_del_init_rcu(&zwplug
->node
);
489 spin_unlock_irqrestore(&disk
->zone_wplugs_lock
, flags
);
490 disk_put_zone_wplug(zwplug
);
493 static void blk_zone_wplug_bio_work(struct work_struct
*work
);
496 * Get a reference on the write plug for the zone containing @sector.
497 * If the plug does not exist, it is allocated and hashed.
498 * Return a pointer to the zone write plug with the plug spinlock held.
500 static struct blk_zone_wplug
*disk_get_and_lock_zone_wplug(struct gendisk
*disk
,
501 sector_t sector
, gfp_t gfp_mask
,
502 unsigned long *flags
)
504 unsigned int zno
= disk_zone_no(disk
, sector
);
505 struct blk_zone_wplug
*zwplug
;
508 zwplug
= disk_get_zone_wplug(disk
, sector
);
511 * Check that a BIO completion or a zone reset or finish
512 * operation has not already removed the zone write plug from
513 * the hash table and dropped its reference count. In such case,
514 * we need to get a new plug so start over from the beginning.
516 spin_lock_irqsave(&zwplug
->lock
, *flags
);
517 if (zwplug
->flags
& BLK_ZONE_WPLUG_UNHASHED
) {
518 spin_unlock_irqrestore(&zwplug
->lock
, *flags
);
519 disk_put_zone_wplug(zwplug
);
526 * Allocate and initialize a zone write plug with an extra reference
527 * so that it is not freed when the zone write plug becomes idle without
528 * the zone being full.
530 zwplug
= mempool_alloc(disk
->zone_wplugs_pool
, gfp_mask
);
534 INIT_HLIST_NODE(&zwplug
->node
);
535 INIT_LIST_HEAD(&zwplug
->link
);
536 atomic_set(&zwplug
->ref
, 2);
537 spin_lock_init(&zwplug
->lock
);
539 zwplug
->zone_no
= zno
;
540 zwplug
->wp_offset
= sector
& (disk
->queue
->limits
.chunk_sectors
- 1);
541 bio_list_init(&zwplug
->bio_list
);
542 INIT_WORK(&zwplug
->bio_work
, blk_zone_wplug_bio_work
);
545 spin_lock_irqsave(&zwplug
->lock
, *flags
);
548 * Insert the new zone write plug in the hash table. This can fail only
549 * if another context already inserted a plug. Retry from the beginning
552 if (!disk_insert_zone_wplug(disk
, zwplug
)) {
553 spin_unlock_irqrestore(&zwplug
->lock
, *flags
);
554 mempool_free(zwplug
, disk
->zone_wplugs_pool
);
561 static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug
*zwplug
,
564 struct request_queue
*q
= zwplug
->disk
->queue
;
566 bio_clear_flag(bio
, BIO_ZONE_WRITE_PLUGGING
);
568 disk_put_zone_wplug(zwplug
);
573 * Abort (fail) all plugged BIOs of a zone write plug.
575 static void disk_zone_wplug_abort(struct blk_zone_wplug
*zwplug
)
579 while ((bio
= bio_list_pop(&zwplug
->bio_list
)))
580 blk_zone_wplug_bio_io_error(zwplug
, bio
);
584 * Abort (fail) all plugged BIOs of a zone write plug that are not aligned
585 * with the assumed write pointer location of the zone when the BIO will
588 static void disk_zone_wplug_abort_unaligned(struct gendisk
*disk
,
589 struct blk_zone_wplug
*zwplug
)
591 unsigned int wp_offset
= zwplug
->wp_offset
;
592 struct bio_list bl
= BIO_EMPTY_LIST
;
595 while ((bio
= bio_list_pop(&zwplug
->bio_list
))) {
596 if (disk_zone_is_full(disk
, zwplug
->zone_no
, wp_offset
) ||
597 (bio_op(bio
) != REQ_OP_ZONE_APPEND
&&
598 bio_offset_from_zone_start(bio
) != wp_offset
)) {
599 blk_zone_wplug_bio_io_error(zwplug
, bio
);
603 wp_offset
+= bio_sectors(bio
);
604 bio_list_add(&bl
, bio
);
607 bio_list_merge(&zwplug
->bio_list
, &bl
);
610 static inline void disk_zone_wplug_set_error(struct gendisk
*disk
,
611 struct blk_zone_wplug
*zwplug
)
615 if (zwplug
->flags
& BLK_ZONE_WPLUG_ERROR
)
619 * At this point, we already have a reference on the zone write plug.
620 * However, since we are going to add the plug to the disk zone write
621 * plugs work list, increase its reference count. This reference will
622 * be dropped in disk_zone_wplugs_work() once the error state is
623 * handled, or in disk_zone_wplug_clear_error() if the zone is reset or
626 zwplug
->flags
|= BLK_ZONE_WPLUG_ERROR
;
627 atomic_inc(&zwplug
->ref
);
629 spin_lock_irqsave(&disk
->zone_wplugs_lock
, flags
);
630 list_add_tail(&zwplug
->link
, &disk
->zone_wplugs_err_list
);
631 spin_unlock_irqrestore(&disk
->zone_wplugs_lock
, flags
);
634 static inline void disk_zone_wplug_clear_error(struct gendisk
*disk
,
635 struct blk_zone_wplug
*zwplug
)
639 if (!(zwplug
->flags
& BLK_ZONE_WPLUG_ERROR
))
643 * We are racing with the error handling work which drops the reference
644 * on the zone write plug after handling the error state. So remove the
645 * plug from the error list and drop its reference count only if the
646 * error handling has not yet started, that is, if the zone write plug
649 spin_lock_irqsave(&disk
->zone_wplugs_lock
, flags
);
650 if (!list_empty(&zwplug
->link
)) {
651 list_del_init(&zwplug
->link
);
652 zwplug
->flags
&= ~BLK_ZONE_WPLUG_ERROR
;
653 disk_put_zone_wplug(zwplug
);
655 spin_unlock_irqrestore(&disk
->zone_wplugs_lock
, flags
);
659 * Set a zone write plug write pointer offset to either 0 (zone reset case)
660 * or to the zone size (zone finish case). This aborts all plugged BIOs, which
661 * is fine to do as doing a zone reset or zone finish while writes are in-flight
662 * is a mistake from the user which will most likely cause all plugged BIOs to
665 static void disk_zone_wplug_set_wp_offset(struct gendisk
*disk
,
666 struct blk_zone_wplug
*zwplug
,
667 unsigned int wp_offset
)
671 spin_lock_irqsave(&zwplug
->lock
, flags
);
674 * Make sure that a BIO completion or another zone reset or finish
675 * operation has not already removed the plug from the hash table.
677 if (zwplug
->flags
& BLK_ZONE_WPLUG_UNHASHED
) {
678 spin_unlock_irqrestore(&zwplug
->lock
, flags
);
682 /* Update the zone write pointer and abort all plugged BIOs. */
683 zwplug
->wp_offset
= wp_offset
;
684 disk_zone_wplug_abort(zwplug
);
687 * Updating the write pointer offset puts back the zone
688 * in a good state. So clear the error flag and decrement the
689 * error count if we were in error state.
691 disk_zone_wplug_clear_error(disk
, zwplug
);
694 * The zone write plug now has no BIO plugged: remove it from the
695 * hash table so that it cannot be seen. The plug will be freed
696 * when the last reference is dropped.
698 if (disk_should_remove_zone_wplug(disk
, zwplug
))
699 disk_remove_zone_wplug(disk
, zwplug
);
701 spin_unlock_irqrestore(&zwplug
->lock
, flags
);
704 static bool blk_zone_wplug_handle_reset_or_finish(struct bio
*bio
,
705 unsigned int wp_offset
)
707 struct gendisk
*disk
= bio
->bi_bdev
->bd_disk
;
708 sector_t sector
= bio
->bi_iter
.bi_sector
;
709 struct blk_zone_wplug
*zwplug
;
711 /* Conventional zones cannot be reset nor finished. */
712 if (disk_zone_is_conv(disk
, sector
)) {
718 * If we have a zone write plug, set its write pointer offset to 0
719 * (reset case) or to the zone size (finish case). This will abort all
720 * BIOs plugged for the target zone. It is fine as resetting or
721 * finishing zones while writes are still in-flight will result in the
722 * writes failing anyway.
724 zwplug
= disk_get_zone_wplug(disk
, sector
);
726 disk_zone_wplug_set_wp_offset(disk
, zwplug
, wp_offset
);
727 disk_put_zone_wplug(zwplug
);
733 static bool blk_zone_wplug_handle_reset_all(struct bio
*bio
)
735 struct gendisk
*disk
= bio
->bi_bdev
->bd_disk
;
736 struct blk_zone_wplug
*zwplug
;
740 * Set the write pointer offset of all zone write plugs to 0. This will
741 * abort all plugged BIOs. It is fine as resetting zones while writes
742 * are still in-flight will result in the writes failing anyway.
744 for (sector
= 0; sector
< get_capacity(disk
);
745 sector
+= disk
->queue
->limits
.chunk_sectors
) {
746 zwplug
= disk_get_zone_wplug(disk
, sector
);
748 disk_zone_wplug_set_wp_offset(disk
, zwplug
, 0);
749 disk_put_zone_wplug(zwplug
);
756 static inline void blk_zone_wplug_add_bio(struct blk_zone_wplug
*zwplug
,
757 struct bio
*bio
, unsigned int nr_segs
)
760 * Grab an extra reference on the BIO request queue usage counter.
761 * This reference will be reused to submit a request for the BIO for
762 * blk-mq devices and dropped when the BIO is failed and after
763 * it is issued in the case of BIO-based devices.
765 percpu_ref_get(&bio
->bi_bdev
->bd_disk
->queue
->q_usage_counter
);
768 * The BIO is being plugged and thus will have to wait for the on-going
769 * write and for all other writes already plugged. So polling makes
772 bio_clear_polled(bio
);
775 * Reuse the poll cookie field to store the number of segments when
776 * split to the hardware limits.
778 bio
->__bi_nr_segments
= nr_segs
;
781 * We always receive BIOs after they are split and ready to be issued.
782 * The block layer passes the parts of a split BIO in order, and the
783 * user must also issue write sequentially. So simply add the new BIO
784 * at the tail of the list to preserve the sequential write order.
786 bio_list_add(&zwplug
->bio_list
, bio
);
790 * Called from bio_attempt_back_merge() when a BIO was merged with a request.
792 void blk_zone_write_plug_bio_merged(struct bio
*bio
)
794 struct blk_zone_wplug
*zwplug
;
798 * If the BIO was already plugged, then we were called through
799 * blk_zone_write_plug_init_request() -> blk_attempt_bio_merge().
800 * For this case, we already hold a reference on the zone write plug for
801 * the BIO and blk_zone_write_plug_init_request() will handle the
802 * zone write pointer offset update.
804 if (bio_flagged(bio
, BIO_ZONE_WRITE_PLUGGING
))
807 bio_set_flag(bio
, BIO_ZONE_WRITE_PLUGGING
);
810 * Get a reference on the zone write plug of the target zone and advance
811 * the zone write pointer offset. Given that this is a merge, we already
812 * have at least one request and one BIO referencing the zone write
813 * plug. So this should not fail.
815 zwplug
= disk_get_zone_wplug(bio
->bi_bdev
->bd_disk
,
816 bio
->bi_iter
.bi_sector
);
817 if (WARN_ON_ONCE(!zwplug
))
820 spin_lock_irqsave(&zwplug
->lock
, flags
);
821 zwplug
->wp_offset
+= bio_sectors(bio
);
822 spin_unlock_irqrestore(&zwplug
->lock
, flags
);
826 * Attempt to merge plugged BIOs with a newly prepared request for a BIO that
827 * already went through zone write plugging (either a new BIO or one that was
830 void blk_zone_write_plug_init_request(struct request
*req
)
832 sector_t req_back_sector
= blk_rq_pos(req
) + blk_rq_sectors(req
);
833 struct request_queue
*q
= req
->q
;
834 struct gendisk
*disk
= q
->disk
;
835 struct blk_zone_wplug
*zwplug
=
836 disk_get_zone_wplug(disk
, blk_rq_pos(req
));
840 if (WARN_ON_ONCE(!zwplug
))
844 * Indicate that completion of this request needs to be handled with
845 * blk_zone_write_plug_finish_request(), which will drop the reference
846 * on the zone write plug we took above on entry to this function.
848 req
->rq_flags
|= RQF_ZONE_WRITE_PLUGGING
;
850 if (blk_queue_nomerges(q
))
854 * Walk through the list of plugged BIOs to check if they can be merged
855 * into the back of the request.
857 spin_lock_irqsave(&zwplug
->lock
, flags
);
858 while (!disk_zone_wplug_is_full(disk
, zwplug
)) {
859 bio
= bio_list_peek(&zwplug
->bio_list
);
863 if (bio
->bi_iter
.bi_sector
!= req_back_sector
||
864 !blk_rq_merge_ok(req
, bio
))
867 WARN_ON_ONCE(bio_op(bio
) != REQ_OP_WRITE_ZEROES
&&
868 !bio
->__bi_nr_segments
);
870 bio_list_pop(&zwplug
->bio_list
);
871 if (bio_attempt_back_merge(req
, bio
, bio
->__bi_nr_segments
) !=
873 bio_list_add_head(&zwplug
->bio_list
, bio
);
878 * Drop the extra reference on the queue usage we got when
879 * plugging the BIO and advance the write pointer offset.
882 zwplug
->wp_offset
+= bio_sectors(bio
);
884 req_back_sector
+= bio_sectors(bio
);
886 spin_unlock_irqrestore(&zwplug
->lock
, flags
);
890 * Check and prepare a BIO for submission by incrementing the write pointer
891 * offset of its zone write plug and changing zone append operations into
892 * regular write when zone append emulation is needed.
894 static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug
*zwplug
,
897 struct gendisk
*disk
= bio
->bi_bdev
->bd_disk
;
900 * Check that the user is not attempting to write to a full zone.
901 * We know such BIO will fail, and that would potentially overflow our
902 * write pointer offset beyond the end of the zone.
904 if (disk_zone_wplug_is_full(disk
, zwplug
))
907 if (bio_op(bio
) == REQ_OP_ZONE_APPEND
) {
909 * Use a regular write starting at the current write pointer.
910 * Similarly to native zone append operations, do not allow
913 bio
->bi_opf
&= ~REQ_OP_MASK
;
914 bio
->bi_opf
|= REQ_OP_WRITE
| REQ_NOMERGE
;
915 bio
->bi_iter
.bi_sector
+= zwplug
->wp_offset
;
918 * Remember that this BIO is in fact a zone append operation
919 * so that we can restore its operation code on completion.
921 bio_set_flag(bio
, BIO_EMULATES_ZONE_APPEND
);
924 * Check for non-sequential writes early because we avoid a
925 * whole lot of error handling trouble if we don't send it off
928 if (bio_offset_from_zone_start(bio
) != zwplug
->wp_offset
)
932 /* Advance the zone write pointer offset. */
933 zwplug
->wp_offset
+= bio_sectors(bio
);
938 /* We detected an invalid write BIO: schedule error recovery. */
939 disk_zone_wplug_set_error(disk
, zwplug
);
940 kblockd_schedule_work(&disk
->zone_wplugs_work
);
944 static bool blk_zone_wplug_handle_write(struct bio
*bio
, unsigned int nr_segs
)
946 struct gendisk
*disk
= bio
->bi_bdev
->bd_disk
;
947 sector_t sector
= bio
->bi_iter
.bi_sector
;
948 struct blk_zone_wplug
*zwplug
;
949 gfp_t gfp_mask
= GFP_NOIO
;
953 * BIOs must be fully contained within a zone so that we use the correct
954 * zone write plug for the entire BIO. For blk-mq devices, the block
955 * layer should already have done any splitting required to ensure this
956 * and this BIO should thus not be straddling zone boundaries. For
957 * BIO-based devices, it is the responsibility of the driver to split
958 * the bio before submitting it.
960 if (WARN_ON_ONCE(bio_straddles_zones(bio
))) {
965 /* Conventional zones do not need write plugging. */
966 if (disk_zone_is_conv(disk
, sector
)) {
967 /* Zone append to conventional zones is not allowed. */
968 if (bio_op(bio
) == REQ_OP_ZONE_APPEND
) {
975 if (bio
->bi_opf
& REQ_NOWAIT
)
976 gfp_mask
= GFP_NOWAIT
;
978 zwplug
= disk_get_and_lock_zone_wplug(disk
, sector
, gfp_mask
, &flags
);
984 /* Indicate that this BIO is being handled using zone write plugging. */
985 bio_set_flag(bio
, BIO_ZONE_WRITE_PLUGGING
);
988 * If the zone is already plugged or has a pending error, add the BIO
989 * to the plug BIO list. Otherwise, plug and let the BIO execute.
991 if (zwplug
->flags
& BLK_ZONE_WPLUG_BUSY
)
995 * If an error is detected when preparing the BIO, add it to the BIO
996 * list so that error recovery can deal with it.
998 if (!blk_zone_wplug_prepare_bio(zwplug
, bio
))
1001 zwplug
->flags
|= BLK_ZONE_WPLUG_PLUGGED
;
1003 spin_unlock_irqrestore(&zwplug
->lock
, flags
);
1008 zwplug
->flags
|= BLK_ZONE_WPLUG_PLUGGED
;
1009 blk_zone_wplug_add_bio(zwplug
, bio
, nr_segs
);
1011 spin_unlock_irqrestore(&zwplug
->lock
, flags
);
1017 * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging
1018 * @bio: The BIO being submitted
1019 * @nr_segs: The number of physical segments of @bio
1021 * Handle write, write zeroes and zone append operations requiring emulation
1022 * using zone write plugging.
1024 * Return true whenever @bio execution needs to be delayed through the zone
1025 * write plug. Otherwise, return false to let the submission path process
1028 bool blk_zone_plug_bio(struct bio
*bio
, unsigned int nr_segs
)
1030 struct block_device
*bdev
= bio
->bi_bdev
;
1032 if (!bdev
->bd_disk
->zone_wplugs_hash
)
1036 * If the BIO already has the plugging flag set, then it was already
1037 * handled through this path and this is a submission from the zone
1038 * plug bio submit work.
1040 if (bio_flagged(bio
, BIO_ZONE_WRITE_PLUGGING
))
1044 * We do not need to do anything special for empty flush BIOs, e.g
1045 * BIOs such as issued by blkdev_issue_flush(). The is because it is
1046 * the responsibility of the user to first wait for the completion of
1047 * write operations for flush to have any effect on the persistence of
1050 if (op_is_flush(bio
->bi_opf
) && !bio_sectors(bio
))
1054 * Regular writes and write zeroes need to be handled through the target
1055 * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH
1056 * which may need to go through the flush machinery depending on the
1057 * target device capabilities. Plugging such writes is fine as the flush
1058 * machinery operates at the request level, below the plug, and
1059 * completion of the flush sequence will go through the regular BIO
1060 * completion, which will handle zone write plugging.
1061 * Zone append operations for devices that requested emulation must
1062 * also be plugged so that these BIOs can be changed into regular
1064 * Zone reset, reset all and finish commands need special treatment
1065 * to correctly track the write pointer offset of zones. These commands
1066 * are not plugged as we do not need serialization with write
1067 * operations. It is the responsibility of the user to not issue reset
1068 * and finish commands when write operations are in flight.
1070 switch (bio_op(bio
)) {
1071 case REQ_OP_ZONE_APPEND
:
1072 if (!bdev_emulates_zone_append(bdev
))
1076 case REQ_OP_WRITE_ZEROES
:
1077 return blk_zone_wplug_handle_write(bio
, nr_segs
);
1078 case REQ_OP_ZONE_RESET
:
1079 return blk_zone_wplug_handle_reset_or_finish(bio
, 0);
1080 case REQ_OP_ZONE_FINISH
:
1081 return blk_zone_wplug_handle_reset_or_finish(bio
,
1082 bdev_zone_sectors(bdev
));
1083 case REQ_OP_ZONE_RESET_ALL
:
1084 return blk_zone_wplug_handle_reset_all(bio
);
1091 EXPORT_SYMBOL_GPL(blk_zone_plug_bio
);
1093 static void disk_zone_wplug_schedule_bio_work(struct gendisk
*disk
,
1094 struct blk_zone_wplug
*zwplug
)
1097 * Take a reference on the zone write plug and schedule the submission
1098 * of the next plugged BIO. blk_zone_wplug_bio_work() will release the
1099 * reference we take here.
1101 WARN_ON_ONCE(!(zwplug
->flags
& BLK_ZONE_WPLUG_PLUGGED
));
1102 atomic_inc(&zwplug
->ref
);
1103 queue_work(disk
->zone_wplugs_wq
, &zwplug
->bio_work
);
1106 static void disk_zone_wplug_unplug_bio(struct gendisk
*disk
,
1107 struct blk_zone_wplug
*zwplug
)
1109 unsigned long flags
;
1111 spin_lock_irqsave(&zwplug
->lock
, flags
);
1114 * If we had an error, schedule error recovery. The recovery work
1115 * will restart submission of plugged BIOs.
1117 if (zwplug
->flags
& BLK_ZONE_WPLUG_ERROR
) {
1118 spin_unlock_irqrestore(&zwplug
->lock
, flags
);
1119 kblockd_schedule_work(&disk
->zone_wplugs_work
);
1123 /* Schedule submission of the next plugged BIO if we have one. */
1124 if (!bio_list_empty(&zwplug
->bio_list
)) {
1125 disk_zone_wplug_schedule_bio_work(disk
, zwplug
);
1126 spin_unlock_irqrestore(&zwplug
->lock
, flags
);
1130 zwplug
->flags
&= ~BLK_ZONE_WPLUG_PLUGGED
;
1133 * If the zone is full (it was fully written or finished, or empty
1134 * (it was reset), remove its zone write plug from the hash table.
1136 if (disk_should_remove_zone_wplug(disk
, zwplug
))
1137 disk_remove_zone_wplug(disk
, zwplug
);
1139 spin_unlock_irqrestore(&zwplug
->lock
, flags
);
1142 void blk_zone_write_plug_bio_endio(struct bio
*bio
)
1144 struct gendisk
*disk
= bio
->bi_bdev
->bd_disk
;
1145 struct blk_zone_wplug
*zwplug
=
1146 disk_get_zone_wplug(disk
, bio
->bi_iter
.bi_sector
);
1147 unsigned long flags
;
1149 if (WARN_ON_ONCE(!zwplug
))
1152 /* Make sure we do not see this BIO again by clearing the plug flag. */
1153 bio_clear_flag(bio
, BIO_ZONE_WRITE_PLUGGING
);
1156 * If this is a regular write emulating a zone append operation,
1157 * restore the original operation code.
1159 if (bio_flagged(bio
, BIO_EMULATES_ZONE_APPEND
)) {
1160 bio
->bi_opf
&= ~REQ_OP_MASK
;
1161 bio
->bi_opf
|= REQ_OP_ZONE_APPEND
;
1165 * If the BIO failed, mark the plug as having an error to trigger
1168 if (bio
->bi_status
!= BLK_STS_OK
) {
1169 spin_lock_irqsave(&zwplug
->lock
, flags
);
1170 disk_zone_wplug_set_error(disk
, zwplug
);
1171 spin_unlock_irqrestore(&zwplug
->lock
, flags
);
1174 /* Drop the reference we took when the BIO was issued. */
1175 disk_put_zone_wplug(zwplug
);
1178 * For BIO-based devices, blk_zone_write_plug_finish_request()
1179 * is not called. So we need to schedule execution of the next
1182 if (bdev_test_flag(bio
->bi_bdev
, BD_HAS_SUBMIT_BIO
))
1183 disk_zone_wplug_unplug_bio(disk
, zwplug
);
1185 /* Drop the reference we took when entering this function. */
1186 disk_put_zone_wplug(zwplug
);
1189 void blk_zone_write_plug_finish_request(struct request
*req
)
1191 struct gendisk
*disk
= req
->q
->disk
;
1192 struct blk_zone_wplug
*zwplug
;
1194 zwplug
= disk_get_zone_wplug(disk
, req
->__sector
);
1195 if (WARN_ON_ONCE(!zwplug
))
1198 req
->rq_flags
&= ~RQF_ZONE_WRITE_PLUGGING
;
1201 * Drop the reference we took when the request was initialized in
1202 * blk_zone_write_plug_init_request().
1204 disk_put_zone_wplug(zwplug
);
1206 disk_zone_wplug_unplug_bio(disk
, zwplug
);
1208 /* Drop the reference we took when entering this function. */
1209 disk_put_zone_wplug(zwplug
);
1212 static void blk_zone_wplug_bio_work(struct work_struct
*work
)
1214 struct blk_zone_wplug
*zwplug
=
1215 container_of(work
, struct blk_zone_wplug
, bio_work
);
1216 struct block_device
*bdev
;
1217 unsigned long flags
;
1221 * Submit the next plugged BIO. If we do not have any, clear
1224 spin_lock_irqsave(&zwplug
->lock
, flags
);
1226 bio
= bio_list_pop(&zwplug
->bio_list
);
1228 zwplug
->flags
&= ~BLK_ZONE_WPLUG_PLUGGED
;
1229 spin_unlock_irqrestore(&zwplug
->lock
, flags
);
1233 if (!blk_zone_wplug_prepare_bio(zwplug
, bio
)) {
1234 /* Error recovery will decide what to do with the BIO. */
1235 bio_list_add_head(&zwplug
->bio_list
, bio
);
1236 spin_unlock_irqrestore(&zwplug
->lock
, flags
);
1240 spin_unlock_irqrestore(&zwplug
->lock
, flags
);
1242 bdev
= bio
->bi_bdev
;
1243 submit_bio_noacct_nocheck(bio
);
1246 * blk-mq devices will reuse the extra reference on the request queue
1247 * usage counter we took when the BIO was plugged, but the submission
1248 * path for BIO-based devices will not do that. So drop this extra
1251 if (bdev_test_flag(bdev
, BD_HAS_SUBMIT_BIO
))
1252 blk_queue_exit(bdev
->bd_disk
->queue
);
1255 /* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */
1256 disk_put_zone_wplug(zwplug
);
1259 static unsigned int blk_zone_wp_offset(struct blk_zone
*zone
)
1261 switch (zone
->cond
) {
1262 case BLK_ZONE_COND_IMP_OPEN
:
1263 case BLK_ZONE_COND_EXP_OPEN
:
1264 case BLK_ZONE_COND_CLOSED
:
1265 return zone
->wp
- zone
->start
;
1266 case BLK_ZONE_COND_FULL
:
1268 case BLK_ZONE_COND_EMPTY
:
1270 case BLK_ZONE_COND_NOT_WP
:
1271 case BLK_ZONE_COND_OFFLINE
:
1272 case BLK_ZONE_COND_READONLY
:
1275 * Conventional, offline and read-only zones do not have a valid
1282 static int blk_zone_wplug_report_zone_cb(struct blk_zone
*zone
,
1283 unsigned int idx
, void *data
)
1285 struct blk_zone
*zonep
= data
;
1291 static void disk_zone_wplug_handle_error(struct gendisk
*disk
,
1292 struct blk_zone_wplug
*zwplug
)
1294 sector_t zone_start_sector
=
1295 bdev_zone_sectors(disk
->part0
) * zwplug
->zone_no
;
1296 unsigned int noio_flag
;
1297 struct blk_zone zone
;
1298 unsigned long flags
;
1301 /* Get the current zone information from the device. */
1302 noio_flag
= memalloc_noio_save();
1303 ret
= disk
->fops
->report_zones(disk
, zone_start_sector
, 1,
1304 blk_zone_wplug_report_zone_cb
, &zone
);
1305 memalloc_noio_restore(noio_flag
);
1307 spin_lock_irqsave(&zwplug
->lock
, flags
);
1310 * A zone reset or finish may have cleared the error already. In such
1311 * case, do nothing as the report zones may have seen the "old" write
1312 * pointer value before the reset/finish operation completed.
1314 if (!(zwplug
->flags
& BLK_ZONE_WPLUG_ERROR
))
1317 zwplug
->flags
&= ~BLK_ZONE_WPLUG_ERROR
;
1321 * We failed to get the zone information, meaning that something
1322 * is likely really wrong with the device. Abort all remaining
1323 * plugged BIOs as otherwise we could endup waiting forever on
1324 * plugged BIOs to complete if there is a queue freeze on-going.
1326 disk_zone_wplug_abort(zwplug
);
1330 /* Update the zone write pointer offset. */
1331 zwplug
->wp_offset
= blk_zone_wp_offset(&zone
);
1332 disk_zone_wplug_abort_unaligned(disk
, zwplug
);
1334 /* Restart BIO submission if we still have any BIO left. */
1335 if (!bio_list_empty(&zwplug
->bio_list
)) {
1336 disk_zone_wplug_schedule_bio_work(disk
, zwplug
);
1341 zwplug
->flags
&= ~BLK_ZONE_WPLUG_PLUGGED
;
1342 if (disk_should_remove_zone_wplug(disk
, zwplug
))
1343 disk_remove_zone_wplug(disk
, zwplug
);
1346 spin_unlock_irqrestore(&zwplug
->lock
, flags
);
1349 static void disk_zone_wplugs_work(struct work_struct
*work
)
1351 struct gendisk
*disk
=
1352 container_of(work
, struct gendisk
, zone_wplugs_work
);
1353 struct blk_zone_wplug
*zwplug
;
1354 unsigned long flags
;
1356 spin_lock_irqsave(&disk
->zone_wplugs_lock
, flags
);
1358 while (!list_empty(&disk
->zone_wplugs_err_list
)) {
1359 zwplug
= list_first_entry(&disk
->zone_wplugs_err_list
,
1360 struct blk_zone_wplug
, link
);
1361 list_del_init(&zwplug
->link
);
1362 spin_unlock_irqrestore(&disk
->zone_wplugs_lock
, flags
);
1364 disk_zone_wplug_handle_error(disk
, zwplug
);
1365 disk_put_zone_wplug(zwplug
);
1367 spin_lock_irqsave(&disk
->zone_wplugs_lock
, flags
);
1370 spin_unlock_irqrestore(&disk
->zone_wplugs_lock
, flags
);
1373 static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk
*disk
)
1375 return 1U << disk
->zone_wplugs_hash_bits
;
1378 void disk_init_zone_resources(struct gendisk
*disk
)
1380 spin_lock_init(&disk
->zone_wplugs_lock
);
1381 INIT_LIST_HEAD(&disk
->zone_wplugs_err_list
);
1382 INIT_WORK(&disk
->zone_wplugs_work
, disk_zone_wplugs_work
);
1386 * For the size of a disk zone write plug hash table, use the size of the
1387 * zone write plug mempool, which is the maximum of the disk open zones and
1388 * active zones limits. But do not exceed 4KB (512 hlist head entries), that is,
1389 * 9 bits. For a disk that has no limits, mempool size defaults to 128.
1391 #define BLK_ZONE_WPLUG_MAX_HASH_BITS 9
1392 #define BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE 128
1394 static int disk_alloc_zone_resources(struct gendisk
*disk
,
1395 unsigned int pool_size
)
1399 disk
->zone_wplugs_hash_bits
=
1400 min(ilog2(pool_size
) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS
);
1402 disk
->zone_wplugs_hash
=
1403 kcalloc(disk_zone_wplugs_hash_size(disk
),
1404 sizeof(struct hlist_head
), GFP_KERNEL
);
1405 if (!disk
->zone_wplugs_hash
)
1408 for (i
= 0; i
< disk_zone_wplugs_hash_size(disk
); i
++)
1409 INIT_HLIST_HEAD(&disk
->zone_wplugs_hash
[i
]);
1411 disk
->zone_wplugs_pool
= mempool_create_kmalloc_pool(pool_size
,
1412 sizeof(struct blk_zone_wplug
));
1413 if (!disk
->zone_wplugs_pool
)
1416 disk
->zone_wplugs_wq
=
1417 alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM
| WQ_HIGHPRI
,
1418 pool_size
, disk
->disk_name
);
1419 if (!disk
->zone_wplugs_wq
)
1425 mempool_destroy(disk
->zone_wplugs_pool
);
1426 disk
->zone_wplugs_pool
= NULL
;
1428 kfree(disk
->zone_wplugs_hash
);
1429 disk
->zone_wplugs_hash
= NULL
;
1430 disk
->zone_wplugs_hash_bits
= 0;
1434 static void disk_destroy_zone_wplugs_hash_table(struct gendisk
*disk
)
1436 struct blk_zone_wplug
*zwplug
;
1439 if (!disk
->zone_wplugs_hash
)
1442 /* Free all the zone write plugs we have. */
1443 for (i
= 0; i
< disk_zone_wplugs_hash_size(disk
); i
++) {
1444 while (!hlist_empty(&disk
->zone_wplugs_hash
[i
])) {
1445 zwplug
= hlist_entry(disk
->zone_wplugs_hash
[i
].first
,
1446 struct blk_zone_wplug
, node
);
1447 atomic_inc(&zwplug
->ref
);
1448 disk_remove_zone_wplug(disk
, zwplug
);
1449 disk_put_zone_wplug(zwplug
);
1453 kfree(disk
->zone_wplugs_hash
);
1454 disk
->zone_wplugs_hash
= NULL
;
1455 disk
->zone_wplugs_hash_bits
= 0;
1458 void disk_free_zone_resources(struct gendisk
*disk
)
1460 if (!disk
->zone_wplugs_pool
)
1463 cancel_work_sync(&disk
->zone_wplugs_work
);
1465 if (disk
->zone_wplugs_wq
) {
1466 destroy_workqueue(disk
->zone_wplugs_wq
);
1467 disk
->zone_wplugs_wq
= NULL
;
1470 disk_destroy_zone_wplugs_hash_table(disk
);
1473 * Wait for the zone write plugs to be RCU-freed before
1474 * destorying the mempool.
1478 mempool_destroy(disk
->zone_wplugs_pool
);
1479 disk
->zone_wplugs_pool
= NULL
;
1481 bitmap_free(disk
->conv_zones_bitmap
);
1482 disk
->conv_zones_bitmap
= NULL
;
1483 disk
->zone_capacity
= 0;
1484 disk
->last_zone_capacity
= 0;
1488 static inline bool disk_need_zone_resources(struct gendisk
*disk
)
1491 * All mq zoned devices need zone resources so that the block layer
1492 * can automatically handle write BIO plugging. BIO-based device drivers
1493 * (e.g. DM devices) are normally responsible for handling zone write
1494 * ordering and do not need zone resources, unless the driver requires
1495 * zone append emulation.
1497 return queue_is_mq(disk
->queue
) ||
1498 queue_emulates_zone_append(disk
->queue
);
1501 static int disk_revalidate_zone_resources(struct gendisk
*disk
,
1502 unsigned int nr_zones
)
1504 struct queue_limits
*lim
= &disk
->queue
->limits
;
1505 unsigned int pool_size
;
1507 if (!disk_need_zone_resources(disk
))
1511 * If the device has no limit on the maximum number of open and active
1512 * zones, use BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE.
1514 pool_size
= max(lim
->max_open_zones
, lim
->max_active_zones
);
1516 pool_size
= min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE
, nr_zones
);
1518 if (!disk
->zone_wplugs_hash
)
1519 return disk_alloc_zone_resources(disk
, pool_size
);
1524 struct blk_revalidate_zone_args
{
1525 struct gendisk
*disk
;
1526 unsigned long *conv_zones_bitmap
;
1527 unsigned int nr_zones
;
1528 unsigned int zone_capacity
;
1529 unsigned int last_zone_capacity
;
1534 * Update the disk zone resources information and device queue limits.
1535 * The disk queue is frozen when this is executed.
1537 static int disk_update_zone_resources(struct gendisk
*disk
,
1538 struct blk_revalidate_zone_args
*args
)
1540 struct request_queue
*q
= disk
->queue
;
1541 unsigned int nr_seq_zones
, nr_conv_zones
= 0;
1542 unsigned int pool_size
;
1543 struct queue_limits lim
;
1545 disk
->nr_zones
= args
->nr_zones
;
1546 disk
->zone_capacity
= args
->zone_capacity
;
1547 disk
->last_zone_capacity
= args
->last_zone_capacity
;
1548 swap(disk
->conv_zones_bitmap
, args
->conv_zones_bitmap
);
1549 if (disk
->conv_zones_bitmap
)
1550 nr_conv_zones
= bitmap_weight(disk
->conv_zones_bitmap
,
1552 if (nr_conv_zones
>= disk
->nr_zones
) {
1553 pr_warn("%s: Invalid number of conventional zones %u / %u\n",
1554 disk
->disk_name
, nr_conv_zones
, disk
->nr_zones
);
1558 lim
= queue_limits_start_update(q
);
1561 * Some devices can advertize zone resource limits that are larger than
1562 * the number of sequential zones of the zoned block device, e.g. a
1563 * small ZNS namespace. For such case, assume that the zoned device has
1564 * no zone resource limits.
1566 nr_seq_zones
= disk
->nr_zones
- nr_conv_zones
;
1567 if (lim
.max_open_zones
>= nr_seq_zones
)
1568 lim
.max_open_zones
= 0;
1569 if (lim
.max_active_zones
>= nr_seq_zones
)
1570 lim
.max_active_zones
= 0;
1572 if (!disk
->zone_wplugs_pool
)
1576 * If the device has no limit on the maximum number of open and active
1577 * zones, set its max open zone limit to the mempool size to indicate
1578 * to the user that there is a potential performance impact due to
1579 * dynamic zone write plug allocation when simultaneously writing to
1580 * more zones than the size of the mempool.
1582 pool_size
= max(lim
.max_open_zones
, lim
.max_active_zones
);
1584 pool_size
= min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE
, nr_seq_zones
);
1586 mempool_resize(disk
->zone_wplugs_pool
, pool_size
);
1588 if (!lim
.max_open_zones
&& !lim
.max_active_zones
) {
1589 if (pool_size
< nr_seq_zones
)
1590 lim
.max_open_zones
= pool_size
;
1592 lim
.max_open_zones
= 0;
1596 return queue_limits_commit_update(q
, &lim
);
1599 static int blk_revalidate_conv_zone(struct blk_zone
*zone
, unsigned int idx
,
1600 struct blk_revalidate_zone_args
*args
)
1602 struct gendisk
*disk
= args
->disk
;
1604 if (zone
->capacity
!= zone
->len
) {
1605 pr_warn("%s: Invalid conventional zone capacity\n",
1610 if (disk_zone_is_last(disk
, zone
))
1611 args
->last_zone_capacity
= zone
->capacity
;
1613 if (!disk_need_zone_resources(disk
))
1616 if (!args
->conv_zones_bitmap
) {
1617 args
->conv_zones_bitmap
=
1618 bitmap_zalloc(args
->nr_zones
, GFP_NOIO
);
1619 if (!args
->conv_zones_bitmap
)
1623 set_bit(idx
, args
->conv_zones_bitmap
);
1628 static int blk_revalidate_seq_zone(struct blk_zone
*zone
, unsigned int idx
,
1629 struct blk_revalidate_zone_args
*args
)
1631 struct gendisk
*disk
= args
->disk
;
1632 struct blk_zone_wplug
*zwplug
;
1633 unsigned int wp_offset
;
1634 unsigned long flags
;
1637 * Remember the capacity of the first sequential zone and check
1638 * if it is constant for all zones, ignoring the last zone as it can be
1641 if (!args
->zone_capacity
)
1642 args
->zone_capacity
= zone
->capacity
;
1643 if (disk_zone_is_last(disk
, zone
)) {
1644 args
->last_zone_capacity
= zone
->capacity
;
1645 } else if (zone
->capacity
!= args
->zone_capacity
) {
1646 pr_warn("%s: Invalid variable zone capacity\n",
1652 * We need to track the write pointer of all zones that are not
1653 * empty nor full. So make sure we have a zone write plug for
1654 * such zone if the device has a zone write plug hash table.
1656 if (!disk
->zone_wplugs_hash
)
1659 wp_offset
= blk_zone_wp_offset(zone
);
1660 if (!wp_offset
|| wp_offset
>= zone
->capacity
)
1663 zwplug
= disk_get_and_lock_zone_wplug(disk
, zone
->wp
, GFP_NOIO
, &flags
);
1666 spin_unlock_irqrestore(&zwplug
->lock
, flags
);
1667 disk_put_zone_wplug(zwplug
);
1673 * Helper function to check the validity of zones of a zoned block device.
1675 static int blk_revalidate_zone_cb(struct blk_zone
*zone
, unsigned int idx
,
1678 struct blk_revalidate_zone_args
*args
= data
;
1679 struct gendisk
*disk
= args
->disk
;
1680 sector_t zone_sectors
= disk
->queue
->limits
.chunk_sectors
;
1683 /* Check for bad zones and holes in the zone report */
1684 if (zone
->start
!= args
->sector
) {
1685 pr_warn("%s: Zone gap at sectors %llu..%llu\n",
1686 disk
->disk_name
, args
->sector
, zone
->start
);
1690 if (zone
->start
>= get_capacity(disk
) || !zone
->len
) {
1691 pr_warn("%s: Invalid zone start %llu, length %llu\n",
1692 disk
->disk_name
, zone
->start
, zone
->len
);
1697 * All zones must have the same size, with the exception on an eventual
1698 * smaller last zone.
1700 if (!disk_zone_is_last(disk
, zone
)) {
1701 if (zone
->len
!= zone_sectors
) {
1702 pr_warn("%s: Invalid zoned device with non constant zone size\n",
1706 } else if (zone
->len
> zone_sectors
) {
1707 pr_warn("%s: Invalid zoned device with larger last zone size\n",
1712 if (!zone
->capacity
|| zone
->capacity
> zone
->len
) {
1713 pr_warn("%s: Invalid zone capacity\n",
1718 /* Check zone type */
1719 switch (zone
->type
) {
1720 case BLK_ZONE_TYPE_CONVENTIONAL
:
1721 ret
= blk_revalidate_conv_zone(zone
, idx
, args
);
1723 case BLK_ZONE_TYPE_SEQWRITE_REQ
:
1724 ret
= blk_revalidate_seq_zone(zone
, idx
, args
);
1726 case BLK_ZONE_TYPE_SEQWRITE_PREF
:
1728 pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n",
1729 disk
->disk_name
, (int)zone
->type
, zone
->start
);
1734 args
->sector
+= zone
->len
;
1740 * blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs
1741 * @disk: Target disk
1743 * Helper function for low-level device drivers to check, (re) allocate and
1744 * initialize resources used for managing zoned disks. This function should
1745 * normally be called by blk-mq based drivers when a zoned gendisk is probed
1746 * and when the zone configuration of the gendisk changes (e.g. after a format).
1747 * Before calling this function, the device driver must already have set the
1748 * device zone size (chunk_sector limit) and the max zone append limit.
1749 * BIO based drivers can also use this function as long as the device queue
1750 * can be safely frozen.
1752 int blk_revalidate_disk_zones(struct gendisk
*disk
)
1754 struct request_queue
*q
= disk
->queue
;
1755 sector_t zone_sectors
= q
->limits
.chunk_sectors
;
1756 sector_t capacity
= get_capacity(disk
);
1757 struct blk_revalidate_zone_args args
= { };
1758 unsigned int noio_flag
;
1761 if (WARN_ON_ONCE(!blk_queue_is_zoned(q
)))
1768 * Checks that the device driver indicated a valid zone size and that
1769 * the max zone append limit is set.
1771 if (!zone_sectors
|| !is_power_of_2(zone_sectors
)) {
1772 pr_warn("%s: Invalid non power of two zone size (%llu)\n",
1773 disk
->disk_name
, zone_sectors
);
1777 if (!queue_max_zone_append_sectors(q
)) {
1778 pr_warn("%s: Invalid 0 maximum zone append limit\n",
1784 * Ensure that all memory allocations in this context are done as if
1785 * GFP_NOIO was specified.
1788 args
.nr_zones
= (capacity
+ zone_sectors
- 1) >> ilog2(zone_sectors
);
1789 noio_flag
= memalloc_noio_save();
1790 ret
= disk_revalidate_zone_resources(disk
, args
.nr_zones
);
1792 memalloc_noio_restore(noio_flag
);
1795 ret
= disk
->fops
->report_zones(disk
, 0, UINT_MAX
,
1796 blk_revalidate_zone_cb
, &args
);
1798 pr_warn("%s: No zones reported\n", disk
->disk_name
);
1801 memalloc_noio_restore(noio_flag
);
1804 * If zones where reported, make sure that the entire disk capacity
1807 if (ret
> 0 && args
.sector
!= capacity
) {
1808 pr_warn("%s: Missing zones from sector %llu\n",
1809 disk
->disk_name
, args
.sector
);
1814 * Set the new disk zone parameters only once the queue is frozen and
1815 * all I/Os are completed.
1817 blk_mq_freeze_queue(q
);
1819 ret
= disk_update_zone_resources(disk
, &args
);
1821 pr_warn("%s: failed to revalidate zones\n", disk
->disk_name
);
1823 disk_free_zone_resources(disk
);
1824 blk_mq_unfreeze_queue(q
);
1826 kfree(args
.conv_zones_bitmap
);
1830 EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones
);
1832 #ifdef CONFIG_BLK_DEBUG_FS
1834 int queue_zone_wplugs_show(void *data
, struct seq_file
*m
)
1836 struct request_queue
*q
= data
;
1837 struct gendisk
*disk
= q
->disk
;
1838 struct blk_zone_wplug
*zwplug
;
1839 unsigned int zwp_wp_offset
, zwp_flags
;
1840 unsigned int zwp_zone_no
, zwp_ref
;
1841 unsigned int zwp_bio_list_size
, i
;
1842 unsigned long flags
;
1844 if (!disk
->zone_wplugs_hash
)
1848 for (i
= 0; i
< disk_zone_wplugs_hash_size(disk
); i
++) {
1849 hlist_for_each_entry_rcu(zwplug
,
1850 &disk
->zone_wplugs_hash
[i
], node
) {
1851 spin_lock_irqsave(&zwplug
->lock
, flags
);
1852 zwp_zone_no
= zwplug
->zone_no
;
1853 zwp_flags
= zwplug
->flags
;
1854 zwp_ref
= atomic_read(&zwplug
->ref
);
1855 zwp_wp_offset
= zwplug
->wp_offset
;
1856 zwp_bio_list_size
= bio_list_size(&zwplug
->bio_list
);
1857 spin_unlock_irqrestore(&zwplug
->lock
, flags
);
1859 seq_printf(m
, "%u 0x%x %u %u %u\n",
1860 zwp_zone_no
, zwp_flags
, zwp_ref
,
1861 zwp_wp_offset
, zwp_bio_list_size
);