2 * Zoned block device handling
4 * Copyright (c) 2015, Hannes Reinecke
5 * Copyright (c) 2015, SUSE Linux GmbH
7 * Copyright (c) 2016, Damien Le Moal
8 * Copyright (c) 2016, Western Digital
11 #include <linux/kernel.h>
12 #include <linux/module.h>
13 #include <linux/rbtree.h>
14 #include <linux/blkdev.h>
15 #include <linux/blk-mq.h>
19 static inline sector_t
blk_zone_start(struct request_queue
*q
,
22 sector_t zone_mask
= blk_queue_zone_sectors(q
) - 1;
24 return sector
& ~zone_mask
;
28 * Return true if a request is a write requests that needs zone write locking.
30 bool blk_req_needs_zone_write_lock(struct request
*rq
)
32 if (!rq
->q
->seq_zones_wlock
)
35 if (blk_rq_is_passthrough(rq
))
39 case REQ_OP_WRITE_ZEROES
:
40 case REQ_OP_WRITE_SAME
:
42 return blk_rq_zone_is_seq(rq
);
47 EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock
);
49 void __blk_req_zone_write_lock(struct request
*rq
)
51 if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq
),
52 rq
->q
->seq_zones_wlock
)))
55 WARN_ON_ONCE(rq
->rq_flags
& RQF_ZONE_WRITE_LOCKED
);
56 rq
->rq_flags
|= RQF_ZONE_WRITE_LOCKED
;
58 EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock
);
60 void __blk_req_zone_write_unlock(struct request
*rq
)
62 rq
->rq_flags
&= ~RQF_ZONE_WRITE_LOCKED
;
63 if (rq
->q
->seq_zones_wlock
)
64 WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq
),
65 rq
->q
->seq_zones_wlock
));
67 EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock
);
69 static inline unsigned int __blkdev_nr_zones(struct request_queue
*q
,
72 unsigned long zone_sectors
= blk_queue_zone_sectors(q
);
74 return (nr_sectors
+ zone_sectors
- 1) >> ilog2(zone_sectors
);
78 * blkdev_nr_zones - Get number of zones
79 * @bdev: Target block device
82 * Return the total number of zones of a zoned block device.
83 * For a regular block device, the number of zones is always 0.
85 unsigned int blkdev_nr_zones(struct block_device
*bdev
)
87 struct request_queue
*q
= bdev_get_queue(bdev
);
89 if (!blk_queue_is_zoned(q
))
92 return __blkdev_nr_zones(q
, bdev
->bd_part
->nr_sects
);
94 EXPORT_SYMBOL_GPL(blkdev_nr_zones
);
97 * Check that a zone report belongs to this partition, and if yes, fix its start
98 * sector and write pointer and return true. Return false otherwise.
100 static bool blkdev_report_zone(struct block_device
*bdev
, struct blk_zone
*rep
)
102 sector_t offset
= get_start_sect(bdev
);
104 if (rep
->start
< offset
)
107 rep
->start
-= offset
;
108 if (rep
->start
+ rep
->len
> bdev
->bd_part
->nr_sects
)
111 if (rep
->type
== BLK_ZONE_TYPE_CONVENTIONAL
)
112 rep
->wp
= rep
->start
+ rep
->len
;
118 static int blk_report_zones(struct gendisk
*disk
, sector_t sector
,
119 struct blk_zone
*zones
, unsigned int *nr_zones
,
122 struct request_queue
*q
= disk
->queue
;
123 unsigned int z
= 0, n
, nrz
= *nr_zones
;
124 sector_t capacity
= get_capacity(disk
);
127 while (z
< nrz
&& sector
< capacity
) {
129 ret
= disk
->fops
->report_zones(disk
, sector
, &zones
[z
], &n
,
135 sector
+= blk_queue_zone_sectors(q
) * n
;
139 WARN_ON(z
> *nr_zones
);
146 * blkdev_report_zones - Get zones information
147 * @bdev: Target block device
148 * @sector: Sector from which to report zones
149 * @zones: Array of zone structures where to return the zones information
150 * @nr_zones: Number of zone structures in the zone array
151 * @gfp_mask: Memory allocation flags (for bio_alloc)
154 * Get zone information starting from the zone containing @sector.
155 * The number of zone information reported may be less than the number
156 * requested by @nr_zones. The number of zones actually reported is
157 * returned in @nr_zones.
159 int blkdev_report_zones(struct block_device
*bdev
, sector_t sector
,
160 struct blk_zone
*zones
, unsigned int *nr_zones
,
163 struct request_queue
*q
= bdev_get_queue(bdev
);
167 if (!blk_queue_is_zoned(q
))
171 * A block device that advertized itself as zoned must have a
172 * report_zones method. If it does not have one defined, the device
173 * driver has a bug. So warn about that.
175 if (WARN_ON_ONCE(!bdev
->bd_disk
->fops
->report_zones
))
178 if (!*nr_zones
|| sector
>= bdev
->bd_part
->nr_sects
) {
184 __blkdev_nr_zones(q
, bdev
->bd_part
->nr_sects
- sector
));
185 ret
= blk_report_zones(bdev
->bd_disk
, get_start_sect(bdev
) + sector
,
186 zones
, &nrz
, gfp_mask
);
190 for (i
= 0; i
< nrz
; i
++) {
191 if (!blkdev_report_zone(bdev
, zones
))
200 EXPORT_SYMBOL_GPL(blkdev_report_zones
);
203 * blkdev_reset_zones - Reset zones write pointer
204 * @bdev: Target block device
205 * @sector: Start sector of the first zone to reset
206 * @nr_sectors: Number of sectors, at least the length of one zone
207 * @gfp_mask: Memory allocation flags (for bio_alloc)
210 * Reset the write pointer of the zones contained in the range
211 * @sector..@sector+@nr_sectors. Specifying the entire disk sector range
212 * is valid, but the specified range should not contain conventional zones.
214 int blkdev_reset_zones(struct block_device
*bdev
,
215 sector_t sector
, sector_t nr_sectors
,
218 struct request_queue
*q
= bdev_get_queue(bdev
);
219 sector_t zone_sectors
;
220 sector_t end_sector
= sector
+ nr_sectors
;
221 struct bio
*bio
= NULL
;
222 struct blk_plug plug
;
225 if (!blk_queue_is_zoned(q
))
228 if (bdev_read_only(bdev
))
231 if (!nr_sectors
|| end_sector
> bdev
->bd_part
->nr_sects
)
235 /* Check alignment (handle eventual smaller last zone) */
236 zone_sectors
= blk_queue_zone_sectors(q
);
237 if (sector
& (zone_sectors
- 1))
240 if ((nr_sectors
& (zone_sectors
- 1)) &&
241 end_sector
!= bdev
->bd_part
->nr_sects
)
244 blk_start_plug(&plug
);
245 while (sector
< end_sector
) {
247 bio
= blk_next_bio(bio
, 0, gfp_mask
);
248 bio
->bi_iter
.bi_sector
= sector
;
249 bio_set_dev(bio
, bdev
);
250 bio_set_op_attrs(bio
, REQ_OP_ZONE_RESET
, 0);
252 sector
+= zone_sectors
;
254 /* This may take a while, so be nice to others */
259 ret
= submit_bio_wait(bio
);
262 blk_finish_plug(&plug
);
266 EXPORT_SYMBOL_GPL(blkdev_reset_zones
);
269 * BLKREPORTZONE ioctl processing.
270 * Called from blkdev_ioctl.
272 int blkdev_report_zones_ioctl(struct block_device
*bdev
, fmode_t mode
,
273 unsigned int cmd
, unsigned long arg
)
275 void __user
*argp
= (void __user
*)arg
;
276 struct request_queue
*q
;
277 struct blk_zone_report rep
;
278 struct blk_zone
*zones
;
284 q
= bdev_get_queue(bdev
);
288 if (!blk_queue_is_zoned(q
))
291 if (!capable(CAP_SYS_ADMIN
))
294 if (copy_from_user(&rep
, argp
, sizeof(struct blk_zone_report
)))
300 rep
.nr_zones
= min(blkdev_nr_zones(bdev
), rep
.nr_zones
);
302 zones
= kvmalloc_array(rep
.nr_zones
, sizeof(struct blk_zone
),
303 GFP_KERNEL
| __GFP_ZERO
);
307 ret
= blkdev_report_zones(bdev
, rep
.sector
,
308 zones
, &rep
.nr_zones
,
313 if (copy_to_user(argp
, &rep
, sizeof(struct blk_zone_report
))) {
319 if (copy_to_user(argp
+ sizeof(struct blk_zone_report
), zones
,
320 sizeof(struct blk_zone
) * rep
.nr_zones
))
331 * BLKRESETZONE ioctl processing.
332 * Called from blkdev_ioctl.
334 int blkdev_reset_zones_ioctl(struct block_device
*bdev
, fmode_t mode
,
335 unsigned int cmd
, unsigned long arg
)
337 void __user
*argp
= (void __user
*)arg
;
338 struct request_queue
*q
;
339 struct blk_zone_range zrange
;
344 q
= bdev_get_queue(bdev
);
348 if (!blk_queue_is_zoned(q
))
351 if (!capable(CAP_SYS_ADMIN
))
354 if (!(mode
& FMODE_WRITE
))
357 if (copy_from_user(&zrange
, argp
, sizeof(struct blk_zone_range
)))
360 return blkdev_reset_zones(bdev
, zrange
.sector
, zrange
.nr_sectors
,
364 static inline unsigned long *blk_alloc_zone_bitmap(int node
,
365 unsigned int nr_zones
)
367 return kcalloc_node(BITS_TO_LONGS(nr_zones
), sizeof(unsigned long),
372 * Allocate an array of struct blk_zone to get nr_zones zone information.
373 * The allocated array may be smaller than nr_zones.
375 static struct blk_zone
*blk_alloc_zones(int node
, unsigned int *nr_zones
)
377 size_t size
= *nr_zones
* sizeof(struct blk_zone
);
381 for (order
= get_order(size
); order
>= 0; order
--) {
382 page
= alloc_pages_node(node
, GFP_NOIO
| __GFP_ZERO
, order
);
384 *nr_zones
= min_t(unsigned int, *nr_zones
,
385 (PAGE_SIZE
<< order
) / sizeof(struct blk_zone
));
386 return page_address(page
);
393 void blk_queue_free_zone_bitmaps(struct request_queue
*q
)
395 kfree(q
->seq_zones_bitmap
);
396 q
->seq_zones_bitmap
= NULL
;
397 kfree(q
->seq_zones_wlock
);
398 q
->seq_zones_wlock
= NULL
;
402 * blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps
405 * Helper function for low-level device drivers to (re) allocate and initialize
406 * a disk request queue zone bitmaps. This functions should normally be called
407 * within the disk ->revalidate method. For BIO based queues, no zone bitmap
410 int blk_revalidate_disk_zones(struct gendisk
*disk
)
412 struct request_queue
*q
= disk
->queue
;
413 unsigned int nr_zones
= __blkdev_nr_zones(q
, get_capacity(disk
));
414 unsigned long *seq_zones_wlock
= NULL
, *seq_zones_bitmap
= NULL
;
415 unsigned int i
, rep_nr_zones
= 0, z
= 0, nrz
;
416 struct blk_zone
*zones
= NULL
;
421 * BIO based queues do not use a scheduler so only q->nr_zones
422 * needs to be updated so that the sysfs exposed value is correct.
424 if (!queue_is_rq_based(q
)) {
425 q
->nr_zones
= nr_zones
;
429 if (!blk_queue_is_zoned(q
) || !nr_zones
) {
434 /* Allocate bitmaps */
436 seq_zones_wlock
= blk_alloc_zone_bitmap(q
->node
, nr_zones
);
437 if (!seq_zones_wlock
)
439 seq_zones_bitmap
= blk_alloc_zone_bitmap(q
->node
, nr_zones
);
440 if (!seq_zones_bitmap
)
443 /* Get zone information and initialize seq_zones_bitmap */
444 rep_nr_zones
= nr_zones
;
445 zones
= blk_alloc_zones(q
->node
, &rep_nr_zones
);
449 while (z
< nr_zones
) {
450 nrz
= min(nr_zones
- z
, rep_nr_zones
);
451 ret
= blk_report_zones(disk
, sector
, zones
, &nrz
, GFP_NOIO
);
456 for (i
= 0; i
< nrz
; i
++) {
457 if (zones
[i
].type
!= BLK_ZONE_TYPE_CONVENTIONAL
)
458 set_bit(z
, seq_zones_bitmap
);
461 sector
+= nrz
* blk_queue_zone_sectors(q
);
464 if (WARN_ON(z
!= nr_zones
)) {
471 * Install the new bitmaps, making sure the queue is stopped and
472 * all I/Os are completed (i.e. a scheduler is not referencing the
475 blk_mq_freeze_queue(q
);
476 q
->nr_zones
= nr_zones
;
477 swap(q
->seq_zones_wlock
, seq_zones_wlock
);
478 swap(q
->seq_zones_bitmap
, seq_zones_bitmap
);
479 blk_mq_unfreeze_queue(q
);
482 free_pages((unsigned long)zones
,
483 get_order(rep_nr_zones
* sizeof(struct blk_zone
)));
484 kfree(seq_zones_wlock
);
485 kfree(seq_zones_bitmap
);
488 pr_warn("%s: failed to revalidate zones\n", disk
->disk_name
);
489 blk_mq_freeze_queue(q
);
490 blk_queue_free_zone_bitmaps(q
);
491 blk_mq_unfreeze_queue(q
);
496 EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones
);