1 // SPDX-License-Identifier: GPL-2.0
3 * NVMe ZNS-ZBD command implementation.
4 * Copyright (C) 2021 Western Digital Corporation or its affiliates.
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 #include <linux/nvme.h>
8 #include <linux/blkdev.h>
12 * We set the Memory Page Size Minimum (MPSMIN) for target controller to 0
13 * which gets added by 12 in the nvme_enable_ctrl() which results in 2^12 = 4k
14 * as page_shift value. When calculating the ZASL use shift by 12.
16 #define NVMET_MPSMIN_SHIFT 12
18 static inline u8
nvmet_zasl(unsigned int zone_append_sects
)
21 * Zone Append Size Limit (zasl) is expressed as a power of 2 value
22 * with the minimum memory page size (i.e. 12) as unit.
24 return ilog2(zone_append_sects
>> (NVMET_MPSMIN_SHIFT
- 9));
27 static int validate_conv_zones_cb(struct blk_zone
*z
,
28 unsigned int i
, void *data
)
30 if (z
->type
== BLK_ZONE_TYPE_CONVENTIONAL
)
35 bool nvmet_bdev_zns_enable(struct nvmet_ns
*ns
)
37 u8 zasl
= nvmet_zasl(bdev_max_zone_append_sectors(ns
->bdev
));
38 struct gendisk
*bd_disk
= ns
->bdev
->bd_disk
;
41 if (ns
->subsys
->zasl
) {
42 if (ns
->subsys
->zasl
> zasl
)
45 ns
->subsys
->zasl
= zasl
;
48 * Generic zoned block devices may have a smaller last zone which is
49 * not supported by ZNS. Exclude zoned drives that have such smaller
52 if (get_capacity(bd_disk
) & (bdev_zone_sectors(ns
->bdev
) - 1))
55 * ZNS does not define a conventional zone type. Use report zones
56 * to detect if the device has conventional zones and reject it if
59 ret
= blkdev_report_zones(ns
->bdev
, 0, bdev_nr_zones(ns
->bdev
),
60 validate_conv_zones_cb
, NULL
);
64 ns
->blksize_shift
= blksize_bits(bdev_logical_block_size(ns
->bdev
));
69 void nvmet_execute_identify_ctrl_zns(struct nvmet_req
*req
)
71 u8 zasl
= req
->sq
->ctrl
->subsys
->zasl
;
72 struct nvmet_ctrl
*ctrl
= req
->sq
->ctrl
;
73 struct nvme_id_ctrl_zns
*id
;
76 id
= kzalloc(sizeof(*id
), GFP_KERNEL
);
78 status
= NVME_SC_INTERNAL
;
82 if (ctrl
->ops
->get_mdts
)
83 id
->zasl
= min_t(u8
, ctrl
->ops
->get_mdts(ctrl
), zasl
);
87 status
= nvmet_copy_to_sgl(req
, 0, id
, sizeof(*id
));
91 nvmet_req_complete(req
, status
);
94 void nvmet_execute_identify_ns_zns(struct nvmet_req
*req
)
96 struct nvme_id_ns_zns
*id_zns
= NULL
;
101 if (le32_to_cpu(req
->cmd
->identify
.nsid
) == NVME_NSID_ALL
) {
102 req
->error_loc
= offsetof(struct nvme_identify
, nsid
);
103 status
= NVME_SC_INVALID_NS
| NVME_STATUS_DNR
;
107 id_zns
= kzalloc(sizeof(*id_zns
), GFP_KERNEL
);
109 status
= NVME_SC_INTERNAL
;
113 status
= nvmet_req_find_ns(req
);
117 if (nvmet_ns_revalidate(req
->ns
)) {
118 mutex_lock(&req
->ns
->subsys
->lock
);
119 nvmet_ns_changed(req
->ns
->subsys
, req
->ns
->nsid
);
120 mutex_unlock(&req
->ns
->subsys
->lock
);
123 if (!bdev_is_zoned(req
->ns
->bdev
)) {
124 status
= NVME_SC_INVALID_FIELD
| NVME_STATUS_DNR
;
125 req
->error_loc
= offsetof(struct nvme_identify
, nsid
);
129 zsze
= (bdev_zone_sectors(req
->ns
->bdev
) << 9) >>
130 req
->ns
->blksize_shift
;
131 id_zns
->lbafe
[0].zsze
= cpu_to_le64(zsze
);
133 mor
= bdev_max_open_zones(req
->ns
->bdev
);
138 id_zns
->mor
= cpu_to_le32(mor
);
140 mar
= bdev_max_active_zones(req
->ns
->bdev
);
145 id_zns
->mar
= cpu_to_le32(mar
);
148 status
= nvmet_copy_to_sgl(req
, 0, id_zns
, sizeof(*id_zns
));
151 nvmet_req_complete(req
, status
);
154 static u16
nvmet_bdev_validate_zone_mgmt_recv(struct nvmet_req
*req
)
156 sector_t sect
= nvmet_lba_to_sect(req
->ns
, req
->cmd
->zmr
.slba
);
157 u32 out_bufsize
= (le32_to_cpu(req
->cmd
->zmr
.numd
) + 1) << 2;
159 if (sect
>= get_capacity(req
->ns
->bdev
->bd_disk
)) {
160 req
->error_loc
= offsetof(struct nvme_zone_mgmt_recv_cmd
, slba
);
161 return NVME_SC_LBA_RANGE
| NVME_STATUS_DNR
;
164 if (out_bufsize
< sizeof(struct nvme_zone_report
)) {
165 req
->error_loc
= offsetof(struct nvme_zone_mgmt_recv_cmd
, numd
);
166 return NVME_SC_INVALID_FIELD
| NVME_STATUS_DNR
;
169 if (req
->cmd
->zmr
.zra
!= NVME_ZRA_ZONE_REPORT
) {
170 req
->error_loc
= offsetof(struct nvme_zone_mgmt_recv_cmd
, zra
);
171 return NVME_SC_INVALID_FIELD
| NVME_STATUS_DNR
;
174 switch (req
->cmd
->zmr
.pr
) {
179 req
->error_loc
= offsetof(struct nvme_zone_mgmt_recv_cmd
, pr
);
180 return NVME_SC_INVALID_FIELD
| NVME_STATUS_DNR
;
183 switch (req
->cmd
->zmr
.zrasf
) {
184 case NVME_ZRASF_ZONE_REPORT_ALL
:
185 case NVME_ZRASF_ZONE_STATE_EMPTY
:
186 case NVME_ZRASF_ZONE_STATE_IMP_OPEN
:
187 case NVME_ZRASF_ZONE_STATE_EXP_OPEN
:
188 case NVME_ZRASF_ZONE_STATE_CLOSED
:
189 case NVME_ZRASF_ZONE_STATE_FULL
:
190 case NVME_ZRASF_ZONE_STATE_READONLY
:
191 case NVME_ZRASF_ZONE_STATE_OFFLINE
:
195 offsetof(struct nvme_zone_mgmt_recv_cmd
, zrasf
);
196 return NVME_SC_INVALID_FIELD
| NVME_STATUS_DNR
;
199 return NVME_SC_SUCCESS
;
202 struct nvmet_report_zone_data
{
203 struct nvmet_req
*req
;
210 static int nvmet_bdev_report_zone_cb(struct blk_zone
*z
, unsigned i
, void *d
)
212 static const unsigned int nvme_zrasf_to_blk_zcond
[] = {
213 [NVME_ZRASF_ZONE_STATE_EMPTY
] = BLK_ZONE_COND_EMPTY
,
214 [NVME_ZRASF_ZONE_STATE_IMP_OPEN
] = BLK_ZONE_COND_IMP_OPEN
,
215 [NVME_ZRASF_ZONE_STATE_EXP_OPEN
] = BLK_ZONE_COND_EXP_OPEN
,
216 [NVME_ZRASF_ZONE_STATE_CLOSED
] = BLK_ZONE_COND_CLOSED
,
217 [NVME_ZRASF_ZONE_STATE_READONLY
] = BLK_ZONE_COND_READONLY
,
218 [NVME_ZRASF_ZONE_STATE_FULL
] = BLK_ZONE_COND_FULL
,
219 [NVME_ZRASF_ZONE_STATE_OFFLINE
] = BLK_ZONE_COND_OFFLINE
,
221 struct nvmet_report_zone_data
*rz
= d
;
223 if (rz
->zrasf
!= NVME_ZRASF_ZONE_REPORT_ALL
&&
224 z
->cond
!= nvme_zrasf_to_blk_zcond
[rz
->zrasf
])
227 if (rz
->nr_zones
< rz
->out_nr_zones
) {
228 struct nvme_zone_descriptor zdesc
= { };
231 zdesc
.zcap
= nvmet_sect_to_lba(rz
->req
->ns
, z
->capacity
);
232 zdesc
.zslba
= nvmet_sect_to_lba(rz
->req
->ns
, z
->start
);
233 zdesc
.wp
= nvmet_sect_to_lba(rz
->req
->ns
, z
->wp
);
234 zdesc
.za
= z
->reset
? 1 << 2 : 0;
235 zdesc
.zs
= z
->cond
<< 4;
238 status
= nvmet_copy_to_sgl(rz
->req
, rz
->out_buf_offset
, &zdesc
,
243 rz
->out_buf_offset
+= sizeof(zdesc
);
251 static unsigned long nvmet_req_nr_zones_from_slba(struct nvmet_req
*req
)
253 unsigned int sect
= nvmet_lba_to_sect(req
->ns
, req
->cmd
->zmr
.slba
);
255 return bdev_nr_zones(req
->ns
->bdev
) - bdev_zone_no(req
->ns
->bdev
, sect
);
258 static unsigned long get_nr_zones_from_buf(struct nvmet_req
*req
, u32 bufsize
)
260 if (bufsize
<= sizeof(struct nvme_zone_report
))
263 return (bufsize
- sizeof(struct nvme_zone_report
)) /
264 sizeof(struct nvme_zone_descriptor
);
267 static void nvmet_bdev_zone_zmgmt_recv_work(struct work_struct
*w
)
269 struct nvmet_req
*req
= container_of(w
, struct nvmet_req
, z
.zmgmt_work
);
270 sector_t start_sect
= nvmet_lba_to_sect(req
->ns
, req
->cmd
->zmr
.slba
);
271 unsigned long req_slba_nr_zones
= nvmet_req_nr_zones_from_slba(req
);
272 u32 out_bufsize
= (le32_to_cpu(req
->cmd
->zmr
.numd
) + 1) << 2;
276 struct nvmet_report_zone_data rz_data
= {
277 .out_nr_zones
= get_nr_zones_from_buf(req
, out_bufsize
),
278 /* leave the place for report zone header */
279 .out_buf_offset
= sizeof(struct nvme_zone_report
),
280 .zrasf
= req
->cmd
->zmr
.zrasf
,
285 status
= nvmet_bdev_validate_zone_mgmt_recv(req
);
289 if (!req_slba_nr_zones
) {
290 status
= NVME_SC_SUCCESS
;
294 ret
= blkdev_report_zones(req
->ns
->bdev
, start_sect
, req_slba_nr_zones
,
295 nvmet_bdev_report_zone_cb
, &rz_data
);
297 status
= NVME_SC_INTERNAL
;
302 * When partial bit is set nr_zones must indicate the number of zone
303 * descriptors actually transferred.
305 if (req
->cmd
->zmr
.pr
)
306 rz_data
.nr_zones
= min(rz_data
.nr_zones
, rz_data
.out_nr_zones
);
308 nr_zones
= cpu_to_le64(rz_data
.nr_zones
);
309 status
= nvmet_copy_to_sgl(req
, 0, &nr_zones
, sizeof(nr_zones
));
312 nvmet_req_complete(req
, status
);
315 void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req
*req
)
317 INIT_WORK(&req
->z
.zmgmt_work
, nvmet_bdev_zone_zmgmt_recv_work
);
318 queue_work(zbd_wq
, &req
->z
.zmgmt_work
);
321 static inline enum req_op
zsa_req_op(u8 zsa
)
325 return REQ_OP_ZONE_OPEN
;
326 case NVME_ZONE_CLOSE
:
327 return REQ_OP_ZONE_CLOSE
;
328 case NVME_ZONE_FINISH
:
329 return REQ_OP_ZONE_FINISH
;
330 case NVME_ZONE_RESET
:
331 return REQ_OP_ZONE_RESET
;
337 static u16
blkdev_zone_mgmt_errno_to_nvme_status(int ret
)
341 return NVME_SC_SUCCESS
;
344 return NVME_SC_ZONE_INVALID_TRANSITION
| NVME_STATUS_DNR
;
346 return NVME_SC_INTERNAL
;
350 struct nvmet_zone_mgmt_send_all_data
{
351 unsigned long *zbitmap
;
352 struct nvmet_req
*req
;
355 static int zmgmt_send_scan_cb(struct blk_zone
*z
, unsigned i
, void *d
)
357 struct nvmet_zone_mgmt_send_all_data
*data
= d
;
359 switch (zsa_req_op(data
->req
->cmd
->zms
.zsa
)) {
360 case REQ_OP_ZONE_OPEN
:
362 case BLK_ZONE_COND_CLOSED
:
368 case REQ_OP_ZONE_CLOSE
:
370 case BLK_ZONE_COND_IMP_OPEN
:
371 case BLK_ZONE_COND_EXP_OPEN
:
377 case REQ_OP_ZONE_FINISH
:
379 case BLK_ZONE_COND_IMP_OPEN
:
380 case BLK_ZONE_COND_EXP_OPEN
:
381 case BLK_ZONE_COND_CLOSED
:
391 set_bit(i
, data
->zbitmap
);
396 static u16
nvmet_bdev_zone_mgmt_emulate_all(struct nvmet_req
*req
)
398 struct block_device
*bdev
= req
->ns
->bdev
;
399 unsigned int nr_zones
= bdev_nr_zones(bdev
);
400 struct bio
*bio
= NULL
;
403 struct nvmet_zone_mgmt_send_all_data d
= {
407 d
.zbitmap
= kcalloc_node(BITS_TO_LONGS(nr_zones
), sizeof(*(d
.zbitmap
)),
408 GFP_NOIO
, bdev
->bd_disk
->node_id
);
414 /* Scan and build bitmap of the eligible zones */
415 ret
= blkdev_report_zones(bdev
, 0, nr_zones
, zmgmt_send_scan_cb
, &d
);
416 if (ret
!= nr_zones
) {
421 /* We scanned all the zones */
425 while (sector
< bdev_nr_sectors(bdev
)) {
426 if (test_bit(disk_zone_no(bdev
->bd_disk
, sector
), d
.zbitmap
)) {
427 bio
= blk_next_bio(bio
, bdev
, 0,
428 zsa_req_op(req
->cmd
->zms
.zsa
) | REQ_SYNC
,
430 bio
->bi_iter
.bi_sector
= sector
;
431 /* This may take a while, so be nice to others */
434 sector
+= bdev_zone_sectors(bdev
);
438 ret
= submit_bio_wait(bio
);
445 return blkdev_zone_mgmt_errno_to_nvme_status(ret
);
448 static u16
nvmet_bdev_execute_zmgmt_send_all(struct nvmet_req
*req
)
452 switch (zsa_req_op(req
->cmd
->zms
.zsa
)) {
453 case REQ_OP_ZONE_RESET
:
454 ret
= blkdev_zone_mgmt(req
->ns
->bdev
, REQ_OP_ZONE_RESET
, 0,
455 get_capacity(req
->ns
->bdev
->bd_disk
));
457 return blkdev_zone_mgmt_errno_to_nvme_status(ret
);
459 case REQ_OP_ZONE_OPEN
:
460 case REQ_OP_ZONE_CLOSE
:
461 case REQ_OP_ZONE_FINISH
:
462 return nvmet_bdev_zone_mgmt_emulate_all(req
);
464 /* this is needed to quiet compiler warning */
465 req
->error_loc
= offsetof(struct nvme_zone_mgmt_send_cmd
, zsa
);
466 return NVME_SC_INVALID_FIELD
| NVME_STATUS_DNR
;
469 return NVME_SC_SUCCESS
;
472 static void nvmet_bdev_zmgmt_send_work(struct work_struct
*w
)
474 struct nvmet_req
*req
= container_of(w
, struct nvmet_req
, z
.zmgmt_work
);
475 sector_t sect
= nvmet_lba_to_sect(req
->ns
, req
->cmd
->zms
.slba
);
476 enum req_op op
= zsa_req_op(req
->cmd
->zms
.zsa
);
477 struct block_device
*bdev
= req
->ns
->bdev
;
478 sector_t zone_sectors
= bdev_zone_sectors(bdev
);
479 u16 status
= NVME_SC_SUCCESS
;
482 if (op
== REQ_OP_LAST
) {
483 req
->error_loc
= offsetof(struct nvme_zone_mgmt_send_cmd
, zsa
);
484 status
= NVME_SC_ZONE_INVALID_TRANSITION
| NVME_STATUS_DNR
;
488 /* when select all bit is set slba field is ignored */
489 if (req
->cmd
->zms
.select_all
) {
490 status
= nvmet_bdev_execute_zmgmt_send_all(req
);
494 if (sect
>= get_capacity(bdev
->bd_disk
)) {
495 req
->error_loc
= offsetof(struct nvme_zone_mgmt_send_cmd
, slba
);
496 status
= NVME_SC_LBA_RANGE
| NVME_STATUS_DNR
;
500 if (sect
& (zone_sectors
- 1)) {
501 req
->error_loc
= offsetof(struct nvme_zone_mgmt_send_cmd
, slba
);
502 status
= NVME_SC_INVALID_FIELD
| NVME_STATUS_DNR
;
506 ret
= blkdev_zone_mgmt(bdev
, op
, sect
, zone_sectors
);
508 status
= blkdev_zone_mgmt_errno_to_nvme_status(ret
);
511 nvmet_req_complete(req
, status
);
514 void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req
*req
)
516 INIT_WORK(&req
->z
.zmgmt_work
, nvmet_bdev_zmgmt_send_work
);
517 queue_work(zbd_wq
, &req
->z
.zmgmt_work
);
520 static void nvmet_bdev_zone_append_bio_done(struct bio
*bio
)
522 struct nvmet_req
*req
= bio
->bi_private
;
524 if (bio
->bi_status
== BLK_STS_OK
) {
525 req
->cqe
->result
.u64
=
526 nvmet_sect_to_lba(req
->ns
, bio
->bi_iter
.bi_sector
);
529 nvmet_req_complete(req
, blk_to_nvme_status(req
, bio
->bi_status
));
530 nvmet_req_bio_put(req
, bio
);
533 void nvmet_bdev_execute_zone_append(struct nvmet_req
*req
)
535 sector_t sect
= nvmet_lba_to_sect(req
->ns
, req
->cmd
->rw
.slba
);
536 const blk_opf_t opf
= REQ_OP_ZONE_APPEND
| REQ_SYNC
| REQ_IDLE
;
537 u16 status
= NVME_SC_SUCCESS
;
538 unsigned int total_len
= 0;
539 struct scatterlist
*sg
;
540 u32 data_len
= nvmet_rw_data_len(req
);
544 /* Request is completed on len mismatch in nvmet_check_transter_len() */
545 if (!nvmet_check_transfer_len(req
, nvmet_rw_data_len(req
)))
549 bdev_max_zone_append_sectors(req
->ns
->bdev
) << SECTOR_SHIFT
) {
550 req
->error_loc
= offsetof(struct nvme_rw_command
, length
);
551 status
= NVME_SC_INVALID_FIELD
| NVME_STATUS_DNR
;
556 nvmet_req_complete(req
, 0);
560 if (sect
>= get_capacity(req
->ns
->bdev
->bd_disk
)) {
561 req
->error_loc
= offsetof(struct nvme_rw_command
, slba
);
562 status
= NVME_SC_LBA_RANGE
| NVME_STATUS_DNR
;
566 if (sect
& (bdev_zone_sectors(req
->ns
->bdev
) - 1)) {
567 req
->error_loc
= offsetof(struct nvme_rw_command
, slba
);
568 status
= NVME_SC_INVALID_FIELD
| NVME_STATUS_DNR
;
572 if (nvmet_use_inline_bvec(req
)) {
573 bio
= &req
->z
.inline_bio
;
574 bio_init(bio
, req
->ns
->bdev
, req
->inline_bvec
,
575 ARRAY_SIZE(req
->inline_bvec
), opf
);
577 bio
= bio_alloc(req
->ns
->bdev
, req
->sg_cnt
, opf
, GFP_KERNEL
);
580 bio
->bi_end_io
= nvmet_bdev_zone_append_bio_done
;
581 bio
->bi_iter
.bi_sector
= sect
;
582 bio
->bi_private
= req
;
583 if (req
->cmd
->rw
.control
& cpu_to_le16(NVME_RW_FUA
))
584 bio
->bi_opf
|= REQ_FUA
;
586 for_each_sg(req
->sg
, sg
, req
->sg_cnt
, sg_cnt
) {
587 unsigned int len
= sg
->length
;
589 if (bio_add_pc_page(bdev_get_queue(bio
->bi_bdev
), bio
,
590 sg_page(sg
), len
, sg
->offset
) != len
) {
591 status
= NVME_SC_INTERNAL
;
597 if (total_len
!= data_len
) {
598 status
= NVME_SC_INTERNAL
| NVME_STATUS_DNR
;
606 nvmet_req_bio_put(req
, bio
);
608 nvmet_req_complete(req
, status
);
611 u16
nvmet_bdev_zns_parse_io_cmd(struct nvmet_req
*req
)
613 struct nvme_command
*cmd
= req
->cmd
;
615 switch (cmd
->common
.opcode
) {
616 case nvme_cmd_zone_append
:
617 req
->execute
= nvmet_bdev_execute_zone_append
;
619 case nvme_cmd_zone_mgmt_recv
:
620 req
->execute
= nvmet_bdev_execute_zone_mgmt_recv
;
622 case nvme_cmd_zone_mgmt_send
:
623 req
->execute
= nvmet_bdev_execute_zone_mgmt_send
;
626 return nvmet_bdev_parse_io_cmd(req
);