2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
13 * Copyright 2018 Nexenta Systems, Inc.
14 * Copyright 2016 Tegile Systems, Inc. All rights reserved.
15 * Copyright (c) 2016 The MathWorks, Inc. All rights reserved.
16 * Copyright 2018 Joyent, Inc.
20 * blkdev driver for NVMe compliant storage devices
22 * This driver was written to conform to version 1.2.1 of the NVMe
23 * specification. It may work with newer versions, but that is completely
24 * untested and disabled by default.
26 * The driver has only been tested on x86 systems and will not work on big-
27 * endian systems without changes to the code accessing registers and data
28 * structures used by the hardware.
33 * The driver will use a single interrupt while configuring the device as the
34 * specification requires, but contrary to the specification it will try to use
35 * a single-message MSI(-X) or FIXED interrupt. Later in the attach process it
36 * will switch to multiple-message MSI(-X) if supported. The driver wants to
37 * have one interrupt vector per CPU, but it will work correctly if less are
38 * available. Interrupts can be shared by queues, the interrupt handler will
39 * iterate through the I/O queue array by steps of n_intr_cnt. Usually only
40 * the admin queue will share an interrupt with one I/O queue. The interrupt
41 * handler will retrieve completed commands from all queues sharing an interrupt
42 * vector and will post them to a taskq for completion processing.
47 * NVMe devices can have up to 65535 I/O queue pairs, with each queue holding up
48 * to 65536 I/O commands. The driver will configure one I/O queue pair per
49 * available interrupt vector, with the queue length usually much smaller than
50 * the maximum of 65536. If the hardware doesn't provide enough queues, fewer
51 * interrupt vectors will be used.
53 * Additionally the hardware provides a single special admin queue pair that can
54 * hold up to 4096 admin commands.
56 * From the hardware perspective both queues of a queue pair are independent,
57 * but they share some driver state: the command array (holding pointers to
58 * commands currently being processed by the hardware) and the active command
59 * counter. Access to a queue pair and the shared state is protected by
62 * When a command is submitted to a queue pair the active command counter is
63 * incremented and a pointer to the command is stored in the command array. The
64 * array index is used as command identifier (CID) in the submission queue
65 * entry. Some commands may take a very long time to complete, and if the queue
66 * wraps around in that time a submission may find the next array slot to still
67 * be used by a long-running command. In this case the array is sequentially
68 * searched for the next free slot. The length of the command array is the same
69 * as the configured queue length. Queue overrun is prevented by the semaphore,
70 * so a command submission may block if the queue is full.
75 * For kernel core dump support the driver can do polled I/O. As interrupts are
76 * turned off while dumping the driver will just submit a command in the regular
77 * way, and then repeatedly attempt a command retrieval until it gets the
83 * NVMe devices can have multiple namespaces, each being a independent data
84 * store. The driver supports multiple namespaces and creates a blkdev interface
85 * for each namespace found. Namespaces can have various attributes to support
86 * protection information. This driver does not support any of this and ignores
87 * namespaces that have these attributes.
89 * As of NVMe 1.1 namespaces can have an 64bit Extended Unique Identifier
90 * (EUI64). This driver uses the EUI64 if present to generate the devid and
91 * passes it to blkdev to use it in the device node names. As this is currently
92 * untested namespaces with EUI64 are ignored by default.
94 * We currently support only (2 << NVME_MINOR_INST_SHIFT) - 2 namespaces in a
95 * single controller. This is an artificial limit imposed by the driver to be
96 * able to address a reasonable number of controllers and namespaces using a
97 * 32bit minor node number.
102 * For each NVMe device the driver exposes one minor node for the controller and
103 * one minor node for each namespace. The only operations supported by those
104 * minor nodes are open(9E), close(9E), and ioctl(9E). This serves as the
105 * interface for the nvmeadm(8) utility.
110 * This driver uses blkdev to do all the heavy lifting involved with presenting
111 * a disk device to the system. As a result, the processing of I/O requests is
112 * relatively simple as blkdev takes care of partitioning, boundary checks, DMA
113 * setup, and splitting of transfers into manageable chunks.
115 * I/O requests coming in from blkdev are turned into NVM commands and posted to
116 * an I/O queue. The queue is selected by taking the CPU id modulo the number of
117 * queues. There is currently no timeout handling of I/O commands.
119 * Blkdev also supports querying device/media information and generating a
120 * devid. The driver reports the best block size as determined by the namespace
121 * format back to blkdev as physical block size to support partition and block
122 * alignment. The devid is either based on the namespace EUI64, if present, or
123 * composed using the device vendor ID, model number, serial number, and the
129 * Error handling is currently limited to detecting fatal hardware errors,
130 * either by asynchronous events, or synchronously through command status or
131 * admin command timeouts. In case of severe errors the device is fenced off,
132 * all further requests will return EIO. FMA is then called to fault the device.
134 * The hardware has a limit for outstanding asynchronous event requests. Before
135 * this limit is known the driver assumes it is at least 1 and posts a single
136 * asynchronous request. Later when the limit is known more asynchronous event
137 * requests are posted to allow quicker reception of error information. When an
138 * asynchronous event is posted by the hardware the driver will parse the error
139 * status fields and log information or fault the device, depending on the
140 * severity of the asynchronous event. The asynchronous event request is then
141 * reused and posted to the admin queue again.
143 * On command completion the command status is checked for errors. In case of
144 * errors indicating a driver bug the driver panics. Almost all other error
145 * status values just cause EIO to be returned.
147 * Command timeouts are currently detected for all admin commands except
148 * asynchronous event requests. If a command times out and the hardware appears
149 * to be healthy the driver attempts to abort the command. The original command
150 * timeout is also applied to the abort command. If the abort times out too the
151 * driver assumes the device to be dead, fences it off, and calls FMA to retire
152 * it. In all other cases the aborted command should return immediately with a
153 * status indicating it was aborted, and the driver will wait indefinitely for
154 * that to happen. No timeout handling of normal I/O commands is presently done.
156 * Any command that times out due to the controller dropping dead will be put on
157 * nvme_lost_cmds list if it references DMA memory. This will prevent the DMA
158 * memory being reused by the system and later be written to by a "dead" NVMe
164 * Each queue pair has its own nq_mutex, which must be held when accessing the
165 * associated queue registers or the shared state of the queue pair. Callers of
166 * nvme_unqueue_cmd() must make sure that nq_mutex is held, while
167 * nvme_submit_{admin,io}_cmd() and nvme_retrieve_cmd() take care of this
170 * Each command also has its own nc_mutex, which is associated with the
171 * condition variable nc_cv. It is only used on admin commands which are run
172 * synchronously. In that case it must be held across calls to
173 * nvme_submit_{admin,io}_cmd() and nvme_wait_cmd(), which is taken care of by
174 * nvme_admin_cmd(). It must also be held whenever the completion state of the
175 * command is changed or while a admin command timeout is handled.
177 * If both nc_mutex and nq_mutex must be held, nc_mutex must be acquired first.
178 * More than one nc_mutex may only be held when aborting commands. In this case,
179 * the nc_mutex of the command to be aborted must be held across the call to
180 * nvme_abort_cmd() to prevent the command from completing while the abort is in
183 * Each minor node has its own nm_mutex, which protects the open count nm_ocnt
184 * and exclusive-open flag nm_oexcl.
187 * Quiesce / Fast Reboot:
189 * The driver currently does not support fast reboot. A quiesce(9E) entry point
190 * is still provided which is used to send a shutdown notification to the
194 * Driver Configuration:
196 * The following driver properties can be changed to control some aspects of the
198 * - strict-version: can be set to 0 to allow devices conforming to newer
199 * major versions to be used
200 * - ignore-unknown-vendor-status: can be set to 1 to not handle any vendor
201 * specific command status as a fatal error leading device faulting
202 * - admin-queue-len: the maximum length of the admin queue (16-4096)
203 * - io-queue-len: the maximum length of the I/O queues (16-65536)
204 * - async-event-limit: the maximum number of asynchronous event requests to be
205 * posted by the driver
206 * - volatile-write-cache-enable: can be set to 0 to disable the volatile write
208 * - min-phys-block-size: the minimum physical block size to report to blkdev,
209 * which is among other things the basis for ZFS vdev ashift
213 * - figure out sane default for I/O queue depth reported to blkdev
214 * - FMA handling of media errors
215 * - support for devices supporting very large I/O requests using chained PRPs
216 * - support for configuring hardware parameters like interrupt coalescing
217 * - support for media formatting and hard partitioning into namespaces
218 * - support for big-endian systems
219 * - support for fast reboot
220 * - support for firmware updates
221 * - support for NVMe Subsystem Reset (1.1)
222 * - support for Scatter/Gather lists (1.1)
223 * - support for Reservations (1.1)
224 * - support for power management
227 #include <sys/byteorder.h>
229 #error nvme driver needs porting for big-endian platforms
232 #include <sys/modctl.h>
233 #include <sys/conf.h>
234 #include <sys/devops.h>
236 #include <sys/sunddi.h>
237 #include <sys/sunndi.h>
238 #include <sys/bitmap.h>
239 #include <sys/sysmacros.h>
240 #include <sys/param.h>
241 #include <sys/varargs.h>
242 #include <sys/cpuvar.h>
243 #include <sys/disp.h>
244 #include <sys/blkdev.h>
245 #include <sys/atomic.h>
246 #include <sys/archsystm.h>
247 #include <sys/sata/sata_hba.h>
248 #include <sys/stat.h>
249 #include <sys/policy.h>
250 #include <sys/list.h>
252 #include <sys/nvme.h>
255 #include <sys/x86_archext.h>
258 #include "nvme_reg.h"
259 #include "nvme_var.h"
262 * Assertions to make sure that we've properly captured various aspects of the
263 * packed structures and haven't broken them during updates.
265 CTASSERT(sizeof (nvme_identify_ctrl_t
) == 0x1000);
266 CTASSERT(offsetof(nvme_identify_ctrl_t
, id_oacs
) == 256);
267 CTASSERT(offsetof(nvme_identify_ctrl_t
, id_sqes
) == 512);
268 CTASSERT(offsetof(nvme_identify_ctrl_t
, id_subnqn
) == 768);
269 CTASSERT(offsetof(nvme_identify_ctrl_t
, id_nvmof
) == 1792);
270 CTASSERT(offsetof(nvme_identify_ctrl_t
, id_psd
) == 2048);
271 CTASSERT(offsetof(nvme_identify_ctrl_t
, id_vs
) == 3072);
273 CTASSERT(sizeof (nvme_identify_nsid_t
) == 0x1000);
274 CTASSERT(offsetof(nvme_identify_nsid_t
, id_fpi
) == 32);
275 CTASSERT(offsetof(nvme_identify_nsid_t
, id_nguid
) == 104);
276 CTASSERT(offsetof(nvme_identify_nsid_t
, id_lbaf
) == 128);
277 CTASSERT(offsetof(nvme_identify_nsid_t
, id_vs
) == 384);
279 CTASSERT(sizeof (nvme_identify_primary_caps_t
) == 0x1000);
280 CTASSERT(offsetof(nvme_identify_primary_caps_t
, nipc_vqfrt
) == 32);
281 CTASSERT(offsetof(nvme_identify_primary_caps_t
, nipc_vifrt
) == 64);
284 /* NVMe spec version supported */
285 static const int nvme_version_major
= 1;
287 /* tunable for admin command timeout in seconds, default is 1s */
288 int nvme_admin_cmd_timeout
= 1;
290 /* tunable for FORMAT NVM command timeout in seconds, default is 600s */
291 int nvme_format_cmd_timeout
= 600;
293 static int nvme_attach(dev_info_t
*, ddi_attach_cmd_t
);
294 static int nvme_detach(dev_info_t
*, ddi_detach_cmd_t
);
295 static int nvme_quiesce(dev_info_t
*);
296 static int nvme_fm_errcb(dev_info_t
*, ddi_fm_error_t
*, const void *);
297 static int nvme_setup_interrupts(nvme_t
*, int, int);
298 static void nvme_release_interrupts(nvme_t
*);
299 static uint_t
nvme_intr(caddr_t
, caddr_t
);
301 static void nvme_shutdown(nvme_t
*, int, boolean_t
);
302 static boolean_t
nvme_reset(nvme_t
*, boolean_t
);
303 static int nvme_init(nvme_t
*);
304 static nvme_cmd_t
*nvme_alloc_cmd(nvme_t
*, int);
305 static void nvme_free_cmd(nvme_cmd_t
*);
306 static nvme_cmd_t
*nvme_create_nvm_cmd(nvme_namespace_t
*, uint8_t,
308 static void nvme_admin_cmd(nvme_cmd_t
*, int);
309 static void nvme_submit_admin_cmd(nvme_qpair_t
*, nvme_cmd_t
*);
310 static int nvme_submit_io_cmd(nvme_qpair_t
*, nvme_cmd_t
*);
311 static void nvme_submit_cmd_common(nvme_qpair_t
*, nvme_cmd_t
*);
312 static nvme_cmd_t
*nvme_unqueue_cmd(nvme_t
*, nvme_qpair_t
*, int);
313 static nvme_cmd_t
*nvme_retrieve_cmd(nvme_t
*, nvme_qpair_t
*);
314 static void nvme_wait_cmd(nvme_cmd_t
*, uint_t
);
315 static void nvme_wakeup_cmd(void *);
316 static void nvme_async_event_task(void *);
318 static int nvme_check_unknown_cmd_status(nvme_cmd_t
*);
319 static int nvme_check_vendor_cmd_status(nvme_cmd_t
*);
320 static int nvme_check_integrity_cmd_status(nvme_cmd_t
*);
321 static int nvme_check_specific_cmd_status(nvme_cmd_t
*);
322 static int nvme_check_generic_cmd_status(nvme_cmd_t
*);
323 static inline int nvme_check_cmd_status(nvme_cmd_t
*);
325 static int nvme_abort_cmd(nvme_cmd_t
*, uint_t
);
326 static void nvme_async_event(nvme_t
*);
327 static int nvme_format_nvm(nvme_t
*, boolean_t
, uint32_t, uint8_t, boolean_t
,
328 uint8_t, boolean_t
, uint8_t);
329 static int nvme_get_logpage(nvme_t
*, boolean_t
, void **, size_t *, uint8_t,
331 static int nvme_identify(nvme_t
*, boolean_t
, uint32_t, void **);
332 static int nvme_set_features(nvme_t
*, boolean_t
, uint32_t, uint8_t, uint32_t,
334 static int nvme_get_features(nvme_t
*, boolean_t
, uint32_t, uint8_t, uint32_t *,
336 static int nvme_write_cache_set(nvme_t
*, boolean_t
);
337 static int nvme_set_nqueues(nvme_t
*, uint16_t *);
339 static void nvme_free_dma(nvme_dma_t
*);
340 static int nvme_zalloc_dma(nvme_t
*, size_t, uint_t
, ddi_dma_attr_t
*,
342 static int nvme_zalloc_queue_dma(nvme_t
*, uint32_t, uint16_t, uint_t
,
344 static void nvme_free_qpair(nvme_qpair_t
*);
345 static int nvme_alloc_qpair(nvme_t
*, uint32_t, nvme_qpair_t
**, int);
346 static int nvme_create_io_qpair(nvme_t
*, nvme_qpair_t
*, uint16_t);
348 static inline void nvme_put64(nvme_t
*, uintptr_t, uint64_t);
349 static inline void nvme_put32(nvme_t
*, uintptr_t, uint32_t);
350 static inline uint64_t nvme_get64(nvme_t
*, uintptr_t);
351 static inline uint32_t nvme_get32(nvme_t
*, uintptr_t);
353 static boolean_t
nvme_check_regs_hdl(nvme_t
*);
354 static boolean_t
nvme_check_dma_hdl(nvme_dma_t
*);
356 static int nvme_fill_prp(nvme_cmd_t
*, bd_xfer_t
*);
358 static void nvme_bd_xfer_done(void *);
359 static void nvme_bd_driveinfo(void *, bd_drive_t
*);
360 static int nvme_bd_mediainfo(void *, bd_media_t
*);
361 static int nvme_bd_cmd(nvme_namespace_t
*, bd_xfer_t
*, uint8_t);
362 static int nvme_bd_read(void *, bd_xfer_t
*);
363 static int nvme_bd_write(void *, bd_xfer_t
*);
364 static int nvme_bd_sync(void *, bd_xfer_t
*);
365 static int nvme_bd_devid(void *, dev_info_t
*, ddi_devid_t
*);
367 static int nvme_prp_dma_constructor(void *, void *, int);
368 static void nvme_prp_dma_destructor(void *, void *);
370 static void nvme_prepare_devid(nvme_t
*, uint32_t);
372 static int nvme_open(dev_t
*, int, int, cred_t
*);
373 static int nvme_close(dev_t
, int, int, cred_t
*);
374 static int nvme_ioctl(dev_t
, int, intptr_t, int, cred_t
*, int *);
376 #define NVME_MINOR_INST_SHIFT 9
377 #define NVME_MINOR(inst, nsid) (((inst) << NVME_MINOR_INST_SHIFT) | (nsid))
378 #define NVME_MINOR_INST(minor) ((minor) >> NVME_MINOR_INST_SHIFT)
379 #define NVME_MINOR_NSID(minor) ((minor) & ((1 << NVME_MINOR_INST_SHIFT) - 1))
380 #define NVME_MINOR_MAX (NVME_MINOR(1, 0) - 2)
382 static void *nvme_state
;
383 static kmem_cache_t
*nvme_cmd_cache
;
386 * DMA attributes for queue DMA memory
388 * Queue DMA memory must be page aligned. The maximum length of a queue is
389 * 65536 entries, and an entry can be 64 bytes long.
391 static ddi_dma_attr_t nvme_queue_dma_attr
= {
392 .dma_attr_version
= DMA_ATTR_V0
,
393 .dma_attr_addr_lo
= 0,
394 .dma_attr_addr_hi
= 0xffffffffffffffffULL
,
395 .dma_attr_count_max
= (UINT16_MAX
+ 1) * sizeof (nvme_sqe_t
) - 1,
396 .dma_attr_align
= 0x1000,
397 .dma_attr_burstsizes
= 0x7ff,
398 .dma_attr_minxfer
= 0x1000,
399 .dma_attr_maxxfer
= (UINT16_MAX
+ 1) * sizeof (nvme_sqe_t
),
400 .dma_attr_seg
= 0xffffffffffffffffULL
,
401 .dma_attr_sgllen
= 1,
402 .dma_attr_granular
= 1,
407 * DMA attributes for transfers using Physical Region Page (PRP) entries
409 * A PRP entry describes one page of DMA memory using the page size specified
410 * in the controller configuration's memory page size register (CC.MPS). It uses
411 * a 64bit base address aligned to this page size. There is no limitation on
412 * chaining PRPs together for arbitrarily large DMA transfers.
414 static ddi_dma_attr_t nvme_prp_dma_attr
= {
415 .dma_attr_version
= DMA_ATTR_V0
,
416 .dma_attr_addr_lo
= 0,
417 .dma_attr_addr_hi
= 0xffffffffffffffffULL
,
418 .dma_attr_count_max
= 0xfff,
419 .dma_attr_align
= 0x1000,
420 .dma_attr_burstsizes
= 0x7ff,
421 .dma_attr_minxfer
= 0x1000,
422 .dma_attr_maxxfer
= 0x1000,
423 .dma_attr_seg
= 0xfff,
424 .dma_attr_sgllen
= -1,
425 .dma_attr_granular
= 1,
430 * DMA attributes for transfers using scatter/gather lists
432 * A SGL entry describes a chunk of DMA memory using a 64bit base address and a
433 * 32bit length field. SGL Segment and SGL Last Segment entries require the
434 * length to be a multiple of 16 bytes.
436 static ddi_dma_attr_t nvme_sgl_dma_attr
= {
437 .dma_attr_version
= DMA_ATTR_V0
,
438 .dma_attr_addr_lo
= 0,
439 .dma_attr_addr_hi
= 0xffffffffffffffffULL
,
440 .dma_attr_count_max
= 0xffffffffUL
,
442 .dma_attr_burstsizes
= 0x7ff,
443 .dma_attr_minxfer
= 0x10,
444 .dma_attr_maxxfer
= 0xfffffffffULL
,
445 .dma_attr_seg
= 0xffffffffffffffffULL
,
446 .dma_attr_sgllen
= -1,
447 .dma_attr_granular
= 0x10,
451 static ddi_device_acc_attr_t nvme_reg_acc_attr
= {
452 .devacc_attr_version
= DDI_DEVICE_ATTR_V0
,
453 .devacc_attr_endian_flags
= DDI_STRUCTURE_LE_ACC
,
454 .devacc_attr_dataorder
= DDI_STRICTORDER_ACC
457 static struct cb_ops nvme_cb_ops
= {
458 .cb_open
= nvme_open
,
459 .cb_close
= nvme_close
,
460 .cb_strategy
= nodev
,
465 .cb_ioctl
= nvme_ioctl
,
469 .cb_chpoll
= nochpoll
,
470 .cb_prop_op
= ddi_prop_op
,
472 .cb_flag
= D_NEW
| D_MP
,
478 static struct dev_ops nvme_dev_ops
= {
479 .devo_rev
= DEVO_REV
,
481 .devo_getinfo
= ddi_no_info
,
482 .devo_identify
= nulldev
,
483 .devo_probe
= nulldev
,
484 .devo_attach
= nvme_attach
,
485 .devo_detach
= nvme_detach
,
487 .devo_cb_ops
= &nvme_cb_ops
,
488 .devo_bus_ops
= NULL
,
490 .devo_quiesce
= nvme_quiesce
,
493 static struct modldrv nvme_modldrv
= {
494 .drv_modops
= &mod_driverops
,
495 .drv_linkinfo
= "NVMe v1.1b",
496 .drv_dev_ops
= &nvme_dev_ops
499 static struct modlinkage nvme_modlinkage
= {
501 .ml_linkage
= { &nvme_modldrv
, NULL
}
504 static bd_ops_t nvme_bd_ops
= {
505 .o_version
= BD_OPS_VERSION_0
,
506 .o_drive_info
= nvme_bd_driveinfo
,
507 .o_media_info
= nvme_bd_mediainfo
,
508 .o_devid_init
= nvme_bd_devid
,
509 .o_sync_cache
= nvme_bd_sync
,
510 .o_read
= nvme_bd_read
,
511 .o_write
= nvme_bd_write
,
515 * This list will hold commands that have timed out and couldn't be aborted.
516 * As we don't know what the hardware may still do with the DMA memory we can't
517 * free them, so we'll keep them forever on this list where we can easily look
520 static struct list nvme_lost_cmds
;
521 static kmutex_t nvme_lc_mutex
;
528 error
= ddi_soft_state_init(&nvme_state
, sizeof (nvme_t
), 1);
529 if (error
!= DDI_SUCCESS
)
532 nvme_cmd_cache
= kmem_cache_create("nvme_cmd_cache",
533 sizeof (nvme_cmd_t
), 64, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
535 mutex_init(&nvme_lc_mutex
, NULL
, MUTEX_DRIVER
, NULL
);
536 list_create(&nvme_lost_cmds
, sizeof (nvme_cmd_t
),
537 offsetof(nvme_cmd_t
, nc_list
));
539 bd_mod_init(&nvme_dev_ops
);
541 error
= mod_install(&nvme_modlinkage
);
542 if (error
!= DDI_SUCCESS
) {
543 ddi_soft_state_fini(&nvme_state
);
544 mutex_destroy(&nvme_lc_mutex
);
545 list_destroy(&nvme_lost_cmds
);
546 bd_mod_fini(&nvme_dev_ops
);
557 if (!list_is_empty(&nvme_lost_cmds
))
558 return (DDI_FAILURE
);
560 error
= mod_remove(&nvme_modlinkage
);
561 if (error
== DDI_SUCCESS
) {
562 ddi_soft_state_fini(&nvme_state
);
563 kmem_cache_destroy(nvme_cmd_cache
);
564 mutex_destroy(&nvme_lc_mutex
);
565 list_destroy(&nvme_lost_cmds
);
566 bd_mod_fini(&nvme_dev_ops
);
573 _info(struct modinfo
*modinfop
)
575 return (mod_info(&nvme_modlinkage
, modinfop
));
579 nvme_put64(nvme_t
*nvme
, uintptr_t reg
, uint64_t val
)
581 ASSERT(((uintptr_t)(nvme
->n_regs
+ reg
) & 0x7) == 0);
583 /*LINTED: E_BAD_PTR_CAST_ALIGN*/
584 ddi_put64(nvme
->n_regh
, (uint64_t *)(nvme
->n_regs
+ reg
), val
);
588 nvme_put32(nvme_t
*nvme
, uintptr_t reg
, uint32_t val
)
590 ASSERT(((uintptr_t)(nvme
->n_regs
+ reg
) & 0x3) == 0);
592 /*LINTED: E_BAD_PTR_CAST_ALIGN*/
593 ddi_put32(nvme
->n_regh
, (uint32_t *)(nvme
->n_regs
+ reg
), val
);
596 static inline uint64_t
597 nvme_get64(nvme_t
*nvme
, uintptr_t reg
)
601 ASSERT(((uintptr_t)(nvme
->n_regs
+ reg
) & 0x7) == 0);
603 /*LINTED: E_BAD_PTR_CAST_ALIGN*/
604 val
= ddi_get64(nvme
->n_regh
, (uint64_t *)(nvme
->n_regs
+ reg
));
609 static inline uint32_t
610 nvme_get32(nvme_t
*nvme
, uintptr_t reg
)
614 ASSERT(((uintptr_t)(nvme
->n_regs
+ reg
) & 0x3) == 0);
616 /*LINTED: E_BAD_PTR_CAST_ALIGN*/
617 val
= ddi_get32(nvme
->n_regh
, (uint32_t *)(nvme
->n_regs
+ reg
));
623 nvme_check_regs_hdl(nvme_t
*nvme
)
625 ddi_fm_error_t error
;
627 ddi_fm_acc_err_get(nvme
->n_regh
, &error
, DDI_FME_VERSION
);
629 if (error
.fme_status
!= DDI_FM_OK
)
636 nvme_check_dma_hdl(nvme_dma_t
*dma
)
638 ddi_fm_error_t error
;
643 ddi_fm_dma_err_get(dma
->nd_dmah
, &error
, DDI_FME_VERSION
);
645 if (error
.fme_status
!= DDI_FM_OK
)
652 nvme_free_dma_common(nvme_dma_t
*dma
)
654 if (dma
->nd_dmah
!= NULL
)
655 (void) ddi_dma_unbind_handle(dma
->nd_dmah
);
656 if (dma
->nd_acch
!= NULL
)
657 ddi_dma_mem_free(&dma
->nd_acch
);
658 if (dma
->nd_dmah
!= NULL
)
659 ddi_dma_free_handle(&dma
->nd_dmah
);
663 nvme_free_dma(nvme_dma_t
*dma
)
665 nvme_free_dma_common(dma
);
666 kmem_free(dma
, sizeof (*dma
));
671 nvme_prp_dma_destructor(void *buf
, void *private)
673 nvme_dma_t
*dma
= (nvme_dma_t
*)buf
;
675 nvme_free_dma_common(dma
);
679 nvme_alloc_dma_common(nvme_t
*nvme
, nvme_dma_t
*dma
,
680 size_t len
, uint_t flags
, ddi_dma_attr_t
*dma_attr
)
682 if (ddi_dma_alloc_handle(nvme
->n_dip
, dma_attr
, DDI_DMA_SLEEP
, NULL
,
683 &dma
->nd_dmah
) != DDI_SUCCESS
) {
685 * Due to DDI_DMA_SLEEP this can't be DDI_DMA_NORESOURCES, and
686 * the only other possible error is DDI_DMA_BADATTR which
687 * indicates a driver bug which should cause a panic.
689 dev_err(nvme
->n_dip
, CE_PANIC
,
690 "!failed to get DMA handle, check DMA attributes");
691 return (DDI_FAILURE
);
695 * ddi_dma_mem_alloc() can only fail when DDI_DMA_NOSLEEP is specified
696 * or the flags are conflicting, which isn't the case here.
698 (void) ddi_dma_mem_alloc(dma
->nd_dmah
, len
, &nvme
->n_reg_acc_attr
,
699 DDI_DMA_CONSISTENT
, DDI_DMA_SLEEP
, NULL
, &dma
->nd_memp
,
700 &dma
->nd_len
, &dma
->nd_acch
);
702 if (ddi_dma_addr_bind_handle(dma
->nd_dmah
, NULL
, dma
->nd_memp
,
703 dma
->nd_len
, flags
| DDI_DMA_CONSISTENT
, DDI_DMA_SLEEP
, NULL
,
704 &dma
->nd_cookie
, &dma
->nd_ncookie
) != DDI_DMA_MAPPED
) {
705 dev_err(nvme
->n_dip
, CE_WARN
,
706 "!failed to bind DMA memory");
707 atomic_inc_32(&nvme
->n_dma_bind_err
);
708 nvme_free_dma_common(dma
);
709 return (DDI_FAILURE
);
712 return (DDI_SUCCESS
);
716 nvme_zalloc_dma(nvme_t
*nvme
, size_t len
, uint_t flags
,
717 ddi_dma_attr_t
*dma_attr
, nvme_dma_t
**ret
)
719 nvme_dma_t
*dma
= kmem_zalloc(sizeof (nvme_dma_t
), KM_SLEEP
);
721 if (nvme_alloc_dma_common(nvme
, dma
, len
, flags
, dma_attr
) !=
724 kmem_free(dma
, sizeof (nvme_dma_t
));
725 return (DDI_FAILURE
);
728 bzero(dma
->nd_memp
, dma
->nd_len
);
731 return (DDI_SUCCESS
);
736 nvme_prp_dma_constructor(void *buf
, void *private, int flags
)
738 nvme_dma_t
*dma
= (nvme_dma_t
*)buf
;
739 nvme_t
*nvme
= (nvme_t
*)private;
744 if (nvme_alloc_dma_common(nvme
, dma
, nvme
->n_pagesize
,
745 DDI_DMA_READ
, &nvme
->n_prp_dma_attr
) != DDI_SUCCESS
) {
749 ASSERT(dma
->nd_ncookie
== 1);
751 dma
->nd_cached
= B_TRUE
;
757 nvme_zalloc_queue_dma(nvme_t
*nvme
, uint32_t nentry
, uint16_t qe_len
,
758 uint_t flags
, nvme_dma_t
**dma
)
760 uint32_t len
= nentry
* qe_len
;
761 ddi_dma_attr_t q_dma_attr
= nvme
->n_queue_dma_attr
;
763 len
= roundup(len
, nvme
->n_pagesize
);
765 q_dma_attr
.dma_attr_minxfer
= len
;
767 if (nvme_zalloc_dma(nvme
, len
, flags
, &q_dma_attr
, dma
)
769 dev_err(nvme
->n_dip
, CE_WARN
,
770 "!failed to get DMA memory for queue");
774 if ((*dma
)->nd_ncookie
!= 1) {
775 dev_err(nvme
->n_dip
, CE_WARN
,
776 "!got too many cookies for queue DMA");
780 return (DDI_SUCCESS
);
788 return (DDI_FAILURE
);
792 nvme_free_qpair(nvme_qpair_t
*qp
)
796 mutex_destroy(&qp
->nq_mutex
);
797 sema_destroy(&qp
->nq_sema
);
799 if (qp
->nq_sqdma
!= NULL
)
800 nvme_free_dma(qp
->nq_sqdma
);
801 if (qp
->nq_cqdma
!= NULL
)
802 nvme_free_dma(qp
->nq_cqdma
);
804 if (qp
->nq_active_cmds
> 0)
805 for (i
= 0; i
!= qp
->nq_nentry
; i
++)
806 if (qp
->nq_cmd
[i
] != NULL
)
807 nvme_free_cmd(qp
->nq_cmd
[i
]);
809 if (qp
->nq_cmd
!= NULL
)
810 kmem_free(qp
->nq_cmd
, sizeof (nvme_cmd_t
*) * qp
->nq_nentry
);
812 kmem_free(qp
, sizeof (nvme_qpair_t
));
816 nvme_alloc_qpair(nvme_t
*nvme
, uint32_t nentry
, nvme_qpair_t
**nqp
,
819 nvme_qpair_t
*qp
= kmem_zalloc(sizeof (*qp
), KM_SLEEP
);
821 mutex_init(&qp
->nq_mutex
, NULL
, MUTEX_DRIVER
,
822 DDI_INTR_PRI(nvme
->n_intr_pri
));
823 sema_init(&qp
->nq_sema
, nentry
, NULL
, SEMA_DRIVER
, NULL
);
825 if (nvme_zalloc_queue_dma(nvme
, nentry
, sizeof (nvme_sqe_t
),
826 DDI_DMA_WRITE
, &qp
->nq_sqdma
) != DDI_SUCCESS
)
829 if (nvme_zalloc_queue_dma(nvme
, nentry
, sizeof (nvme_cqe_t
),
830 DDI_DMA_READ
, &qp
->nq_cqdma
) != DDI_SUCCESS
)
833 qp
->nq_sq
= (nvme_sqe_t
*)qp
->nq_sqdma
->nd_memp
;
834 qp
->nq_cq
= (nvme_cqe_t
*)qp
->nq_cqdma
->nd_memp
;
835 qp
->nq_nentry
= nentry
;
837 qp
->nq_sqtdbl
= NVME_REG_SQTDBL(nvme
, idx
);
838 qp
->nq_cqhdbl
= NVME_REG_CQHDBL(nvme
, idx
);
840 qp
->nq_cmd
= kmem_zalloc(sizeof (nvme_cmd_t
*) * nentry
, KM_SLEEP
);
844 return (DDI_SUCCESS
);
850 return (DDI_FAILURE
);
854 nvme_alloc_cmd(nvme_t
*nvme
, int kmflag
)
856 nvme_cmd_t
*cmd
= kmem_cache_alloc(nvme_cmd_cache
, kmflag
);
861 bzero(cmd
, sizeof (nvme_cmd_t
));
865 mutex_init(&cmd
->nc_mutex
, NULL
, MUTEX_DRIVER
,
866 DDI_INTR_PRI(nvme
->n_intr_pri
));
867 cv_init(&cmd
->nc_cv
, NULL
, CV_DRIVER
, NULL
);
873 nvme_free_cmd(nvme_cmd_t
*cmd
)
875 /* Don't free commands on the lost commands list. */
876 if (list_link_active(&cmd
->nc_list
))
880 if (cmd
->nc_dma
->nd_cached
)
881 kmem_cache_free(cmd
->nc_nvme
->n_prp_cache
,
884 nvme_free_dma(cmd
->nc_dma
);
888 cv_destroy(&cmd
->nc_cv
);
889 mutex_destroy(&cmd
->nc_mutex
);
891 kmem_cache_free(nvme_cmd_cache
, cmd
);
895 nvme_submit_admin_cmd(nvme_qpair_t
*qp
, nvme_cmd_t
*cmd
)
897 sema_p(&qp
->nq_sema
);
898 nvme_submit_cmd_common(qp
, cmd
);
902 nvme_submit_io_cmd(nvme_qpair_t
*qp
, nvme_cmd_t
*cmd
)
904 if (sema_tryp(&qp
->nq_sema
) == 0)
907 nvme_submit_cmd_common(qp
, cmd
);
912 nvme_submit_cmd_common(nvme_qpair_t
*qp
, nvme_cmd_t
*cmd
)
914 nvme_reg_sqtdbl_t tail
= { 0 };
916 mutex_enter(&qp
->nq_mutex
);
917 cmd
->nc_completed
= B_FALSE
;
920 * Try to insert the cmd into the active cmd array at the nq_next_cmd
921 * slot. If the slot is already occupied advance to the next slot and
922 * try again. This can happen for long running commands like async event
925 while (qp
->nq_cmd
[qp
->nq_next_cmd
] != NULL
)
926 qp
->nq_next_cmd
= (qp
->nq_next_cmd
+ 1) % qp
->nq_nentry
;
927 qp
->nq_cmd
[qp
->nq_next_cmd
] = cmd
;
929 qp
->nq_active_cmds
++;
931 cmd
->nc_sqe
.sqe_cid
= qp
->nq_next_cmd
;
932 bcopy(&cmd
->nc_sqe
, &qp
->nq_sq
[qp
->nq_sqtail
], sizeof (nvme_sqe_t
));
933 (void) ddi_dma_sync(qp
->nq_sqdma
->nd_dmah
,
934 sizeof (nvme_sqe_t
) * qp
->nq_sqtail
,
935 sizeof (nvme_sqe_t
), DDI_DMA_SYNC_FORDEV
);
936 qp
->nq_next_cmd
= (qp
->nq_next_cmd
+ 1) % qp
->nq_nentry
;
938 tail
.b
.sqtdbl_sqt
= qp
->nq_sqtail
= (qp
->nq_sqtail
+ 1) % qp
->nq_nentry
;
939 nvme_put32(cmd
->nc_nvme
, qp
->nq_sqtdbl
, tail
.r
);
941 mutex_exit(&qp
->nq_mutex
);
945 nvme_unqueue_cmd(nvme_t
*nvme
, nvme_qpair_t
*qp
, int cid
)
949 ASSERT(mutex_owned(&qp
->nq_mutex
));
950 ASSERT3S(cid
, <, qp
->nq_nentry
);
952 cmd
= qp
->nq_cmd
[cid
];
953 qp
->nq_cmd
[cid
] = NULL
;
954 ASSERT3U(qp
->nq_active_cmds
, >, 0);
955 qp
->nq_active_cmds
--;
956 sema_v(&qp
->nq_sema
);
958 ASSERT3P(cmd
, !=, NULL
);
959 ASSERT3P(cmd
->nc_nvme
, ==, nvme
);
960 ASSERT3S(cmd
->nc_sqe
.sqe_cid
, ==, cid
);
966 nvme_retrieve_cmd(nvme_t
*nvme
, nvme_qpair_t
*qp
)
968 nvme_reg_cqhdbl_t head
= { 0 };
973 (void) ddi_dma_sync(qp
->nq_cqdma
->nd_dmah
, 0,
974 sizeof (nvme_cqe_t
) * qp
->nq_nentry
, DDI_DMA_SYNC_FORKERNEL
);
976 mutex_enter(&qp
->nq_mutex
);
977 cqe
= &qp
->nq_cq
[qp
->nq_cqhead
];
979 /* Check phase tag of CQE. Hardware inverts it for new entries. */
980 if (cqe
->cqe_sf
.sf_p
== qp
->nq_phase
) {
981 mutex_exit(&qp
->nq_mutex
);
985 ASSERT(nvme
->n_ioq
[cqe
->cqe_sqid
] == qp
);
987 cmd
= nvme_unqueue_cmd(nvme
, qp
, cqe
->cqe_cid
);
989 ASSERT(cmd
->nc_sqid
== cqe
->cqe_sqid
);
990 bcopy(cqe
, &cmd
->nc_cqe
, sizeof (nvme_cqe_t
));
992 qp
->nq_sqhead
= cqe
->cqe_sqhd
;
994 head
.b
.cqhdbl_cqh
= qp
->nq_cqhead
= (qp
->nq_cqhead
+ 1) % qp
->nq_nentry
;
996 /* Toggle phase on wrap-around. */
997 if (qp
->nq_cqhead
== 0)
998 qp
->nq_phase
= qp
->nq_phase
? 0 : 1;
1000 nvme_put32(cmd
->nc_nvme
, qp
->nq_cqhdbl
, head
.r
);
1001 mutex_exit(&qp
->nq_mutex
);
1007 nvme_check_unknown_cmd_status(nvme_cmd_t
*cmd
)
1009 nvme_cqe_t
*cqe
= &cmd
->nc_cqe
;
1011 dev_err(cmd
->nc_nvme
->n_dip
, CE_WARN
,
1012 "!unknown command status received: opc = %x, sqid = %d, cid = %d, "
1013 "sc = %x, sct = %x, dnr = %d, m = %d", cmd
->nc_sqe
.sqe_opc
,
1014 cqe
->cqe_sqid
, cqe
->cqe_cid
, cqe
->cqe_sf
.sf_sc
, cqe
->cqe_sf
.sf_sct
,
1015 cqe
->cqe_sf
.sf_dnr
, cqe
->cqe_sf
.sf_m
);
1017 if (cmd
->nc_xfer
!= NULL
)
1018 bd_error(cmd
->nc_xfer
, BD_ERR_ILLRQ
);
1020 if (cmd
->nc_nvme
->n_strict_version
) {
1021 cmd
->nc_nvme
->n_dead
= B_TRUE
;
1022 ddi_fm_service_impact(cmd
->nc_nvme
->n_dip
, DDI_SERVICE_LOST
);
1029 nvme_check_vendor_cmd_status(nvme_cmd_t
*cmd
)
1031 nvme_cqe_t
*cqe
= &cmd
->nc_cqe
;
1033 dev_err(cmd
->nc_nvme
->n_dip
, CE_WARN
,
1034 "!unknown command status received: opc = %x, sqid = %d, cid = %d, "
1035 "sc = %x, sct = %x, dnr = %d, m = %d", cmd
->nc_sqe
.sqe_opc
,
1036 cqe
->cqe_sqid
, cqe
->cqe_cid
, cqe
->cqe_sf
.sf_sc
, cqe
->cqe_sf
.sf_sct
,
1037 cqe
->cqe_sf
.sf_dnr
, cqe
->cqe_sf
.sf_m
);
1038 if (!cmd
->nc_nvme
->n_ignore_unknown_vendor_status
) {
1039 cmd
->nc_nvme
->n_dead
= B_TRUE
;
1040 ddi_fm_service_impact(cmd
->nc_nvme
->n_dip
, DDI_SERVICE_LOST
);
1047 nvme_check_integrity_cmd_status(nvme_cmd_t
*cmd
)
1049 nvme_cqe_t
*cqe
= &cmd
->nc_cqe
;
1051 switch (cqe
->cqe_sf
.sf_sc
) {
1052 case NVME_CQE_SC_INT_NVM_WRITE
:
1054 /* TODO: post ereport */
1055 if (cmd
->nc_xfer
!= NULL
)
1056 bd_error(cmd
->nc_xfer
, BD_ERR_MEDIA
);
1059 case NVME_CQE_SC_INT_NVM_READ
:
1061 /* TODO: post ereport */
1062 if (cmd
->nc_xfer
!= NULL
)
1063 bd_error(cmd
->nc_xfer
, BD_ERR_MEDIA
);
1067 return (nvme_check_unknown_cmd_status(cmd
));
1072 nvme_check_generic_cmd_status(nvme_cmd_t
*cmd
)
1074 nvme_cqe_t
*cqe
= &cmd
->nc_cqe
;
1076 switch (cqe
->cqe_sf
.sf_sc
) {
1077 case NVME_CQE_SC_GEN_SUCCESS
:
1081 * Errors indicating a bug in the driver should cause a panic.
1083 case NVME_CQE_SC_GEN_INV_OPC
:
1084 /* Invalid Command Opcode */
1085 if (!cmd
->nc_dontpanic
)
1086 dev_err(cmd
->nc_nvme
->n_dip
, CE_PANIC
,
1087 "programming error: invalid opcode in cmd %p",
1091 case NVME_CQE_SC_GEN_INV_FLD
:
1092 /* Invalid Field in Command */
1093 if (!cmd
->nc_dontpanic
)
1094 dev_err(cmd
->nc_nvme
->n_dip
, CE_PANIC
,
1095 "programming error: invalid field in cmd %p",
1099 case NVME_CQE_SC_GEN_ID_CNFL
:
1100 /* Command ID Conflict */
1101 dev_err(cmd
->nc_nvme
->n_dip
, CE_PANIC
, "programming error: "
1102 "cmd ID conflict in cmd %p", (void *)cmd
);
1105 case NVME_CQE_SC_GEN_INV_NS
:
1106 /* Invalid Namespace or Format */
1107 if (!cmd
->nc_dontpanic
)
1108 dev_err(cmd
->nc_nvme
->n_dip
, CE_PANIC
,
1109 "programming error: invalid NS/format in cmd %p",
1113 case NVME_CQE_SC_GEN_NVM_LBA_RANGE
:
1114 /* LBA Out Of Range */
1115 dev_err(cmd
->nc_nvme
->n_dip
, CE_PANIC
, "programming error: "
1116 "LBA out of range in cmd %p", (void *)cmd
);
1120 * Non-fatal errors, handle gracefully.
1122 case NVME_CQE_SC_GEN_DATA_XFR_ERR
:
1123 /* Data Transfer Error (DMA) */
1124 /* TODO: post ereport */
1125 atomic_inc_32(&cmd
->nc_nvme
->n_data_xfr_err
);
1126 if (cmd
->nc_xfer
!= NULL
)
1127 bd_error(cmd
->nc_xfer
, BD_ERR_NTRDY
);
1130 case NVME_CQE_SC_GEN_INTERNAL_ERR
:
1132 * Internal Error. The spec (v1.0, section 4.5.1.2) says
1133 * detailed error information is returned as async event,
1134 * so we pretty much ignore the error here and handle it
1135 * in the async event handler.
1137 atomic_inc_32(&cmd
->nc_nvme
->n_internal_err
);
1138 if (cmd
->nc_xfer
!= NULL
)
1139 bd_error(cmd
->nc_xfer
, BD_ERR_NTRDY
);
1142 case NVME_CQE_SC_GEN_ABORT_REQUEST
:
1144 * Command Abort Requested. This normally happens only when a
1145 * command times out.
1147 /* TODO: post ereport or change blkdev to handle this? */
1148 atomic_inc_32(&cmd
->nc_nvme
->n_abort_rq_err
);
1151 case NVME_CQE_SC_GEN_ABORT_PWRLOSS
:
1152 /* Command Aborted due to Power Loss Notification */
1153 ddi_fm_service_impact(cmd
->nc_nvme
->n_dip
, DDI_SERVICE_LOST
);
1154 cmd
->nc_nvme
->n_dead
= B_TRUE
;
1157 case NVME_CQE_SC_GEN_ABORT_SQ_DEL
:
1158 /* Command Aborted due to SQ Deletion */
1159 atomic_inc_32(&cmd
->nc_nvme
->n_abort_sq_del
);
1162 case NVME_CQE_SC_GEN_NVM_CAP_EXC
:
1163 /* Capacity Exceeded */
1164 atomic_inc_32(&cmd
->nc_nvme
->n_nvm_cap_exc
);
1165 if (cmd
->nc_xfer
!= NULL
)
1166 bd_error(cmd
->nc_xfer
, BD_ERR_MEDIA
);
1169 case NVME_CQE_SC_GEN_NVM_NS_NOTRDY
:
1170 /* Namespace Not Ready */
1171 atomic_inc_32(&cmd
->nc_nvme
->n_nvm_ns_notrdy
);
1172 if (cmd
->nc_xfer
!= NULL
)
1173 bd_error(cmd
->nc_xfer
, BD_ERR_NTRDY
);
1177 return (nvme_check_unknown_cmd_status(cmd
));
1182 nvme_check_specific_cmd_status(nvme_cmd_t
*cmd
)
1184 nvme_cqe_t
*cqe
= &cmd
->nc_cqe
;
1186 switch (cqe
->cqe_sf
.sf_sc
) {
1187 case NVME_CQE_SC_SPC_INV_CQ
:
1188 /* Completion Queue Invalid */
1189 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_CREATE_SQUEUE
);
1190 atomic_inc_32(&cmd
->nc_nvme
->n_inv_cq_err
);
1193 case NVME_CQE_SC_SPC_INV_QID
:
1194 /* Invalid Queue Identifier */
1195 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_CREATE_SQUEUE
||
1196 cmd
->nc_sqe
.sqe_opc
== NVME_OPC_DELETE_SQUEUE
||
1197 cmd
->nc_sqe
.sqe_opc
== NVME_OPC_CREATE_CQUEUE
||
1198 cmd
->nc_sqe
.sqe_opc
== NVME_OPC_DELETE_CQUEUE
);
1199 atomic_inc_32(&cmd
->nc_nvme
->n_inv_qid_err
);
1202 case NVME_CQE_SC_SPC_MAX_QSZ_EXC
:
1203 /* Max Queue Size Exceeded */
1204 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_CREATE_SQUEUE
||
1205 cmd
->nc_sqe
.sqe_opc
== NVME_OPC_CREATE_CQUEUE
);
1206 atomic_inc_32(&cmd
->nc_nvme
->n_max_qsz_exc
);
1209 case NVME_CQE_SC_SPC_ABRT_CMD_EXC
:
1210 /* Abort Command Limit Exceeded */
1211 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_ABORT
);
1212 dev_err(cmd
->nc_nvme
->n_dip
, CE_PANIC
, "programming error: "
1213 "abort command limit exceeded in cmd %p", (void *)cmd
);
1216 case NVME_CQE_SC_SPC_ASYNC_EVREQ_EXC
:
1217 /* Async Event Request Limit Exceeded */
1218 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_ASYNC_EVENT
);
1219 dev_err(cmd
->nc_nvme
->n_dip
, CE_PANIC
, "programming error: "
1220 "async event request limit exceeded in cmd %p",
1224 case NVME_CQE_SC_SPC_INV_INT_VECT
:
1225 /* Invalid Interrupt Vector */
1226 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_CREATE_CQUEUE
);
1227 atomic_inc_32(&cmd
->nc_nvme
->n_inv_int_vect
);
1230 case NVME_CQE_SC_SPC_INV_LOG_PAGE
:
1231 /* Invalid Log Page */
1232 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_GET_LOG_PAGE
);
1233 atomic_inc_32(&cmd
->nc_nvme
->n_inv_log_page
);
1236 case NVME_CQE_SC_SPC_INV_FORMAT
:
1237 /* Invalid Format */
1238 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_NVM_FORMAT
);
1239 atomic_inc_32(&cmd
->nc_nvme
->n_inv_format
);
1240 if (cmd
->nc_xfer
!= NULL
)
1241 bd_error(cmd
->nc_xfer
, BD_ERR_ILLRQ
);
1244 case NVME_CQE_SC_SPC_INV_Q_DEL
:
1245 /* Invalid Queue Deletion */
1246 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_DELETE_CQUEUE
);
1247 atomic_inc_32(&cmd
->nc_nvme
->n_inv_q_del
);
1250 case NVME_CQE_SC_SPC_NVM_CNFL_ATTR
:
1251 /* Conflicting Attributes */
1252 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_NVM_DSET_MGMT
||
1253 cmd
->nc_sqe
.sqe_opc
== NVME_OPC_NVM_READ
||
1254 cmd
->nc_sqe
.sqe_opc
== NVME_OPC_NVM_WRITE
);
1255 atomic_inc_32(&cmd
->nc_nvme
->n_cnfl_attr
);
1256 if (cmd
->nc_xfer
!= NULL
)
1257 bd_error(cmd
->nc_xfer
, BD_ERR_ILLRQ
);
1260 case NVME_CQE_SC_SPC_NVM_INV_PROT
:
1261 /* Invalid Protection Information */
1262 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_NVM_COMPARE
||
1263 cmd
->nc_sqe
.sqe_opc
== NVME_OPC_NVM_READ
||
1264 cmd
->nc_sqe
.sqe_opc
== NVME_OPC_NVM_WRITE
);
1265 atomic_inc_32(&cmd
->nc_nvme
->n_inv_prot
);
1266 if (cmd
->nc_xfer
!= NULL
)
1267 bd_error(cmd
->nc_xfer
, BD_ERR_ILLRQ
);
1270 case NVME_CQE_SC_SPC_NVM_READONLY
:
1271 /* Write to Read Only Range */
1272 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_NVM_WRITE
);
1273 atomic_inc_32(&cmd
->nc_nvme
->n_readonly
);
1274 if (cmd
->nc_xfer
!= NULL
)
1275 bd_error(cmd
->nc_xfer
, BD_ERR_ILLRQ
);
1279 return (nvme_check_unknown_cmd_status(cmd
));
1284 nvme_check_cmd_status(nvme_cmd_t
*cmd
)
1286 nvme_cqe_t
*cqe
= &cmd
->nc_cqe
;
1289 * Take a shortcut if the controller is dead, or if
1290 * command status indicates no error.
1292 if (cmd
->nc_nvme
->n_dead
)
1295 if (cqe
->cqe_sf
.sf_sct
== NVME_CQE_SCT_GENERIC
&&
1296 cqe
->cqe_sf
.sf_sc
== NVME_CQE_SC_GEN_SUCCESS
)
1299 if (cqe
->cqe_sf
.sf_sct
== NVME_CQE_SCT_GENERIC
)
1300 return (nvme_check_generic_cmd_status(cmd
));
1301 else if (cqe
->cqe_sf
.sf_sct
== NVME_CQE_SCT_SPECIFIC
)
1302 return (nvme_check_specific_cmd_status(cmd
));
1303 else if (cqe
->cqe_sf
.sf_sct
== NVME_CQE_SCT_INTEGRITY
)
1304 return (nvme_check_integrity_cmd_status(cmd
));
1305 else if (cqe
->cqe_sf
.sf_sct
== NVME_CQE_SCT_VENDOR
)
1306 return (nvme_check_vendor_cmd_status(cmd
));
1308 return (nvme_check_unknown_cmd_status(cmd
));
1312 nvme_abort_cmd(nvme_cmd_t
*abort_cmd
, uint_t sec
)
1314 nvme_t
*nvme
= abort_cmd
->nc_nvme
;
1315 nvme_cmd_t
*cmd
= nvme_alloc_cmd(nvme
, KM_SLEEP
);
1316 nvme_abort_cmd_t ac
= { 0 };
1319 sema_p(&nvme
->n_abort_sema
);
1321 ac
.b
.ac_cid
= abort_cmd
->nc_sqe
.sqe_cid
;
1322 ac
.b
.ac_sqid
= abort_cmd
->nc_sqid
;
1325 cmd
->nc_sqe
.sqe_opc
= NVME_OPC_ABORT
;
1326 cmd
->nc_callback
= nvme_wakeup_cmd
;
1327 cmd
->nc_sqe
.sqe_cdw10
= ac
.r
;
1330 * Send the ABORT to the hardware. The ABORT command will return _after_
1331 * the aborted command has completed (aborted or otherwise), but since
1332 * we still hold the aborted command's mutex its callback hasn't been
1335 nvme_admin_cmd(cmd
, sec
);
1336 sema_v(&nvme
->n_abort_sema
);
1338 if ((ret
= nvme_check_cmd_status(cmd
)) != 0) {
1339 dev_err(nvme
->n_dip
, CE_WARN
,
1340 "!ABORT failed with sct = %x, sc = %x",
1341 cmd
->nc_cqe
.cqe_sf
.sf_sct
, cmd
->nc_cqe
.cqe_sf
.sf_sc
);
1342 atomic_inc_32(&nvme
->n_abort_failed
);
1344 dev_err(nvme
->n_dip
, CE_WARN
,
1345 "!ABORT of command %d/%d %ssuccessful",
1346 abort_cmd
->nc_sqe
.sqe_cid
, abort_cmd
->nc_sqid
,
1347 cmd
->nc_cqe
.cqe_dw0
& 1 ? "un" : "");
1348 if ((cmd
->nc_cqe
.cqe_dw0
& 1) == 0)
1349 atomic_inc_32(&nvme
->n_cmd_aborted
);
1357 * nvme_wait_cmd -- wait for command completion or timeout
1359 * In case of a serious error or a timeout of the abort command the hardware
1360 * will be declared dead and FMA will be notified.
1363 nvme_wait_cmd(nvme_cmd_t
*cmd
, uint_t sec
)
1365 clock_t timeout
= ddi_get_lbolt() + drv_usectohz(sec
* MICROSEC
);
1366 nvme_t
*nvme
= cmd
->nc_nvme
;
1367 nvme_reg_csts_t csts
;
1370 ASSERT(mutex_owned(&cmd
->nc_mutex
));
1372 while (!cmd
->nc_completed
) {
1373 if (cv_timedwait(&cmd
->nc_cv
, &cmd
->nc_mutex
, timeout
) == -1)
1377 if (cmd
->nc_completed
)
1381 * The command timed out.
1383 * Check controller for fatal status, any errors associated with the
1384 * register or DMA handle, or for a double timeout (abort command timed
1385 * out). If necessary log a warning and call FMA.
1387 csts
.r
= nvme_get32(nvme
, NVME_REG_CSTS
);
1388 dev_err(nvme
->n_dip
, CE_WARN
, "!command %d/%d timeout, "
1389 "OPC = %x, CFS = %d", cmd
->nc_sqe
.sqe_cid
, cmd
->nc_sqid
,
1390 cmd
->nc_sqe
.sqe_opc
, csts
.b
.csts_cfs
);
1391 atomic_inc_32(&nvme
->n_cmd_timeout
);
1393 if (csts
.b
.csts_cfs
||
1394 nvme_check_regs_hdl(nvme
) ||
1395 nvme_check_dma_hdl(cmd
->nc_dma
) ||
1396 cmd
->nc_sqe
.sqe_opc
== NVME_OPC_ABORT
) {
1397 ddi_fm_service_impact(nvme
->n_dip
, DDI_SERVICE_LOST
);
1398 nvme
->n_dead
= B_TRUE
;
1399 } else if (nvme_abort_cmd(cmd
, sec
) == 0) {
1401 * If the abort succeeded the command should complete
1402 * immediately with an appropriate status.
1404 while (!cmd
->nc_completed
)
1405 cv_wait(&cmd
->nc_cv
, &cmd
->nc_mutex
);
1410 qp
= nvme
->n_ioq
[cmd
->nc_sqid
];
1412 mutex_enter(&qp
->nq_mutex
);
1413 (void) nvme_unqueue_cmd(nvme
, qp
, cmd
->nc_sqe
.sqe_cid
);
1414 mutex_exit(&qp
->nq_mutex
);
1417 * As we don't know what the presumed dead hardware might still do with
1418 * the DMA memory, we'll put the command on the lost commands list if it
1419 * has any DMA memory.
1421 if (cmd
->nc_dma
!= NULL
) {
1422 mutex_enter(&nvme_lc_mutex
);
1423 list_insert_head(&nvme_lost_cmds
, cmd
);
1424 mutex_exit(&nvme_lc_mutex
);
1429 nvme_wakeup_cmd(void *arg
)
1431 nvme_cmd_t
*cmd
= arg
;
1433 mutex_enter(&cmd
->nc_mutex
);
1434 cmd
->nc_completed
= B_TRUE
;
1435 cv_signal(&cmd
->nc_cv
);
1436 mutex_exit(&cmd
->nc_mutex
);
1440 nvme_async_event_task(void *arg
)
1442 nvme_cmd_t
*cmd
= arg
;
1443 nvme_t
*nvme
= cmd
->nc_nvme
;
1444 nvme_error_log_entry_t
*error_log
= NULL
;
1445 nvme_health_log_t
*health_log
= NULL
;
1447 nvme_async_event_t event
;
1450 * Check for errors associated with the async request itself. The only
1451 * command-specific error is "async event limit exceeded", which
1452 * indicates a programming error in the driver and causes a panic in
1453 * nvme_check_cmd_status().
1455 * Other possible errors are various scenarios where the async request
1456 * was aborted, or internal errors in the device. Internal errors are
1457 * reported to FMA, the command aborts need no special handling here.
1459 * And finally, at least qemu nvme does not support async events,
1460 * and will return NVME_CQE_SC_GEN_INV_OPC | DNR. If so, we
1461 * will avoid posting async events.
1464 if (nvme_check_cmd_status(cmd
) != 0) {
1465 dev_err(cmd
->nc_nvme
->n_dip
, CE_WARN
,
1466 "!async event request returned failure, sct = %x, "
1467 "sc = %x, dnr = %d, m = %d", cmd
->nc_cqe
.cqe_sf
.sf_sct
,
1468 cmd
->nc_cqe
.cqe_sf
.sf_sc
, cmd
->nc_cqe
.cqe_sf
.sf_dnr
,
1469 cmd
->nc_cqe
.cqe_sf
.sf_m
);
1471 if (cmd
->nc_cqe
.cqe_sf
.sf_sct
== NVME_CQE_SCT_GENERIC
&&
1472 cmd
->nc_cqe
.cqe_sf
.sf_sc
== NVME_CQE_SC_GEN_INTERNAL_ERR
) {
1473 cmd
->nc_nvme
->n_dead
= B_TRUE
;
1474 ddi_fm_service_impact(cmd
->nc_nvme
->n_dip
,
1478 if (cmd
->nc_cqe
.cqe_sf
.sf_sct
== NVME_CQE_SCT_GENERIC
&&
1479 cmd
->nc_cqe
.cqe_sf
.sf_sc
== NVME_CQE_SC_GEN_INV_OPC
&&
1480 cmd
->nc_cqe
.cqe_sf
.sf_dnr
== 1) {
1481 nvme
->n_async_event_supported
= B_FALSE
;
1489 event
.r
= cmd
->nc_cqe
.cqe_dw0
;
1491 /* Clear CQE and re-submit the async request. */
1492 bzero(&cmd
->nc_cqe
, sizeof (nvme_cqe_t
));
1493 nvme_submit_admin_cmd(nvme
->n_adminq
, cmd
);
1495 switch (event
.b
.ae_type
) {
1496 case NVME_ASYNC_TYPE_ERROR
:
1497 if (event
.b
.ae_logpage
== NVME_LOGPAGE_ERROR
) {
1498 (void) nvme_get_logpage(nvme
, B_FALSE
,
1499 (void **)&error_log
, &logsize
, event
.b
.ae_logpage
);
1501 dev_err(nvme
->n_dip
, CE_WARN
, "!wrong logpage in "
1502 "async event reply: %d", event
.b
.ae_logpage
);
1503 atomic_inc_32(&nvme
->n_wrong_logpage
);
1506 switch (event
.b
.ae_info
) {
1507 case NVME_ASYNC_ERROR_INV_SQ
:
1508 dev_err(nvme
->n_dip
, CE_PANIC
, "programming error: "
1509 "invalid submission queue");
1512 case NVME_ASYNC_ERROR_INV_DBL
:
1513 dev_err(nvme
->n_dip
, CE_PANIC
, "programming error: "
1514 "invalid doorbell write value");
1517 case NVME_ASYNC_ERROR_DIAGFAIL
:
1518 dev_err(nvme
->n_dip
, CE_WARN
, "!diagnostic failure");
1519 ddi_fm_service_impact(nvme
->n_dip
, DDI_SERVICE_LOST
);
1520 nvme
->n_dead
= B_TRUE
;
1521 atomic_inc_32(&nvme
->n_diagfail_event
);
1524 case NVME_ASYNC_ERROR_PERSISTENT
:
1525 dev_err(nvme
->n_dip
, CE_WARN
, "!persistent internal "
1527 ddi_fm_service_impact(nvme
->n_dip
, DDI_SERVICE_LOST
);
1528 nvme
->n_dead
= B_TRUE
;
1529 atomic_inc_32(&nvme
->n_persistent_event
);
1532 case NVME_ASYNC_ERROR_TRANSIENT
:
1533 dev_err(nvme
->n_dip
, CE_WARN
, "!transient internal "
1535 /* TODO: send ereport */
1536 atomic_inc_32(&nvme
->n_transient_event
);
1539 case NVME_ASYNC_ERROR_FW_LOAD
:
1540 dev_err(nvme
->n_dip
, CE_WARN
,
1541 "!firmware image load error");
1542 atomic_inc_32(&nvme
->n_fw_load_event
);
1547 case NVME_ASYNC_TYPE_HEALTH
:
1548 if (event
.b
.ae_logpage
== NVME_LOGPAGE_HEALTH
) {
1549 (void) nvme_get_logpage(nvme
, B_FALSE
,
1550 (void **)&health_log
, &logsize
, event
.b
.ae_logpage
,
1553 dev_err(nvme
->n_dip
, CE_WARN
, "!wrong logpage in "
1554 "async event reply: %d", event
.b
.ae_logpage
);
1555 atomic_inc_32(&nvme
->n_wrong_logpage
);
1558 switch (event
.b
.ae_info
) {
1559 case NVME_ASYNC_HEALTH_RELIABILITY
:
1560 dev_err(nvme
->n_dip
, CE_WARN
,
1561 "!device reliability compromised");
1562 /* TODO: send ereport */
1563 atomic_inc_32(&nvme
->n_reliability_event
);
1566 case NVME_ASYNC_HEALTH_TEMPERATURE
:
1567 dev_err(nvme
->n_dip
, CE_WARN
,
1568 "!temperature above threshold");
1569 /* TODO: send ereport */
1570 atomic_inc_32(&nvme
->n_temperature_event
);
1573 case NVME_ASYNC_HEALTH_SPARE
:
1574 dev_err(nvme
->n_dip
, CE_WARN
,
1575 "!spare space below threshold");
1576 /* TODO: send ereport */
1577 atomic_inc_32(&nvme
->n_spare_event
);
1582 case NVME_ASYNC_TYPE_VENDOR
:
1583 dev_err(nvme
->n_dip
, CE_WARN
, "!vendor specific async event "
1584 "received, info = %x, logpage = %x", event
.b
.ae_info
,
1585 event
.b
.ae_logpage
);
1586 atomic_inc_32(&nvme
->n_vendor_event
);
1590 dev_err(nvme
->n_dip
, CE_WARN
, "!unknown async event received, "
1591 "type = %x, info = %x, logpage = %x", event
.b
.ae_type
,
1592 event
.b
.ae_info
, event
.b
.ae_logpage
);
1593 atomic_inc_32(&nvme
->n_unknown_event
);
1598 kmem_free(error_log
, logsize
);
1601 kmem_free(health_log
, logsize
);
1605 nvme_admin_cmd(nvme_cmd_t
*cmd
, int sec
)
1607 mutex_enter(&cmd
->nc_mutex
);
1608 nvme_submit_admin_cmd(cmd
->nc_nvme
->n_adminq
, cmd
);
1609 nvme_wait_cmd(cmd
, sec
);
1610 mutex_exit(&cmd
->nc_mutex
);
1614 nvme_async_event(nvme_t
*nvme
)
1618 cmd
= nvme_alloc_cmd(nvme
, KM_SLEEP
);
1620 cmd
->nc_sqe
.sqe_opc
= NVME_OPC_ASYNC_EVENT
;
1621 cmd
->nc_callback
= nvme_async_event_task
;
1622 cmd
->nc_dontpanic
= B_TRUE
;
1624 nvme_submit_admin_cmd(nvme
->n_adminq
, cmd
);
1628 nvme_format_nvm(nvme_t
*nvme
, boolean_t user
, uint32_t nsid
, uint8_t lbaf
,
1629 boolean_t ms
, uint8_t pi
, boolean_t pil
, uint8_t ses
)
1631 nvme_cmd_t
*cmd
= nvme_alloc_cmd(nvme
, KM_SLEEP
);
1632 nvme_format_nvm_t format_nvm
= { 0 };
1635 format_nvm
.b
.fm_lbaf
= lbaf
& 0xf;
1636 format_nvm
.b
.fm_ms
= ms
? 1 : 0;
1637 format_nvm
.b
.fm_pi
= pi
& 0x7;
1638 format_nvm
.b
.fm_pil
= pil
? 1 : 0;
1639 format_nvm
.b
.fm_ses
= ses
& 0x7;
1642 cmd
->nc_callback
= nvme_wakeup_cmd
;
1643 cmd
->nc_sqe
.sqe_nsid
= nsid
;
1644 cmd
->nc_sqe
.sqe_opc
= NVME_OPC_NVM_FORMAT
;
1645 cmd
->nc_sqe
.sqe_cdw10
= format_nvm
.r
;
1648 * Some devices like Samsung SM951 don't allow formatting of all
1649 * namespaces in one command. Handle that gracefully.
1651 if (nsid
== (uint32_t)-1)
1652 cmd
->nc_dontpanic
= B_TRUE
;
1654 * If this format request was initiated by the user, then don't allow a
1655 * programmer error to panic the system.
1658 cmd
->nc_dontpanic
= B_TRUE
;
1660 nvme_admin_cmd(cmd
, nvme_format_cmd_timeout
);
1662 if ((ret
= nvme_check_cmd_status(cmd
)) != 0) {
1663 dev_err(nvme
->n_dip
, CE_WARN
,
1664 "!FORMAT failed with sct = %x, sc = %x",
1665 cmd
->nc_cqe
.cqe_sf
.sf_sct
, cmd
->nc_cqe
.cqe_sf
.sf_sc
);
1673 nvme_get_logpage(nvme_t
*nvme
, boolean_t user
, void **buf
, size_t *bufsize
,
1674 uint8_t logpage
, ...)
1676 nvme_cmd_t
*cmd
= nvme_alloc_cmd(nvme
, KM_SLEEP
);
1677 nvme_getlogpage_t getlogpage
= { 0 };
1681 va_start(ap
, logpage
);
1684 cmd
->nc_callback
= nvme_wakeup_cmd
;
1685 cmd
->nc_sqe
.sqe_opc
= NVME_OPC_GET_LOG_PAGE
;
1688 cmd
->nc_dontpanic
= B_TRUE
;
1690 getlogpage
.b
.lp_lid
= logpage
;
1693 case NVME_LOGPAGE_ERROR
:
1694 cmd
->nc_sqe
.sqe_nsid
= (uint32_t)-1;
1696 * The GET LOG PAGE command can use at most 2 pages to return
1697 * data, PRP lists are not supported.
1699 *bufsize
= MIN(2 * nvme
->n_pagesize
,
1700 nvme
->n_error_log_len
* sizeof (nvme_error_log_entry_t
));
1703 case NVME_LOGPAGE_HEALTH
:
1704 cmd
->nc_sqe
.sqe_nsid
= va_arg(ap
, uint32_t);
1705 *bufsize
= sizeof (nvme_health_log_t
);
1708 case NVME_LOGPAGE_FWSLOT
:
1709 cmd
->nc_sqe
.sqe_nsid
= (uint32_t)-1;
1710 *bufsize
= sizeof (nvme_fwslot_log_t
);
1714 dev_err(nvme
->n_dip
, CE_WARN
, "!unknown log page requested: %d",
1716 atomic_inc_32(&nvme
->n_unknown_logpage
);
1723 getlogpage
.b
.lp_numd
= *bufsize
/ sizeof (uint32_t) - 1;
1725 cmd
->nc_sqe
.sqe_cdw10
= getlogpage
.r
;
1727 if (nvme_zalloc_dma(nvme
, getlogpage
.b
.lp_numd
* sizeof (uint32_t),
1728 DDI_DMA_READ
, &nvme
->n_prp_dma_attr
, &cmd
->nc_dma
) != DDI_SUCCESS
) {
1729 dev_err(nvme
->n_dip
, CE_WARN
,
1730 "!nvme_zalloc_dma failed for GET LOG PAGE");
1735 if (cmd
->nc_dma
->nd_ncookie
> 2) {
1736 dev_err(nvme
->n_dip
, CE_WARN
,
1737 "!too many DMA cookies for GET LOG PAGE");
1738 atomic_inc_32(&nvme
->n_too_many_cookies
);
1743 cmd
->nc_sqe
.sqe_dptr
.d_prp
[0] = cmd
->nc_dma
->nd_cookie
.dmac_laddress
;
1744 if (cmd
->nc_dma
->nd_ncookie
> 1) {
1745 ddi_dma_nextcookie(cmd
->nc_dma
->nd_dmah
,
1746 &cmd
->nc_dma
->nd_cookie
);
1747 cmd
->nc_sqe
.sqe_dptr
.d_prp
[1] =
1748 cmd
->nc_dma
->nd_cookie
.dmac_laddress
;
1751 nvme_admin_cmd(cmd
, nvme_admin_cmd_timeout
);
1753 if ((ret
= nvme_check_cmd_status(cmd
)) != 0) {
1754 dev_err(nvme
->n_dip
, CE_WARN
,
1755 "!GET LOG PAGE failed with sct = %x, sc = %x",
1756 cmd
->nc_cqe
.cqe_sf
.sf_sct
, cmd
->nc_cqe
.cqe_sf
.sf_sc
);
1760 *buf
= kmem_alloc(*bufsize
, KM_SLEEP
);
1761 bcopy(cmd
->nc_dma
->nd_memp
, *buf
, *bufsize
);
1770 nvme_identify(nvme_t
*nvme
, boolean_t user
, uint32_t nsid
, void **buf
)
1772 nvme_cmd_t
*cmd
= nvme_alloc_cmd(nvme
, KM_SLEEP
);
1779 cmd
->nc_callback
= nvme_wakeup_cmd
;
1780 cmd
->nc_sqe
.sqe_opc
= NVME_OPC_IDENTIFY
;
1781 cmd
->nc_sqe
.sqe_nsid
= nsid
;
1782 cmd
->nc_sqe
.sqe_cdw10
= nsid
? NVME_IDENTIFY_NSID
: NVME_IDENTIFY_CTRL
;
1784 if (nvme_zalloc_dma(nvme
, NVME_IDENTIFY_BUFSIZE
, DDI_DMA_READ
,
1785 &nvme
->n_prp_dma_attr
, &cmd
->nc_dma
) != DDI_SUCCESS
) {
1786 dev_err(nvme
->n_dip
, CE_WARN
,
1787 "!nvme_zalloc_dma failed for IDENTIFY");
1792 if (cmd
->nc_dma
->nd_ncookie
> 2) {
1793 dev_err(nvme
->n_dip
, CE_WARN
,
1794 "!too many DMA cookies for IDENTIFY");
1795 atomic_inc_32(&nvme
->n_too_many_cookies
);
1800 cmd
->nc_sqe
.sqe_dptr
.d_prp
[0] = cmd
->nc_dma
->nd_cookie
.dmac_laddress
;
1801 if (cmd
->nc_dma
->nd_ncookie
> 1) {
1802 ddi_dma_nextcookie(cmd
->nc_dma
->nd_dmah
,
1803 &cmd
->nc_dma
->nd_cookie
);
1804 cmd
->nc_sqe
.sqe_dptr
.d_prp
[1] =
1805 cmd
->nc_dma
->nd_cookie
.dmac_laddress
;
1809 cmd
->nc_dontpanic
= B_TRUE
;
1811 nvme_admin_cmd(cmd
, nvme_admin_cmd_timeout
);
1813 if ((ret
= nvme_check_cmd_status(cmd
)) != 0) {
1814 dev_err(nvme
->n_dip
, CE_WARN
,
1815 "!IDENTIFY failed with sct = %x, sc = %x",
1816 cmd
->nc_cqe
.cqe_sf
.sf_sct
, cmd
->nc_cqe
.cqe_sf
.sf_sc
);
1820 *buf
= kmem_alloc(NVME_IDENTIFY_BUFSIZE
, KM_SLEEP
);
1821 bcopy(cmd
->nc_dma
->nd_memp
, *buf
, NVME_IDENTIFY_BUFSIZE
);
1830 nvme_set_features(nvme_t
*nvme
, boolean_t user
, uint32_t nsid
, uint8_t feature
,
1831 uint32_t val
, uint32_t *res
)
1833 _NOTE(ARGUNUSED(nsid
));
1834 nvme_cmd_t
*cmd
= nvme_alloc_cmd(nvme
, KM_SLEEP
);
1837 ASSERT(res
!= NULL
);
1840 cmd
->nc_callback
= nvme_wakeup_cmd
;
1841 cmd
->nc_sqe
.sqe_opc
= NVME_OPC_SET_FEATURES
;
1842 cmd
->nc_sqe
.sqe_cdw10
= feature
;
1843 cmd
->nc_sqe
.sqe_cdw11
= val
;
1846 cmd
->nc_dontpanic
= B_TRUE
;
1849 case NVME_FEAT_WRITE_CACHE
:
1850 if (!nvme
->n_write_cache_present
)
1854 case NVME_FEAT_NQUEUES
:
1861 nvme_admin_cmd(cmd
, nvme_admin_cmd_timeout
);
1863 if ((ret
= nvme_check_cmd_status(cmd
)) != 0) {
1864 dev_err(nvme
->n_dip
, CE_WARN
,
1865 "!SET FEATURES %d failed with sct = %x, sc = %x",
1866 feature
, cmd
->nc_cqe
.cqe_sf
.sf_sct
,
1867 cmd
->nc_cqe
.cqe_sf
.sf_sc
);
1871 *res
= cmd
->nc_cqe
.cqe_dw0
;
1879 nvme_get_features(nvme_t
*nvme
, boolean_t user
, uint32_t nsid
, uint8_t feature
,
1880 uint32_t *res
, void **buf
, size_t *bufsize
)
1882 nvme_cmd_t
*cmd
= nvme_alloc_cmd(nvme
, KM_SLEEP
);
1885 ASSERT(res
!= NULL
);
1887 if (bufsize
!= NULL
)
1891 cmd
->nc_callback
= nvme_wakeup_cmd
;
1892 cmd
->nc_sqe
.sqe_opc
= NVME_OPC_GET_FEATURES
;
1893 cmd
->nc_sqe
.sqe_cdw10
= feature
;
1894 cmd
->nc_sqe
.sqe_cdw11
= *res
;
1897 * For some of the optional features there doesn't seem to be a method
1898 * of detecting whether it is supported other than using it. This will
1899 * cause "Invalid Field in Command" error, which is normally considered
1900 * a programming error. Set the nc_dontpanic flag to override the panic
1901 * in nvme_check_generic_cmd_status().
1904 case NVME_FEAT_ARBITRATION
:
1905 case NVME_FEAT_POWER_MGMT
:
1906 case NVME_FEAT_TEMPERATURE
:
1907 case NVME_FEAT_ERROR
:
1908 case NVME_FEAT_NQUEUES
:
1909 case NVME_FEAT_INTR_COAL
:
1910 case NVME_FEAT_INTR_VECT
:
1911 case NVME_FEAT_WRITE_ATOM
:
1912 case NVME_FEAT_ASYNC_EVENT
:
1915 case NVME_FEAT_WRITE_CACHE
:
1916 if (!nvme
->n_write_cache_present
)
1920 case NVME_FEAT_LBA_RANGE
:
1921 if (!nvme
->n_lba_range_supported
)
1924 cmd
->nc_dontpanic
= B_TRUE
;
1925 cmd
->nc_sqe
.sqe_nsid
= nsid
;
1926 ASSERT(bufsize
!= NULL
);
1927 *bufsize
= NVME_LBA_RANGE_BUFSIZE
;
1930 case NVME_FEAT_AUTO_PST
:
1931 if (!nvme
->n_auto_pst_supported
)
1934 ASSERT(bufsize
!= NULL
);
1935 *bufsize
= NVME_AUTO_PST_BUFSIZE
;
1938 case NVME_FEAT_PROGRESS
:
1939 if (!nvme
->n_progress_supported
)
1942 cmd
->nc_dontpanic
= B_TRUE
;
1950 cmd
->nc_dontpanic
= B_TRUE
;
1952 if (bufsize
!= NULL
&& *bufsize
!= 0) {
1953 if (nvme_zalloc_dma(nvme
, *bufsize
, DDI_DMA_READ
,
1954 &nvme
->n_prp_dma_attr
, &cmd
->nc_dma
) != DDI_SUCCESS
) {
1955 dev_err(nvme
->n_dip
, CE_WARN
,
1956 "!nvme_zalloc_dma failed for GET FEATURES");
1961 if (cmd
->nc_dma
->nd_ncookie
> 2) {
1962 dev_err(nvme
->n_dip
, CE_WARN
,
1963 "!too many DMA cookies for GET FEATURES");
1964 atomic_inc_32(&nvme
->n_too_many_cookies
);
1969 cmd
->nc_sqe
.sqe_dptr
.d_prp
[0] =
1970 cmd
->nc_dma
->nd_cookie
.dmac_laddress
;
1971 if (cmd
->nc_dma
->nd_ncookie
> 1) {
1972 ddi_dma_nextcookie(cmd
->nc_dma
->nd_dmah
,
1973 &cmd
->nc_dma
->nd_cookie
);
1974 cmd
->nc_sqe
.sqe_dptr
.d_prp
[1] =
1975 cmd
->nc_dma
->nd_cookie
.dmac_laddress
;
1979 nvme_admin_cmd(cmd
, nvme_admin_cmd_timeout
);
1981 if ((ret
= nvme_check_cmd_status(cmd
)) != 0) {
1982 boolean_t known
= B_TRUE
;
1984 /* Check if this is unsupported optional feature */
1985 if (cmd
->nc_cqe
.cqe_sf
.sf_sct
== NVME_CQE_SCT_GENERIC
&&
1986 cmd
->nc_cqe
.cqe_sf
.sf_sc
== NVME_CQE_SC_GEN_INV_FLD
) {
1988 case NVME_FEAT_LBA_RANGE
:
1989 nvme
->n_lba_range_supported
= B_FALSE
;
1991 case NVME_FEAT_PROGRESS
:
1992 nvme
->n_progress_supported
= B_FALSE
;
2002 /* Report the error otherwise */
2004 dev_err(nvme
->n_dip
, CE_WARN
,
2005 "!GET FEATURES %d failed with sct = %x, sc = %x",
2006 feature
, cmd
->nc_cqe
.cqe_sf
.sf_sct
,
2007 cmd
->nc_cqe
.cqe_sf
.sf_sc
);
2013 if (bufsize
!= NULL
&& *bufsize
!= 0) {
2014 ASSERT(buf
!= NULL
);
2015 *buf
= kmem_alloc(*bufsize
, KM_SLEEP
);
2016 bcopy(cmd
->nc_dma
->nd_memp
, *buf
, *bufsize
);
2019 *res
= cmd
->nc_cqe
.cqe_dw0
;
2027 nvme_write_cache_set(nvme_t
*nvme
, boolean_t enable
)
2029 nvme_write_cache_t nwc
= { 0 };
2034 return (nvme_set_features(nvme
, B_FALSE
, 0, NVME_FEAT_WRITE_CACHE
,
2039 nvme_set_nqueues(nvme_t
*nvme
, uint16_t *nqueues
)
2041 nvme_nqueues_t nq
= { 0 };
2044 nq
.b
.nq_nsq
= nq
.b
.nq_ncq
= *nqueues
- 1;
2046 ret
= nvme_set_features(nvme
, B_FALSE
, 0, NVME_FEAT_NQUEUES
, nq
.r
,
2051 * Always use the same number of submission and completion
2052 * queues, and never use more than the requested number of
2055 *nqueues
= MIN(*nqueues
, MIN(nq
.b
.nq_nsq
, nq
.b
.nq_ncq
) + 1);
2062 nvme_create_io_qpair(nvme_t
*nvme
, nvme_qpair_t
*qp
, uint16_t idx
)
2064 nvme_cmd_t
*cmd
= nvme_alloc_cmd(nvme
, KM_SLEEP
);
2065 nvme_create_queue_dw10_t dw10
= { 0 };
2066 nvme_create_cq_dw11_t c_dw11
= { 0 };
2067 nvme_create_sq_dw11_t s_dw11
= { 0 };
2071 dw10
.b
.q_qsize
= qp
->nq_nentry
- 1;
2074 c_dw11
.b
.cq_ien
= 1;
2075 c_dw11
.b
.cq_iv
= idx
% nvme
->n_intr_cnt
;
2078 cmd
->nc_callback
= nvme_wakeup_cmd
;
2079 cmd
->nc_sqe
.sqe_opc
= NVME_OPC_CREATE_CQUEUE
;
2080 cmd
->nc_sqe
.sqe_cdw10
= dw10
.r
;
2081 cmd
->nc_sqe
.sqe_cdw11
= c_dw11
.r
;
2082 cmd
->nc_sqe
.sqe_dptr
.d_prp
[0] = qp
->nq_cqdma
->nd_cookie
.dmac_laddress
;
2084 nvme_admin_cmd(cmd
, nvme_admin_cmd_timeout
);
2086 if ((ret
= nvme_check_cmd_status(cmd
)) != 0) {
2087 dev_err(nvme
->n_dip
, CE_WARN
,
2088 "!CREATE CQUEUE failed with sct = %x, sc = %x",
2089 cmd
->nc_cqe
.cqe_sf
.sf_sct
, cmd
->nc_cqe
.cqe_sf
.sf_sc
);
2096 s_dw11
.b
.sq_cqid
= idx
;
2098 cmd
= nvme_alloc_cmd(nvme
, KM_SLEEP
);
2100 cmd
->nc_callback
= nvme_wakeup_cmd
;
2101 cmd
->nc_sqe
.sqe_opc
= NVME_OPC_CREATE_SQUEUE
;
2102 cmd
->nc_sqe
.sqe_cdw10
= dw10
.r
;
2103 cmd
->nc_sqe
.sqe_cdw11
= s_dw11
.r
;
2104 cmd
->nc_sqe
.sqe_dptr
.d_prp
[0] = qp
->nq_sqdma
->nd_cookie
.dmac_laddress
;
2106 nvme_admin_cmd(cmd
, nvme_admin_cmd_timeout
);
2108 if ((ret
= nvme_check_cmd_status(cmd
)) != 0) {
2109 dev_err(nvme
->n_dip
, CE_WARN
,
2110 "!CREATE SQUEUE failed with sct = %x, sc = %x",
2111 cmd
->nc_cqe
.cqe_sf
.sf_sct
, cmd
->nc_cqe
.cqe_sf
.sf_sc
);
2122 nvme_reset(nvme_t
*nvme
, boolean_t quiesce
)
2124 nvme_reg_csts_t csts
;
2127 nvme_put32(nvme
, NVME_REG_CC
, 0);
2129 csts
.r
= nvme_get32(nvme
, NVME_REG_CSTS
);
2130 if (csts
.b
.csts_rdy
== 1) {
2131 nvme_put32(nvme
, NVME_REG_CC
, 0);
2132 for (i
= 0; i
!= nvme
->n_timeout
* 10; i
++) {
2133 csts
.r
= nvme_get32(nvme
, NVME_REG_CSTS
);
2134 if (csts
.b
.csts_rdy
== 0)
2138 drv_usecwait(50000);
2140 delay(drv_usectohz(50000));
2144 nvme_put32(nvme
, NVME_REG_AQA
, 0);
2145 nvme_put32(nvme
, NVME_REG_ASQ
, 0);
2146 nvme_put32(nvme
, NVME_REG_ACQ
, 0);
2148 csts
.r
= nvme_get32(nvme
, NVME_REG_CSTS
);
2149 return (csts
.b
.csts_rdy
== 0 ? B_TRUE
: B_FALSE
);
2153 nvme_shutdown(nvme_t
*nvme
, int mode
, boolean_t quiesce
)
2156 nvme_reg_csts_t csts
;
2159 ASSERT(mode
== NVME_CC_SHN_NORMAL
|| mode
== NVME_CC_SHN_ABRUPT
);
2161 cc
.r
= nvme_get32(nvme
, NVME_REG_CC
);
2162 cc
.b
.cc_shn
= mode
& 0x3;
2163 nvme_put32(nvme
, NVME_REG_CC
, cc
.r
);
2165 for (i
= 0; i
!= 10; i
++) {
2166 csts
.r
= nvme_get32(nvme
, NVME_REG_CSTS
);
2167 if (csts
.b
.csts_shst
== NVME_CSTS_SHN_COMPLETE
)
2171 drv_usecwait(100000);
2173 delay(drv_usectohz(100000));
2179 nvme_prepare_devid(nvme_t
*nvme
, uint32_t nsid
)
2182 * Section 7.7 of the spec describes how to get a unique ID for
2183 * the controller: the vendor ID, the model name and the serial
2184 * number shall be unique when combined.
2186 * If a namespace has no EUI64 we use the above and add the hex
2187 * namespace ID to get a unique ID for the namespace.
2189 char model
[sizeof (nvme
->n_idctl
->id_model
) + 1];
2190 char serial
[sizeof (nvme
->n_idctl
->id_serial
) + 1];
2192 bcopy(nvme
->n_idctl
->id_model
, model
, sizeof (nvme
->n_idctl
->id_model
));
2193 bcopy(nvme
->n_idctl
->id_serial
, serial
,
2194 sizeof (nvme
->n_idctl
->id_serial
));
2196 model
[sizeof (nvme
->n_idctl
->id_model
)] = '\0';
2197 serial
[sizeof (nvme
->n_idctl
->id_serial
)] = '\0';
2199 nvme
->n_ns
[nsid
- 1].ns_devid
= kmem_asprintf("%4X-%s-%s-%X",
2200 nvme
->n_idctl
->id_vid
, model
, serial
, nsid
);
2204 nvme_init_ns(nvme_t
*nvme
, int nsid
)
2206 nvme_namespace_t
*ns
= &nvme
->n_ns
[nsid
- 1];
2207 nvme_identify_nsid_t
*idns
;
2212 if (nvme_identify(nvme
, B_FALSE
, nsid
, (void **)&idns
) != 0) {
2213 dev_err(nvme
->n_dip
, CE_WARN
,
2214 "!failed to identify namespace %d", nsid
);
2215 return (DDI_FAILURE
);
2220 ns
->ns_block_count
= idns
->id_nsize
;
2222 1 << idns
->id_lbaf
[idns
->id_flbas
.lba_format
].lbaf_lbads
;
2223 ns
->ns_best_block_size
= ns
->ns_block_size
;
2226 * Get the EUI64 if present. Use it for devid and device node names.
2228 if (NVME_VERSION_ATLEAST(&nvme
->n_version
, 1, 1))
2229 bcopy(idns
->id_eui64
, ns
->ns_eui64
, sizeof (ns
->ns_eui64
));
2231 /*LINTED: E_BAD_PTR_CAST_ALIGN*/
2232 if (*(uint64_t *)ns
->ns_eui64
!= 0) {
2233 uint8_t *eui64
= ns
->ns_eui64
;
2235 (void) snprintf(ns
->ns_name
, sizeof (ns
->ns_name
),
2236 "%02x%02x%02x%02x%02x%02x%02x%02x",
2237 eui64
[0], eui64
[1], eui64
[2], eui64
[3],
2238 eui64
[4], eui64
[5], eui64
[6], eui64
[7]);
2240 (void) snprintf(ns
->ns_name
, sizeof (ns
->ns_name
), "%d",
2243 nvme_prepare_devid(nvme
, ns
->ns_id
);
2247 * Find the LBA format with no metadata and the best relative
2248 * performance. A value of 3 means "degraded", 0 is best.
2251 for (int j
= 0; j
<= idns
->id_nlbaf
; j
++) {
2252 if (idns
->id_lbaf
[j
].lbaf_lbads
== 0)
2254 if (idns
->id_lbaf
[j
].lbaf_ms
!= 0)
2256 if (idns
->id_lbaf
[j
].lbaf_rp
>= last_rp
)
2258 last_rp
= idns
->id_lbaf
[j
].lbaf_rp
;
2259 ns
->ns_best_block_size
=
2260 1 << idns
->id_lbaf
[j
].lbaf_lbads
;
2263 if (ns
->ns_best_block_size
< nvme
->n_min_block_size
)
2264 ns
->ns_best_block_size
= nvme
->n_min_block_size
;
2267 * We currently don't support namespaces that use either:
2268 * - protection information
2269 * - illegal block size (< 512)
2271 if (idns
->id_dps
.dp_pinfo
) {
2272 dev_err(nvme
->n_dip
, CE_WARN
,
2273 "!ignoring namespace %d, unsupported feature: "
2274 "pinfo = %d", nsid
, idns
->id_dps
.dp_pinfo
);
2275 ns
->ns_ignore
= B_TRUE
;
2276 } else if (ns
->ns_block_size
< 512) {
2277 dev_err(nvme
->n_dip
, CE_WARN
,
2278 "!ignoring namespace %d, unsupported block size %"PRIu64
,
2279 nsid
, (uint64_t)ns
->ns_block_size
);
2280 ns
->ns_ignore
= B_TRUE
;
2282 ns
->ns_ignore
= B_FALSE
;
2285 return (DDI_SUCCESS
);
2289 nvme_init(nvme_t
*nvme
)
2291 nvme_reg_cc_t cc
= { 0 };
2292 nvme_reg_aqa_t aqa
= { 0 };
2293 nvme_reg_asq_t asq
= { 0 };
2294 nvme_reg_acq_t acq
= { 0 };
2297 nvme_reg_csts_t csts
;
2300 char model
[sizeof (nvme
->n_idctl
->id_model
) + 1];
2301 char *vendor
, *product
;
2303 /* Check controller version */
2304 vs
.r
= nvme_get32(nvme
, NVME_REG_VS
);
2305 nvme
->n_version
.v_major
= vs
.b
.vs_mjr
;
2306 nvme
->n_version
.v_minor
= vs
.b
.vs_mnr
;
2307 dev_err(nvme
->n_dip
, CE_CONT
, "?NVMe spec version %d.%d",
2308 nvme
->n_version
.v_major
, nvme
->n_version
.v_minor
);
2310 if (nvme
->n_version
.v_major
> nvme_version_major
) {
2311 dev_err(nvme
->n_dip
, CE_WARN
, "!no support for version > %d.x",
2312 nvme_version_major
);
2313 if (nvme
->n_strict_version
)
2317 /* retrieve controller configuration */
2318 cap
.r
= nvme_get64(nvme
, NVME_REG_CAP
);
2320 if ((cap
.b
.cap_css
& NVME_CAP_CSS_NVM
) == 0) {
2321 dev_err(nvme
->n_dip
, CE_WARN
,
2322 "!NVM command set not supported by hardware");
2326 nvme
->n_nssr_supported
= cap
.b
.cap_nssrs
;
2327 nvme
->n_doorbell_stride
= 4 << cap
.b
.cap_dstrd
;
2328 nvme
->n_timeout
= cap
.b
.cap_to
;
2329 nvme
->n_arbitration_mechanisms
= cap
.b
.cap_ams
;
2330 nvme
->n_cont_queues_reqd
= cap
.b
.cap_cqr
;
2331 nvme
->n_max_queue_entries
= cap
.b
.cap_mqes
+ 1;
2334 * The MPSMIN and MPSMAX fields in the CAP register use 0 to specify
2335 * the base page size of 4k (1<<12), so add 12 here to get the real
2338 nvme
->n_pageshift
= MIN(MAX(cap
.b
.cap_mpsmin
+ 12, PAGESHIFT
),
2339 cap
.b
.cap_mpsmax
+ 12);
2340 nvme
->n_pagesize
= 1UL << (nvme
->n_pageshift
);
2343 * Set up Queue DMA to transfer at least 1 page-aligned page at a time.
2345 nvme
->n_queue_dma_attr
.dma_attr_align
= nvme
->n_pagesize
;
2346 nvme
->n_queue_dma_attr
.dma_attr_minxfer
= nvme
->n_pagesize
;
2349 * Set up PRP DMA to transfer 1 page-aligned page at a time.
2350 * Maxxfer may be increased after we identified the controller limits.
2352 nvme
->n_prp_dma_attr
.dma_attr_maxxfer
= nvme
->n_pagesize
;
2353 nvme
->n_prp_dma_attr
.dma_attr_minxfer
= nvme
->n_pagesize
;
2354 nvme
->n_prp_dma_attr
.dma_attr_align
= nvme
->n_pagesize
;
2355 nvme
->n_prp_dma_attr
.dma_attr_seg
= nvme
->n_pagesize
- 1;
2358 * Reset controller if it's still in ready state.
2360 if (nvme_reset(nvme
, B_FALSE
) == B_FALSE
) {
2361 dev_err(nvme
->n_dip
, CE_WARN
, "!unable to reset controller");
2362 ddi_fm_service_impact(nvme
->n_dip
, DDI_SERVICE_LOST
);
2363 nvme
->n_dead
= B_TRUE
;
2368 * Create the admin queue pair.
2370 if (nvme_alloc_qpair(nvme
, nvme
->n_admin_queue_len
, &nvme
->n_adminq
, 0)
2372 dev_err(nvme
->n_dip
, CE_WARN
,
2373 "!unable to allocate admin qpair");
2376 nvme
->n_ioq
= kmem_alloc(sizeof (nvme_qpair_t
*), KM_SLEEP
);
2377 nvme
->n_ioq
[0] = nvme
->n_adminq
;
2379 nvme
->n_progress
|= NVME_ADMIN_QUEUE
;
2381 (void) ddi_prop_update_int(DDI_DEV_T_NONE
, nvme
->n_dip
,
2382 "admin-queue-len", nvme
->n_admin_queue_len
);
2384 aqa
.b
.aqa_asqs
= aqa
.b
.aqa_acqs
= nvme
->n_admin_queue_len
- 1;
2385 asq
= nvme
->n_adminq
->nq_sqdma
->nd_cookie
.dmac_laddress
;
2386 acq
= nvme
->n_adminq
->nq_cqdma
->nd_cookie
.dmac_laddress
;
2388 ASSERT((asq
& (nvme
->n_pagesize
- 1)) == 0);
2389 ASSERT((acq
& (nvme
->n_pagesize
- 1)) == 0);
2391 nvme_put32(nvme
, NVME_REG_AQA
, aqa
.r
);
2392 nvme_put64(nvme
, NVME_REG_ASQ
, asq
);
2393 nvme_put64(nvme
, NVME_REG_ACQ
, acq
);
2395 cc
.b
.cc_ams
= 0; /* use Round-Robin arbitration */
2396 cc
.b
.cc_css
= 0; /* use NVM command set */
2397 cc
.b
.cc_mps
= nvme
->n_pageshift
- 12;
2398 cc
.b
.cc_shn
= 0; /* no shutdown in progress */
2399 cc
.b
.cc_en
= 1; /* enable controller */
2400 cc
.b
.cc_iosqes
= 6; /* submission queue entry is 2^6 bytes long */
2401 cc
.b
.cc_iocqes
= 4; /* completion queue entry is 2^4 bytes long */
2403 nvme_put32(nvme
, NVME_REG_CC
, cc
.r
);
2406 * Wait for the controller to become ready.
2408 csts
.r
= nvme_get32(nvme
, NVME_REG_CSTS
);
2409 if (csts
.b
.csts_rdy
== 0) {
2410 for (i
= 0; i
!= nvme
->n_timeout
* 10; i
++) {
2411 delay(drv_usectohz(50000));
2412 csts
.r
= nvme_get32(nvme
, NVME_REG_CSTS
);
2414 if (csts
.b
.csts_cfs
== 1) {
2415 dev_err(nvme
->n_dip
, CE_WARN
,
2416 "!controller fatal status at init");
2417 ddi_fm_service_impact(nvme
->n_dip
,
2419 nvme
->n_dead
= B_TRUE
;
2423 if (csts
.b
.csts_rdy
== 1)
2428 if (csts
.b
.csts_rdy
== 0) {
2429 dev_err(nvme
->n_dip
, CE_WARN
, "!controller not ready");
2430 ddi_fm_service_impact(nvme
->n_dip
, DDI_SERVICE_LOST
);
2431 nvme
->n_dead
= B_TRUE
;
2436 * Assume an abort command limit of 1. We'll destroy and re-init
2437 * that later when we know the true abort command limit.
2439 sema_init(&nvme
->n_abort_sema
, 1, NULL
, SEMA_DRIVER
, NULL
);
2442 * Setup initial interrupt for admin queue.
2444 if ((nvme_setup_interrupts(nvme
, DDI_INTR_TYPE_MSIX
, 1)
2446 (nvme_setup_interrupts(nvme
, DDI_INTR_TYPE_MSI
, 1)
2448 (nvme_setup_interrupts(nvme
, DDI_INTR_TYPE_FIXED
, 1)
2450 dev_err(nvme
->n_dip
, CE_WARN
,
2451 "!failed to setup initial interrupt");
2456 * Post an asynchronous event command to catch errors.
2457 * We assume the asynchronous events are supported as required by
2458 * specification (Figure 40 in section 5 of NVMe 1.2).
2459 * However, since at least qemu does not follow the specification,
2460 * we need a mechanism to protect ourselves.
2462 nvme
->n_async_event_supported
= B_TRUE
;
2463 nvme_async_event(nvme
);
2466 * Identify Controller
2468 if (nvme_identify(nvme
, B_FALSE
, 0, (void **)&nvme
->n_idctl
) != 0) {
2469 dev_err(nvme
->n_dip
, CE_WARN
,
2470 "!failed to identify controller");
2475 * Get Vendor & Product ID
2477 bcopy(nvme
->n_idctl
->id_model
, model
, sizeof (nvme
->n_idctl
->id_model
));
2478 model
[sizeof (nvme
->n_idctl
->id_model
)] = '\0';
2479 sata_split_model(model
, &vendor
, &product
);
2482 nvme
->n_vendor
= strdup("NVMe");
2484 nvme
->n_vendor
= strdup(vendor
);
2486 nvme
->n_product
= strdup(product
);
2489 * Get controller limits.
2491 nvme
->n_async_event_limit
= MAX(NVME_MIN_ASYNC_EVENT_LIMIT
,
2492 MIN(nvme
->n_admin_queue_len
/ 10,
2493 MIN(nvme
->n_idctl
->id_aerl
+ 1, nvme
->n_async_event_limit
)));
2495 (void) ddi_prop_update_int(DDI_DEV_T_NONE
, nvme
->n_dip
,
2496 "async-event-limit", nvme
->n_async_event_limit
);
2498 nvme
->n_abort_command_limit
= nvme
->n_idctl
->id_acl
+ 1;
2501 * Reinitialize the semaphore with the true abort command limit
2502 * supported by the hardware. It's not necessary to disable interrupts
2503 * as only command aborts use the semaphore, and no commands are
2504 * executed or aborted while we're here.
2506 sema_destroy(&nvme
->n_abort_sema
);
2507 sema_init(&nvme
->n_abort_sema
, nvme
->n_abort_command_limit
- 1, NULL
,
2510 nvme
->n_progress
|= NVME_CTRL_LIMITS
;
2512 if (nvme
->n_idctl
->id_mdts
== 0)
2513 nvme
->n_max_data_transfer_size
= nvme
->n_pagesize
* 65536;
2515 nvme
->n_max_data_transfer_size
=
2516 1ull << (nvme
->n_pageshift
+ nvme
->n_idctl
->id_mdts
);
2518 nvme
->n_error_log_len
= nvme
->n_idctl
->id_elpe
+ 1;
2521 * Limit n_max_data_transfer_size to what we can handle in one PRP.
2522 * Chained PRPs are currently unsupported.
2524 * This is a no-op on hardware which doesn't support a transfer size
2525 * big enough to require chained PRPs.
2527 nvme
->n_max_data_transfer_size
= MIN(nvme
->n_max_data_transfer_size
,
2528 (nvme
->n_pagesize
/ sizeof (uint64_t) * nvme
->n_pagesize
));
2530 nvme
->n_prp_dma_attr
.dma_attr_maxxfer
= nvme
->n_max_data_transfer_size
;
2533 * Make sure the minimum/maximum queue entry sizes are not
2534 * larger/smaller than the default.
2537 if (((1 << nvme
->n_idctl
->id_sqes
.qes_min
) > sizeof (nvme_sqe_t
)) ||
2538 ((1 << nvme
->n_idctl
->id_sqes
.qes_max
) < sizeof (nvme_sqe_t
)) ||
2539 ((1 << nvme
->n_idctl
->id_cqes
.qes_min
) > sizeof (nvme_cqe_t
)) ||
2540 ((1 << nvme
->n_idctl
->id_cqes
.qes_max
) < sizeof (nvme_cqe_t
)))
2544 * Check for the presence of a Volatile Write Cache. If present,
2545 * enable or disable based on the value of the property
2546 * volatile-write-cache-enable (default is enabled).
2548 nvme
->n_write_cache_present
=
2549 nvme
->n_idctl
->id_vwc
.vwc_present
== 0 ? B_FALSE
: B_TRUE
;
2551 (void) ddi_prop_update_int(DDI_DEV_T_NONE
, nvme
->n_dip
,
2552 "volatile-write-cache-present",
2553 nvme
->n_write_cache_present
? 1 : 0);
2555 if (!nvme
->n_write_cache_present
) {
2556 nvme
->n_write_cache_enabled
= B_FALSE
;
2557 } else if (nvme_write_cache_set(nvme
, nvme
->n_write_cache_enabled
)
2559 dev_err(nvme
->n_dip
, CE_WARN
,
2560 "!failed to %sable volatile write cache",
2561 nvme
->n_write_cache_enabled
? "en" : "dis");
2563 * Assume the cache is (still) enabled.
2565 nvme
->n_write_cache_enabled
= B_TRUE
;
2568 (void) ddi_prop_update_int(DDI_DEV_T_NONE
, nvme
->n_dip
,
2569 "volatile-write-cache-enable",
2570 nvme
->n_write_cache_enabled
? 1 : 0);
2573 * Assume LBA Range Type feature is supported. If it isn't this
2574 * will be set to B_FALSE by nvme_get_features().
2576 nvme
->n_lba_range_supported
= B_TRUE
;
2579 * Check support for Autonomous Power State Transition.
2581 if (NVME_VERSION_ATLEAST(&nvme
->n_version
, 1, 1))
2582 nvme
->n_auto_pst_supported
=
2583 nvme
->n_idctl
->id_apsta
.ap_sup
== 0 ? B_FALSE
: B_TRUE
;
2586 * Assume Software Progress Marker feature is supported. If it isn't
2587 * this will be set to B_FALSE by nvme_get_features().
2589 nvme
->n_progress_supported
= B_TRUE
;
2592 * Identify Namespaces
2594 nvme
->n_namespace_count
= nvme
->n_idctl
->id_nn
;
2596 if (nvme
->n_namespace_count
== 0) {
2597 dev_err(nvme
->n_dip
, CE_WARN
,
2598 "!controllers without namespaces are not supported");
2602 if (nvme
->n_namespace_count
> NVME_MINOR_MAX
) {
2603 dev_err(nvme
->n_dip
, CE_WARN
,
2604 "!too many namespaces: %d, limiting to %d\n",
2605 nvme
->n_namespace_count
, NVME_MINOR_MAX
);
2606 nvme
->n_namespace_count
= NVME_MINOR_MAX
;
2609 nvme
->n_ns
= kmem_zalloc(sizeof (nvme_namespace_t
) *
2610 nvme
->n_namespace_count
, KM_SLEEP
);
2612 for (i
= 0; i
!= nvme
->n_namespace_count
; i
++) {
2613 mutex_init(&nvme
->n_ns
[i
].ns_minor
.nm_mutex
, NULL
, MUTEX_DRIVER
,
2615 if (nvme_init_ns(nvme
, i
+ 1) != DDI_SUCCESS
)
2620 * Try to set up MSI/MSI-X interrupts.
2622 if ((nvme
->n_intr_types
& (DDI_INTR_TYPE_MSI
| DDI_INTR_TYPE_MSIX
))
2624 nvme_release_interrupts(nvme
);
2626 nqueues
= MIN(UINT16_MAX
, ncpus
);
2628 if ((nvme_setup_interrupts(nvme
, DDI_INTR_TYPE_MSIX
,
2629 nqueues
) != DDI_SUCCESS
) &&
2630 (nvme_setup_interrupts(nvme
, DDI_INTR_TYPE_MSI
,
2631 nqueues
) != DDI_SUCCESS
)) {
2632 dev_err(nvme
->n_dip
, CE_WARN
,
2633 "!failed to setup MSI/MSI-X interrupts");
2638 nqueues
= nvme
->n_intr_cnt
;
2641 * Create I/O queue pairs.
2644 if (nvme_set_nqueues(nvme
, &nqueues
) != 0) {
2645 dev_err(nvme
->n_dip
, CE_WARN
,
2646 "!failed to set number of I/O queues to %d",
2652 * Reallocate I/O queue array
2654 kmem_free(nvme
->n_ioq
, sizeof (nvme_qpair_t
*));
2655 nvme
->n_ioq
= kmem_zalloc(sizeof (nvme_qpair_t
*) *
2656 (nqueues
+ 1), KM_SLEEP
);
2657 nvme
->n_ioq
[0] = nvme
->n_adminq
;
2659 nvme
->n_ioq_count
= nqueues
;
2662 * If we got less queues than we asked for we might as well give
2663 * some of the interrupt vectors back to the system.
2665 if (nvme
->n_ioq_count
< nvme
->n_intr_cnt
) {
2666 nvme_release_interrupts(nvme
);
2668 if (nvme_setup_interrupts(nvme
, nvme
->n_intr_type
,
2669 nvme
->n_ioq_count
) != DDI_SUCCESS
) {
2670 dev_err(nvme
->n_dip
, CE_WARN
,
2671 "!failed to reduce number of interrupts");
2677 * Alloc & register I/O queue pairs
2679 nvme
->n_io_queue_len
=
2680 MIN(nvme
->n_io_queue_len
, nvme
->n_max_queue_entries
);
2681 (void) ddi_prop_update_int(DDI_DEV_T_NONE
, nvme
->n_dip
, "io-queue-len",
2682 nvme
->n_io_queue_len
);
2684 for (i
= 1; i
!= nvme
->n_ioq_count
+ 1; i
++) {
2685 if (nvme_alloc_qpair(nvme
, nvme
->n_io_queue_len
,
2686 &nvme
->n_ioq
[i
], i
) != DDI_SUCCESS
) {
2687 dev_err(nvme
->n_dip
, CE_WARN
,
2688 "!unable to allocate I/O qpair %d", i
);
2692 if (nvme_create_io_qpair(nvme
, nvme
->n_ioq
[i
], i
) != 0) {
2693 dev_err(nvme
->n_dip
, CE_WARN
,
2694 "!unable to create I/O qpair %d", i
);
2700 * Post more asynchronous events commands to reduce event reporting
2701 * latency as suggested by the spec.
2703 if (nvme
->n_async_event_supported
) {
2704 for (i
= 1; i
!= nvme
->n_async_event_limit
; i
++)
2705 nvme_async_event(nvme
);
2708 return (DDI_SUCCESS
);
2711 (void) nvme_reset(nvme
, B_FALSE
);
2712 return (DDI_FAILURE
);
2716 nvme_intr(caddr_t arg1
, caddr_t arg2
)
2718 /*LINTED: E_PTR_BAD_CAST_ALIGN*/
2719 nvme_t
*nvme
= (nvme_t
*)arg1
;
2720 int inum
= (int)(uintptr_t)arg2
;
2725 if (inum
>= nvme
->n_intr_cnt
)
2726 return (DDI_INTR_UNCLAIMED
);
2729 return (nvme
->n_intr_type
== DDI_INTR_TYPE_FIXED
?
2730 DDI_INTR_UNCLAIMED
: DDI_INTR_CLAIMED
);
2733 * The interrupt vector a queue uses is calculated as queue_idx %
2734 * intr_cnt in nvme_create_io_qpair(). Iterate through the queue array
2735 * in steps of n_intr_cnt to process all queues using this vector.
2738 qnum
< nvme
->n_ioq_count
+ 1 && nvme
->n_ioq
[qnum
] != NULL
;
2739 qnum
+= nvme
->n_intr_cnt
) {
2740 while ((cmd
= nvme_retrieve_cmd(nvme
, nvme
->n_ioq
[qnum
]))) {
2741 taskq_dispatch_ent((taskq_t
*)cmd
->nc_nvme
->n_cmd_taskq
,
2742 cmd
->nc_callback
, cmd
, TQ_NOSLEEP
, &cmd
->nc_tqent
);
2747 return (ccnt
> 0 ? DDI_INTR_CLAIMED
: DDI_INTR_UNCLAIMED
);
2751 nvme_release_interrupts(nvme_t
*nvme
)
2755 for (i
= 0; i
< nvme
->n_intr_cnt
; i
++) {
2756 if (nvme
->n_inth
[i
] == NULL
)
2759 if (nvme
->n_intr_cap
& DDI_INTR_FLAG_BLOCK
)
2760 (void) ddi_intr_block_disable(&nvme
->n_inth
[i
], 1);
2762 (void) ddi_intr_disable(nvme
->n_inth
[i
]);
2764 (void) ddi_intr_remove_handler(nvme
->n_inth
[i
]);
2765 (void) ddi_intr_free(nvme
->n_inth
[i
]);
2768 kmem_free(nvme
->n_inth
, nvme
->n_inth_sz
);
2769 nvme
->n_inth
= NULL
;
2770 nvme
->n_inth_sz
= 0;
2772 nvme
->n_progress
&= ~NVME_INTERRUPTS
;
2776 nvme_setup_interrupts(nvme_t
*nvme
, int intr_type
, int nqpairs
)
2778 int nintrs
, navail
, count
;
2782 if (nvme
->n_intr_types
== 0) {
2783 ret
= ddi_intr_get_supported_types(nvme
->n_dip
,
2784 &nvme
->n_intr_types
);
2785 if (ret
!= DDI_SUCCESS
) {
2786 dev_err(nvme
->n_dip
, CE_WARN
,
2787 "!%s: ddi_intr_get_supported types failed",
2792 if (get_hwenv() == HW_VMWARE
)
2793 nvme
->n_intr_types
&= ~DDI_INTR_TYPE_MSIX
;
2797 if ((nvme
->n_intr_types
& intr_type
) == 0)
2798 return (DDI_FAILURE
);
2800 ret
= ddi_intr_get_nintrs(nvme
->n_dip
, intr_type
, &nintrs
);
2801 if (ret
!= DDI_SUCCESS
) {
2802 dev_err(nvme
->n_dip
, CE_WARN
, "!%s: ddi_intr_get_nintrs failed",
2807 ret
= ddi_intr_get_navail(nvme
->n_dip
, intr_type
, &navail
);
2808 if (ret
!= DDI_SUCCESS
) {
2809 dev_err(nvme
->n_dip
, CE_WARN
, "!%s: ddi_intr_get_navail failed",
2814 /* We want at most one interrupt per queue pair. */
2815 if (navail
> nqpairs
)
2818 nvme
->n_inth_sz
= sizeof (ddi_intr_handle_t
) * navail
;
2819 nvme
->n_inth
= kmem_zalloc(nvme
->n_inth_sz
, KM_SLEEP
);
2821 ret
= ddi_intr_alloc(nvme
->n_dip
, nvme
->n_inth
, intr_type
, 0, navail
,
2823 if (ret
!= DDI_SUCCESS
) {
2824 dev_err(nvme
->n_dip
, CE_WARN
, "!%s: ddi_intr_alloc failed",
2829 nvme
->n_intr_cnt
= count
;
2831 ret
= ddi_intr_get_pri(nvme
->n_inth
[0], &nvme
->n_intr_pri
);
2832 if (ret
!= DDI_SUCCESS
) {
2833 dev_err(nvme
->n_dip
, CE_WARN
, "!%s: ddi_intr_get_pri failed",
2838 for (i
= 0; i
< count
; i
++) {
2839 ret
= ddi_intr_add_handler(nvme
->n_inth
[i
], nvme_intr
,
2840 (void *)nvme
, (void *)(uintptr_t)i
);
2841 if (ret
!= DDI_SUCCESS
) {
2842 dev_err(nvme
->n_dip
, CE_WARN
,
2843 "!%s: ddi_intr_add_handler failed", __func__
);
2848 (void) ddi_intr_get_cap(nvme
->n_inth
[0], &nvme
->n_intr_cap
);
2850 for (i
= 0; i
< count
; i
++) {
2851 if (nvme
->n_intr_cap
& DDI_INTR_FLAG_BLOCK
)
2852 ret
= ddi_intr_block_enable(&nvme
->n_inth
[i
], 1);
2854 ret
= ddi_intr_enable(nvme
->n_inth
[i
]);
2856 if (ret
!= DDI_SUCCESS
) {
2857 dev_err(nvme
->n_dip
, CE_WARN
,
2858 "!%s: enabling interrupt %d failed", __func__
, i
);
2863 nvme
->n_intr_type
= intr_type
;
2865 nvme
->n_progress
|= NVME_INTERRUPTS
;
2867 return (DDI_SUCCESS
);
2870 nvme_release_interrupts(nvme
);
2876 nvme_fm_errcb(dev_info_t
*dip
, ddi_fm_error_t
*fm_error
, const void *arg
)
2878 _NOTE(ARGUNUSED(arg
));
2880 pci_ereport_post(dip
, fm_error
, NULL
);
2881 return (fm_error
->fme_status
);
2885 nvme_attach(dev_info_t
*dip
, ddi_attach_cmd_t cmd
)
2894 if (cmd
!= DDI_ATTACH
)
2895 return (DDI_FAILURE
);
2897 instance
= ddi_get_instance(dip
);
2899 if (ddi_soft_state_zalloc(nvme_state
, instance
) != DDI_SUCCESS
)
2900 return (DDI_FAILURE
);
2902 nvme
= ddi_get_soft_state(nvme_state
, instance
);
2903 ddi_set_driver_private(dip
, nvme
);
2906 mutex_init(&nvme
->n_minor
.nm_mutex
, NULL
, MUTEX_DRIVER
, NULL
);
2908 nvme
->n_strict_version
= ddi_prop_get_int(DDI_DEV_T_ANY
, dip
,
2909 DDI_PROP_DONTPASS
, "strict-version", 1) == 1 ? B_TRUE
: B_FALSE
;
2910 nvme
->n_ignore_unknown_vendor_status
= ddi_prop_get_int(DDI_DEV_T_ANY
,
2911 dip
, DDI_PROP_DONTPASS
, "ignore-unknown-vendor-status", 0) == 1 ?
2913 nvme
->n_admin_queue_len
= ddi_prop_get_int(DDI_DEV_T_ANY
, dip
,
2914 DDI_PROP_DONTPASS
, "admin-queue-len", NVME_DEFAULT_ADMIN_QUEUE_LEN
);
2915 nvme
->n_io_queue_len
= ddi_prop_get_int(DDI_DEV_T_ANY
, dip
,
2916 DDI_PROP_DONTPASS
, "io-queue-len", NVME_DEFAULT_IO_QUEUE_LEN
);
2917 nvme
->n_async_event_limit
= ddi_prop_get_int(DDI_DEV_T_ANY
, dip
,
2918 DDI_PROP_DONTPASS
, "async-event-limit",
2919 NVME_DEFAULT_ASYNC_EVENT_LIMIT
);
2920 nvme
->n_write_cache_enabled
= ddi_prop_get_int(DDI_DEV_T_ANY
, dip
,
2921 DDI_PROP_DONTPASS
, "volatile-write-cache-enable", 1) != 0 ?
2923 nvme
->n_min_block_size
= ddi_prop_get_int(DDI_DEV_T_ANY
, dip
,
2924 DDI_PROP_DONTPASS
, "min-phys-block-size",
2925 NVME_DEFAULT_MIN_BLOCK_SIZE
);
2927 if (!ISP2(nvme
->n_min_block_size
) ||
2928 (nvme
->n_min_block_size
< NVME_DEFAULT_MIN_BLOCK_SIZE
)) {
2929 dev_err(dip
, CE_WARN
, "!min-phys-block-size %s, "
2930 "using default %d", ISP2(nvme
->n_min_block_size
) ?
2931 "too low" : "not a power of 2",
2932 NVME_DEFAULT_MIN_BLOCK_SIZE
);
2933 nvme
->n_min_block_size
= NVME_DEFAULT_MIN_BLOCK_SIZE
;
2936 if (nvme
->n_admin_queue_len
< NVME_MIN_ADMIN_QUEUE_LEN
)
2937 nvme
->n_admin_queue_len
= NVME_MIN_ADMIN_QUEUE_LEN
;
2938 else if (nvme
->n_admin_queue_len
> NVME_MAX_ADMIN_QUEUE_LEN
)
2939 nvme
->n_admin_queue_len
= NVME_MAX_ADMIN_QUEUE_LEN
;
2941 if (nvme
->n_io_queue_len
< NVME_MIN_IO_QUEUE_LEN
)
2942 nvme
->n_io_queue_len
= NVME_MIN_IO_QUEUE_LEN
;
2944 if (nvme
->n_async_event_limit
< 1)
2945 nvme
->n_async_event_limit
= NVME_DEFAULT_ASYNC_EVENT_LIMIT
;
2947 nvme
->n_reg_acc_attr
= nvme_reg_acc_attr
;
2948 nvme
->n_queue_dma_attr
= nvme_queue_dma_attr
;
2949 nvme
->n_prp_dma_attr
= nvme_prp_dma_attr
;
2950 nvme
->n_sgl_dma_attr
= nvme_sgl_dma_attr
;
2953 * Setup FMA support.
2955 nvme
->n_fm_cap
= ddi_getprop(DDI_DEV_T_ANY
, dip
,
2956 DDI_PROP_CANSLEEP
| DDI_PROP_DONTPASS
, "fm-capable",
2957 DDI_FM_EREPORT_CAPABLE
| DDI_FM_ACCCHK_CAPABLE
|
2958 DDI_FM_DMACHK_CAPABLE
| DDI_FM_ERRCB_CAPABLE
);
2960 ddi_fm_init(dip
, &nvme
->n_fm_cap
, &nvme
->n_fm_ibc
);
2962 if (nvme
->n_fm_cap
) {
2963 if (nvme
->n_fm_cap
& DDI_FM_ACCCHK_CAPABLE
)
2964 nvme
->n_reg_acc_attr
.devacc_attr_access
=
2967 if (nvme
->n_fm_cap
& DDI_FM_DMACHK_CAPABLE
) {
2968 nvme
->n_prp_dma_attr
.dma_attr_flags
|= DDI_DMA_FLAGERR
;
2969 nvme
->n_sgl_dma_attr
.dma_attr_flags
|= DDI_DMA_FLAGERR
;
2972 if (DDI_FM_EREPORT_CAP(nvme
->n_fm_cap
) ||
2973 DDI_FM_ERRCB_CAP(nvme
->n_fm_cap
))
2974 pci_ereport_setup(dip
);
2976 if (DDI_FM_ERRCB_CAP(nvme
->n_fm_cap
))
2977 ddi_fm_handler_register(dip
, nvme_fm_errcb
,
2981 nvme
->n_progress
|= NVME_FMA_INIT
;
2984 * The spec defines several register sets. Only the controller
2985 * registers (set 1) are currently used.
2987 if (ddi_dev_nregs(dip
, &nregs
) == DDI_FAILURE
||
2989 ddi_dev_regsize(dip
, 1, ®size
) == DDI_FAILURE
)
2992 if (ddi_regs_map_setup(dip
, 1, &nvme
->n_regs
, 0, regsize
,
2993 &nvme
->n_reg_acc_attr
, &nvme
->n_regh
) != DDI_SUCCESS
) {
2994 dev_err(dip
, CE_WARN
, "!failed to map regset 1");
2998 nvme
->n_progress
|= NVME_REGS_MAPPED
;
3001 * Create taskq for command completion.
3003 (void) snprintf(name
, sizeof (name
), "%s%d_cmd_taskq",
3004 ddi_driver_name(dip
), ddi_get_instance(dip
));
3005 nvme
->n_cmd_taskq
= ddi_taskq_create(dip
, name
, MIN(UINT16_MAX
, ncpus
),
3006 TASKQ_DEFAULTPRI
, 0);
3007 if (nvme
->n_cmd_taskq
== NULL
) {
3008 dev_err(dip
, CE_WARN
, "!failed to create cmd taskq");
3013 * Create PRP DMA cache
3015 (void) snprintf(name
, sizeof (name
), "%s%d_prp_cache",
3016 ddi_driver_name(dip
), ddi_get_instance(dip
));
3017 nvme
->n_prp_cache
= kmem_cache_create(name
, sizeof (nvme_dma_t
),
3018 0, nvme_prp_dma_constructor
, nvme_prp_dma_destructor
,
3019 NULL
, (void *)nvme
, NULL
, 0);
3021 if (nvme_init(nvme
) != DDI_SUCCESS
)
3025 * Attach the blkdev driver for each namespace.
3027 for (i
= 0; i
!= nvme
->n_namespace_count
; i
++) {
3028 if (ddi_create_minor_node(nvme
->n_dip
, nvme
->n_ns
[i
].ns_name
,
3029 S_IFCHR
, NVME_MINOR(ddi_get_instance(nvme
->n_dip
), i
+ 1),
3030 DDI_NT_NVME_ATTACHMENT_POINT
, 0) != DDI_SUCCESS
) {
3031 dev_err(dip
, CE_WARN
,
3032 "!failed to create minor node for namespace %d", i
);
3036 if (nvme
->n_ns
[i
].ns_ignore
)
3039 nvme
->n_ns
[i
].ns_bd_hdl
= bd_alloc_handle(&nvme
->n_ns
[i
],
3040 &nvme_bd_ops
, &nvme
->n_prp_dma_attr
, KM_SLEEP
);
3042 if (nvme
->n_ns
[i
].ns_bd_hdl
== NULL
) {
3043 dev_err(dip
, CE_WARN
,
3044 "!failed to get blkdev handle for namespace %d", i
);
3048 if (bd_attach_handle(dip
, nvme
->n_ns
[i
].ns_bd_hdl
)
3050 dev_err(dip
, CE_WARN
,
3051 "!failed to attach blkdev handle for namespace %d",
3057 if (ddi_create_minor_node(dip
, "devctl", S_IFCHR
,
3058 NVME_MINOR(ddi_get_instance(dip
), 0), DDI_NT_NVME_NEXUS
, 0)
3060 dev_err(dip
, CE_WARN
, "nvme_attach: "
3061 "cannot create devctl minor node");
3065 return (DDI_SUCCESS
);
3068 /* attach successful anyway so that FMA can retire the device */
3070 return (DDI_SUCCESS
);
3072 (void) nvme_detach(dip
, DDI_DETACH
);
3074 return (DDI_FAILURE
);
3078 nvme_detach(dev_info_t
*dip
, ddi_detach_cmd_t cmd
)
3083 if (cmd
!= DDI_DETACH
)
3084 return (DDI_FAILURE
);
3086 instance
= ddi_get_instance(dip
);
3088 nvme
= ddi_get_soft_state(nvme_state
, instance
);
3091 return (DDI_FAILURE
);
3093 ddi_remove_minor_node(dip
, "devctl");
3094 mutex_destroy(&nvme
->n_minor
.nm_mutex
);
3097 for (i
= 0; i
!= nvme
->n_namespace_count
; i
++) {
3098 ddi_remove_minor_node(dip
, nvme
->n_ns
[i
].ns_name
);
3099 mutex_destroy(&nvme
->n_ns
[i
].ns_minor
.nm_mutex
);
3101 if (nvme
->n_ns
[i
].ns_bd_hdl
) {
3102 (void) bd_detach_handle(
3103 nvme
->n_ns
[i
].ns_bd_hdl
);
3104 bd_free_handle(nvme
->n_ns
[i
].ns_bd_hdl
);
3107 if (nvme
->n_ns
[i
].ns_idns
)
3108 kmem_free(nvme
->n_ns
[i
].ns_idns
,
3109 sizeof (nvme_identify_nsid_t
));
3110 if (nvme
->n_ns
[i
].ns_devid
)
3111 strfree(nvme
->n_ns
[i
].ns_devid
);
3114 kmem_free(nvme
->n_ns
, sizeof (nvme_namespace_t
) *
3115 nvme
->n_namespace_count
);
3118 if (nvme
->n_progress
& NVME_INTERRUPTS
)
3119 nvme_release_interrupts(nvme
);
3121 if (nvme
->n_cmd_taskq
)
3122 ddi_taskq_wait(nvme
->n_cmd_taskq
);
3124 if (nvme
->n_ioq_count
> 0) {
3125 for (i
= 1; i
!= nvme
->n_ioq_count
+ 1; i
++) {
3126 if (nvme
->n_ioq
[i
] != NULL
) {
3127 /* TODO: send destroy queue commands */
3128 nvme_free_qpair(nvme
->n_ioq
[i
]);
3132 kmem_free(nvme
->n_ioq
, sizeof (nvme_qpair_t
*) *
3133 (nvme
->n_ioq_count
+ 1));
3136 if (nvme
->n_prp_cache
!= NULL
) {
3137 kmem_cache_destroy(nvme
->n_prp_cache
);
3140 if (nvme
->n_progress
& NVME_REGS_MAPPED
) {
3141 nvme_shutdown(nvme
, NVME_CC_SHN_NORMAL
, B_FALSE
);
3142 (void) nvme_reset(nvme
, B_FALSE
);
3145 if (nvme
->n_cmd_taskq
)
3146 ddi_taskq_destroy(nvme
->n_cmd_taskq
);
3148 if (nvme
->n_progress
& NVME_CTRL_LIMITS
)
3149 sema_destroy(&nvme
->n_abort_sema
);
3151 if (nvme
->n_progress
& NVME_ADMIN_QUEUE
)
3152 nvme_free_qpair(nvme
->n_adminq
);
3155 kmem_free(nvme
->n_idctl
, NVME_IDENTIFY_BUFSIZE
);
3157 if (nvme
->n_progress
& NVME_REGS_MAPPED
)
3158 ddi_regs_map_free(&nvme
->n_regh
);
3160 if (nvme
->n_progress
& NVME_FMA_INIT
) {
3161 if (DDI_FM_ERRCB_CAP(nvme
->n_fm_cap
))
3162 ddi_fm_handler_unregister(nvme
->n_dip
);
3164 if (DDI_FM_EREPORT_CAP(nvme
->n_fm_cap
) ||
3165 DDI_FM_ERRCB_CAP(nvme
->n_fm_cap
))
3166 pci_ereport_teardown(nvme
->n_dip
);
3168 ddi_fm_fini(nvme
->n_dip
);
3171 if (nvme
->n_vendor
!= NULL
)
3172 strfree(nvme
->n_vendor
);
3174 if (nvme
->n_product
!= NULL
)
3175 strfree(nvme
->n_product
);
3177 ddi_soft_state_free(nvme_state
, instance
);
3179 return (DDI_SUCCESS
);
3183 nvme_quiesce(dev_info_t
*dip
)
3188 instance
= ddi_get_instance(dip
);
3190 nvme
= ddi_get_soft_state(nvme_state
, instance
);
3193 return (DDI_FAILURE
);
3195 nvme_shutdown(nvme
, NVME_CC_SHN_ABRUPT
, B_TRUE
);
3197 (void) nvme_reset(nvme
, B_TRUE
);
3199 return (DDI_FAILURE
);
3203 nvme_fill_prp(nvme_cmd_t
*cmd
, bd_xfer_t
*xfer
)
3205 nvme_t
*nvme
= cmd
->nc_nvme
;
3206 int nprp_page
, nprp
;
3209 if (xfer
->x_ndmac
== 0)
3210 return (DDI_FAILURE
);
3212 cmd
->nc_sqe
.sqe_dptr
.d_prp
[0] = xfer
->x_dmac
.dmac_laddress
;
3213 ddi_dma_nextcookie(xfer
->x_dmah
, &xfer
->x_dmac
);
3215 if (xfer
->x_ndmac
== 1) {
3216 cmd
->nc_sqe
.sqe_dptr
.d_prp
[1] = 0;
3217 return (DDI_SUCCESS
);
3218 } else if (xfer
->x_ndmac
== 2) {
3219 cmd
->nc_sqe
.sqe_dptr
.d_prp
[1] = xfer
->x_dmac
.dmac_laddress
;
3220 return (DDI_SUCCESS
);
3225 nprp_page
= nvme
->n_pagesize
/ sizeof (uint64_t) - 1;
3226 ASSERT(nprp_page
> 0);
3227 nprp
= (xfer
->x_ndmac
+ nprp_page
- 1) / nprp_page
;
3230 * We currently don't support chained PRPs and set up our DMA
3231 * attributes to reflect that. If we still get an I/O request
3232 * that needs a chained PRP something is very wrong.
3236 cmd
->nc_dma
= kmem_cache_alloc(nvme
->n_prp_cache
, KM_SLEEP
);
3237 bzero(cmd
->nc_dma
->nd_memp
, cmd
->nc_dma
->nd_len
);
3239 cmd
->nc_sqe
.sqe_dptr
.d_prp
[1] = cmd
->nc_dma
->nd_cookie
.dmac_laddress
;
3241 /*LINTED: E_PTR_BAD_CAST_ALIGN*/
3242 for (prp
= (uint64_t *)cmd
->nc_dma
->nd_memp
;
3244 prp
++, xfer
->x_ndmac
--) {
3245 *prp
= xfer
->x_dmac
.dmac_laddress
;
3246 ddi_dma_nextcookie(xfer
->x_dmah
, &xfer
->x_dmac
);
3249 (void) ddi_dma_sync(cmd
->nc_dma
->nd_dmah
, 0, cmd
->nc_dma
->nd_len
,
3250 DDI_DMA_SYNC_FORDEV
);
3251 return (DDI_SUCCESS
);
3255 nvme_create_nvm_cmd(nvme_namespace_t
*ns
, uint8_t opc
, bd_xfer_t
*xfer
)
3257 nvme_t
*nvme
= ns
->ns_nvme
;
3261 * Blkdev only sets BD_XFER_POLL when dumping, so don't sleep.
3263 cmd
= nvme_alloc_cmd(nvme
, (xfer
->x_flags
& BD_XFER_POLL
) ?
3264 KM_NOSLEEP
: KM_SLEEP
);
3269 cmd
->nc_sqe
.sqe_opc
= opc
;
3270 cmd
->nc_callback
= nvme_bd_xfer_done
;
3271 cmd
->nc_xfer
= xfer
;
3274 case NVME_OPC_NVM_WRITE
:
3275 case NVME_OPC_NVM_READ
:
3276 VERIFY(xfer
->x_nblks
<= 0x10000);
3278 cmd
->nc_sqe
.sqe_nsid
= ns
->ns_id
;
3280 cmd
->nc_sqe
.sqe_cdw10
= xfer
->x_blkno
& 0xffffffffu
;
3281 cmd
->nc_sqe
.sqe_cdw11
= (xfer
->x_blkno
>> 32);
3282 cmd
->nc_sqe
.sqe_cdw12
= (uint16_t)(xfer
->x_nblks
- 1);
3284 if (nvme_fill_prp(cmd
, xfer
) != DDI_SUCCESS
)
3288 case NVME_OPC_NVM_FLUSH
:
3289 cmd
->nc_sqe
.sqe_nsid
= ns
->ns_id
;
3304 nvme_bd_xfer_done(void *arg
)
3306 nvme_cmd_t
*cmd
= arg
;
3307 bd_xfer_t
*xfer
= cmd
->nc_xfer
;
3310 error
= nvme_check_cmd_status(cmd
);
3313 bd_xfer_done(xfer
, error
);
3317 nvme_bd_driveinfo(void *arg
, bd_drive_t
*drive
)
3319 nvme_namespace_t
*ns
= arg
;
3320 nvme_t
*nvme
= ns
->ns_nvme
;
3323 * blkdev maintains one queue size per instance (namespace),
3324 * but all namespace share the I/O queues.
3325 * TODO: need to figure out a sane default, or use per-NS I/O queues,
3326 * or change blkdev to handle EAGAIN
3328 drive
->d_qsize
= nvme
->n_ioq_count
* nvme
->n_io_queue_len
3329 / nvme
->n_namespace_count
;
3332 * d_maxxfer is not set, which means the value is taken from the DMA
3333 * attributes specified to bd_alloc_handle.
3336 drive
->d_removable
= B_FALSE
;
3337 drive
->d_hotpluggable
= B_FALSE
;
3339 bcopy(ns
->ns_eui64
, drive
->d_eui64
, sizeof (drive
->d_eui64
));
3340 drive
->d_target
= ns
->ns_id
;
3343 drive
->d_model
= nvme
->n_idctl
->id_model
;
3344 drive
->d_model_len
= sizeof (nvme
->n_idctl
->id_model
);
3345 drive
->d_vendor
= nvme
->n_vendor
;
3346 drive
->d_vendor_len
= strlen(nvme
->n_vendor
);
3347 drive
->d_product
= nvme
->n_product
;
3348 drive
->d_product_len
= strlen(nvme
->n_product
);
3349 drive
->d_serial
= nvme
->n_idctl
->id_serial
;
3350 drive
->d_serial_len
= sizeof (nvme
->n_idctl
->id_serial
);
3351 drive
->d_revision
= nvme
->n_idctl
->id_fwrev
;
3352 drive
->d_revision_len
= sizeof (nvme
->n_idctl
->id_fwrev
);
3356 nvme_bd_mediainfo(void *arg
, bd_media_t
*media
)
3358 nvme_namespace_t
*ns
= arg
;
3360 media
->m_nblks
= ns
->ns_block_count
;
3361 media
->m_blksize
= ns
->ns_block_size
;
3362 media
->m_readonly
= B_FALSE
;
3363 media
->m_solidstate
= B_TRUE
;
3365 media
->m_pblksize
= ns
->ns_best_block_size
;
3371 nvme_bd_cmd(nvme_namespace_t
*ns
, bd_xfer_t
*xfer
, uint8_t opc
)
3373 nvme_t
*nvme
= ns
->ns_nvme
;
3382 cmd
= nvme_create_nvm_cmd(ns
, opc
, xfer
);
3386 cmd
->nc_sqid
= (CPU
->cpu_id
% nvme
->n_ioq_count
) + 1;
3387 ASSERT(cmd
->nc_sqid
<= nvme
->n_ioq_count
);
3388 ioq
= nvme
->n_ioq
[cmd
->nc_sqid
];
3391 * Get the polling flag before submitting the command. The command may
3392 * complete immediately after it was submitted, which means we must
3393 * treat both cmd and xfer as if they have been freed already.
3395 poll
= (xfer
->x_flags
& BD_XFER_POLL
) != 0;
3397 ret
= nvme_submit_io_cmd(ioq
, cmd
);
3406 cmd
= nvme_retrieve_cmd(nvme
, ioq
);
3408 nvme_bd_xfer_done(cmd
);
3411 } while (ioq
->nq_active_cmds
!= 0);
3417 nvme_bd_read(void *arg
, bd_xfer_t
*xfer
)
3419 nvme_namespace_t
*ns
= arg
;
3421 return (nvme_bd_cmd(ns
, xfer
, NVME_OPC_NVM_READ
));
3425 nvme_bd_write(void *arg
, bd_xfer_t
*xfer
)
3427 nvme_namespace_t
*ns
= arg
;
3429 return (nvme_bd_cmd(ns
, xfer
, NVME_OPC_NVM_WRITE
));
3433 nvme_bd_sync(void *arg
, bd_xfer_t
*xfer
)
3435 nvme_namespace_t
*ns
= arg
;
3437 if (ns
->ns_nvme
->n_dead
)
3441 * If the volatile write cache is not present or not enabled the FLUSH
3442 * command is a no-op, so we can take a shortcut here.
3444 if (!ns
->ns_nvme
->n_write_cache_present
) {
3445 bd_xfer_done(xfer
, ENOTSUP
);
3449 if (!ns
->ns_nvme
->n_write_cache_enabled
) {
3450 bd_xfer_done(xfer
, 0);
3454 return (nvme_bd_cmd(ns
, xfer
, NVME_OPC_NVM_FLUSH
));
3458 nvme_bd_devid(void *arg
, dev_info_t
*devinfo
, ddi_devid_t
*devid
)
3460 nvme_namespace_t
*ns
= arg
;
3462 /*LINTED: E_BAD_PTR_CAST_ALIGN*/
3463 if (*(uint64_t *)ns
->ns_eui64
!= 0) {
3464 return (ddi_devid_init(devinfo
, DEVID_SCSI3_WWN
,
3465 sizeof (ns
->ns_eui64
), ns
->ns_eui64
, devid
));
3467 return (ddi_devid_init(devinfo
, DEVID_ENCAP
,
3468 strlen(ns
->ns_devid
), ns
->ns_devid
, devid
));
3473 nvme_open(dev_t
*devp
, int flag
, int otyp
, cred_t
*cred_p
)
3476 _NOTE(ARGUNUSED(cred_p
));
3478 minor_t minor
= getminor(*devp
);
3479 nvme_t
*nvme
= ddi_get_soft_state(nvme_state
, NVME_MINOR_INST(minor
));
3480 int nsid
= NVME_MINOR_NSID(minor
);
3481 nvme_minor_state_t
*nm
;
3484 if (otyp
!= OTYP_CHR
)
3490 if (nsid
> nvme
->n_namespace_count
)
3496 nm
= nsid
== 0 ? &nvme
->n_minor
: &nvme
->n_ns
[nsid
- 1].ns_minor
;
3498 mutex_enter(&nm
->nm_mutex
);
3505 if (nm
->nm_ocnt
!= 0) {
3509 nm
->nm_oexcl
= B_TRUE
;
3515 mutex_exit(&nm
->nm_mutex
);
3521 nvme_close(dev_t dev
, int flag
, int otyp
, cred_t
*cred_p
)
3524 _NOTE(ARGUNUSED(cred_p
));
3525 _NOTE(ARGUNUSED(flag
));
3527 minor_t minor
= getminor(dev
);
3528 nvme_t
*nvme
= ddi_get_soft_state(nvme_state
, NVME_MINOR_INST(minor
));
3529 int nsid
= NVME_MINOR_NSID(minor
);
3530 nvme_minor_state_t
*nm
;
3532 if (otyp
!= OTYP_CHR
)
3538 if (nsid
> nvme
->n_namespace_count
)
3541 nm
= nsid
== 0 ? &nvme
->n_minor
: &nvme
->n_ns
[nsid
- 1].ns_minor
;
3543 mutex_enter(&nm
->nm_mutex
);
3545 nm
->nm_oexcl
= B_FALSE
;
3547 ASSERT(nm
->nm_ocnt
> 0);
3549 mutex_exit(&nm
->nm_mutex
);
3555 nvme_ioctl_identify(nvme_t
*nvme
, int nsid
, nvme_ioctl_t
*nioc
, int mode
,
3558 _NOTE(ARGUNUSED(cred_p
));
3562 if ((mode
& FREAD
) == 0)
3565 if (nioc
->n_len
< NVME_IDENTIFY_BUFSIZE
)
3568 if ((rv
= nvme_identify(nvme
, B_TRUE
, nsid
, (void **)&idctl
)) != 0)
3571 if (ddi_copyout(idctl
, (void *)nioc
->n_buf
, NVME_IDENTIFY_BUFSIZE
, mode
)
3575 kmem_free(idctl
, NVME_IDENTIFY_BUFSIZE
);
3581 nvme_ioctl_capabilities(nvme_t
*nvme
, int nsid
, nvme_ioctl_t
*nioc
,
3582 int mode
, cred_t
*cred_p
)
3584 _NOTE(ARGUNUSED(nsid
, cred_p
));
3586 nvme_reg_cap_t cap
= { 0 };
3587 nvme_capabilities_t nc
;
3589 if ((mode
& FREAD
) == 0)
3592 if (nioc
->n_len
< sizeof (nc
))
3595 cap
.r
= nvme_get64(nvme
, NVME_REG_CAP
);
3598 * The MPSMIN and MPSMAX fields in the CAP register use 0 to
3599 * specify the base page size of 4k (1<<12), so add 12 here to
3600 * get the real page size value.
3602 nc
.mpsmax
= 1 << (12 + cap
.b
.cap_mpsmax
);
3603 nc
.mpsmin
= 1 << (12 + cap
.b
.cap_mpsmin
);
3605 if (ddi_copyout(&nc
, (void *)nioc
->n_buf
, sizeof (nc
), mode
) != 0)
3612 nvme_ioctl_get_logpage(nvme_t
*nvme
, int nsid
, nvme_ioctl_t
*nioc
,
3613 int mode
, cred_t
*cred_p
)
3615 _NOTE(ARGUNUSED(cred_p
));
3620 if ((mode
& FREAD
) == 0)
3623 switch (nioc
->n_arg
) {
3624 case NVME_LOGPAGE_ERROR
:
3628 case NVME_LOGPAGE_HEALTH
:
3629 if (nsid
!= 0 && nvme
->n_idctl
->id_lpa
.lp_smart
== 0)
3633 nsid
= (uint32_t)-1;
3636 case NVME_LOGPAGE_FWSLOT
:
3644 if (nvme_get_logpage(nvme
, B_TRUE
, &log
, &bufsize
, nioc
->n_arg
, nsid
)
3648 if (nioc
->n_len
< bufsize
) {
3649 kmem_free(log
, bufsize
);
3653 if (ddi_copyout(log
, (void *)nioc
->n_buf
, bufsize
, mode
) != 0)
3656 nioc
->n_len
= bufsize
;
3657 kmem_free(log
, bufsize
);
3663 nvme_ioctl_get_features(nvme_t
*nvme
, int nsid
, nvme_ioctl_t
*nioc
,
3664 int mode
, cred_t
*cred_p
)
3666 _NOTE(ARGUNUSED(cred_p
));
3673 if ((mode
& FREAD
) == 0)
3676 if ((nioc
->n_arg
>> 32) > 0xff)
3679 feature
= (uint8_t)(nioc
->n_arg
>> 32);
3682 case NVME_FEAT_ARBITRATION
:
3683 case NVME_FEAT_POWER_MGMT
:
3684 case NVME_FEAT_TEMPERATURE
:
3685 case NVME_FEAT_ERROR
:
3686 case NVME_FEAT_NQUEUES
:
3687 case NVME_FEAT_INTR_COAL
:
3688 case NVME_FEAT_WRITE_ATOM
:
3689 case NVME_FEAT_ASYNC_EVENT
:
3690 case NVME_FEAT_PROGRESS
:
3695 case NVME_FEAT_INTR_VECT
:
3699 res
= nioc
->n_arg
& 0xffffffffUL
;
3700 if (res
>= nvme
->n_intr_cnt
)
3704 case NVME_FEAT_LBA_RANGE
:
3705 if (nvme
->n_lba_range_supported
== B_FALSE
)
3709 nsid
> nvme
->n_namespace_count
)
3714 case NVME_FEAT_WRITE_CACHE
:
3718 if (!nvme
->n_write_cache_present
)
3723 case NVME_FEAT_AUTO_PST
:
3727 if (!nvme
->n_auto_pst_supported
)
3736 rv
= nvme_get_features(nvme
, B_TRUE
, nsid
, feature
, &res
, &buf
,
3741 if (nioc
->n_len
< bufsize
) {
3742 kmem_free(buf
, bufsize
);
3746 if (buf
&& ddi_copyout(buf
, (void*)nioc
->n_buf
, bufsize
, mode
) != 0)
3749 kmem_free(buf
, bufsize
);
3751 nioc
->n_len
= bufsize
;
3757 nvme_ioctl_intr_cnt(nvme_t
*nvme
, int nsid
, nvme_ioctl_t
*nioc
, int mode
,
3760 _NOTE(ARGUNUSED(nsid
, mode
, cred_p
));
3762 if ((mode
& FREAD
) == 0)
3765 nioc
->n_arg
= nvme
->n_intr_cnt
;
3770 nvme_ioctl_version(nvme_t
*nvme
, int nsid
, nvme_ioctl_t
*nioc
, int mode
,
3773 _NOTE(ARGUNUSED(nsid
, cred_p
));
3776 if ((mode
& FREAD
) == 0)
3779 if (nioc
->n_len
< sizeof (nvme
->n_version
))
3782 if (ddi_copyout(&nvme
->n_version
, (void *)nioc
->n_buf
,
3783 sizeof (nvme
->n_version
), mode
) != 0)
3790 nvme_ioctl_format(nvme_t
*nvme
, int nsid
, nvme_ioctl_t
*nioc
, int mode
,
3793 _NOTE(ARGUNUSED(mode
));
3794 nvme_format_nvm_t frmt
= { 0 };
3795 int c_nsid
= nsid
!= 0 ? nsid
- 1 : 0;
3797 if ((mode
& FWRITE
) == 0 || secpolicy_sys_config(cred_p
, B_FALSE
) != 0)
3800 frmt
.r
= nioc
->n_arg
& 0xffffffff;
3803 * Check whether the FORMAT NVM command is supported.
3805 if (nvme
->n_idctl
->id_oacs
.oa_format
== 0)
3809 * Don't allow format or secure erase of individual namespace if that
3810 * would cause a format or secure erase of all namespaces.
3812 if (nsid
!= 0 && nvme
->n_idctl
->id_fna
.fn_format
!= 0)
3815 if (nsid
!= 0 && frmt
.b
.fm_ses
!= NVME_FRMT_SES_NONE
&&
3816 nvme
->n_idctl
->id_fna
.fn_sec_erase
!= 0)
3820 * Don't allow formatting with Protection Information.
3822 if (frmt
.b
.fm_pi
!= 0 || frmt
.b
.fm_pil
!= 0 || frmt
.b
.fm_ms
!= 0)
3826 * Don't allow formatting using an illegal LBA format, or any LBA format
3827 * that uses metadata.
3829 if (frmt
.b
.fm_lbaf
> nvme
->n_ns
[c_nsid
].ns_idns
->id_nlbaf
||
3830 nvme
->n_ns
[c_nsid
].ns_idns
->id_lbaf
[frmt
.b
.fm_lbaf
].lbaf_ms
!= 0)
3834 * Don't allow formatting using an illegal Secure Erase setting.
3836 if (frmt
.b
.fm_ses
> NVME_FRMT_MAX_SES
||
3837 (frmt
.b
.fm_ses
== NVME_FRMT_SES_CRYPTO
&&
3838 nvme
->n_idctl
->id_fna
.fn_crypt_erase
== 0))
3842 nsid
= (uint32_t)-1;
3844 return (nvme_format_nvm(nvme
, B_TRUE
, nsid
, frmt
.b
.fm_lbaf
, B_FALSE
, 0,
3845 B_FALSE
, frmt
.b
.fm_ses
));
3849 nvme_ioctl_detach(nvme_t
*nvme
, int nsid
, nvme_ioctl_t
*nioc
, int mode
,
3852 _NOTE(ARGUNUSED(nioc
, mode
));
3855 if ((mode
& FWRITE
) == 0 || secpolicy_sys_config(cred_p
, B_FALSE
) != 0)
3861 rv
= bd_detach_handle(nvme
->n_ns
[nsid
- 1].ns_bd_hdl
);
3862 if (rv
!= DDI_SUCCESS
)
3869 nvme_ioctl_attach(nvme_t
*nvme
, int nsid
, nvme_ioctl_t
*nioc
, int mode
,
3872 _NOTE(ARGUNUSED(nioc
, mode
));
3873 nvme_identify_nsid_t
*idns
;
3876 if ((mode
& FWRITE
) == 0 || secpolicy_sys_config(cred_p
, B_FALSE
) != 0)
3883 * Identify namespace again, free old identify data.
3885 idns
= nvme
->n_ns
[nsid
- 1].ns_idns
;
3886 if (nvme_init_ns(nvme
, nsid
) != DDI_SUCCESS
)
3889 kmem_free(idns
, sizeof (nvme_identify_nsid_t
));
3891 rv
= bd_attach_handle(nvme
->n_dip
, nvme
->n_ns
[nsid
- 1].ns_bd_hdl
);
3892 if (rv
!= DDI_SUCCESS
)
3899 nvme_ioctl(dev_t dev
, int cmd
, intptr_t arg
, int mode
, cred_t
*cred_p
,
3903 _NOTE(ARGUNUSED(rval_p
));
3905 minor_t minor
= getminor(dev
);
3906 nvme_t
*nvme
= ddi_get_soft_state(nvme_state
, NVME_MINOR_INST(minor
));
3907 int nsid
= NVME_MINOR_NSID(minor
);
3911 int (*nvme_ioctl
[])(nvme_t
*, int, nvme_ioctl_t
*, int, cred_t
*) = {
3913 nvme_ioctl_identify
,
3914 nvme_ioctl_identify
,
3915 nvme_ioctl_capabilities
,
3916 nvme_ioctl_get_logpage
,
3917 nvme_ioctl_get_features
,
3918 nvme_ioctl_intr_cnt
,
3928 if (nsid
> nvme
->n_namespace_count
)
3932 return (ndi_devctl_ioctl(nvme
->n_dip
, cmd
, arg
, mode
, 0));
3934 #ifdef _MULTI_DATAMODEL
3935 switch (ddi_model_convert_from(mode
& FMODELS
)) {
3936 case DDI_MODEL_ILP32
: {
3937 nvme_ioctl32_t nioc32
;
3938 if (ddi_copyin((void*)arg
, &nioc32
, sizeof (nvme_ioctl32_t
),
3941 nioc
.n_len
= nioc32
.n_len
;
3942 nioc
.n_buf
= nioc32
.n_buf
;
3943 nioc
.n_arg
= nioc32
.n_arg
;
3946 case DDI_MODEL_NONE
:
3948 if (ddi_copyin((void*)arg
, &nioc
, sizeof (nvme_ioctl_t
), mode
)
3951 #ifdef _MULTI_DATAMODEL
3956 if (nvme
->n_dead
&& cmd
!= NVME_IOC_DETACH
)
3960 if (cmd
== NVME_IOC_IDENTIFY_CTRL
) {
3962 * This makes NVME_IOC_IDENTIFY_CTRL work the same on devctl and
3963 * attachment point nodes.
3966 } else if (cmd
== NVME_IOC_IDENTIFY_NSID
&& nsid
== 0) {
3968 * This makes NVME_IOC_IDENTIFY_NSID work on a devctl node, it
3969 * will always return identify data for namespace 1.
3974 if (IS_NVME_IOC(cmd
) && nvme_ioctl
[NVME_IOC_CMD(cmd
)] != NULL
)
3975 rv
= nvme_ioctl
[NVME_IOC_CMD(cmd
)](nvme
, nsid
, &nioc
, mode
,
3980 #ifdef _MULTI_DATAMODEL
3981 switch (ddi_model_convert_from(mode
& FMODELS
)) {
3982 case DDI_MODEL_ILP32
: {
3983 nvme_ioctl32_t nioc32
;
3985 nioc32
.n_len
= (size32_t
)nioc
.n_len
;
3986 nioc32
.n_buf
= (uintptr32_t
)nioc
.n_buf
;
3987 nioc32
.n_arg
= nioc
.n_arg
;
3989 if (ddi_copyout(&nioc32
, (void *)arg
, sizeof (nvme_ioctl32_t
),
3994 case DDI_MODEL_NONE
:
3996 if (ddi_copyout(&nioc
, (void *)arg
, sizeof (nvme_ioctl_t
), mode
)
3999 #ifdef _MULTI_DATAMODEL