Merge remote-tracking branch 'origin/master'
[unleashed/lotheac.git] / usr / src / uts / common / io / blkdev / blkdev.c
blobd3b96c9f8a2d9cc3a219d4e1da660607d40c51d6
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved.
24 * Copyright 2012 Alexey Zaytsev <alexey.zaytsev@gmail.com> All rights reserved.
25 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
26 * Copyright 2017 The MathWorks, Inc. All rights reserved.
29 #include <sys/types.h>
30 #include <sys/ksynch.h>
31 #include <sys/kmem.h>
32 #include <sys/file.h>
33 #include <sys/errno.h>
34 #include <sys/open.h>
35 #include <sys/buf.h>
36 #include <sys/uio.h>
37 #include <sys/aio_req.h>
38 #include <sys/cred.h>
39 #include <sys/modctl.h>
40 #include <sys/cmlb.h>
41 #include <sys/conf.h>
42 #include <sys/devops.h>
43 #include <sys/list.h>
44 #include <sys/sysmacros.h>
45 #include <sys/dkio.h>
46 #include <sys/vtoc.h>
47 #include <sys/scsi/scsi.h> /* for DTYPE_DIRECT */
48 #include <sys/kstat.h>
49 #include <sys/fs/dv_node.h>
50 #include <sys/ddi.h>
51 #include <sys/sunddi.h>
52 #include <sys/note.h>
53 #include <sys/blkdev.h>
54 #include <sys/scsi/impl/inquiry.h>
56 #define BD_MAXPART 64
57 #define BDINST(dev) (getminor(dev) / BD_MAXPART)
58 #define BDPART(dev) (getminor(dev) % BD_MAXPART)
60 typedef struct bd bd_t;
61 typedef struct bd_xfer_impl bd_xfer_impl_t;
63 struct bd {
64 void *d_private;
65 dev_info_t *d_dip;
66 kmutex_t d_ocmutex;
67 kmutex_t d_iomutex;
68 kmutex_t *d_errmutex;
69 kmutex_t d_statemutex;
70 kcondvar_t d_statecv;
71 enum dkio_state d_state;
72 cmlb_handle_t d_cmlbh;
73 unsigned d_open_lyr[BD_MAXPART]; /* open count */
74 uint64_t d_open_excl; /* bit mask indexed by partition */
75 uint64_t d_open_reg[OTYPCNT]; /* bit mask */
77 uint32_t d_qsize;
78 uint32_t d_qactive;
79 uint32_t d_maxxfer;
80 uint32_t d_blkshift;
81 uint32_t d_pblkshift;
82 uint64_t d_numblks;
83 ddi_devid_t d_devid;
85 kmem_cache_t *d_cache;
86 list_t d_runq;
87 list_t d_waitq;
88 kstat_t *d_ksp;
89 kstat_io_t *d_kiop;
90 kstat_t *d_errstats;
91 struct bd_errstats *d_kerr;
93 boolean_t d_rdonly;
94 boolean_t d_ssd;
95 boolean_t d_removable;
96 boolean_t d_hotpluggable;
97 boolean_t d_use_dma;
99 ddi_dma_attr_t d_dma;
100 bd_ops_t d_ops;
101 bd_handle_t d_handle;
104 struct bd_handle {
105 bd_ops_t h_ops;
106 ddi_dma_attr_t *h_dma;
107 dev_info_t *h_parent;
108 dev_info_t *h_child;
109 void *h_private;
110 bd_t *h_bd;
111 char *h_name;
112 char h_addr[30]; /* enough for w%0.16x,%X */
115 struct bd_xfer_impl {
116 bd_xfer_t i_public;
117 list_node_t i_linkage;
118 bd_t *i_bd;
119 buf_t *i_bp;
120 uint_t i_num_win;
121 uint_t i_cur_win;
122 off_t i_offset;
123 int (*i_func)(void *, bd_xfer_t *);
124 uint32_t i_blkshift;
125 size_t i_len;
126 size_t i_resid;
129 #define i_dmah i_public.x_dmah
130 #define i_dmac i_public.x_dmac
131 #define i_ndmac i_public.x_ndmac
132 #define i_kaddr i_public.x_kaddr
133 #define i_nblks i_public.x_nblks
134 #define i_blkno i_public.x_blkno
135 #define i_flags i_public.x_flags
139 * Private prototypes.
142 static void bd_prop_update_inqstring(dev_info_t *, char *, char *, size_t);
143 static void bd_create_inquiry_props(dev_info_t *, bd_drive_t *);
144 static void bd_create_errstats(bd_t *, int, bd_drive_t *);
145 static void bd_errstats_setstr(kstat_named_t *, char *, size_t, char *);
146 static void bd_init_errstats(bd_t *, bd_drive_t *);
148 static int bd_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
149 static int bd_attach(dev_info_t *, ddi_attach_cmd_t);
150 static int bd_detach(dev_info_t *, ddi_detach_cmd_t);
152 static int bd_open(dev_t *, int, int, cred_t *);
153 static int bd_close(dev_t, int, int, cred_t *);
154 static int bd_strategy(struct buf *);
155 static int bd_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
156 static int bd_dump(dev_t, caddr_t, daddr_t, int);
157 static int bd_read(dev_t, struct uio *, cred_t *);
158 static int bd_write(dev_t, struct uio *, cred_t *);
159 static int bd_aread(dev_t, struct aio_req *, cred_t *);
160 static int bd_awrite(dev_t, struct aio_req *, cred_t *);
161 static int bd_prop_op(dev_t, dev_info_t *, ddi_prop_op_t, int, char *,
162 caddr_t, int *);
164 static int bd_tg_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t,
165 void *);
166 static int bd_tg_getinfo(dev_info_t *, int, void *, void *);
167 static int bd_xfer_ctor(void *, void *, int);
168 static void bd_xfer_dtor(void *, void *);
169 static void bd_sched(bd_t *);
170 static void bd_submit(bd_t *, bd_xfer_impl_t *);
171 static void bd_runq_exit(bd_xfer_impl_t *, int);
172 static void bd_update_state(bd_t *);
173 static int bd_check_state(bd_t *, enum dkio_state *);
174 static int bd_flush_write_cache(bd_t *, struct dk_callback *);
175 static int bd_check_uio(dev_t, struct uio *);
177 struct cmlb_tg_ops bd_tg_ops = {
178 TG_DK_OPS_VERSION_1,
179 bd_tg_rdwr,
180 bd_tg_getinfo,
183 static struct cb_ops bd_cb_ops = {
184 bd_open, /* open */
185 bd_close, /* close */
186 bd_strategy, /* strategy */
187 nodev, /* print */
188 bd_dump, /* dump */
189 bd_read, /* read */
190 bd_write, /* write */
191 bd_ioctl, /* ioctl */
192 nodev, /* devmap */
193 nodev, /* mmap */
194 nodev, /* segmap */
195 nochpoll, /* poll */
196 bd_prop_op, /* cb_prop_op */
197 0, /* streamtab */
198 D_64BIT | D_MP, /* Driver comaptibility flag */
199 CB_REV, /* cb_rev */
200 bd_aread, /* async read */
201 bd_awrite /* async write */
204 struct dev_ops bd_dev_ops = {
205 DEVO_REV, /* devo_rev, */
206 0, /* refcnt */
207 bd_getinfo, /* getinfo */
208 nulldev, /* identify */
209 nulldev, /* probe */
210 bd_attach, /* attach */
211 bd_detach, /* detach */
212 nodev, /* reset */
213 &bd_cb_ops, /* driver operations */
214 NULL, /* bus operations */
215 NULL, /* power */
216 ddi_quiesce_not_needed, /* quiesce */
219 static struct modldrv modldrv = {
220 &mod_driverops,
221 "Generic Block Device",
222 &bd_dev_ops,
225 static struct modlinkage modlinkage = {
226 MODREV_1, { &modldrv, NULL }
229 static void *bd_state;
230 static krwlock_t bd_lock;
233 _init(void)
235 int rv;
237 rv = ddi_soft_state_init(&bd_state, sizeof (struct bd), 2);
238 if (rv != DDI_SUCCESS) {
239 return (rv);
241 rw_init(&bd_lock, NULL, RW_DRIVER, NULL);
242 rv = mod_install(&modlinkage);
243 if (rv != DDI_SUCCESS) {
244 rw_destroy(&bd_lock);
245 ddi_soft_state_fini(&bd_state);
247 return (rv);
251 _fini(void)
253 int rv;
255 rv = mod_remove(&modlinkage);
256 if (rv == DDI_SUCCESS) {
257 rw_destroy(&bd_lock);
258 ddi_soft_state_fini(&bd_state);
260 return (rv);
264 _info(struct modinfo *modinfop)
266 return (mod_info(&modlinkage, modinfop));
269 static int
270 bd_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp)
272 bd_t *bd;
273 minor_t inst;
275 _NOTE(ARGUNUSED(dip));
277 inst = BDINST((dev_t)arg);
279 switch (cmd) {
280 case DDI_INFO_DEVT2DEVINFO:
281 bd = ddi_get_soft_state(bd_state, inst);
282 if (bd == NULL) {
283 return (DDI_FAILURE);
285 *resultp = (void *)bd->d_dip;
286 break;
288 case DDI_INFO_DEVT2INSTANCE:
289 *resultp = (void *)(intptr_t)inst;
290 break;
292 default:
293 return (DDI_FAILURE);
295 return (DDI_SUCCESS);
298 static void
299 bd_prop_update_inqstring(dev_info_t *dip, char *name, char *data, size_t len)
301 int ilen;
302 char *data_string;
304 ilen = scsi_ascii_inquiry_len(data, len);
305 ASSERT3U(ilen, <=, len);
306 if (ilen <= 0)
307 return;
308 /* ensure null termination */
309 data_string = kmem_zalloc(ilen + 1, KM_SLEEP);
310 bcopy(data, data_string, ilen);
311 (void) ndi_prop_update_string(DDI_DEV_T_NONE, dip, name, data_string);
312 kmem_free(data_string, ilen + 1);
315 static void
316 bd_create_inquiry_props(dev_info_t *dip, bd_drive_t *drive)
318 if (drive->d_vendor_len > 0)
319 bd_prop_update_inqstring(dip, INQUIRY_VENDOR_ID,
320 drive->d_vendor, drive->d_vendor_len);
322 if (drive->d_product_len > 0)
323 bd_prop_update_inqstring(dip, INQUIRY_PRODUCT_ID,
324 drive->d_product, drive->d_product_len);
326 if (drive->d_serial_len > 0)
327 bd_prop_update_inqstring(dip, INQUIRY_SERIAL_NO,
328 drive->d_serial, drive->d_serial_len);
330 if (drive->d_revision_len > 0)
331 bd_prop_update_inqstring(dip, INQUIRY_REVISION_ID,
332 drive->d_revision, drive->d_revision_len);
335 static void
336 bd_create_errstats(bd_t *bd, int inst, bd_drive_t *drive)
338 char ks_module[KSTAT_STRLEN];
339 char ks_name[KSTAT_STRLEN];
340 int ndata = sizeof (struct bd_errstats) / sizeof (kstat_named_t);
342 if (bd->d_errstats != NULL)
343 return;
345 (void) snprintf(ks_module, sizeof (ks_module), "%serr",
346 ddi_driver_name(bd->d_dip));
347 (void) snprintf(ks_name, sizeof (ks_name), "%s%d,err",
348 ddi_driver_name(bd->d_dip), inst);
350 bd->d_errstats = kstat_create(ks_module, inst, ks_name, "device_error",
351 KSTAT_TYPE_NAMED, ndata, KSTAT_FLAG_PERSISTENT);
353 if (bd->d_errstats == NULL) {
355 * Even if we cannot create the kstat, we create a
356 * scratch kstat. The reason for this is to ensure
357 * that we can update the kstat all of the time,
358 * without adding an extra branch instruction.
360 bd->d_kerr = kmem_zalloc(sizeof (struct bd_errstats),
361 KM_SLEEP);
362 bd->d_errmutex = kmem_zalloc(sizeof (kmutex_t), KM_SLEEP);
363 mutex_init(bd->d_errmutex, NULL, MUTEX_DRIVER, NULL);
364 } else {
365 if (bd->d_errstats->ks_lock == NULL) {
366 bd->d_errstats->ks_lock = kmem_zalloc(sizeof (kmutex_t),
367 KM_SLEEP);
368 mutex_init(bd->d_errstats->ks_lock, NULL, MUTEX_DRIVER,
369 NULL);
372 bd->d_errmutex = bd->d_errstats->ks_lock;
373 bd->d_kerr = (struct bd_errstats *)bd->d_errstats->ks_data;
376 kstat_named_init(&bd->d_kerr->bd_softerrs, "Soft Errors",
377 KSTAT_DATA_UINT32);
378 kstat_named_init(&bd->d_kerr->bd_harderrs, "Hard Errors",
379 KSTAT_DATA_UINT32);
380 kstat_named_init(&bd->d_kerr->bd_transerrs, "Transport Errors",
381 KSTAT_DATA_UINT32);
383 if (drive->d_model_len > 0) {
384 kstat_named_init(&bd->d_kerr->bd_model, "Model",
385 KSTAT_DATA_STRING);
386 } else {
387 kstat_named_init(&bd->d_kerr->bd_vid, "Vendor",
388 KSTAT_DATA_STRING);
389 kstat_named_init(&bd->d_kerr->bd_pid, "Product",
390 KSTAT_DATA_STRING);
393 kstat_named_init(&bd->d_kerr->bd_revision, "Revision",
394 KSTAT_DATA_STRING);
395 kstat_named_init(&bd->d_kerr->bd_serial, "Serial No",
396 KSTAT_DATA_STRING);
397 kstat_named_init(&bd->d_kerr->bd_capacity, "Size",
398 KSTAT_DATA_ULONGLONG);
399 kstat_named_init(&bd->d_kerr->bd_rq_media_err, "Media Error",
400 KSTAT_DATA_UINT32);
401 kstat_named_init(&bd->d_kerr->bd_rq_ntrdy_err, "Device Not Ready",
402 KSTAT_DATA_UINT32);
403 kstat_named_init(&bd->d_kerr->bd_rq_nodev_err, "No Device",
404 KSTAT_DATA_UINT32);
405 kstat_named_init(&bd->d_kerr->bd_rq_recov_err, "Recoverable",
406 KSTAT_DATA_UINT32);
407 kstat_named_init(&bd->d_kerr->bd_rq_illrq_err, "Illegal Request",
408 KSTAT_DATA_UINT32);
409 kstat_named_init(&bd->d_kerr->bd_rq_pfa_err,
410 "Predictive Failure Analysis", KSTAT_DATA_UINT32);
412 bd->d_errstats->ks_private = bd;
414 kstat_install(bd->d_errstats);
417 static void
418 bd_errstats_setstr(kstat_named_t *k, char *str, size_t len, char *alt)
420 char *tmp;
422 if (KSTAT_NAMED_STR_PTR(k) == NULL) {
423 if (len > 0) {
424 tmp = kmem_alloc(len + 1, KM_SLEEP);
425 (void) strlcpy(tmp, str, len + 1);
426 } else {
427 tmp = alt;
430 kstat_named_setstr(k, tmp);
434 static void
435 bd_init_errstats(bd_t *bd, bd_drive_t *drive)
437 struct bd_errstats *est = bd->d_kerr;
439 mutex_enter(bd->d_errmutex);
441 if (drive->d_model_len > 0 &&
442 KSTAT_NAMED_STR_PTR(&est->bd_model) == NULL) {
443 bd_errstats_setstr(&est->bd_model, drive->d_model,
444 drive->d_model_len, NULL);
445 } else {
446 bd_errstats_setstr(&est->bd_vid, drive->d_vendor,
447 drive->d_vendor_len, "Unknown ");
448 bd_errstats_setstr(&est->bd_pid, drive->d_product,
449 drive->d_product_len, "Unknown ");
452 bd_errstats_setstr(&est->bd_revision, drive->d_revision,
453 drive->d_revision_len, "0001");
454 bd_errstats_setstr(&est->bd_serial, drive->d_serial,
455 drive->d_serial_len, "0 ");
457 mutex_exit(bd->d_errmutex);
460 static int
461 bd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
463 int inst;
464 bd_handle_t hdl;
465 bd_t *bd;
466 bd_drive_t drive;
467 int rv;
468 char name[16];
469 char kcache[32];
471 switch (cmd) {
472 case DDI_ATTACH:
473 break;
474 case DDI_RESUME:
475 /* We don't do anything native for suspend/resume */
476 return (DDI_SUCCESS);
477 default:
478 return (DDI_FAILURE);
481 inst = ddi_get_instance(dip);
482 hdl = ddi_get_parent_data(dip);
484 (void) snprintf(name, sizeof (name), "%s%d",
485 ddi_driver_name(dip), ddi_get_instance(dip));
486 (void) snprintf(kcache, sizeof (kcache), "%s_xfer", name);
488 if (hdl == NULL) {
489 cmn_err(CE_WARN, "%s: missing parent data!", name);
490 return (DDI_FAILURE);
493 if (ddi_soft_state_zalloc(bd_state, inst) != DDI_SUCCESS) {
494 cmn_err(CE_WARN, "%s: unable to zalloc soft state!", name);
495 return (DDI_FAILURE);
497 bd = ddi_get_soft_state(bd_state, inst);
499 if (hdl->h_dma) {
500 bd->d_dma = *(hdl->h_dma);
501 bd->d_dma.dma_attr_granular =
502 max(DEV_BSIZE, bd->d_dma.dma_attr_granular);
503 bd->d_use_dma = B_TRUE;
505 if (bd->d_maxxfer &&
506 (bd->d_maxxfer != bd->d_dma.dma_attr_maxxfer)) {
507 cmn_err(CE_WARN,
508 "%s: inconsistent maximum transfer size!",
509 name);
510 /* We force it */
511 bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer;
512 } else {
513 bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer;
515 } else {
516 bd->d_use_dma = B_FALSE;
517 if (bd->d_maxxfer == 0) {
518 bd->d_maxxfer = 1024 * 1024;
521 bd->d_ops = hdl->h_ops;
522 bd->d_private = hdl->h_private;
523 bd->d_blkshift = 9; /* 512 bytes, to start */
525 if (bd->d_maxxfer % DEV_BSIZE) {
526 cmn_err(CE_WARN, "%s: maximum transfer misaligned!", name);
527 bd->d_maxxfer &= ~(DEV_BSIZE - 1);
529 if (bd->d_maxxfer < DEV_BSIZE) {
530 cmn_err(CE_WARN, "%s: maximum transfer size too small!", name);
531 ddi_soft_state_free(bd_state, inst);
532 return (DDI_FAILURE);
535 bd->d_dip = dip;
536 bd->d_handle = hdl;
537 hdl->h_bd = bd;
538 ddi_set_driver_private(dip, bd);
540 mutex_init(&bd->d_iomutex, NULL, MUTEX_DRIVER, NULL);
541 mutex_init(&bd->d_ocmutex, NULL, MUTEX_DRIVER, NULL);
542 mutex_init(&bd->d_statemutex, NULL, MUTEX_DRIVER, NULL);
543 cv_init(&bd->d_statecv, NULL, CV_DRIVER, NULL);
545 list_create(&bd->d_waitq, sizeof (bd_xfer_impl_t),
546 offsetof(struct bd_xfer_impl, i_linkage));
547 list_create(&bd->d_runq, sizeof (bd_xfer_impl_t),
548 offsetof(struct bd_xfer_impl, i_linkage));
550 bd->d_cache = kmem_cache_create(kcache, sizeof (bd_xfer_impl_t), 8,
551 bd_xfer_ctor, bd_xfer_dtor, NULL, bd, NULL, 0);
553 bd->d_ksp = kstat_create(ddi_driver_name(dip), inst, NULL, "disk",
554 KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT);
555 if (bd->d_ksp != NULL) {
556 bd->d_ksp->ks_lock = &bd->d_iomutex;
557 kstat_install(bd->d_ksp);
558 bd->d_kiop = bd->d_ksp->ks_data;
559 } else {
561 * Even if we cannot create the kstat, we create a
562 * scratch kstat. The reason for this is to ensure
563 * that we can update the kstat all of the time,
564 * without adding an extra branch instruction.
566 bd->d_kiop = kmem_zalloc(sizeof (kstat_io_t), KM_SLEEP);
569 cmlb_alloc_handle(&bd->d_cmlbh);
571 bd->d_state = DKIO_NONE;
573 bzero(&drive, sizeof (drive));
574 bd->d_ops.o_drive_info(bd->d_private, &drive);
575 bd->d_qsize = drive.d_qsize;
576 bd->d_removable = drive.d_removable;
577 bd->d_hotpluggable = drive.d_hotpluggable;
579 if (drive.d_maxxfer && drive.d_maxxfer < bd->d_maxxfer)
580 bd->d_maxxfer = drive.d_maxxfer;
582 bd_create_inquiry_props(dip, &drive);
584 bd_create_errstats(bd, inst, &drive);
585 bd_init_errstats(bd, &drive);
586 bd_update_state(bd);
588 rv = cmlb_attach(dip, &bd_tg_ops, DTYPE_DIRECT,
589 bd->d_removable, bd->d_hotpluggable,
590 /*LINTED: E_BAD_PTR_CAST_ALIGN*/
591 *(uint64_t *)drive.d_eui64 != 0 ? DDI_NT_BLOCK_BLKDEV :
592 drive.d_lun >= 0 ? DDI_NT_BLOCK_CHAN : DDI_NT_BLOCK,
593 CMLB_FAKE_LABEL_ONE_PARTITION, bd->d_cmlbh, 0);
594 if (rv != 0) {
595 cmlb_free_handle(&bd->d_cmlbh);
596 kmem_cache_destroy(bd->d_cache);
597 mutex_destroy(&bd->d_iomutex);
598 mutex_destroy(&bd->d_ocmutex);
599 mutex_destroy(&bd->d_statemutex);
600 cv_destroy(&bd->d_statecv);
601 list_destroy(&bd->d_waitq);
602 list_destroy(&bd->d_runq);
603 if (bd->d_ksp != NULL) {
604 kstat_delete(bd->d_ksp);
605 bd->d_ksp = NULL;
606 } else {
607 kmem_free(bd->d_kiop, sizeof (kstat_io_t));
609 ddi_soft_state_free(bd_state, inst);
610 return (DDI_FAILURE);
613 if (bd->d_ops.o_devid_init != NULL) {
614 rv = bd->d_ops.o_devid_init(bd->d_private, dip, &bd->d_devid);
615 if (rv == DDI_SUCCESS) {
616 if (ddi_devid_register(dip, bd->d_devid) !=
617 DDI_SUCCESS) {
618 cmn_err(CE_WARN,
619 "%s: unable to register devid", name);
625 * Add a zero-length attribute to tell the world we support
626 * kernel ioctls (for layered drivers). Also set up properties
627 * used by HAL to identify removable media.
629 (void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
630 DDI_KERNEL_IOCTL, NULL, 0);
631 if (bd->d_removable) {
632 (void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
633 "removable-media", NULL, 0);
635 if (bd->d_hotpluggable) {
636 (void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
637 "hotpluggable", NULL, 0);
640 ddi_report_dev(dip);
642 return (DDI_SUCCESS);
645 static int
646 bd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
648 bd_t *bd;
650 bd = ddi_get_driver_private(dip);
652 switch (cmd) {
653 case DDI_DETACH:
654 break;
655 case DDI_SUSPEND:
656 /* We don't suspend, but our parent does */
657 return (DDI_SUCCESS);
658 default:
659 return (DDI_FAILURE);
661 if (bd->d_ksp != NULL) {
662 kstat_delete(bd->d_ksp);
663 bd->d_ksp = NULL;
664 } else {
665 kmem_free(bd->d_kiop, sizeof (kstat_io_t));
668 if (bd->d_errstats != NULL) {
669 kstat_delete(bd->d_errstats);
670 bd->d_errstats = NULL;
671 } else {
672 kmem_free(bd->d_kerr, sizeof (struct bd_errstats));
673 mutex_destroy(bd->d_errmutex);
676 cmlb_detach(bd->d_cmlbh, 0);
677 cmlb_free_handle(&bd->d_cmlbh);
678 if (bd->d_devid)
679 ddi_devid_free(bd->d_devid);
680 kmem_cache_destroy(bd->d_cache);
681 mutex_destroy(&bd->d_iomutex);
682 mutex_destroy(&bd->d_ocmutex);
683 mutex_destroy(&bd->d_statemutex);
684 cv_destroy(&bd->d_statecv);
685 list_destroy(&bd->d_waitq);
686 list_destroy(&bd->d_runq);
687 ddi_soft_state_free(bd_state, ddi_get_instance(dip));
688 return (DDI_SUCCESS);
691 static int
692 bd_xfer_ctor(void *buf, void *arg, int kmflag)
694 bd_xfer_impl_t *xi;
695 bd_t *bd = arg;
696 int (*dcb)(caddr_t);
698 if (kmflag == KM_PUSHPAGE || kmflag == KM_SLEEP) {
699 dcb = DDI_DMA_SLEEP;
700 } else {
701 dcb = DDI_DMA_DONTWAIT;
704 xi = buf;
705 bzero(xi, sizeof (*xi));
706 xi->i_bd = bd;
708 if (bd->d_use_dma) {
709 if (ddi_dma_alloc_handle(bd->d_dip, &bd->d_dma, dcb, NULL,
710 &xi->i_dmah) != DDI_SUCCESS) {
711 return (-1);
715 return (0);
718 static void
719 bd_xfer_dtor(void *buf, void *arg)
721 bd_xfer_impl_t *xi = buf;
723 _NOTE(ARGUNUSED(arg));
725 if (xi->i_dmah)
726 ddi_dma_free_handle(&xi->i_dmah);
727 xi->i_dmah = NULL;
730 static bd_xfer_impl_t *
731 bd_xfer_alloc(bd_t *bd, struct buf *bp, int (*func)(void *, bd_xfer_t *),
732 int kmflag)
734 bd_xfer_impl_t *xi;
735 int rv = 0;
736 int status;
737 unsigned dir;
738 int (*cb)(caddr_t);
739 size_t len;
740 uint32_t shift;
742 if (kmflag == KM_SLEEP) {
743 cb = DDI_DMA_SLEEP;
744 } else {
745 cb = DDI_DMA_DONTWAIT;
748 xi = kmem_cache_alloc(bd->d_cache, kmflag);
749 if (xi == NULL) {
750 bioerror(bp, ENOMEM);
751 return (NULL);
754 ASSERT(bp);
756 xi->i_bp = bp;
757 xi->i_func = func;
758 xi->i_blkno = bp->b_lblkno >> (bd->d_blkshift - DEV_BSHIFT);
760 if (bp->b_bcount == 0) {
761 xi->i_len = 0;
762 xi->i_nblks = 0;
763 xi->i_kaddr = NULL;
764 xi->i_resid = 0;
765 xi->i_num_win = 0;
766 goto done;
769 if (bp->b_flags & B_READ) {
770 dir = DDI_DMA_READ;
771 xi->i_func = bd->d_ops.o_read;
772 } else {
773 dir = DDI_DMA_WRITE;
774 xi->i_func = bd->d_ops.o_write;
777 shift = bd->d_blkshift;
778 xi->i_blkshift = shift;
780 if (!bd->d_use_dma) {
781 bp_mapin(bp);
782 rv = 0;
783 xi->i_offset = 0;
784 xi->i_num_win =
785 (bp->b_bcount + (bd->d_maxxfer - 1)) / bd->d_maxxfer;
786 xi->i_cur_win = 0;
787 xi->i_len = min(bp->b_bcount, bd->d_maxxfer);
788 xi->i_nblks = xi->i_len >> shift;
789 xi->i_kaddr = bp->b_un.b_addr;
790 xi->i_resid = bp->b_bcount;
791 } else {
794 * We have to use consistent DMA if the address is misaligned.
796 if (((bp->b_flags & (B_PAGEIO | B_REMAPPED)) != B_PAGEIO) &&
797 ((uintptr_t)bp->b_un.b_addr & 0x7)) {
798 dir |= DDI_DMA_CONSISTENT | DDI_DMA_PARTIAL;
799 } else {
800 dir |= DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
803 status = ddi_dma_buf_bind_handle(xi->i_dmah, bp, dir, cb,
804 NULL, &xi->i_dmac, &xi->i_ndmac);
805 switch (status) {
806 case DDI_DMA_MAPPED:
807 xi->i_num_win = 1;
808 xi->i_cur_win = 0;
809 xi->i_offset = 0;
810 xi->i_len = bp->b_bcount;
811 xi->i_nblks = xi->i_len >> shift;
812 xi->i_resid = bp->b_bcount;
813 rv = 0;
814 break;
815 case DDI_DMA_PARTIAL_MAP:
816 xi->i_cur_win = 0;
818 if ((ddi_dma_numwin(xi->i_dmah, &xi->i_num_win) !=
819 DDI_SUCCESS) ||
820 (ddi_dma_getwin(xi->i_dmah, 0, &xi->i_offset,
821 &len, &xi->i_dmac, &xi->i_ndmac) !=
822 DDI_SUCCESS) ||
823 (P2PHASE(len, (1U << shift)) != 0)) {
824 (void) ddi_dma_unbind_handle(xi->i_dmah);
825 rv = EFAULT;
826 goto done;
828 xi->i_len = len;
829 xi->i_nblks = xi->i_len >> shift;
830 xi->i_resid = bp->b_bcount;
831 rv = 0;
832 break;
833 case DDI_DMA_NORESOURCES:
834 rv = EAGAIN;
835 goto done;
836 case DDI_DMA_TOOBIG:
837 rv = EINVAL;
838 goto done;
839 case DDI_DMA_NOMAPPING:
840 case DDI_DMA_INUSE:
841 default:
842 rv = EFAULT;
843 goto done;
847 done:
848 if (rv != 0) {
849 kmem_cache_free(bd->d_cache, xi);
850 bioerror(bp, rv);
851 return (NULL);
854 return (xi);
857 static void
858 bd_xfer_free(bd_xfer_impl_t *xi)
860 if (xi->i_dmah) {
861 (void) ddi_dma_unbind_handle(xi->i_dmah);
863 kmem_cache_free(xi->i_bd->d_cache, xi);
866 static int
867 bd_open(dev_t *devp, int flag, int otyp, cred_t *credp)
869 dev_t dev = *devp;
870 bd_t *bd;
871 minor_t part;
872 minor_t inst;
873 uint64_t mask;
874 boolean_t ndelay;
875 int rv;
876 diskaddr_t nblks;
877 diskaddr_t lba;
879 _NOTE(ARGUNUSED(credp));
881 part = BDPART(dev);
882 inst = BDINST(dev);
884 if (otyp >= OTYPCNT)
885 return (EINVAL);
887 ndelay = (flag & (FNDELAY | FNONBLOCK)) ? B_TRUE : B_FALSE;
890 * Block any DR events from changing the set of registered
891 * devices while we function.
893 rw_enter(&bd_lock, RW_READER);
894 if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
895 rw_exit(&bd_lock);
896 return (ENXIO);
899 mutex_enter(&bd->d_ocmutex);
901 ASSERT(part < 64);
902 mask = (1U << part);
904 bd_update_state(bd);
906 if (cmlb_validate(bd->d_cmlbh, 0, 0) != 0) {
908 /* non-blocking opens are allowed to succeed */
909 if (!ndelay) {
910 rv = ENXIO;
911 goto done;
913 } else if (cmlb_partinfo(bd->d_cmlbh, part, &nblks, &lba,
914 NULL, NULL, 0) == 0) {
917 * We read the partinfo, verify valid ranges. If the
918 * partition is invalid, and we aren't blocking or
919 * doing a raw access, then fail. (Non-blocking and
920 * raw accesses can still succeed to allow a disk with
921 * bad partition data to opened by format and fdisk.)
923 if ((!nblks) && ((!ndelay) || (otyp != OTYP_CHR))) {
924 rv = ENXIO;
925 goto done;
927 } else if (!ndelay) {
929 * cmlb_partinfo failed -- invalid partition or no
930 * disk label.
932 rv = ENXIO;
933 goto done;
936 if ((flag & FWRITE) && bd->d_rdonly) {
937 rv = EROFS;
938 goto done;
941 if ((bd->d_open_excl) & (mask)) {
942 rv = EBUSY;
943 goto done;
945 if (flag & FEXCL) {
946 if (bd->d_open_lyr[part]) {
947 rv = EBUSY;
948 goto done;
950 for (int i = 0; i < OTYP_LYR; i++) {
951 if (bd->d_open_reg[i] & mask) {
952 rv = EBUSY;
953 goto done;
958 if (otyp == OTYP_LYR) {
959 bd->d_open_lyr[part]++;
960 } else {
961 bd->d_open_reg[otyp] |= mask;
963 if (flag & FEXCL) {
964 bd->d_open_excl |= mask;
967 rv = 0;
968 done:
969 mutex_exit(&bd->d_ocmutex);
970 rw_exit(&bd_lock);
972 return (rv);
975 static int
976 bd_close(dev_t dev, int flag, int otyp, cred_t *credp)
978 bd_t *bd;
979 minor_t inst;
980 minor_t part;
981 uint64_t mask;
982 boolean_t last = B_TRUE;
984 _NOTE(ARGUNUSED(flag));
985 _NOTE(ARGUNUSED(credp));
987 part = BDPART(dev);
988 inst = BDINST(dev);
990 ASSERT(part < 64);
991 mask = (1U << part);
993 rw_enter(&bd_lock, RW_READER);
995 if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
996 rw_exit(&bd_lock);
997 return (ENXIO);
1000 mutex_enter(&bd->d_ocmutex);
1001 if (bd->d_open_excl & mask) {
1002 bd->d_open_excl &= ~mask;
1004 if (otyp == OTYP_LYR) {
1005 bd->d_open_lyr[part]--;
1006 } else {
1007 bd->d_open_reg[otyp] &= ~mask;
1009 for (int i = 0; i < 64; i++) {
1010 if (bd->d_open_lyr[part]) {
1011 last = B_FALSE;
1014 for (int i = 0; last && (i < OTYP_LYR); i++) {
1015 if (bd->d_open_reg[i]) {
1016 last = B_FALSE;
1019 mutex_exit(&bd->d_ocmutex);
1021 if (last) {
1022 cmlb_invalidate(bd->d_cmlbh, 0);
1024 rw_exit(&bd_lock);
1026 return (0);
1029 static int
1030 bd_dump(dev_t dev, caddr_t caddr, daddr_t blkno, int nblk)
1032 minor_t inst;
1033 minor_t part;
1034 diskaddr_t pstart;
1035 diskaddr_t psize;
1036 bd_t *bd;
1037 bd_xfer_impl_t *xi;
1038 buf_t *bp;
1039 int rv;
1040 uint32_t shift;
1041 daddr_t d_blkno;
1042 int d_nblk;
1044 rw_enter(&bd_lock, RW_READER);
1046 part = BDPART(dev);
1047 inst = BDINST(dev);
1049 if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1050 rw_exit(&bd_lock);
1051 return (ENXIO);
1053 shift = bd->d_blkshift;
1054 d_blkno = blkno >> (shift - DEV_BSHIFT);
1055 d_nblk = nblk >> (shift - DEV_BSHIFT);
1057 * do cmlb, but do it synchronously unless we already have the
1058 * partition (which we probably should.)
1060 if (cmlb_partinfo(bd->d_cmlbh, part, &psize, &pstart, NULL, NULL,
1061 (void *)1)) {
1062 rw_exit(&bd_lock);
1063 return (ENXIO);
1066 if ((d_blkno + d_nblk) > psize) {
1067 rw_exit(&bd_lock);
1068 return (EINVAL);
1070 bp = getrbuf(KM_NOSLEEP);
1071 if (bp == NULL) {
1072 rw_exit(&bd_lock);
1073 return (ENOMEM);
1076 bp->b_bcount = nblk << DEV_BSHIFT;
1077 bp->b_resid = bp->b_bcount;
1078 bp->b_lblkno = blkno;
1079 bp->b_un.b_addr = caddr;
1081 xi = bd_xfer_alloc(bd, bp, bd->d_ops.o_write, KM_NOSLEEP);
1082 if (xi == NULL) {
1083 rw_exit(&bd_lock);
1084 freerbuf(bp);
1085 return (ENOMEM);
1087 xi->i_blkno = d_blkno + pstart;
1088 xi->i_flags = BD_XFER_POLL;
1089 bd_submit(bd, xi);
1090 rw_exit(&bd_lock);
1093 * Generally, we should have run this entirely synchronously
1094 * at this point and the biowait call should be a no-op. If
1095 * it didn't happen this way, it's a bug in the underlying
1096 * driver not honoring BD_XFER_POLL.
1098 (void) biowait(bp);
1099 rv = geterror(bp);
1100 freerbuf(bp);
1101 return (rv);
1104 void
1105 bd_minphys(struct buf *bp)
1107 minor_t inst;
1108 bd_t *bd;
1109 inst = BDINST(bp->b_edev);
1111 bd = ddi_get_soft_state(bd_state, inst);
1114 * In a non-debug kernel, bd_strategy will catch !bd as
1115 * well, and will fail nicely.
1117 ASSERT(bd);
1119 if (bp->b_bcount > bd->d_maxxfer)
1120 bp->b_bcount = bd->d_maxxfer;
1123 static int
1124 bd_check_uio(dev_t dev, struct uio *uio)
1126 bd_t *bd;
1127 uint32_t shift;
1129 if ((bd = ddi_get_soft_state(bd_state, BDINST(dev))) == NULL) {
1130 return (ENXIO);
1133 shift = bd->d_blkshift;
1134 if ((P2PHASE(uio->uio_loffset, (1U << shift)) != 0) ||
1135 (P2PHASE(uio->uio_iov->iov_len, (1U << shift)) != 0)) {
1136 return (EINVAL);
1139 return (0);
1142 static int
1143 bd_read(dev_t dev, struct uio *uio, cred_t *credp)
1145 _NOTE(ARGUNUSED(credp));
1146 int ret = bd_check_uio(dev, uio);
1147 if (ret != 0) {
1148 return (ret);
1150 return (physio(bd_strategy, NULL, dev, B_READ, bd_minphys, uio));
1153 static int
1154 bd_write(dev_t dev, struct uio *uio, cred_t *credp)
1156 _NOTE(ARGUNUSED(credp));
1157 int ret = bd_check_uio(dev, uio);
1158 if (ret != 0) {
1159 return (ret);
1161 return (physio(bd_strategy, NULL, dev, B_WRITE, bd_minphys, uio));
1164 static int
1165 bd_aread(dev_t dev, struct aio_req *aio, cred_t *credp)
1167 _NOTE(ARGUNUSED(credp));
1168 int ret = bd_check_uio(dev, aio->aio_uio);
1169 if (ret != 0) {
1170 return (ret);
1172 return (aphysio(bd_strategy, anocancel, dev, B_READ, bd_minphys, aio));
1175 static int
1176 bd_awrite(dev_t dev, struct aio_req *aio, cred_t *credp)
1178 _NOTE(ARGUNUSED(credp));
1179 int ret = bd_check_uio(dev, aio->aio_uio);
1180 if (ret != 0) {
1181 return (ret);
1183 return (aphysio(bd_strategy, anocancel, dev, B_WRITE, bd_minphys, aio));
1186 static int
1187 bd_strategy(struct buf *bp)
1189 minor_t inst;
1190 minor_t part;
1191 bd_t *bd;
1192 diskaddr_t p_lba;
1193 diskaddr_t p_nblks;
1194 diskaddr_t b_nblks;
1195 bd_xfer_impl_t *xi;
1196 uint32_t shift;
1197 int (*func)(void *, bd_xfer_t *);
1198 diskaddr_t lblkno;
1200 part = BDPART(bp->b_edev);
1201 inst = BDINST(bp->b_edev);
1203 ASSERT(bp);
1205 bp->b_resid = bp->b_bcount;
1207 if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1208 bioerror(bp, ENXIO);
1209 biodone(bp);
1210 return (0);
1213 if (cmlb_partinfo(bd->d_cmlbh, part, &p_nblks, &p_lba,
1214 NULL, NULL, 0)) {
1215 bioerror(bp, ENXIO);
1216 biodone(bp);
1217 return (0);
1220 shift = bd->d_blkshift;
1221 lblkno = bp->b_lblkno >> (shift - DEV_BSHIFT);
1222 if ((P2PHASE(bp->b_lblkno, (1U << (shift - DEV_BSHIFT))) != 0) ||
1223 (P2PHASE(bp->b_bcount, (1U << shift)) != 0) ||
1224 (lblkno > p_nblks)) {
1225 bioerror(bp, EINVAL);
1226 biodone(bp);
1227 return (0);
1229 b_nblks = bp->b_bcount >> shift;
1230 if ((lblkno == p_nblks) || (bp->b_bcount == 0)) {
1231 biodone(bp);
1232 return (0);
1235 if ((b_nblks + lblkno) > p_nblks) {
1236 bp->b_resid = ((lblkno + b_nblks - p_nblks) << shift);
1237 bp->b_bcount -= bp->b_resid;
1238 } else {
1239 bp->b_resid = 0;
1241 func = (bp->b_flags & B_READ) ? bd->d_ops.o_read : bd->d_ops.o_write;
1243 xi = bd_xfer_alloc(bd, bp, func, KM_NOSLEEP);
1244 if (xi == NULL) {
1245 xi = bd_xfer_alloc(bd, bp, func, KM_PUSHPAGE);
1247 if (xi == NULL) {
1248 /* bd_request_alloc will have done bioerror */
1249 biodone(bp);
1250 return (0);
1252 xi->i_blkno = lblkno + p_lba;
1254 bd_submit(bd, xi);
1256 return (0);
1259 static int
1260 bd_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp, int *rvalp)
1262 minor_t inst;
1263 uint16_t part;
1264 bd_t *bd;
1265 void *ptr = (void *)arg;
1266 int rv;
1268 part = BDPART(dev);
1269 inst = BDINST(dev);
1271 if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1272 return (ENXIO);
1275 rv = cmlb_ioctl(bd->d_cmlbh, dev, cmd, arg, flag, credp, rvalp, 0);
1276 if (rv != ENOTTY)
1277 return (rv);
1279 if (rvalp != NULL) {
1280 /* the return value of the ioctl is 0 by default */
1281 *rvalp = 0;
1284 switch (cmd) {
1285 case DKIOCGMEDIAINFO: {
1286 struct dk_minfo minfo;
1288 /* make sure our state information is current */
1289 bd_update_state(bd);
1290 bzero(&minfo, sizeof (minfo));
1291 minfo.dki_media_type = DK_FIXED_DISK;
1292 minfo.dki_lbsize = (1U << bd->d_blkshift);
1293 minfo.dki_capacity = bd->d_numblks;
1294 if (ddi_copyout(&minfo, ptr, sizeof (minfo), flag)) {
1295 return (EFAULT);
1297 return (0);
1299 case DKIOCGMEDIAINFOEXT: {
1300 struct dk_minfo_ext miext;
1302 /* make sure our state information is current */
1303 bd_update_state(bd);
1304 bzero(&miext, sizeof (miext));
1305 miext.dki_media_type = DK_FIXED_DISK;
1306 miext.dki_lbsize = (1U << bd->d_blkshift);
1307 miext.dki_pbsize = (1U << bd->d_pblkshift);
1308 miext.dki_capacity = bd->d_numblks;
1309 if (ddi_copyout(&miext, ptr, sizeof (miext), flag)) {
1310 return (EFAULT);
1312 return (0);
1314 case DKIOCINFO: {
1315 struct dk_cinfo cinfo;
1316 bzero(&cinfo, sizeof (cinfo));
1317 cinfo.dki_ctype = DKC_BLKDEV;
1318 cinfo.dki_cnum = ddi_get_instance(ddi_get_parent(bd->d_dip));
1319 (void) snprintf(cinfo.dki_cname, sizeof (cinfo.dki_cname),
1320 "%s", ddi_driver_name(ddi_get_parent(bd->d_dip)));
1321 (void) snprintf(cinfo.dki_dname, sizeof (cinfo.dki_dname),
1322 "%s", ddi_driver_name(bd->d_dip));
1323 cinfo.dki_unit = inst;
1324 cinfo.dki_flags = DKI_FMTVOL;
1325 cinfo.dki_partition = part;
1326 cinfo.dki_maxtransfer = bd->d_maxxfer / DEV_BSIZE;
1327 cinfo.dki_addr = 0;
1328 cinfo.dki_slave = 0;
1329 cinfo.dki_space = 0;
1330 cinfo.dki_prio = 0;
1331 cinfo.dki_vec = 0;
1332 if (ddi_copyout(&cinfo, ptr, sizeof (cinfo), flag)) {
1333 return (EFAULT);
1335 return (0);
1337 case DKIOCREMOVABLE: {
1338 int i;
1339 i = bd->d_removable ? 1 : 0;
1340 if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1341 return (EFAULT);
1343 return (0);
1345 case DKIOCHOTPLUGGABLE: {
1346 int i;
1347 i = bd->d_hotpluggable ? 1 : 0;
1348 if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1349 return (EFAULT);
1351 return (0);
1353 case DKIOCREADONLY: {
1354 int i;
1355 i = bd->d_rdonly ? 1 : 0;
1356 if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1357 return (EFAULT);
1359 return (0);
1361 case DKIOCSOLIDSTATE: {
1362 int i;
1363 i = bd->d_ssd ? 1 : 0;
1364 if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1365 return (EFAULT);
1367 return (0);
1369 case DKIOCSTATE: {
1370 enum dkio_state state;
1371 if (ddi_copyin(ptr, &state, sizeof (state), flag)) {
1372 return (EFAULT);
1374 if ((rv = bd_check_state(bd, &state)) != 0) {
1375 return (rv);
1377 if (ddi_copyout(&state, ptr, sizeof (state), flag)) {
1378 return (EFAULT);
1380 return (0);
1382 case DKIOCFLUSHWRITECACHE: {
1383 struct dk_callback *dkc = NULL;
1385 if (flag & FKIOCTL)
1386 dkc = (void *)arg;
1388 rv = bd_flush_write_cache(bd, dkc);
1389 return (rv);
1392 default:
1393 break;
1396 return (ENOTTY);
1399 static int
1400 bd_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags,
1401 char *name, caddr_t valuep, int *lengthp)
1403 bd_t *bd;
1405 bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1406 if (bd == NULL)
1407 return (ddi_prop_op(dev, dip, prop_op, mod_flags,
1408 name, valuep, lengthp));
1410 return (cmlb_prop_op(bd->d_cmlbh, dev, dip, prop_op, mod_flags, name,
1411 valuep, lengthp, BDPART(dev), 0));
1415 static int
1416 bd_tg_rdwr(dev_info_t *dip, uchar_t cmd, void *bufaddr, diskaddr_t start,
1417 size_t length, void *tg_cookie)
1419 bd_t *bd;
1420 buf_t *bp;
1421 bd_xfer_impl_t *xi;
1422 int rv;
1423 int (*func)(void *, bd_xfer_t *);
1424 int kmflag;
1427 * If we are running in polled mode (such as during dump(9e)
1428 * execution), then we cannot sleep for kernel allocations.
1430 kmflag = tg_cookie ? KM_NOSLEEP : KM_SLEEP;
1432 bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1434 if (P2PHASE(length, (1U << bd->d_blkshift)) != 0) {
1435 /* We can only transfer whole blocks at a time! */
1436 return (EINVAL);
1439 if ((bp = getrbuf(kmflag)) == NULL) {
1440 return (ENOMEM);
1443 switch (cmd) {
1444 case TG_READ:
1445 bp->b_flags = B_READ;
1446 func = bd->d_ops.o_read;
1447 break;
1448 case TG_WRITE:
1449 bp->b_flags = B_WRITE;
1450 func = bd->d_ops.o_write;
1451 break;
1452 default:
1453 freerbuf(bp);
1454 return (EINVAL);
1457 bp->b_un.b_addr = bufaddr;
1458 bp->b_bcount = length;
1459 xi = bd_xfer_alloc(bd, bp, func, kmflag);
1460 if (xi == NULL) {
1461 rv = geterror(bp);
1462 freerbuf(bp);
1463 return (rv);
1465 xi->i_flags = tg_cookie ? BD_XFER_POLL : 0;
1466 xi->i_blkno = start;
1467 bd_submit(bd, xi);
1468 (void) biowait(bp);
1469 rv = geterror(bp);
1470 freerbuf(bp);
1472 return (rv);
1475 static int
1476 bd_tg_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie)
1478 bd_t *bd;
1480 _NOTE(ARGUNUSED(tg_cookie));
1481 bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1483 switch (cmd) {
1484 case TG_GETPHYGEOM:
1485 case TG_GETVIRTGEOM:
1487 * We don't have any "geometry" as such, let cmlb
1488 * fabricate something.
1490 return (ENOTTY);
1492 case TG_GETCAPACITY:
1493 bd_update_state(bd);
1494 *(diskaddr_t *)arg = bd->d_numblks;
1495 return (0);
1497 case TG_GETBLOCKSIZE:
1498 *(uint32_t *)arg = (1U << bd->d_blkshift);
1499 return (0);
1501 case TG_GETATTR:
1503 * It turns out that cmlb really doesn't do much for
1504 * non-writable media, but lets make the information
1505 * available for it in case it does more in the
1506 * future. (The value is currently used for
1507 * triggering special behavior for CD-ROMs.)
1509 bd_update_state(bd);
1510 ((tg_attribute_t *)arg)->media_is_writable =
1511 bd->d_rdonly ? B_FALSE : B_TRUE;
1512 ((tg_attribute_t *)arg)->media_is_solid_state = bd->d_ssd;
1513 ((tg_attribute_t *)arg)->media_is_rotational = B_FALSE;
1514 return (0);
1516 default:
1517 return (EINVAL);
1522 static void
1523 bd_sched(bd_t *bd)
1525 bd_xfer_impl_t *xi;
1526 struct buf *bp;
1527 int rv;
1529 mutex_enter(&bd->d_iomutex);
1531 while ((bd->d_qactive < bd->d_qsize) &&
1532 ((xi = list_remove_head(&bd->d_waitq)) != NULL)) {
1533 bd->d_qactive++;
1534 kstat_waitq_to_runq(bd->d_kiop);
1535 list_insert_tail(&bd->d_runq, xi);
1538 * Submit the job to the driver. We drop the I/O mutex
1539 * so that we can deal with the case where the driver
1540 * completion routine calls back into us synchronously.
1543 mutex_exit(&bd->d_iomutex);
1545 rv = xi->i_func(bd->d_private, &xi->i_public);
1546 if (rv != 0) {
1547 bp = xi->i_bp;
1548 bioerror(bp, rv);
1549 biodone(bp);
1551 atomic_inc_32(&bd->d_kerr->bd_transerrs.value.ui32);
1553 mutex_enter(&bd->d_iomutex);
1554 bd->d_qactive--;
1555 kstat_runq_exit(bd->d_kiop);
1556 list_remove(&bd->d_runq, xi);
1557 bd_xfer_free(xi);
1558 } else {
1559 mutex_enter(&bd->d_iomutex);
1563 mutex_exit(&bd->d_iomutex);
1566 static void
1567 bd_submit(bd_t *bd, bd_xfer_impl_t *xi)
1569 mutex_enter(&bd->d_iomutex);
1570 list_insert_tail(&bd->d_waitq, xi);
1571 kstat_waitq_enter(bd->d_kiop);
1572 mutex_exit(&bd->d_iomutex);
1574 bd_sched(bd);
1577 static void
1578 bd_runq_exit(bd_xfer_impl_t *xi, int err)
1580 bd_t *bd = xi->i_bd;
1581 buf_t *bp = xi->i_bp;
1583 mutex_enter(&bd->d_iomutex);
1584 bd->d_qactive--;
1585 kstat_runq_exit(bd->d_kiop);
1586 list_remove(&bd->d_runq, xi);
1587 mutex_exit(&bd->d_iomutex);
1589 if (err == 0) {
1590 if (bp->b_flags & B_READ) {
1591 bd->d_kiop->reads++;
1592 bd->d_kiop->nread += (bp->b_bcount - xi->i_resid);
1593 } else {
1594 bd->d_kiop->writes++;
1595 bd->d_kiop->nwritten += (bp->b_bcount - xi->i_resid);
1598 bd_sched(bd);
1601 static void
1602 bd_update_state(bd_t *bd)
1604 enum dkio_state state = DKIO_INSERTED;
1605 boolean_t docmlb = B_FALSE;
1606 bd_media_t media;
1608 bzero(&media, sizeof (media));
1610 mutex_enter(&bd->d_statemutex);
1611 if (bd->d_ops.o_media_info(bd->d_private, &media) != 0) {
1612 bd->d_numblks = 0;
1613 state = DKIO_EJECTED;
1614 goto done;
1617 if ((media.m_blksize < 512) ||
1618 (!ISP2(media.m_blksize)) ||
1619 (P2PHASE(bd->d_maxxfer, media.m_blksize))) {
1620 cmn_err(CE_WARN, "%s%d: Invalid media block size (%d)",
1621 ddi_driver_name(bd->d_dip), ddi_get_instance(bd->d_dip),
1622 media.m_blksize);
1624 * We can't use the media, treat it as not present.
1626 state = DKIO_EJECTED;
1627 bd->d_numblks = 0;
1628 goto done;
1631 if (((1U << bd->d_blkshift) != media.m_blksize) ||
1632 (bd->d_numblks != media.m_nblks)) {
1633 /* Device size changed */
1634 docmlb = B_TRUE;
1637 bd->d_blkshift = ddi_ffs(media.m_blksize) - 1;
1638 bd->d_pblkshift = bd->d_blkshift;
1639 bd->d_numblks = media.m_nblks;
1640 bd->d_rdonly = media.m_readonly;
1641 bd->d_ssd = media.m_solidstate;
1644 * Only use the supplied physical block size if it is non-zero,
1645 * greater or equal to the block size, and a power of 2. Ignore it
1646 * if not, it's just informational and we can still use the media.
1648 if ((media.m_pblksize != 0) &&
1649 (media.m_pblksize >= media.m_blksize) &&
1650 (ISP2(media.m_pblksize)))
1651 bd->d_pblkshift = ddi_ffs(media.m_pblksize) - 1;
1653 done:
1654 if (state != bd->d_state) {
1655 bd->d_state = state;
1656 cv_broadcast(&bd->d_statecv);
1657 docmlb = B_TRUE;
1659 mutex_exit(&bd->d_statemutex);
1661 bd->d_kerr->bd_capacity.value.ui64 = bd->d_numblks << bd->d_blkshift;
1663 if (docmlb) {
1664 if (state == DKIO_INSERTED) {
1665 (void) cmlb_validate(bd->d_cmlbh, 0, 0);
1666 } else {
1667 cmlb_invalidate(bd->d_cmlbh, 0);
1672 static int
1673 bd_check_state(bd_t *bd, enum dkio_state *state)
1675 clock_t when;
1677 for (;;) {
1679 bd_update_state(bd);
1681 mutex_enter(&bd->d_statemutex);
1683 if (bd->d_state != *state) {
1684 *state = bd->d_state;
1685 mutex_exit(&bd->d_statemutex);
1686 break;
1689 when = drv_usectohz(1000000);
1690 if (cv_reltimedwait_sig(&bd->d_statecv, &bd->d_statemutex,
1691 when, TR_CLOCK_TICK) == 0) {
1692 mutex_exit(&bd->d_statemutex);
1693 return (EINTR);
1696 mutex_exit(&bd->d_statemutex);
1699 return (0);
1702 static int
1703 bd_flush_write_cache_done(struct buf *bp)
1705 struct dk_callback *dc = (void *)bp->b_private;
1707 (*dc->dkc_callback)(dc->dkc_cookie, geterror(bp));
1708 kmem_free(dc, sizeof (*dc));
1709 freerbuf(bp);
1710 return (0);
1713 static int
1714 bd_flush_write_cache(bd_t *bd, struct dk_callback *dkc)
1716 buf_t *bp;
1717 struct dk_callback *dc;
1718 bd_xfer_impl_t *xi;
1719 int rv;
1721 if (bd->d_ops.o_sync_cache == NULL) {
1722 return (ENOTSUP);
1724 if ((bp = getrbuf(KM_SLEEP)) == NULL) {
1725 return (ENOMEM);
1727 bp->b_resid = 0;
1728 bp->b_bcount = 0;
1730 xi = bd_xfer_alloc(bd, bp, bd->d_ops.o_sync_cache, KM_SLEEP);
1731 if (xi == NULL) {
1732 rv = geterror(bp);
1733 freerbuf(bp);
1734 return (rv);
1737 /* Make an asynchronous flush, but only if there is a callback */
1738 if (dkc != NULL && dkc->dkc_callback != NULL) {
1739 /* Make a private copy of the callback structure */
1740 dc = kmem_alloc(sizeof (*dc), KM_SLEEP);
1741 *dc = *dkc;
1742 bp->b_private = dc;
1743 bp->b_iodone = bd_flush_write_cache_done;
1745 bd_submit(bd, xi);
1746 return (0);
1749 /* In case there is no callback, perform a synchronous flush */
1750 bd_submit(bd, xi);
1751 (void) biowait(bp);
1752 rv = geterror(bp);
1753 freerbuf(bp);
1755 return (rv);
1759 * Nexus support.
1762 bd_bus_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_ctl_enum_t ctlop,
1763 void *arg, void *result)
1765 bd_handle_t hdl;
1767 switch (ctlop) {
1768 case DDI_CTLOPS_REPORTDEV:
1769 cmn_err(CE_CONT, "?Block device: %s@%s, %s%d\n",
1770 ddi_node_name(rdip), ddi_get_name_addr(rdip),
1771 ddi_driver_name(rdip), ddi_get_instance(rdip));
1772 return (DDI_SUCCESS);
1774 case DDI_CTLOPS_INITCHILD:
1775 hdl = ddi_get_parent_data((dev_info_t *)arg);
1776 if (hdl == NULL) {
1777 return (DDI_NOT_WELL_FORMED);
1779 ddi_set_name_addr((dev_info_t *)arg, hdl->h_addr);
1780 return (DDI_SUCCESS);
1782 case DDI_CTLOPS_UNINITCHILD:
1783 ddi_set_name_addr((dev_info_t *)arg, NULL);
1784 ndi_prop_remove_all((dev_info_t *)arg);
1785 return (DDI_SUCCESS);
1787 default:
1788 return (ddi_ctlops(dip, rdip, ctlop, arg, result));
1793 * Functions for device drivers.
1795 bd_handle_t
1796 bd_alloc_handle(void *private, bd_ops_t *ops, ddi_dma_attr_t *dma, int kmflag)
1798 bd_handle_t hdl;
1800 hdl = kmem_zalloc(sizeof (*hdl), kmflag);
1801 if (hdl != NULL) {
1802 hdl->h_ops = *ops;
1803 hdl->h_dma = dma;
1804 hdl->h_private = private;
1807 return (hdl);
1810 void
1811 bd_free_handle(bd_handle_t hdl)
1813 kmem_free(hdl, sizeof (*hdl));
1817 bd_attach_handle(dev_info_t *dip, bd_handle_t hdl)
1819 dev_info_t *child;
1820 bd_drive_t drive = { 0 };
1823 * It's not an error if bd_attach_handle() is called on a handle that
1824 * already is attached. We just ignore the request to attach and return.
1825 * This way drivers using blkdev don't have to keep track about blkdev
1826 * state, they can just call this function to make sure it attached.
1828 if (hdl->h_child != NULL) {
1829 return (DDI_SUCCESS);
1832 /* if drivers don't override this, make it assume none */
1833 drive.d_lun = -1;
1834 hdl->h_ops.o_drive_info(hdl->h_private, &drive);
1836 hdl->h_parent = dip;
1837 hdl->h_name = "blkdev";
1839 /*LINTED: E_BAD_PTR_CAST_ALIGN*/
1840 if (*(uint64_t *)drive.d_eui64 != 0) {
1841 if (drive.d_lun >= 0) {
1842 (void) snprintf(hdl->h_addr, sizeof (hdl->h_addr),
1843 "w%02X%02X%02X%02X%02X%02X%02X%02X,%X",
1844 drive.d_eui64[0], drive.d_eui64[1],
1845 drive.d_eui64[2], drive.d_eui64[3],
1846 drive.d_eui64[4], drive.d_eui64[5],
1847 drive.d_eui64[6], drive.d_eui64[7], drive.d_lun);
1848 } else {
1849 (void) snprintf(hdl->h_addr, sizeof (hdl->h_addr),
1850 "w%02X%02X%02X%02X%02X%02X%02X%02X",
1851 drive.d_eui64[0], drive.d_eui64[1],
1852 drive.d_eui64[2], drive.d_eui64[3],
1853 drive.d_eui64[4], drive.d_eui64[5],
1854 drive.d_eui64[6], drive.d_eui64[7]);
1856 } else {
1857 if (drive.d_lun >= 0) {
1858 (void) snprintf(hdl->h_addr, sizeof (hdl->h_addr),
1859 "%X,%X", drive.d_target, drive.d_lun);
1860 } else {
1861 (void) snprintf(hdl->h_addr, sizeof (hdl->h_addr),
1862 "%X", drive.d_target);
1866 if (ndi_devi_alloc(dip, hdl->h_name, (pnode_t)DEVI_SID_NODEID,
1867 &child) != NDI_SUCCESS) {
1868 cmn_err(CE_WARN, "%s%d: unable to allocate node %s@%s",
1869 ddi_driver_name(dip), ddi_get_instance(dip),
1870 "blkdev", hdl->h_addr);
1871 return (DDI_FAILURE);
1874 ddi_set_parent_data(child, hdl);
1875 hdl->h_child = child;
1877 if (ndi_devi_online(child, 0) == NDI_FAILURE) {
1878 cmn_err(CE_WARN, "%s%d: failed bringing node %s@%s online",
1879 ddi_driver_name(dip), ddi_get_instance(dip),
1880 hdl->h_name, hdl->h_addr);
1881 (void) ndi_devi_free(child);
1882 return (DDI_FAILURE);
1885 return (DDI_SUCCESS);
1889 bd_detach_handle(bd_handle_t hdl)
1891 int circ;
1892 int rv;
1893 char *devnm;
1896 * It's not an error if bd_detach_handle() is called on a handle that
1897 * already is detached. We just ignore the request to detach and return.
1898 * This way drivers using blkdev don't have to keep track about blkdev
1899 * state, they can just call this function to make sure it detached.
1901 if (hdl->h_child == NULL) {
1902 return (DDI_SUCCESS);
1904 ndi_devi_enter(hdl->h_parent, &circ);
1905 if (i_ddi_node_state(hdl->h_child) < DS_INITIALIZED) {
1906 rv = ddi_remove_child(hdl->h_child, 0);
1907 } else {
1908 devnm = kmem_alloc(MAXNAMELEN + 1, KM_SLEEP);
1909 (void) ddi_deviname(hdl->h_child, devnm);
1910 (void) devfs_clean(hdl->h_parent, devnm + 1, DV_CLEAN_FORCE);
1911 rv = ndi_devi_unconfig_one(hdl->h_parent, devnm + 1, NULL,
1912 NDI_DEVI_REMOVE | NDI_UNCONFIG);
1913 kmem_free(devnm, MAXNAMELEN + 1);
1915 if (rv == 0) {
1916 hdl->h_child = NULL;
1919 ndi_devi_exit(hdl->h_parent, circ);
1920 return (rv == NDI_SUCCESS ? DDI_SUCCESS : DDI_FAILURE);
1923 void
1924 bd_xfer_done(bd_xfer_t *xfer, int err)
1926 bd_xfer_impl_t *xi = (void *)xfer;
1927 buf_t *bp = xi->i_bp;
1928 int rv = DDI_SUCCESS;
1929 bd_t *bd = xi->i_bd;
1930 size_t len;
1932 if (err != 0) {
1933 bd_runq_exit(xi, err);
1934 atomic_inc_32(&bd->d_kerr->bd_harderrs.value.ui32);
1936 bp->b_resid += xi->i_resid;
1937 bd_xfer_free(xi);
1938 bioerror(bp, err);
1939 biodone(bp);
1940 return;
1943 xi->i_cur_win++;
1944 xi->i_resid -= xi->i_len;
1946 if (xi->i_resid == 0) {
1947 /* Job completed succcessfully! */
1948 bd_runq_exit(xi, 0);
1950 bd_xfer_free(xi);
1951 biodone(bp);
1952 return;
1955 xi->i_blkno += xi->i_nblks;
1957 if (bd->d_use_dma) {
1958 /* More transfer still pending... advance to next DMA window. */
1959 rv = ddi_dma_getwin(xi->i_dmah, xi->i_cur_win,
1960 &xi->i_offset, &len, &xi->i_dmac, &xi->i_ndmac);
1961 } else {
1962 /* Advance memory window. */
1963 xi->i_kaddr += xi->i_len;
1964 xi->i_offset += xi->i_len;
1965 len = min(bp->b_bcount - xi->i_offset, bd->d_maxxfer);
1969 if ((rv != DDI_SUCCESS) ||
1970 (P2PHASE(len, (1U << xi->i_blkshift)) != 0)) {
1971 bd_runq_exit(xi, EFAULT);
1973 bp->b_resid += xi->i_resid;
1974 bd_xfer_free(xi);
1975 bioerror(bp, EFAULT);
1976 biodone(bp);
1977 return;
1979 xi->i_len = len;
1980 xi->i_nblks = len >> xi->i_blkshift;
1982 /* Submit next window to hardware. */
1983 rv = xi->i_func(bd->d_private, &xi->i_public);
1984 if (rv != 0) {
1985 bd_runq_exit(xi, rv);
1987 atomic_inc_32(&bd->d_kerr->bd_transerrs.value.ui32);
1989 bp->b_resid += xi->i_resid;
1990 bd_xfer_free(xi);
1991 bioerror(bp, rv);
1992 biodone(bp);
1996 void
1997 bd_error(bd_xfer_t *xfer, int error)
1999 bd_xfer_impl_t *xi = (void *)xfer;
2000 bd_t *bd = xi->i_bd;
2002 switch (error) {
2003 case BD_ERR_MEDIA:
2004 atomic_inc_32(&bd->d_kerr->bd_rq_media_err.value.ui32);
2005 break;
2006 case BD_ERR_NTRDY:
2007 atomic_inc_32(&bd->d_kerr->bd_rq_ntrdy_err.value.ui32);
2008 break;
2009 case BD_ERR_NODEV:
2010 atomic_inc_32(&bd->d_kerr->bd_rq_nodev_err.value.ui32);
2011 break;
2012 case BD_ERR_RECOV:
2013 atomic_inc_32(&bd->d_kerr->bd_rq_recov_err.value.ui32);
2014 break;
2015 case BD_ERR_ILLRQ:
2016 atomic_inc_32(&bd->d_kerr->bd_rq_illrq_err.value.ui32);
2017 break;
2018 case BD_ERR_PFA:
2019 atomic_inc_32(&bd->d_kerr->bd_rq_pfa_err.value.ui32);
2020 break;
2021 default:
2022 cmn_err(CE_PANIC, "bd_error: unknown error type %d", error);
2023 break;
2027 void
2028 bd_state_change(bd_handle_t hdl)
2030 bd_t *bd;
2032 if ((bd = hdl->h_bd) != NULL) {
2033 bd_update_state(bd);
2037 void
2038 bd_mod_init(struct dev_ops *devops)
2040 static struct bus_ops bd_bus_ops = {
2041 BUSO_REV, /* busops_rev */
2042 nullbusmap, /* bus_map */
2043 NULL, /* bus_get_intrspec (OBSOLETE) */
2044 NULL, /* bus_add_intrspec (OBSOLETE) */
2045 NULL, /* bus_remove_intrspec (OBSOLETE) */
2046 i_ddi_map_fault, /* bus_map_fault */
2047 NULL, /* bus_dma_map (OBSOLETE) */
2048 ddi_dma_allochdl, /* bus_dma_allochdl */
2049 ddi_dma_freehdl, /* bus_dma_freehdl */
2050 ddi_dma_bindhdl, /* bus_dma_bindhdl */
2051 ddi_dma_unbindhdl, /* bus_dma_unbindhdl */
2052 ddi_dma_flush, /* bus_dma_flush */
2053 ddi_dma_win, /* bus_dma_win */
2054 ddi_dma_mctl, /* bus_dma_ctl */
2055 bd_bus_ctl, /* bus_ctl */
2056 ddi_bus_prop_op, /* bus_prop_op */
2057 NULL, /* bus_get_eventcookie */
2058 NULL, /* bus_add_eventcall */
2059 NULL, /* bus_remove_eventcall */
2060 NULL, /* bus_post_event */
2061 NULL, /* bus_intr_ctl (OBSOLETE) */
2062 NULL, /* bus_config */
2063 NULL, /* bus_unconfig */
2064 NULL, /* bus_fm_init */
2065 NULL, /* bus_fm_fini */
2066 NULL, /* bus_fm_access_enter */
2067 NULL, /* bus_fm_access_exit */
2068 NULL, /* bus_power */
2069 NULL, /* bus_intr_op */
2072 devops->devo_bus_ops = &bd_bus_ops;
2075 * NB: The device driver is free to supply its own
2076 * character entry device support.
2080 void
2081 bd_mod_fini(struct dev_ops *devops)
2083 devops->devo_bus_ops = NULL;