4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
25 * All rights reserved.
27 * Portions Copyright 2010 Robert Milkowski
29 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
30 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
31 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
32 * Copyright (c) 2014 Integros [integros.com]
33 * Copyright (c) 2024, Klara, Inc.
36 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
39 * ZFS volume emulation driver.
41 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
42 * Volumes are accessed through the symbolic links named:
44 * /dev/zvol/<pool_name>/<dataset_name>
46 * Volumes are persistent through reboot. No user command needs to be
47 * run before opening and using a device.
49 * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
50 * in the system. Except when they're simply character devices (volmode=dev).
53 #include <sys/types.h>
54 #include <sys/param.h>
55 #include <sys/kernel.h>
56 #include <sys/errno.h>
62 #include <sys/cmn_err.h>
67 #include <sys/spa_impl.h>
70 #include <sys/dmu_traverse.h>
71 #include <sys/dnode.h>
72 #include <sys/dsl_dataset.h>
73 #include <sys/dsl_prop.h>
74 #include <sys/dsl_dir.h>
75 #include <sys/byteorder.h>
76 #include <sys/sunddi.h>
77 #include <sys/dirent.h>
78 #include <sys/policy.h>
79 #include <sys/queue.h>
80 #include <sys/fs/zfs.h>
81 #include <sys/zfs_ioctl.h>
83 #include <sys/zfs_znode.h>
84 #include <sys/zfs_rlock.h>
85 #include <sys/vdev_impl.h>
86 #include <sys/vdev_raidz.h>
88 #include <sys/zil_impl.h>
89 #include <sys/dataset_kstats.h>
91 #include <sys/dmu_tx.h>
92 #include <sys/zfeature.h>
93 #include <sys/zio_checksum.h>
94 #include <sys/zil_impl.h>
95 #include <sys/filio.h>
96 #include <sys/freebsd_event.h>
98 #include <geom/geom.h>
100 #include <sys/zvol_impl.h>
102 #include "zfs_namecheck.h"
104 #define ZVOL_DUMPSIZE "dumpsize"
106 #ifdef ZVOL_LOCK_DEBUG
107 #define ZVOL_RW_READER RW_WRITER
108 #define ZVOL_RW_READ_HELD RW_WRITE_HELD
110 #define ZVOL_RW_READER RW_READER
111 #define ZVOL_RW_READ_HELD RW_READ_HELD
114 enum zvol_geom_state
{
120 struct zvol_state_os
{
121 #define zso_dev _zso_state._zso_dev
122 #define zso_geom _zso_state._zso_geom
125 struct zvol_state_dev
{
126 struct cdev
*zsd_cdev
;
127 struct selinfo zsd_selinfo
;
131 struct zvol_state_geom
{
132 struct g_provider
*zsg_provider
;
133 struct bio_queue_head zsg_queue
;
134 struct mtx zsg_queue_mtx
;
135 enum zvol_geom_state zsg_state
;
141 static uint32_t zvol_minors
;
143 SYSCTL_DECL(_vfs_zfs
);
144 SYSCTL_NODE(_vfs_zfs
, OID_AUTO
, vol
, CTLFLAG_RW
, 0, "ZFS VOLUME");
145 SYSCTL_INT(_vfs_zfs_vol
, OID_AUTO
, mode
, CTLFLAG_RWTUN
, &zvol_volmode
, 0,
146 "Expose as GEOM providers (1), device files (2) or neither");
147 static boolean_t zpool_on_zvol
= B_FALSE
;
148 SYSCTL_INT(_vfs_zfs_vol
, OID_AUTO
, recursive
, CTLFLAG_RWTUN
, &zpool_on_zvol
, 0,
149 "Allow zpools to use zvols as vdevs (DANGEROUS)");
152 * Toggle unmap functionality.
154 boolean_t zvol_unmap_enabled
= B_TRUE
;
156 SYSCTL_INT(_vfs_zfs_vol
, OID_AUTO
, unmap_enabled
, CTLFLAG_RWTUN
,
157 &zvol_unmap_enabled
, 0, "Enable UNMAP functionality");
160 * zvol maximum transfer in one DMU tx.
162 int zvol_maxphys
= DMU_MAX_ACCESS
/ 2;
164 static void zvol_ensure_zilog(zvol_state_t
*zv
);
166 static d_open_t zvol_cdev_open
;
167 static d_close_t zvol_cdev_close
;
168 static d_ioctl_t zvol_cdev_ioctl
;
169 static d_read_t zvol_cdev_read
;
170 static d_write_t zvol_cdev_write
;
171 static d_strategy_t zvol_geom_bio_strategy
;
172 static d_kqfilter_t zvol_cdev_kqfilter
;
174 static struct cdevsw zvol_cdevsw
= {
176 .d_version
= D_VERSION
,
177 .d_flags
= D_DISK
| D_TRACKCLOSE
,
178 .d_open
= zvol_cdev_open
,
179 .d_close
= zvol_cdev_close
,
180 .d_ioctl
= zvol_cdev_ioctl
,
181 .d_read
= zvol_cdev_read
,
182 .d_write
= zvol_cdev_write
,
183 .d_strategy
= zvol_geom_bio_strategy
,
184 .d_kqfilter
= zvol_cdev_kqfilter
,
187 static void zvol_filter_detach(struct knote
*kn
);
188 static int zvol_filter_vnode(struct knote
*kn
, long hint
);
190 static struct filterops zvol_filterops_vnode
= {
192 .f_detach
= zvol_filter_detach
,
193 .f_event
= zvol_filter_vnode
,
196 extern uint_t zfs_geom_probe_vdev_key
;
198 struct g_class zfs_zvol_class
= {
200 .version
= G_VERSION
,
203 DECLARE_GEOM_CLASS(zfs_zvol_class
, zfs_zvol
);
205 static int zvol_geom_open(struct g_provider
*pp
, int flag
, int count
);
206 static int zvol_geom_close(struct g_provider
*pp
, int flag
, int count
);
207 static void zvol_geom_run(zvol_state_t
*zv
);
208 static void zvol_geom_destroy(zvol_state_t
*zv
);
209 static int zvol_geom_access(struct g_provider
*pp
, int acr
, int acw
, int ace
);
210 static void zvol_geom_worker(void *arg
);
211 static void zvol_geom_bio_start(struct bio
*bp
);
212 static int zvol_geom_bio_getattr(struct bio
*bp
);
213 /* static d_strategy_t zvol_geom_bio_strategy; (declared elsewhere) */
216 * GEOM mode implementation
220 zvol_geom_open(struct g_provider
*pp
, int flag
, int count
)
224 boolean_t drop_suspend
= B_FALSE
;
226 if (!zpool_on_zvol
&& tsd_get(zfs_geom_probe_vdev_key
) != NULL
) {
228 * If zfs_geom_probe_vdev_key is set, that means that zfs is
229 * attempting to probe geom providers while looking for a
230 * replacement for a missing VDEV. In this case, the
231 * spa_namespace_lock will not be held, but it is still illegal
232 * to use a zvol as a vdev. Deadlocks can result if another
233 * thread has spa_namespace_lock.
235 return (SET_ERROR(EOPNOTSUPP
));
239 rw_enter(&zvol_state_lock
, ZVOL_RW_READER
);
241 * Obtain a copy of private under zvol_state_lock to make sure either
242 * the result of zvol free code setting private to NULL is observed,
243 * or the zv is protected from being freed because of the positive
248 rw_exit(&zvol_state_lock
);
249 err
= SET_ERROR(ENXIO
);
253 mutex_enter(&zv
->zv_state_lock
);
254 if (zv
->zv_zso
->zso_dying
|| zv
->zv_flags
& ZVOL_REMOVING
) {
255 rw_exit(&zvol_state_lock
);
256 err
= SET_ERROR(ENXIO
);
259 ASSERT3S(zv
->zv_volmode
, ==, ZFS_VOLMODE_GEOM
);
262 * Make sure zvol is not suspended during first open
263 * (hold zv_suspend_lock) and respect proper lock acquisition
264 * ordering - zv_suspend_lock before zv_state_lock.
266 if (zv
->zv_open_count
== 0) {
267 drop_suspend
= B_TRUE
;
268 if (!rw_tryenter(&zv
->zv_suspend_lock
, ZVOL_RW_READER
)) {
269 mutex_exit(&zv
->zv_state_lock
);
270 rw_enter(&zv
->zv_suspend_lock
, ZVOL_RW_READER
);
271 mutex_enter(&zv
->zv_state_lock
);
272 /* Check to see if zv_suspend_lock is needed. */
273 if (zv
->zv_open_count
!= 0) {
274 rw_exit(&zv
->zv_suspend_lock
);
275 drop_suspend
= B_FALSE
;
279 rw_exit(&zvol_state_lock
);
281 ASSERT(MUTEX_HELD(&zv
->zv_state_lock
));
283 if (zv
->zv_open_count
== 0) {
284 boolean_t drop_namespace
= B_FALSE
;
286 ASSERT(ZVOL_RW_READ_HELD(&zv
->zv_suspend_lock
));
289 * Take spa_namespace_lock to prevent lock inversion when
290 * zvols from one pool are opened as vdevs in another.
292 if (!mutex_owned(&spa_namespace_lock
)) {
293 if (!mutex_tryenter(&spa_namespace_lock
)) {
294 mutex_exit(&zv
->zv_state_lock
);
295 rw_exit(&zv
->zv_suspend_lock
);
296 drop_suspend
= B_FALSE
;
297 kern_yield(PRI_USER
);
300 drop_namespace
= B_TRUE
;
303 err
= zvol_first_open(zv
, !(flag
& FWRITE
));
305 mutex_exit(&spa_namespace_lock
);
308 pp
->mediasize
= zv
->zv_volsize
;
309 pp
->stripeoffset
= 0;
310 pp
->stripesize
= zv
->zv_volblocksize
;
313 ASSERT(MUTEX_HELD(&zv
->zv_state_lock
));
316 * Check for a bad on-disk format version now since we
317 * lied about owning the dataset readonly before.
319 if ((flag
& FWRITE
) && ((zv
->zv_flags
& ZVOL_RDONLY
) ||
320 dmu_objset_incompatible_encryption_version(zv
->zv_objset
))) {
321 err
= SET_ERROR(EROFS
);
324 if (zv
->zv_flags
& ZVOL_EXCL
) {
325 err
= SET_ERROR(EBUSY
);
329 if (zv
->zv_open_count
!= 0) {
330 err
= SET_ERROR(EBUSY
);
333 zv
->zv_flags
|= ZVOL_EXCL
;
336 zv
->zv_open_count
+= count
;
338 if (zv
->zv_open_count
== 0) {
343 mutex_exit(&zv
->zv_state_lock
);
346 rw_exit(&zv
->zv_suspend_lock
);
351 zvol_geom_close(struct g_provider
*pp
, int flag
, int count
)
355 boolean_t drop_suspend
= B_TRUE
;
358 rw_enter(&zvol_state_lock
, ZVOL_RW_READER
);
361 rw_exit(&zvol_state_lock
);
362 return (SET_ERROR(ENXIO
));
365 mutex_enter(&zv
->zv_state_lock
);
366 if (zv
->zv_flags
& ZVOL_EXCL
) {
367 ASSERT3U(zv
->zv_open_count
, ==, 1);
368 zv
->zv_flags
&= ~ZVOL_EXCL
;
371 ASSERT3S(zv
->zv_volmode
, ==, ZFS_VOLMODE_GEOM
);
374 * If the open count is zero, this is a spurious close.
375 * That indicates a bug in the kernel / DDI framework.
377 ASSERT3U(zv
->zv_open_count
, >, 0);
380 * Make sure zvol is not suspended during last close
381 * (hold zv_suspend_lock) and respect proper lock acquisition
382 * ordering - zv_suspend_lock before zv_state_lock.
384 new_open_count
= zv
->zv_open_count
- count
;
385 if (new_open_count
== 0) {
386 if (!rw_tryenter(&zv
->zv_suspend_lock
, ZVOL_RW_READER
)) {
387 mutex_exit(&zv
->zv_state_lock
);
388 rw_enter(&zv
->zv_suspend_lock
, ZVOL_RW_READER
);
389 mutex_enter(&zv
->zv_state_lock
);
390 /* Check to see if zv_suspend_lock is needed. */
391 new_open_count
= zv
->zv_open_count
- count
;
392 if (new_open_count
!= 0) {
393 rw_exit(&zv
->zv_suspend_lock
);
394 drop_suspend
= B_FALSE
;
398 drop_suspend
= B_FALSE
;
400 rw_exit(&zvol_state_lock
);
402 ASSERT(MUTEX_HELD(&zv
->zv_state_lock
));
405 * You may get multiple opens, but only one close.
407 zv
->zv_open_count
= new_open_count
;
408 if (zv
->zv_open_count
== 0) {
409 ASSERT(ZVOL_RW_READ_HELD(&zv
->zv_suspend_lock
));
414 mutex_exit(&zv
->zv_state_lock
);
417 rw_exit(&zv
->zv_suspend_lock
);
422 zvol_geom_run(zvol_state_t
*zv
)
424 struct zvol_state_geom
*zsg
= &zv
->zv_zso
->zso_geom
;
425 struct g_provider
*pp
= zsg
->zsg_provider
;
427 ASSERT3S(zv
->zv_volmode
, ==, ZFS_VOLMODE_GEOM
);
429 g_error_provider(pp
, 0);
431 kproc_kthread_add(zvol_geom_worker
, zv
, &system_proc
, NULL
, 0, 0,
432 "zfskern", "zvol %s", pp
->name
+ sizeof (ZVOL_DRIVER
));
436 zvol_geom_destroy(zvol_state_t
*zv
)
438 struct zvol_state_geom
*zsg
= &zv
->zv_zso
->zso_geom
;
439 struct g_provider
*pp
= zsg
->zsg_provider
;
441 ASSERT3S(zv
->zv_volmode
, ==, ZFS_VOLMODE_GEOM
);
445 mutex_enter(&zv
->zv_state_lock
);
446 VERIFY3S(zsg
->zsg_state
, ==, ZVOL_GEOM_RUNNING
);
447 mutex_exit(&zv
->zv_state_lock
);
448 zsg
->zsg_provider
= NULL
;
449 g_wither_geom(pp
->geom
, ENXIO
);
453 zvol_wait_close(zvol_state_t
*zv
)
456 if (zv
->zv_volmode
!= ZFS_VOLMODE_GEOM
)
458 mutex_enter(&zv
->zv_state_lock
);
459 zv
->zv_zso
->zso_dying
= B_TRUE
;
461 if (zv
->zv_open_count
)
462 msleep(zv
, &zv
->zv_state_lock
,
463 PRIBIO
, "zvol:dying", 10*hz
);
464 mutex_exit(&zv
->zv_state_lock
);
469 zvol_geom_access(struct g_provider
*pp
, int acr
, int acw
, int ace
)
471 int count
, error
, flags
;
476 * To make it easier we expect either open or close, but not both
479 KASSERT((acr
>= 0 && acw
>= 0 && ace
>= 0) ||
480 (acr
<= 0 && acw
<= 0 && ace
<= 0),
481 ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
482 pp
->name
, acr
, acw
, ace
));
484 if (pp
->private == NULL
) {
485 if (acr
<= 0 && acw
<= 0 && ace
<= 0)
491 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if
492 * ace != 0, because GEOM already handles that and handles it a bit
493 * differently. GEOM allows for multiple read/exclusive consumers and
494 * ZFS allows only one exclusive consumer, no matter if it is reader or
495 * writer. I like better the way GEOM works so I'll leave it for GEOM
496 * to decide what to do.
499 count
= acr
+ acw
+ ace
;
504 if (acr
!= 0 || ace
!= 0)
511 error
= zvol_geom_open(pp
, flags
, count
);
513 error
= zvol_geom_close(pp
, flags
, -count
);
519 zvol_geom_worker(void *arg
)
521 zvol_state_t
*zv
= arg
;
522 struct zvol_state_geom
*zsg
= &zv
->zv_zso
->zso_geom
;
525 ASSERT3S(zv
->zv_volmode
, ==, ZFS_VOLMODE_GEOM
);
527 thread_lock(curthread
);
528 sched_prio(curthread
, PRIBIO
);
529 thread_unlock(curthread
);
532 mtx_lock(&zsg
->zsg_queue_mtx
);
533 bp
= bioq_takefirst(&zsg
->zsg_queue
);
535 if (zsg
->zsg_state
== ZVOL_GEOM_STOPPED
) {
536 zsg
->zsg_state
= ZVOL_GEOM_RUNNING
;
537 wakeup(&zsg
->zsg_state
);
538 mtx_unlock(&zsg
->zsg_queue_mtx
);
541 msleep(&zsg
->zsg_queue
, &zsg
->zsg_queue_mtx
,
542 PRIBIO
| PDROP
, "zvol:io", 0);
545 mtx_unlock(&zsg
->zsg_queue_mtx
);
546 zvol_geom_bio_strategy(bp
);
551 zvol_geom_bio_start(struct bio
*bp
)
553 zvol_state_t
*zv
= bp
->bio_to
->private;
554 struct zvol_state_geom
*zsg
;
558 g_io_deliver(bp
, ENXIO
);
561 if (bp
->bio_cmd
== BIO_GETATTR
) {
562 if (zvol_geom_bio_getattr(bp
))
563 g_io_deliver(bp
, EOPNOTSUPP
);
567 if (!THREAD_CAN_SLEEP()) {
568 zsg
= &zv
->zv_zso
->zso_geom
;
569 mtx_lock(&zsg
->zsg_queue_mtx
);
570 first
= (bioq_first(&zsg
->zsg_queue
) == NULL
);
571 bioq_insert_tail(&zsg
->zsg_queue
, bp
);
572 mtx_unlock(&zsg
->zsg_queue_mtx
);
574 wakeup_one(&zsg
->zsg_queue
);
578 zvol_geom_bio_strategy(bp
);
582 zvol_geom_bio_getattr(struct bio
*bp
)
586 zv
= bp
->bio_to
->private;
587 ASSERT3P(zv
, !=, NULL
);
589 spa_t
*spa
= dmu_objset_spa(zv
->zv_objset
);
590 uint64_t refd
, avail
, usedobjs
, availobjs
;
592 if (g_handleattr_int(bp
, "GEOM::candelete", 1))
594 if (strcmp(bp
->bio_attribute
, "blocksavail") == 0) {
595 dmu_objset_space(zv
->zv_objset
, &refd
, &avail
,
596 &usedobjs
, &availobjs
);
597 if (g_handleattr_off_t(bp
, "blocksavail", avail
/ DEV_BSIZE
))
599 } else if (strcmp(bp
->bio_attribute
, "blocksused") == 0) {
600 dmu_objset_space(zv
->zv_objset
, &refd
, &avail
,
601 &usedobjs
, &availobjs
);
602 if (g_handleattr_off_t(bp
, "blocksused", refd
/ DEV_BSIZE
))
604 } else if (strcmp(bp
->bio_attribute
, "poolblocksavail") == 0) {
605 avail
= metaslab_class_get_space(spa_normal_class(spa
));
606 avail
-= metaslab_class_get_alloc(spa_normal_class(spa
));
607 if (g_handleattr_off_t(bp
, "poolblocksavail",
610 } else if (strcmp(bp
->bio_attribute
, "poolblocksused") == 0) {
611 refd
= metaslab_class_get_alloc(spa_normal_class(spa
));
612 if (g_handleattr_off_t(bp
, "poolblocksused", refd
/ DEV_BSIZE
))
619 zvol_filter_detach(struct knote
*kn
)
622 struct zvol_state_dev
*zsd
;
625 zsd
= &zv
->zv_zso
->zso_dev
;
627 knlist_remove(&zsd
->zsd_selinfo
.si_note
, kn
, 0);
631 zvol_filter_vnode(struct knote
*kn
, long hint
)
633 kn
->kn_fflags
|= kn
->kn_sfflags
& hint
;
635 return (kn
->kn_fflags
!= 0);
639 zvol_cdev_kqfilter(struct cdev
*dev
, struct knote
*kn
)
642 struct zvol_state_dev
*zsd
;
645 zsd
= &zv
->zv_zso
->zso_dev
;
647 if (kn
->kn_filter
!= EVFILT_VNODE
)
650 /* XXX: extend support for other NOTE_* events */
651 if (kn
->kn_sfflags
!= NOTE_ATTRIB
)
654 kn
->kn_fop
= &zvol_filterops_vnode
;
656 knlist_add(&zsd
->zsd_selinfo
.si_note
, kn
, 0);
662 zvol_geom_bio_strategy(struct bio
*bp
)
665 uint64_t off
, volsize
;
669 zfs_locked_range_t
*lr
;
671 boolean_t doread
= B_FALSE
;
672 boolean_t is_dumpified
;
676 zv
= bp
->bio_to
->private;
678 zv
= bp
->bio_dev
->si_drv2
;
681 error
= SET_ERROR(ENXIO
);
685 rw_enter(&zv
->zv_suspend_lock
, ZVOL_RW_READER
);
687 if (zv
->zv_flags
& ZVOL_REMOVING
) {
688 error
= SET_ERROR(ENXIO
);
692 switch (bp
->bio_cmd
) {
699 if (zv
->zv_flags
& ZVOL_RDONLY
) {
700 error
= SET_ERROR(EROFS
);
703 zvol_ensure_zilog(zv
);
704 if (bp
->bio_cmd
== BIO_FLUSH
)
708 error
= SET_ERROR(EOPNOTSUPP
);
712 off
= bp
->bio_offset
;
713 volsize
= zv
->zv_volsize
;
716 ASSERT3P(os
, !=, NULL
);
719 resid
= bp
->bio_length
;
721 if (resid
> 0 && off
>= volsize
) {
722 error
= SET_ERROR(EIO
);
726 is_dumpified
= B_FALSE
;
727 commit
= !doread
&& !is_dumpified
&&
728 zv
->zv_objset
->os_sync
== ZFS_SYNC_ALWAYS
;
731 * There must be no buffer changes when doing a dmu_sync() because
732 * we can't change the data whilst calculating the checksum.
734 lr
= zfs_rangelock_enter(&zv
->zv_rangelock
, off
, resid
,
735 doread
? RL_READER
: RL_WRITER
);
737 if (bp
->bio_cmd
== BIO_DELETE
) {
738 dmu_tx_t
*tx
= dmu_tx_create(zv
->zv_objset
);
739 error
= dmu_tx_assign(tx
, TXG_WAIT
);
743 zvol_log_truncate(zv
, tx
, off
, resid
);
745 error
= dmu_free_long_range(zv
->zv_objset
, ZVOL_OBJ
,
751 while (resid
!= 0 && off
< volsize
) {
752 size_t size
= MIN(resid
, zvol_maxphys
);
754 error
= dmu_read(os
, ZVOL_OBJ
, off
, size
, addr
,
757 dmu_tx_t
*tx
= dmu_tx_create(os
);
758 dmu_tx_hold_write_by_dnode(tx
, zv
->zv_dn
, off
, size
);
759 error
= dmu_tx_assign(tx
, TXG_WAIT
);
763 dmu_write(os
, ZVOL_OBJ
, off
, size
, addr
, tx
);
764 zvol_log_write(zv
, tx
, off
, size
, commit
);
769 /* Convert checksum errors into IO errors. */
771 error
= SET_ERROR(EIO
);
779 zfs_rangelock_exit(lr
);
781 bp
->bio_completed
= bp
->bio_length
- resid
;
782 if (bp
->bio_completed
< bp
->bio_length
&& off
> volsize
)
783 error
= SET_ERROR(EINVAL
);
785 switch (bp
->bio_cmd
) {
789 dataset_kstats_update_read_kstats(&zv
->zv_kstat
,
793 dataset_kstats_update_write_kstats(&zv
->zv_kstat
,
804 zil_commit(zv
->zv_zilog
, ZVOL_OBJ
);
807 rw_exit(&zv
->zv_suspend_lock
);
810 g_io_deliver(bp
, error
);
812 biofinish(bp
, NULL
, error
);
816 * Character device mode implementation
820 zvol_cdev_read(struct cdev
*dev
, struct uio
*uio_s
, int ioflag
)
824 zfs_locked_range_t
*lr
;
828 zfs_uio_init(&uio
, uio_s
);
832 volsize
= zv
->zv_volsize
;
834 * uio_loffset == volsize isn't an error as
835 * it's required for EOF processing.
837 if (zfs_uio_resid(&uio
) > 0 &&
838 (zfs_uio_offset(&uio
) < 0 || zfs_uio_offset(&uio
) > volsize
))
839 return (SET_ERROR(EIO
));
841 rw_enter(&zv
->zv_suspend_lock
, ZVOL_RW_READER
);
842 ssize_t start_resid
= zfs_uio_resid(&uio
);
843 lr
= zfs_rangelock_enter(&zv
->zv_rangelock
, zfs_uio_offset(&uio
),
844 zfs_uio_resid(&uio
), RL_READER
);
845 while (zfs_uio_resid(&uio
) > 0 && zfs_uio_offset(&uio
) < volsize
) {
846 uint64_t bytes
= MIN(zfs_uio_resid(&uio
), DMU_MAX_ACCESS
>> 1);
848 /* Don't read past the end. */
849 if (bytes
> volsize
- zfs_uio_offset(&uio
))
850 bytes
= volsize
- zfs_uio_offset(&uio
);
852 error
= dmu_read_uio_dnode(zv
->zv_dn
, &uio
, bytes
);
854 /* Convert checksum errors into IO errors. */
856 error
= SET_ERROR(EIO
);
860 zfs_rangelock_exit(lr
);
861 int64_t nread
= start_resid
- zfs_uio_resid(&uio
);
862 dataset_kstats_update_read_kstats(&zv
->zv_kstat
, nread
);
863 rw_exit(&zv
->zv_suspend_lock
);
869 zvol_cdev_write(struct cdev
*dev
, struct uio
*uio_s
, int ioflag
)
873 zfs_locked_range_t
*lr
;
880 volsize
= zv
->zv_volsize
;
882 zfs_uio_init(&uio
, uio_s
);
884 if (zfs_uio_resid(&uio
) > 0 &&
885 (zfs_uio_offset(&uio
) < 0 || zfs_uio_offset(&uio
) > volsize
))
886 return (SET_ERROR(EIO
));
888 ssize_t start_resid
= zfs_uio_resid(&uio
);
889 commit
= (ioflag
& IO_SYNC
) ||
890 (zv
->zv_objset
->os_sync
== ZFS_SYNC_ALWAYS
);
892 rw_enter(&zv
->zv_suspend_lock
, ZVOL_RW_READER
);
893 zvol_ensure_zilog(zv
);
895 lr
= zfs_rangelock_enter(&zv
->zv_rangelock
, zfs_uio_offset(&uio
),
896 zfs_uio_resid(&uio
), RL_WRITER
);
897 while (zfs_uio_resid(&uio
) > 0 && zfs_uio_offset(&uio
) < volsize
) {
898 uint64_t bytes
= MIN(zfs_uio_resid(&uio
), DMU_MAX_ACCESS
>> 1);
899 uint64_t off
= zfs_uio_offset(&uio
);
900 dmu_tx_t
*tx
= dmu_tx_create(zv
->zv_objset
);
902 if (bytes
> volsize
- off
) /* Don't write past the end. */
903 bytes
= volsize
- off
;
905 dmu_tx_hold_write_by_dnode(tx
, zv
->zv_dn
, off
, bytes
);
906 error
= dmu_tx_assign(tx
, TXG_WAIT
);
911 error
= dmu_write_uio_dnode(zv
->zv_dn
, &uio
, bytes
, tx
);
913 zvol_log_write(zv
, tx
, off
, bytes
, commit
);
919 zfs_rangelock_exit(lr
);
920 int64_t nwritten
= start_resid
- zfs_uio_resid(&uio
);
921 dataset_kstats_update_write_kstats(&zv
->zv_kstat
, nwritten
);
923 zil_commit(zv
->zv_zilog
, ZVOL_OBJ
);
924 rw_exit(&zv
->zv_suspend_lock
);
930 zvol_cdev_open(struct cdev
*dev
, int flags
, int fmt
, struct thread
*td
)
934 boolean_t drop_suspend
= B_FALSE
;
937 rw_enter(&zvol_state_lock
, ZVOL_RW_READER
);
939 * Obtain a copy of si_drv2 under zvol_state_lock to make sure either
940 * the result of zvol free code setting si_drv2 to NULL is observed,
941 * or the zv is protected from being freed because of the positive
946 rw_exit(&zvol_state_lock
);
947 err
= SET_ERROR(ENXIO
);
951 mutex_enter(&zv
->zv_state_lock
);
952 if (zv
->zv_zso
->zso_dying
) {
953 rw_exit(&zvol_state_lock
);
954 err
= SET_ERROR(ENXIO
);
957 ASSERT3S(zv
->zv_volmode
, ==, ZFS_VOLMODE_DEV
);
960 * Make sure zvol is not suspended during first open
961 * (hold zv_suspend_lock) and respect proper lock acquisition
962 * ordering - zv_suspend_lock before zv_state_lock.
964 if (zv
->zv_open_count
== 0) {
965 drop_suspend
= B_TRUE
;
966 if (!rw_tryenter(&zv
->zv_suspend_lock
, ZVOL_RW_READER
)) {
967 mutex_exit(&zv
->zv_state_lock
);
968 rw_enter(&zv
->zv_suspend_lock
, ZVOL_RW_READER
);
969 mutex_enter(&zv
->zv_state_lock
);
970 /* Check to see if zv_suspend_lock is needed. */
971 if (zv
->zv_open_count
!= 0) {
972 rw_exit(&zv
->zv_suspend_lock
);
973 drop_suspend
= B_FALSE
;
977 rw_exit(&zvol_state_lock
);
979 ASSERT(MUTEX_HELD(&zv
->zv_state_lock
));
981 if (zv
->zv_open_count
== 0) {
982 boolean_t drop_namespace
= B_FALSE
;
984 ASSERT(ZVOL_RW_READ_HELD(&zv
->zv_suspend_lock
));
987 * Take spa_namespace_lock to prevent lock inversion when
988 * zvols from one pool are opened as vdevs in another.
990 if (!mutex_owned(&spa_namespace_lock
)) {
991 if (!mutex_tryenter(&spa_namespace_lock
)) {
992 mutex_exit(&zv
->zv_state_lock
);
993 rw_exit(&zv
->zv_suspend_lock
);
994 drop_suspend
= B_FALSE
;
995 kern_yield(PRI_USER
);
998 drop_namespace
= B_TRUE
;
1001 err
= zvol_first_open(zv
, !(flags
& FWRITE
));
1003 mutex_exit(&spa_namespace_lock
);
1008 ASSERT(MUTEX_HELD(&zv
->zv_state_lock
));
1010 if ((flags
& FWRITE
) && (zv
->zv_flags
& ZVOL_RDONLY
)) {
1011 err
= SET_ERROR(EROFS
);
1014 if (zv
->zv_flags
& ZVOL_EXCL
) {
1015 err
= SET_ERROR(EBUSY
);
1018 if (flags
& O_EXCL
) {
1019 if (zv
->zv_open_count
!= 0) {
1020 err
= SET_ERROR(EBUSY
);
1023 zv
->zv_flags
|= ZVOL_EXCL
;
1026 zv
->zv_open_count
++;
1028 if (zv
->zv_open_count
== 0) {
1029 zvol_last_close(zv
);
1033 mutex_exit(&zv
->zv_state_lock
);
1036 rw_exit(&zv
->zv_suspend_lock
);
1041 zvol_cdev_close(struct cdev
*dev
, int flags
, int fmt
, struct thread
*td
)
1044 boolean_t drop_suspend
= B_TRUE
;
1046 rw_enter(&zvol_state_lock
, ZVOL_RW_READER
);
1049 rw_exit(&zvol_state_lock
);
1050 return (SET_ERROR(ENXIO
));
1053 mutex_enter(&zv
->zv_state_lock
);
1054 if (zv
->zv_flags
& ZVOL_EXCL
) {
1055 ASSERT3U(zv
->zv_open_count
, ==, 1);
1056 zv
->zv_flags
&= ~ZVOL_EXCL
;
1059 ASSERT3S(zv
->zv_volmode
, ==, ZFS_VOLMODE_DEV
);
1062 * If the open count is zero, this is a spurious close.
1063 * That indicates a bug in the kernel / DDI framework.
1065 ASSERT3U(zv
->zv_open_count
, >, 0);
1067 * Make sure zvol is not suspended during last close
1068 * (hold zv_suspend_lock) and respect proper lock acquisition
1069 * ordering - zv_suspend_lock before zv_state_lock.
1071 if (zv
->zv_open_count
== 1) {
1072 if (!rw_tryenter(&zv
->zv_suspend_lock
, ZVOL_RW_READER
)) {
1073 mutex_exit(&zv
->zv_state_lock
);
1074 rw_enter(&zv
->zv_suspend_lock
, ZVOL_RW_READER
);
1075 mutex_enter(&zv
->zv_state_lock
);
1076 /* Check to see if zv_suspend_lock is needed. */
1077 if (zv
->zv_open_count
!= 1) {
1078 rw_exit(&zv
->zv_suspend_lock
);
1079 drop_suspend
= B_FALSE
;
1083 drop_suspend
= B_FALSE
;
1085 rw_exit(&zvol_state_lock
);
1087 ASSERT(MUTEX_HELD(&zv
->zv_state_lock
));
1090 * You may get multiple opens, but only one close.
1092 zv
->zv_open_count
--;
1094 if (zv
->zv_open_count
== 0) {
1095 ASSERT(ZVOL_RW_READ_HELD(&zv
->zv_suspend_lock
));
1096 zvol_last_close(zv
);
1100 mutex_exit(&zv
->zv_state_lock
);
1103 rw_exit(&zv
->zv_suspend_lock
);
1108 zvol_cdev_ioctl(struct cdev
*dev
, ulong_t cmd
, caddr_t data
,
1109 int fflag
, struct thread
*td
)
1112 zfs_locked_range_t
*lr
;
1113 off_t offset
, length
;
1120 KASSERT(zv
->zv_open_count
> 0,
1121 ("Device with zero access count in %s", __func__
));
1124 case DIOCGSECTORSIZE
:
1125 *(uint32_t *)data
= DEV_BSIZE
;
1127 case DIOCGMEDIASIZE
:
1128 *(off_t
*)data
= zv
->zv_volsize
;
1131 rw_enter(&zv
->zv_suspend_lock
, ZVOL_RW_READER
);
1132 if (zv
->zv_zilog
!= NULL
)
1133 zil_commit(zv
->zv_zilog
, ZVOL_OBJ
);
1134 rw_exit(&zv
->zv_suspend_lock
);
1137 if (!zvol_unmap_enabled
)
1140 offset
= ((off_t
*)data
)[0];
1141 length
= ((off_t
*)data
)[1];
1142 if ((offset
% DEV_BSIZE
) != 0 || (length
% DEV_BSIZE
) != 0 ||
1143 offset
< 0 || offset
>= zv
->zv_volsize
||
1145 printf("%s: offset=%jd length=%jd\n", __func__
, offset
,
1147 error
= SET_ERROR(EINVAL
);
1150 rw_enter(&zv
->zv_suspend_lock
, ZVOL_RW_READER
);
1151 zvol_ensure_zilog(zv
);
1152 lr
= zfs_rangelock_enter(&zv
->zv_rangelock
, offset
, length
,
1154 dmu_tx_t
*tx
= dmu_tx_create(zv
->zv_objset
);
1155 error
= dmu_tx_assign(tx
, TXG_WAIT
);
1160 sync
= (zv
->zv_objset
->os_sync
== ZFS_SYNC_ALWAYS
);
1161 zvol_log_truncate(zv
, tx
, offset
, length
);
1163 error
= dmu_free_long_range(zv
->zv_objset
, ZVOL_OBJ
,
1166 zfs_rangelock_exit(lr
);
1168 zil_commit(zv
->zv_zilog
, ZVOL_OBJ
);
1169 rw_exit(&zv
->zv_suspend_lock
);
1171 case DIOCGSTRIPESIZE
:
1172 *(off_t
*)data
= zv
->zv_volblocksize
;
1174 case DIOCGSTRIPEOFFSET
:
1178 spa_t
*spa
= dmu_objset_spa(zv
->zv_objset
);
1179 struct diocgattr_arg
*arg
= (struct diocgattr_arg
*)data
;
1180 uint64_t refd
, avail
, usedobjs
, availobjs
;
1182 if (strcmp(arg
->name
, "GEOM::candelete") == 0)
1184 else if (strcmp(arg
->name
, "blocksavail") == 0) {
1185 dmu_objset_space(zv
->zv_objset
, &refd
, &avail
,
1186 &usedobjs
, &availobjs
);
1187 arg
->value
.off
= avail
/ DEV_BSIZE
;
1188 } else if (strcmp(arg
->name
, "blocksused") == 0) {
1189 dmu_objset_space(zv
->zv_objset
, &refd
, &avail
,
1190 &usedobjs
, &availobjs
);
1191 arg
->value
.off
= refd
/ DEV_BSIZE
;
1192 } else if (strcmp(arg
->name
, "poolblocksavail") == 0) {
1193 avail
= metaslab_class_get_space(spa_normal_class(spa
));
1194 avail
-= metaslab_class_get_alloc(
1195 spa_normal_class(spa
));
1196 arg
->value
.off
= avail
/ DEV_BSIZE
;
1197 } else if (strcmp(arg
->name
, "poolblocksused") == 0) {
1198 refd
= metaslab_class_get_alloc(spa_normal_class(spa
));
1199 arg
->value
.off
= refd
/ DEV_BSIZE
;
1201 error
= SET_ERROR(ENOIOCTL
);
1206 off_t
*off
= (off_t
*)data
;
1210 hole
= (cmd
== FIOSEEKHOLE
);
1212 lr
= zfs_rangelock_enter(&zv
->zv_rangelock
, 0, UINT64_MAX
,
1214 error
= dmu_offset_next(zv
->zv_objset
, ZVOL_OBJ
, hole
, &noff
);
1215 zfs_rangelock_exit(lr
);
1220 error
= SET_ERROR(ENOIOCTL
);
1231 zvol_ensure_zilog(zvol_state_t
*zv
)
1233 ASSERT(ZVOL_RW_READ_HELD(&zv
->zv_suspend_lock
));
1236 * Open a ZIL if this is the first time we have written to this
1237 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
1238 * than zv_state_lock so that we don't need to acquire an
1239 * additional lock in this path.
1241 if (zv
->zv_zilog
== NULL
) {
1242 if (!rw_tryupgrade(&zv
->zv_suspend_lock
)) {
1243 rw_exit(&zv
->zv_suspend_lock
);
1244 rw_enter(&zv
->zv_suspend_lock
, RW_WRITER
);
1246 if (zv
->zv_zilog
== NULL
) {
1247 zv
->zv_zilog
= zil_open(zv
->zv_objset
,
1248 zvol_get_data
, &zv
->zv_kstat
.dk_zil_sums
);
1249 zv
->zv_flags
|= ZVOL_WRITTEN_TO
;
1250 /* replay / destroy done in zvol_os_create_minor() */
1251 VERIFY0(zv
->zv_zilog
->zl_header
->zh_flags
&
1254 rw_downgrade(&zv
->zv_suspend_lock
);
1259 zvol_os_is_zvol(const char *device
)
1261 return (device
&& strncmp(device
, ZVOL_DIR
, strlen(ZVOL_DIR
)) == 0);
1265 zvol_os_rename_minor(zvol_state_t
*zv
, const char *newname
)
1267 ASSERT(RW_LOCK_HELD(&zvol_state_lock
));
1268 ASSERT(MUTEX_HELD(&zv
->zv_state_lock
));
1270 /* Move to a new hashtable entry. */
1271 zv
->zv_hash
= zvol_name_hash(newname
);
1272 hlist_del(&zv
->zv_hlink
);
1273 hlist_add_head(&zv
->zv_hlink
, ZVOL_HT_HEAD(zv
->zv_hash
));
1275 if (zv
->zv_volmode
== ZFS_VOLMODE_GEOM
) {
1276 struct zvol_state_geom
*zsg
= &zv
->zv_zso
->zso_geom
;
1277 struct g_provider
*pp
= zsg
->zsg_provider
;
1282 ASSERT3P(gp
, !=, NULL
);
1284 zsg
->zsg_provider
= NULL
;
1285 g_wither_provider(pp
, ENXIO
);
1287 pp
= g_new_providerf(gp
, "%s/%s", ZVOL_DRIVER
, newname
);
1288 pp
->flags
|= G_PF_DIRECT_RECEIVE
| G_PF_DIRECT_SEND
;
1289 pp
->sectorsize
= DEV_BSIZE
;
1290 pp
->mediasize
= zv
->zv_volsize
;
1292 zsg
->zsg_provider
= pp
;
1293 g_error_provider(pp
, 0);
1294 g_topology_unlock();
1295 } else if (zv
->zv_volmode
== ZFS_VOLMODE_DEV
) {
1296 struct zvol_state_dev
*zsd
= &zv
->zv_zso
->zso_dev
;
1298 struct make_dev_args args
;
1300 dev
= zsd
->zsd_cdev
;
1303 dev
= zsd
->zsd_cdev
= NULL
;
1304 if (zv
->zv_open_count
> 0) {
1305 zv
->zv_flags
&= ~ZVOL_EXCL
;
1306 zv
->zv_open_count
= 0;
1307 /* XXX need suspend lock but lock order */
1308 zvol_last_close(zv
);
1312 make_dev_args_init(&args
);
1313 args
.mda_flags
= MAKEDEV_CHECKNAME
| MAKEDEV_WAITOK
;
1314 args
.mda_devsw
= &zvol_cdevsw
;
1316 args
.mda_uid
= UID_ROOT
;
1317 args
.mda_gid
= GID_OPERATOR
;
1318 args
.mda_mode
= 0640;
1319 args
.mda_si_drv2
= zv
;
1320 if (make_dev_s(&args
, &dev
, "%s/%s", ZVOL_DRIVER
, newname
)
1322 dev
->si_iosize_max
= maxphys
;
1323 zsd
->zsd_cdev
= dev
;
1326 strlcpy(zv
->zv_name
, newname
, sizeof (zv
->zv_name
));
1327 dataset_kstats_rename(&zv
->zv_kstat
, newname
);
1331 * Remove minor node for the specified volume.
1334 zvol_os_free(zvol_state_t
*zv
)
1336 ASSERT(!RW_LOCK_HELD(&zv
->zv_suspend_lock
));
1337 ASSERT(!MUTEX_HELD(&zv
->zv_state_lock
));
1338 ASSERT0(zv
->zv_open_count
);
1340 ZFS_LOG(1, "ZVOL %s destroyed.", zv
->zv_name
);
1342 rw_destroy(&zv
->zv_suspend_lock
);
1343 zfs_rangelock_fini(&zv
->zv_rangelock
);
1345 if (zv
->zv_volmode
== ZFS_VOLMODE_GEOM
) {
1346 struct zvol_state_geom
*zsg
= &zv
->zv_zso
->zso_geom
;
1347 struct g_provider
*pp __maybe_unused
= zsg
->zsg_provider
;
1349 ASSERT3P(pp
->private, ==, NULL
);
1352 zvol_geom_destroy(zv
);
1353 g_topology_unlock();
1354 mtx_destroy(&zsg
->zsg_queue_mtx
);
1355 } else if (zv
->zv_volmode
== ZFS_VOLMODE_DEV
) {
1356 struct zvol_state_dev
*zsd
= &zv
->zv_zso
->zso_dev
;
1357 struct cdev
*dev
= zsd
->zsd_cdev
;
1360 ASSERT3P(dev
->si_drv2
, ==, NULL
);
1362 knlist_clear(&zsd
->zsd_selinfo
.si_note
, 0);
1363 knlist_destroy(&zsd
->zsd_selinfo
.si_note
);
1367 mutex_destroy(&zv
->zv_state_lock
);
1368 cv_destroy(&zv
->zv_removing_cv
);
1369 dataset_kstats_destroy(&zv
->zv_kstat
);
1370 kmem_free(zv
->zv_zso
, sizeof (struct zvol_state_os
));
1371 kmem_free(zv
, sizeof (zvol_state_t
));
1376 * Create a minor node (plus a whole lot more) for the specified volume.
1379 zvol_os_create_minor(const char *name
)
1383 dmu_object_info_t
*doi
;
1385 uint64_t volmode
, hash
;
1387 bool replayed_zil
= B_FALSE
;
1389 ZFS_LOG(1, "Creating ZVOL %s...", name
);
1390 hash
= zvol_name_hash(name
);
1391 if ((zv
= zvol_find_by_name_hash(name
, hash
, RW_NONE
)) != NULL
) {
1392 ASSERT(MUTEX_HELD(&zv
->zv_state_lock
));
1393 mutex_exit(&zv
->zv_state_lock
);
1394 return (SET_ERROR(EEXIST
));
1399 doi
= kmem_alloc(sizeof (dmu_object_info_t
), KM_SLEEP
);
1401 /* Lie and say we're read-only. */
1402 error
= dmu_objset_own(name
, DMU_OST_ZVOL
, B_TRUE
, B_TRUE
, FTAG
, &os
);
1406 error
= dmu_object_info(os
, ZVOL_OBJ
, doi
);
1408 goto out_dmu_objset_disown
;
1410 error
= zap_lookup(os
, ZVOL_ZAP_OBJ
, "size", 8, 1, &volsize
);
1412 goto out_dmu_objset_disown
;
1414 error
= dsl_prop_get_integer(name
,
1415 zfs_prop_to_name(ZFS_PROP_VOLMODE
), &volmode
, NULL
);
1416 if (error
|| volmode
== ZFS_VOLMODE_DEFAULT
)
1417 volmode
= zvol_volmode
;
1421 * zvol_alloc equivalent ...
1423 zv
= kmem_zalloc(sizeof (*zv
), KM_SLEEP
);
1425 mutex_init(&zv
->zv_state_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
1426 cv_init(&zv
->zv_removing_cv
, NULL
, CV_DEFAULT
, NULL
);
1427 zv
->zv_zso
= kmem_zalloc(sizeof (struct zvol_state_os
), KM_SLEEP
);
1428 zv
->zv_volmode
= volmode
;
1429 if (zv
->zv_volmode
== ZFS_VOLMODE_GEOM
) {
1430 struct zvol_state_geom
*zsg
= &zv
->zv_zso
->zso_geom
;
1431 struct g_provider
*pp
;
1434 zsg
->zsg_state
= ZVOL_GEOM_UNINIT
;
1435 mtx_init(&zsg
->zsg_queue_mtx
, "zvol", NULL
, MTX_DEF
);
1438 gp
= g_new_geomf(&zfs_zvol_class
, "zfs::zvol::%s", name
);
1439 gp
->start
= zvol_geom_bio_start
;
1440 gp
->access
= zvol_geom_access
;
1441 pp
= g_new_providerf(gp
, "%s/%s", ZVOL_DRIVER
, name
);
1442 pp
->flags
|= G_PF_DIRECT_RECEIVE
| G_PF_DIRECT_SEND
;
1443 pp
->sectorsize
= DEV_BSIZE
;
1447 zsg
->zsg_provider
= pp
;
1448 bioq_init(&zsg
->zsg_queue
);
1449 } else if (zv
->zv_volmode
== ZFS_VOLMODE_DEV
) {
1450 struct zvol_state_dev
*zsd
= &zv
->zv_zso
->zso_dev
;
1452 struct make_dev_args args
;
1454 make_dev_args_init(&args
);
1455 args
.mda_flags
= MAKEDEV_CHECKNAME
| MAKEDEV_WAITOK
;
1456 args
.mda_devsw
= &zvol_cdevsw
;
1458 args
.mda_uid
= UID_ROOT
;
1459 args
.mda_gid
= GID_OPERATOR
;
1460 args
.mda_mode
= 0640;
1461 args
.mda_si_drv2
= zv
;
1462 if (make_dev_s(&args
, &dev
, "%s/%s", ZVOL_DRIVER
, name
)
1464 dev
->si_iosize_max
= maxphys
;
1465 zsd
->zsd_cdev
= dev
;
1466 knlist_init_sx(&zsd
->zsd_selinfo
.si_note
,
1467 &zv
->zv_state_lock
);
1470 (void) strlcpy(zv
->zv_name
, name
, MAXPATHLEN
);
1471 rw_init(&zv
->zv_suspend_lock
, NULL
, RW_DEFAULT
, NULL
);
1472 zfs_rangelock_init(&zv
->zv_rangelock
, NULL
, NULL
);
1474 if (dmu_objset_is_snapshot(os
) || !spa_writeable(dmu_objset_spa(os
)))
1475 zv
->zv_flags
|= ZVOL_RDONLY
;
1477 zv
->zv_volblocksize
= doi
->doi_data_block_size
;
1478 zv
->zv_volsize
= volsize
;
1481 ASSERT3P(zv
->zv_kstat
.dk_kstats
, ==, NULL
);
1482 error
= dataset_kstats_create(&zv
->zv_kstat
, zv
->zv_objset
);
1484 goto out_dmu_objset_disown
;
1485 ASSERT3P(zv
->zv_zilog
, ==, NULL
);
1486 zv
->zv_zilog
= zil_open(os
, zvol_get_data
, &zv
->zv_kstat
.dk_zil_sums
);
1487 if (spa_writeable(dmu_objset_spa(os
))) {
1488 if (zil_replay_disable
)
1489 replayed_zil
= zil_destroy(zv
->zv_zilog
, B_FALSE
);
1491 replayed_zil
= zil_replay(os
, zv
, zvol_replay_vector
);
1494 zil_close(zv
->zv_zilog
);
1495 zv
->zv_zilog
= NULL
;
1497 /* TODO: prefetch for geom tasting */
1499 zv
->zv_objset
= NULL
;
1500 out_dmu_objset_disown
:
1501 dmu_objset_disown(os
, B_TRUE
, FTAG
);
1503 if (error
== 0 && volmode
== ZFS_VOLMODE_GEOM
) {
1505 g_topology_unlock();
1508 kmem_free(doi
, sizeof (dmu_object_info_t
));
1510 rw_enter(&zvol_state_lock
, RW_WRITER
);
1513 rw_exit(&zvol_state_lock
);
1514 ZFS_LOG(1, "ZVOL %s created.", name
);
1521 zvol_os_clear_private(zvol_state_t
*zv
)
1523 ASSERT(RW_LOCK_HELD(&zvol_state_lock
));
1524 if (zv
->zv_volmode
== ZFS_VOLMODE_GEOM
) {
1525 struct zvol_state_geom
*zsg
= &zv
->zv_zso
->zso_geom
;
1526 struct g_provider
*pp
= zsg
->zsg_provider
;
1528 if (pp
->private == NULL
) /* already cleared */
1531 mtx_lock(&zsg
->zsg_queue_mtx
);
1532 zsg
->zsg_state
= ZVOL_GEOM_STOPPED
;
1534 wakeup_one(&zsg
->zsg_queue
);
1535 while (zsg
->zsg_state
!= ZVOL_GEOM_RUNNING
)
1536 msleep(&zsg
->zsg_state
, &zsg
->zsg_queue_mtx
,
1538 mtx_unlock(&zsg
->zsg_queue_mtx
);
1539 ASSERT(!RW_LOCK_HELD(&zv
->zv_suspend_lock
));
1540 } else if (zv
->zv_volmode
== ZFS_VOLMODE_DEV
) {
1541 struct zvol_state_dev
*zsd
= &zv
->zv_zso
->zso_dev
;
1542 struct cdev
*dev
= zsd
->zsd_cdev
;
1545 dev
->si_drv2
= NULL
;
1550 zvol_os_update_volsize(zvol_state_t
*zv
, uint64_t volsize
)
1552 zv
->zv_volsize
= volsize
;
1553 if (zv
->zv_volmode
== ZFS_VOLMODE_GEOM
) {
1554 struct zvol_state_geom
*zsg
= &zv
->zv_zso
->zso_geom
;
1555 struct g_provider
*pp
= zsg
->zsg_provider
;
1559 if (pp
->private == NULL
) {
1560 g_topology_unlock();
1561 return (SET_ERROR(ENXIO
));
1565 * Do not invoke resize event when initial size was zero.
1566 * ZVOL initializes the size on first open, this is not
1569 if (pp
->mediasize
== 0)
1570 pp
->mediasize
= zv
->zv_volsize
;
1572 g_resize_provider(pp
, zv
->zv_volsize
);
1574 g_topology_unlock();
1575 } else if (zv
->zv_volmode
== ZFS_VOLMODE_DEV
) {
1576 struct zvol_state_dev
*zsd
= &zv
->zv_zso
->zso_dev
;
1578 KNOTE_UNLOCKED(&zsd
->zsd_selinfo
.si_note
, NOTE_ATTRIB
);
1584 zvol_os_set_disk_ro(zvol_state_t
*zv
, int flags
)
1586 // XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags);
1590 zvol_os_set_capacity(zvol_state_t
*zv
, uint64_t capacity
)
1592 // XXX? set_capacity(zv->zv_zso->zvo_disk, capacity);
1602 return (zvol_minors
!= 0);