flush: only detect lack of flush support in one place
[zfs.git] / module / os / freebsd / zfs / zvol_os.c
blobc3be4730d4b6ba9edb62f5b828ae42c9cc189d7f
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
25 * All rights reserved.
27 * Portions Copyright 2010 Robert Milkowski
29 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
30 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
31 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
32 * Copyright (c) 2014 Integros [integros.com]
33 * Copyright (c) 2024, Klara, Inc.
36 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
39 * ZFS volume emulation driver.
41 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
42 * Volumes are accessed through the symbolic links named:
44 * /dev/zvol/<pool_name>/<dataset_name>
46 * Volumes are persistent through reboot. No user command needs to be
47 * run before opening and using a device.
49 * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
50 * in the system. Except when they're simply character devices (volmode=dev).
53 #include <sys/types.h>
54 #include <sys/param.h>
55 #include <sys/kernel.h>
56 #include <sys/errno.h>
57 #include <sys/uio.h>
58 #include <sys/bio.h>
59 #include <sys/buf.h>
60 #include <sys/kmem.h>
61 #include <sys/conf.h>
62 #include <sys/cmn_err.h>
63 #include <sys/stat.h>
64 #include <sys/proc.h>
65 #include <sys/zap.h>
66 #include <sys/spa.h>
67 #include <sys/spa_impl.h>
68 #include <sys/zio.h>
69 #include <sys/disk.h>
70 #include <sys/dmu_traverse.h>
71 #include <sys/dnode.h>
72 #include <sys/dsl_dataset.h>
73 #include <sys/dsl_prop.h>
74 #include <sys/dsl_dir.h>
75 #include <sys/byteorder.h>
76 #include <sys/sunddi.h>
77 #include <sys/dirent.h>
78 #include <sys/policy.h>
79 #include <sys/queue.h>
80 #include <sys/fs/zfs.h>
81 #include <sys/zfs_ioctl.h>
82 #include <sys/zil.h>
83 #include <sys/zfs_znode.h>
84 #include <sys/zfs_rlock.h>
85 #include <sys/vdev_impl.h>
86 #include <sys/vdev_raidz.h>
87 #include <sys/zvol.h>
88 #include <sys/zil_impl.h>
89 #include <sys/dataset_kstats.h>
90 #include <sys/dbuf.h>
91 #include <sys/dmu_tx.h>
92 #include <sys/zfeature.h>
93 #include <sys/zio_checksum.h>
94 #include <sys/zil_impl.h>
95 #include <sys/filio.h>
96 #include <sys/freebsd_event.h>
98 #include <geom/geom.h>
99 #include <sys/zvol.h>
100 #include <sys/zvol_impl.h>
102 #include "zfs_namecheck.h"
104 #define ZVOL_DUMPSIZE "dumpsize"
106 #ifdef ZVOL_LOCK_DEBUG
107 #define ZVOL_RW_READER RW_WRITER
108 #define ZVOL_RW_READ_HELD RW_WRITE_HELD
109 #else
110 #define ZVOL_RW_READER RW_READER
111 #define ZVOL_RW_READ_HELD RW_READ_HELD
112 #endif
114 enum zvol_geom_state {
115 ZVOL_GEOM_UNINIT,
116 ZVOL_GEOM_STOPPED,
117 ZVOL_GEOM_RUNNING,
120 struct zvol_state_os {
121 #define zso_dev _zso_state._zso_dev
122 #define zso_geom _zso_state._zso_geom
123 union {
124 /* volmode=dev */
125 struct zvol_state_dev {
126 struct cdev *zsd_cdev;
127 struct selinfo zsd_selinfo;
128 } _zso_dev;
130 /* volmode=geom */
131 struct zvol_state_geom {
132 struct g_provider *zsg_provider;
133 struct bio_queue_head zsg_queue;
134 struct mtx zsg_queue_mtx;
135 enum zvol_geom_state zsg_state;
136 } _zso_geom;
137 } _zso_state;
138 int zso_dying;
141 static uint32_t zvol_minors;
143 SYSCTL_DECL(_vfs_zfs);
144 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
145 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0,
146 "Expose as GEOM providers (1), device files (2) or neither");
147 static boolean_t zpool_on_zvol = B_FALSE;
148 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
149 "Allow zpools to use zvols as vdevs (DANGEROUS)");
152 * Toggle unmap functionality.
154 boolean_t zvol_unmap_enabled = B_TRUE;
156 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
157 &zvol_unmap_enabled, 0, "Enable UNMAP functionality");
160 * zvol maximum transfer in one DMU tx.
162 int zvol_maxphys = DMU_MAX_ACCESS / 2;
164 static void zvol_ensure_zilog(zvol_state_t *zv);
166 static d_open_t zvol_cdev_open;
167 static d_close_t zvol_cdev_close;
168 static d_ioctl_t zvol_cdev_ioctl;
169 static d_read_t zvol_cdev_read;
170 static d_write_t zvol_cdev_write;
171 static d_strategy_t zvol_geom_bio_strategy;
172 static d_kqfilter_t zvol_cdev_kqfilter;
174 static struct cdevsw zvol_cdevsw = {
175 .d_name = "zvol",
176 .d_version = D_VERSION,
177 .d_flags = D_DISK | D_TRACKCLOSE,
178 .d_open = zvol_cdev_open,
179 .d_close = zvol_cdev_close,
180 .d_ioctl = zvol_cdev_ioctl,
181 .d_read = zvol_cdev_read,
182 .d_write = zvol_cdev_write,
183 .d_strategy = zvol_geom_bio_strategy,
184 .d_kqfilter = zvol_cdev_kqfilter,
187 static void zvol_filter_detach(struct knote *kn);
188 static int zvol_filter_vnode(struct knote *kn, long hint);
190 static struct filterops zvol_filterops_vnode = {
191 .f_isfd = 1,
192 .f_detach = zvol_filter_detach,
193 .f_event = zvol_filter_vnode,
196 extern uint_t zfs_geom_probe_vdev_key;
198 struct g_class zfs_zvol_class = {
199 .name = "ZFS::ZVOL",
200 .version = G_VERSION,
203 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
205 static int zvol_geom_open(struct g_provider *pp, int flag, int count);
206 static int zvol_geom_close(struct g_provider *pp, int flag, int count);
207 static void zvol_geom_run(zvol_state_t *zv);
208 static void zvol_geom_destroy(zvol_state_t *zv);
209 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
210 static void zvol_geom_worker(void *arg);
211 static void zvol_geom_bio_start(struct bio *bp);
212 static int zvol_geom_bio_getattr(struct bio *bp);
213 /* static d_strategy_t zvol_geom_bio_strategy; (declared elsewhere) */
216 * GEOM mode implementation
219 static int
220 zvol_geom_open(struct g_provider *pp, int flag, int count)
222 zvol_state_t *zv;
223 int err = 0;
224 boolean_t drop_suspend = B_FALSE;
226 if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
228 * If zfs_geom_probe_vdev_key is set, that means that zfs is
229 * attempting to probe geom providers while looking for a
230 * replacement for a missing VDEV. In this case, the
231 * spa_namespace_lock will not be held, but it is still illegal
232 * to use a zvol as a vdev. Deadlocks can result if another
233 * thread has spa_namespace_lock.
235 return (SET_ERROR(EOPNOTSUPP));
238 retry:
239 rw_enter(&zvol_state_lock, ZVOL_RW_READER);
241 * Obtain a copy of private under zvol_state_lock to make sure either
242 * the result of zvol free code setting private to NULL is observed,
243 * or the zv is protected from being freed because of the positive
244 * zv_open_count.
246 zv = pp->private;
247 if (zv == NULL) {
248 rw_exit(&zvol_state_lock);
249 err = SET_ERROR(ENXIO);
250 goto out_locked;
253 mutex_enter(&zv->zv_state_lock);
254 if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) {
255 rw_exit(&zvol_state_lock);
256 err = SET_ERROR(ENXIO);
257 goto out_zv_locked;
259 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
262 * Make sure zvol is not suspended during first open
263 * (hold zv_suspend_lock) and respect proper lock acquisition
264 * ordering - zv_suspend_lock before zv_state_lock.
266 if (zv->zv_open_count == 0) {
267 drop_suspend = B_TRUE;
268 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
269 mutex_exit(&zv->zv_state_lock);
270 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
271 mutex_enter(&zv->zv_state_lock);
272 /* Check to see if zv_suspend_lock is needed. */
273 if (zv->zv_open_count != 0) {
274 rw_exit(&zv->zv_suspend_lock);
275 drop_suspend = B_FALSE;
279 rw_exit(&zvol_state_lock);
281 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
283 if (zv->zv_open_count == 0) {
284 boolean_t drop_namespace = B_FALSE;
286 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
289 * Take spa_namespace_lock to prevent lock inversion when
290 * zvols from one pool are opened as vdevs in another.
292 if (!mutex_owned(&spa_namespace_lock)) {
293 if (!mutex_tryenter(&spa_namespace_lock)) {
294 mutex_exit(&zv->zv_state_lock);
295 rw_exit(&zv->zv_suspend_lock);
296 drop_suspend = B_FALSE;
297 kern_yield(PRI_USER);
298 goto retry;
299 } else {
300 drop_namespace = B_TRUE;
303 err = zvol_first_open(zv, !(flag & FWRITE));
304 if (drop_namespace)
305 mutex_exit(&spa_namespace_lock);
306 if (err)
307 goto out_zv_locked;
308 pp->mediasize = zv->zv_volsize;
309 pp->stripeoffset = 0;
310 pp->stripesize = zv->zv_volblocksize;
313 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
316 * Check for a bad on-disk format version now since we
317 * lied about owning the dataset readonly before.
319 if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) ||
320 dmu_objset_incompatible_encryption_version(zv->zv_objset))) {
321 err = SET_ERROR(EROFS);
322 goto out_opened;
324 if (zv->zv_flags & ZVOL_EXCL) {
325 err = SET_ERROR(EBUSY);
326 goto out_opened;
328 if (flag & O_EXCL) {
329 if (zv->zv_open_count != 0) {
330 err = SET_ERROR(EBUSY);
331 goto out_opened;
333 zv->zv_flags |= ZVOL_EXCL;
336 zv->zv_open_count += count;
337 out_opened:
338 if (zv->zv_open_count == 0) {
339 zvol_last_close(zv);
340 wakeup(zv);
342 out_zv_locked:
343 mutex_exit(&zv->zv_state_lock);
344 out_locked:
345 if (drop_suspend)
346 rw_exit(&zv->zv_suspend_lock);
347 return (err);
350 static int
351 zvol_geom_close(struct g_provider *pp, int flag, int count)
353 (void) flag;
354 zvol_state_t *zv;
355 boolean_t drop_suspend = B_TRUE;
356 int new_open_count;
358 rw_enter(&zvol_state_lock, ZVOL_RW_READER);
359 zv = pp->private;
360 if (zv == NULL) {
361 rw_exit(&zvol_state_lock);
362 return (SET_ERROR(ENXIO));
365 mutex_enter(&zv->zv_state_lock);
366 if (zv->zv_flags & ZVOL_EXCL) {
367 ASSERT3U(zv->zv_open_count, ==, 1);
368 zv->zv_flags &= ~ZVOL_EXCL;
371 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
374 * If the open count is zero, this is a spurious close.
375 * That indicates a bug in the kernel / DDI framework.
377 ASSERT3U(zv->zv_open_count, >, 0);
380 * Make sure zvol is not suspended during last close
381 * (hold zv_suspend_lock) and respect proper lock acquisition
382 * ordering - zv_suspend_lock before zv_state_lock.
384 new_open_count = zv->zv_open_count - count;
385 if (new_open_count == 0) {
386 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
387 mutex_exit(&zv->zv_state_lock);
388 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
389 mutex_enter(&zv->zv_state_lock);
390 /* Check to see if zv_suspend_lock is needed. */
391 new_open_count = zv->zv_open_count - count;
392 if (new_open_count != 0) {
393 rw_exit(&zv->zv_suspend_lock);
394 drop_suspend = B_FALSE;
397 } else {
398 drop_suspend = B_FALSE;
400 rw_exit(&zvol_state_lock);
402 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
405 * You may get multiple opens, but only one close.
407 zv->zv_open_count = new_open_count;
408 if (zv->zv_open_count == 0) {
409 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
410 zvol_last_close(zv);
411 wakeup(zv);
414 mutex_exit(&zv->zv_state_lock);
416 if (drop_suspend)
417 rw_exit(&zv->zv_suspend_lock);
418 return (0);
421 static void
422 zvol_geom_run(zvol_state_t *zv)
424 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
425 struct g_provider *pp = zsg->zsg_provider;
427 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
429 g_error_provider(pp, 0);
431 kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0,
432 "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER));
435 static void
436 zvol_geom_destroy(zvol_state_t *zv)
438 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
439 struct g_provider *pp = zsg->zsg_provider;
441 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
443 g_topology_assert();
445 mutex_enter(&zv->zv_state_lock);
446 VERIFY3S(zsg->zsg_state, ==, ZVOL_GEOM_RUNNING);
447 mutex_exit(&zv->zv_state_lock);
448 zsg->zsg_provider = NULL;
449 g_wither_geom(pp->geom, ENXIO);
452 void
453 zvol_wait_close(zvol_state_t *zv)
456 if (zv->zv_volmode != ZFS_VOLMODE_GEOM)
457 return;
458 mutex_enter(&zv->zv_state_lock);
459 zv->zv_zso->zso_dying = B_TRUE;
461 if (zv->zv_open_count)
462 msleep(zv, &zv->zv_state_lock,
463 PRIBIO, "zvol:dying", 10*hz);
464 mutex_exit(&zv->zv_state_lock);
468 static int
469 zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
471 int count, error, flags;
473 g_topology_assert();
476 * To make it easier we expect either open or close, but not both
477 * at the same time.
479 KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
480 (acr <= 0 && acw <= 0 && ace <= 0),
481 ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
482 pp->name, acr, acw, ace));
484 if (pp->private == NULL) {
485 if (acr <= 0 && acw <= 0 && ace <= 0)
486 return (0);
487 return (pp->error);
491 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if
492 * ace != 0, because GEOM already handles that and handles it a bit
493 * differently. GEOM allows for multiple read/exclusive consumers and
494 * ZFS allows only one exclusive consumer, no matter if it is reader or
495 * writer. I like better the way GEOM works so I'll leave it for GEOM
496 * to decide what to do.
499 count = acr + acw + ace;
500 if (count == 0)
501 return (0);
503 flags = 0;
504 if (acr != 0 || ace != 0)
505 flags |= FREAD;
506 if (acw != 0)
507 flags |= FWRITE;
509 g_topology_unlock();
510 if (count > 0)
511 error = zvol_geom_open(pp, flags, count);
512 else
513 error = zvol_geom_close(pp, flags, -count);
514 g_topology_lock();
515 return (error);
518 static void
519 zvol_geom_worker(void *arg)
521 zvol_state_t *zv = arg;
522 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
523 struct bio *bp;
525 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
527 thread_lock(curthread);
528 sched_prio(curthread, PRIBIO);
529 thread_unlock(curthread);
531 for (;;) {
532 mtx_lock(&zsg->zsg_queue_mtx);
533 bp = bioq_takefirst(&zsg->zsg_queue);
534 if (bp == NULL) {
535 if (zsg->zsg_state == ZVOL_GEOM_STOPPED) {
536 zsg->zsg_state = ZVOL_GEOM_RUNNING;
537 wakeup(&zsg->zsg_state);
538 mtx_unlock(&zsg->zsg_queue_mtx);
539 kthread_exit();
541 msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx,
542 PRIBIO | PDROP, "zvol:io", 0);
543 continue;
545 mtx_unlock(&zsg->zsg_queue_mtx);
546 zvol_geom_bio_strategy(bp);
550 static void
551 zvol_geom_bio_start(struct bio *bp)
553 zvol_state_t *zv = bp->bio_to->private;
554 struct zvol_state_geom *zsg;
555 boolean_t first;
557 if (zv == NULL) {
558 g_io_deliver(bp, ENXIO);
559 return;
561 if (bp->bio_cmd == BIO_GETATTR) {
562 if (zvol_geom_bio_getattr(bp))
563 g_io_deliver(bp, EOPNOTSUPP);
564 return;
567 if (!THREAD_CAN_SLEEP()) {
568 zsg = &zv->zv_zso->zso_geom;
569 mtx_lock(&zsg->zsg_queue_mtx);
570 first = (bioq_first(&zsg->zsg_queue) == NULL);
571 bioq_insert_tail(&zsg->zsg_queue, bp);
572 mtx_unlock(&zsg->zsg_queue_mtx);
573 if (first)
574 wakeup_one(&zsg->zsg_queue);
575 return;
578 zvol_geom_bio_strategy(bp);
581 static int
582 zvol_geom_bio_getattr(struct bio *bp)
584 zvol_state_t *zv;
586 zv = bp->bio_to->private;
587 ASSERT3P(zv, !=, NULL);
589 spa_t *spa = dmu_objset_spa(zv->zv_objset);
590 uint64_t refd, avail, usedobjs, availobjs;
592 if (g_handleattr_int(bp, "GEOM::candelete", 1))
593 return (0);
594 if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
595 dmu_objset_space(zv->zv_objset, &refd, &avail,
596 &usedobjs, &availobjs);
597 if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE))
598 return (0);
599 } else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
600 dmu_objset_space(zv->zv_objset, &refd, &avail,
601 &usedobjs, &availobjs);
602 if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE))
603 return (0);
604 } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
605 avail = metaslab_class_get_space(spa_normal_class(spa));
606 avail -= metaslab_class_get_alloc(spa_normal_class(spa));
607 if (g_handleattr_off_t(bp, "poolblocksavail",
608 avail / DEV_BSIZE))
609 return (0);
610 } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
611 refd = metaslab_class_get_alloc(spa_normal_class(spa));
612 if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE))
613 return (0);
615 return (1);
618 static void
619 zvol_filter_detach(struct knote *kn)
621 zvol_state_t *zv;
622 struct zvol_state_dev *zsd;
624 zv = kn->kn_hook;
625 zsd = &zv->zv_zso->zso_dev;
627 knlist_remove(&zsd->zsd_selinfo.si_note, kn, 0);
630 static int
631 zvol_filter_vnode(struct knote *kn, long hint)
633 kn->kn_fflags |= kn->kn_sfflags & hint;
635 return (kn->kn_fflags != 0);
638 static int
639 zvol_cdev_kqfilter(struct cdev *dev, struct knote *kn)
641 zvol_state_t *zv;
642 struct zvol_state_dev *zsd;
644 zv = dev->si_drv2;
645 zsd = &zv->zv_zso->zso_dev;
647 if (kn->kn_filter != EVFILT_VNODE)
648 return (EINVAL);
650 /* XXX: extend support for other NOTE_* events */
651 if (kn->kn_sfflags != NOTE_ATTRIB)
652 return (EINVAL);
654 kn->kn_fop = &zvol_filterops_vnode;
655 kn->kn_hook = zv;
656 knlist_add(&zsd->zsd_selinfo.si_note, kn, 0);
658 return (0);
661 static void
662 zvol_geom_bio_strategy(struct bio *bp)
664 zvol_state_t *zv;
665 uint64_t off, volsize;
666 size_t resid;
667 char *addr;
668 objset_t *os;
669 zfs_locked_range_t *lr;
670 int error = 0;
671 boolean_t doread = B_FALSE;
672 boolean_t is_dumpified;
673 boolean_t commit;
675 if (bp->bio_to)
676 zv = bp->bio_to->private;
677 else
678 zv = bp->bio_dev->si_drv2;
680 if (zv == NULL) {
681 error = SET_ERROR(ENXIO);
682 goto out;
685 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
687 if (zv->zv_flags & ZVOL_REMOVING) {
688 error = SET_ERROR(ENXIO);
689 goto resume;
692 switch (bp->bio_cmd) {
693 case BIO_READ:
694 doread = B_TRUE;
695 break;
696 case BIO_WRITE:
697 case BIO_FLUSH:
698 case BIO_DELETE:
699 if (zv->zv_flags & ZVOL_RDONLY) {
700 error = SET_ERROR(EROFS);
701 goto resume;
703 zvol_ensure_zilog(zv);
704 if (bp->bio_cmd == BIO_FLUSH)
705 goto commit;
706 break;
707 default:
708 error = SET_ERROR(EOPNOTSUPP);
709 goto resume;
712 off = bp->bio_offset;
713 volsize = zv->zv_volsize;
715 os = zv->zv_objset;
716 ASSERT3P(os, !=, NULL);
718 addr = bp->bio_data;
719 resid = bp->bio_length;
721 if (resid > 0 && off >= volsize) {
722 error = SET_ERROR(EIO);
723 goto resume;
726 is_dumpified = B_FALSE;
727 commit = !doread && !is_dumpified &&
728 zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
731 * There must be no buffer changes when doing a dmu_sync() because
732 * we can't change the data whilst calculating the checksum.
734 lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid,
735 doread ? RL_READER : RL_WRITER);
737 if (bp->bio_cmd == BIO_DELETE) {
738 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
739 error = dmu_tx_assign(tx, TXG_WAIT);
740 if (error != 0) {
741 dmu_tx_abort(tx);
742 } else {
743 zvol_log_truncate(zv, tx, off, resid);
744 dmu_tx_commit(tx);
745 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
746 off, resid);
747 resid = 0;
749 goto unlock;
751 while (resid != 0 && off < volsize) {
752 size_t size = MIN(resid, zvol_maxphys);
753 if (doread) {
754 error = dmu_read(os, ZVOL_OBJ, off, size, addr,
755 DMU_READ_PREFETCH);
756 } else {
757 dmu_tx_t *tx = dmu_tx_create(os);
758 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
759 error = dmu_tx_assign(tx, TXG_WAIT);
760 if (error) {
761 dmu_tx_abort(tx);
762 } else {
763 dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
764 zvol_log_write(zv, tx, off, size, commit);
765 dmu_tx_commit(tx);
768 if (error) {
769 /* Convert checksum errors into IO errors. */
770 if (error == ECKSUM)
771 error = SET_ERROR(EIO);
772 break;
774 off += size;
775 addr += size;
776 resid -= size;
778 unlock:
779 zfs_rangelock_exit(lr);
781 bp->bio_completed = bp->bio_length - resid;
782 if (bp->bio_completed < bp->bio_length && off > volsize)
783 error = SET_ERROR(EINVAL);
785 switch (bp->bio_cmd) {
786 case BIO_FLUSH:
787 break;
788 case BIO_READ:
789 dataset_kstats_update_read_kstats(&zv->zv_kstat,
790 bp->bio_completed);
791 break;
792 case BIO_WRITE:
793 dataset_kstats_update_write_kstats(&zv->zv_kstat,
794 bp->bio_completed);
795 break;
796 case BIO_DELETE:
797 break;
798 default:
799 break;
802 if (commit) {
803 commit:
804 zil_commit(zv->zv_zilog, ZVOL_OBJ);
806 resume:
807 rw_exit(&zv->zv_suspend_lock);
808 out:
809 if (bp->bio_to)
810 g_io_deliver(bp, error);
811 else
812 biofinish(bp, NULL, error);
816 * Character device mode implementation
819 static int
820 zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag)
822 zvol_state_t *zv;
823 uint64_t volsize;
824 zfs_locked_range_t *lr;
825 int error = 0;
826 zfs_uio_t uio;
828 zfs_uio_init(&uio, uio_s);
830 zv = dev->si_drv2;
832 volsize = zv->zv_volsize;
834 * uio_loffset == volsize isn't an error as
835 * it's required for EOF processing.
837 if (zfs_uio_resid(&uio) > 0 &&
838 (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
839 return (SET_ERROR(EIO));
841 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
842 ssize_t start_resid = zfs_uio_resid(&uio);
843 lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
844 zfs_uio_resid(&uio), RL_READER);
845 while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
846 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
848 /* Don't read past the end. */
849 if (bytes > volsize - zfs_uio_offset(&uio))
850 bytes = volsize - zfs_uio_offset(&uio);
852 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
853 if (error) {
854 /* Convert checksum errors into IO errors. */
855 if (error == ECKSUM)
856 error = SET_ERROR(EIO);
857 break;
860 zfs_rangelock_exit(lr);
861 int64_t nread = start_resid - zfs_uio_resid(&uio);
862 dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
863 rw_exit(&zv->zv_suspend_lock);
865 return (error);
868 static int
869 zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
871 zvol_state_t *zv;
872 uint64_t volsize;
873 zfs_locked_range_t *lr;
874 int error = 0;
875 boolean_t commit;
876 zfs_uio_t uio;
878 zv = dev->si_drv2;
880 volsize = zv->zv_volsize;
882 zfs_uio_init(&uio, uio_s);
884 if (zfs_uio_resid(&uio) > 0 &&
885 (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
886 return (SET_ERROR(EIO));
888 ssize_t start_resid = zfs_uio_resid(&uio);
889 commit = (ioflag & IO_SYNC) ||
890 (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
892 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
893 zvol_ensure_zilog(zv);
895 lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
896 zfs_uio_resid(&uio), RL_WRITER);
897 while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
898 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
899 uint64_t off = zfs_uio_offset(&uio);
900 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
902 if (bytes > volsize - off) /* Don't write past the end. */
903 bytes = volsize - off;
905 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
906 error = dmu_tx_assign(tx, TXG_WAIT);
907 if (error) {
908 dmu_tx_abort(tx);
909 break;
911 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
912 if (error == 0)
913 zvol_log_write(zv, tx, off, bytes, commit);
914 dmu_tx_commit(tx);
916 if (error)
917 break;
919 zfs_rangelock_exit(lr);
920 int64_t nwritten = start_resid - zfs_uio_resid(&uio);
921 dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
922 if (commit)
923 zil_commit(zv->zv_zilog, ZVOL_OBJ);
924 rw_exit(&zv->zv_suspend_lock);
926 return (error);
929 static int
930 zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
932 zvol_state_t *zv;
933 int err = 0;
934 boolean_t drop_suspend = B_FALSE;
936 retry:
937 rw_enter(&zvol_state_lock, ZVOL_RW_READER);
939 * Obtain a copy of si_drv2 under zvol_state_lock to make sure either
940 * the result of zvol free code setting si_drv2 to NULL is observed,
941 * or the zv is protected from being freed because of the positive
942 * zv_open_count.
944 zv = dev->si_drv2;
945 if (zv == NULL) {
946 rw_exit(&zvol_state_lock);
947 err = SET_ERROR(ENXIO);
948 goto out_locked;
951 mutex_enter(&zv->zv_state_lock);
952 if (zv->zv_zso->zso_dying) {
953 rw_exit(&zvol_state_lock);
954 err = SET_ERROR(ENXIO);
955 goto out_zv_locked;
957 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
960 * Make sure zvol is not suspended during first open
961 * (hold zv_suspend_lock) and respect proper lock acquisition
962 * ordering - zv_suspend_lock before zv_state_lock.
964 if (zv->zv_open_count == 0) {
965 drop_suspend = B_TRUE;
966 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
967 mutex_exit(&zv->zv_state_lock);
968 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
969 mutex_enter(&zv->zv_state_lock);
970 /* Check to see if zv_suspend_lock is needed. */
971 if (zv->zv_open_count != 0) {
972 rw_exit(&zv->zv_suspend_lock);
973 drop_suspend = B_FALSE;
977 rw_exit(&zvol_state_lock);
979 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
981 if (zv->zv_open_count == 0) {
982 boolean_t drop_namespace = B_FALSE;
984 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
987 * Take spa_namespace_lock to prevent lock inversion when
988 * zvols from one pool are opened as vdevs in another.
990 if (!mutex_owned(&spa_namespace_lock)) {
991 if (!mutex_tryenter(&spa_namespace_lock)) {
992 mutex_exit(&zv->zv_state_lock);
993 rw_exit(&zv->zv_suspend_lock);
994 drop_suspend = B_FALSE;
995 kern_yield(PRI_USER);
996 goto retry;
997 } else {
998 drop_namespace = B_TRUE;
1001 err = zvol_first_open(zv, !(flags & FWRITE));
1002 if (drop_namespace)
1003 mutex_exit(&spa_namespace_lock);
1004 if (err)
1005 goto out_zv_locked;
1008 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1010 if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
1011 err = SET_ERROR(EROFS);
1012 goto out_opened;
1014 if (zv->zv_flags & ZVOL_EXCL) {
1015 err = SET_ERROR(EBUSY);
1016 goto out_opened;
1018 if (flags & O_EXCL) {
1019 if (zv->zv_open_count != 0) {
1020 err = SET_ERROR(EBUSY);
1021 goto out_opened;
1023 zv->zv_flags |= ZVOL_EXCL;
1026 zv->zv_open_count++;
1027 out_opened:
1028 if (zv->zv_open_count == 0) {
1029 zvol_last_close(zv);
1030 wakeup(zv);
1032 out_zv_locked:
1033 mutex_exit(&zv->zv_state_lock);
1034 out_locked:
1035 if (drop_suspend)
1036 rw_exit(&zv->zv_suspend_lock);
1037 return (err);
1040 static int
1041 zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
1043 zvol_state_t *zv;
1044 boolean_t drop_suspend = B_TRUE;
1046 rw_enter(&zvol_state_lock, ZVOL_RW_READER);
1047 zv = dev->si_drv2;
1048 if (zv == NULL) {
1049 rw_exit(&zvol_state_lock);
1050 return (SET_ERROR(ENXIO));
1053 mutex_enter(&zv->zv_state_lock);
1054 if (zv->zv_flags & ZVOL_EXCL) {
1055 ASSERT3U(zv->zv_open_count, ==, 1);
1056 zv->zv_flags &= ~ZVOL_EXCL;
1059 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
1062 * If the open count is zero, this is a spurious close.
1063 * That indicates a bug in the kernel / DDI framework.
1065 ASSERT3U(zv->zv_open_count, >, 0);
1067 * Make sure zvol is not suspended during last close
1068 * (hold zv_suspend_lock) and respect proper lock acquisition
1069 * ordering - zv_suspend_lock before zv_state_lock.
1071 if (zv->zv_open_count == 1) {
1072 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
1073 mutex_exit(&zv->zv_state_lock);
1074 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1075 mutex_enter(&zv->zv_state_lock);
1076 /* Check to see if zv_suspend_lock is needed. */
1077 if (zv->zv_open_count != 1) {
1078 rw_exit(&zv->zv_suspend_lock);
1079 drop_suspend = B_FALSE;
1082 } else {
1083 drop_suspend = B_FALSE;
1085 rw_exit(&zvol_state_lock);
1087 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1090 * You may get multiple opens, but only one close.
1092 zv->zv_open_count--;
1094 if (zv->zv_open_count == 0) {
1095 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1096 zvol_last_close(zv);
1097 wakeup(zv);
1100 mutex_exit(&zv->zv_state_lock);
1102 if (drop_suspend)
1103 rw_exit(&zv->zv_suspend_lock);
1104 return (0);
1107 static int
1108 zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
1109 int fflag, struct thread *td)
1111 zvol_state_t *zv;
1112 zfs_locked_range_t *lr;
1113 off_t offset, length;
1114 int error;
1115 boolean_t sync;
1117 zv = dev->si_drv2;
1119 error = 0;
1120 KASSERT(zv->zv_open_count > 0,
1121 ("Device with zero access count in %s", __func__));
1123 switch (cmd) {
1124 case DIOCGSECTORSIZE:
1125 *(uint32_t *)data = DEV_BSIZE;
1126 break;
1127 case DIOCGMEDIASIZE:
1128 *(off_t *)data = zv->zv_volsize;
1129 break;
1130 case DIOCGFLUSH:
1131 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1132 if (zv->zv_zilog != NULL)
1133 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1134 rw_exit(&zv->zv_suspend_lock);
1135 break;
1136 case DIOCGDELETE:
1137 if (!zvol_unmap_enabled)
1138 break;
1140 offset = ((off_t *)data)[0];
1141 length = ((off_t *)data)[1];
1142 if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
1143 offset < 0 || offset >= zv->zv_volsize ||
1144 length <= 0) {
1145 printf("%s: offset=%jd length=%jd\n", __func__, offset,
1146 length);
1147 error = SET_ERROR(EINVAL);
1148 break;
1150 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1151 zvol_ensure_zilog(zv);
1152 lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length,
1153 RL_WRITER);
1154 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1155 error = dmu_tx_assign(tx, TXG_WAIT);
1156 if (error != 0) {
1157 sync = FALSE;
1158 dmu_tx_abort(tx);
1159 } else {
1160 sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1161 zvol_log_truncate(zv, tx, offset, length);
1162 dmu_tx_commit(tx);
1163 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1164 offset, length);
1166 zfs_rangelock_exit(lr);
1167 if (sync)
1168 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1169 rw_exit(&zv->zv_suspend_lock);
1170 break;
1171 case DIOCGSTRIPESIZE:
1172 *(off_t *)data = zv->zv_volblocksize;
1173 break;
1174 case DIOCGSTRIPEOFFSET:
1175 *(off_t *)data = 0;
1176 break;
1177 case DIOCGATTR: {
1178 spa_t *spa = dmu_objset_spa(zv->zv_objset);
1179 struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
1180 uint64_t refd, avail, usedobjs, availobjs;
1182 if (strcmp(arg->name, "GEOM::candelete") == 0)
1183 arg->value.i = 1;
1184 else if (strcmp(arg->name, "blocksavail") == 0) {
1185 dmu_objset_space(zv->zv_objset, &refd, &avail,
1186 &usedobjs, &availobjs);
1187 arg->value.off = avail / DEV_BSIZE;
1188 } else if (strcmp(arg->name, "blocksused") == 0) {
1189 dmu_objset_space(zv->zv_objset, &refd, &avail,
1190 &usedobjs, &availobjs);
1191 arg->value.off = refd / DEV_BSIZE;
1192 } else if (strcmp(arg->name, "poolblocksavail") == 0) {
1193 avail = metaslab_class_get_space(spa_normal_class(spa));
1194 avail -= metaslab_class_get_alloc(
1195 spa_normal_class(spa));
1196 arg->value.off = avail / DEV_BSIZE;
1197 } else if (strcmp(arg->name, "poolblocksused") == 0) {
1198 refd = metaslab_class_get_alloc(spa_normal_class(spa));
1199 arg->value.off = refd / DEV_BSIZE;
1200 } else
1201 error = SET_ERROR(ENOIOCTL);
1202 break;
1204 case FIOSEEKHOLE:
1205 case FIOSEEKDATA: {
1206 off_t *off = (off_t *)data;
1207 uint64_t noff;
1208 boolean_t hole;
1210 hole = (cmd == FIOSEEKHOLE);
1211 noff = *off;
1212 lr = zfs_rangelock_enter(&zv->zv_rangelock, 0, UINT64_MAX,
1213 RL_READER);
1214 error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
1215 zfs_rangelock_exit(lr);
1216 *off = noff;
1217 break;
1219 default:
1220 error = SET_ERROR(ENOIOCTL);
1223 return (error);
1227 * Misc. helpers
1230 static void
1231 zvol_ensure_zilog(zvol_state_t *zv)
1233 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1236 * Open a ZIL if this is the first time we have written to this
1237 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
1238 * than zv_state_lock so that we don't need to acquire an
1239 * additional lock in this path.
1241 if (zv->zv_zilog == NULL) {
1242 if (!rw_tryupgrade(&zv->zv_suspend_lock)) {
1243 rw_exit(&zv->zv_suspend_lock);
1244 rw_enter(&zv->zv_suspend_lock, RW_WRITER);
1246 if (zv->zv_zilog == NULL) {
1247 zv->zv_zilog = zil_open(zv->zv_objset,
1248 zvol_get_data, &zv->zv_kstat.dk_zil_sums);
1249 zv->zv_flags |= ZVOL_WRITTEN_TO;
1250 /* replay / destroy done in zvol_os_create_minor() */
1251 VERIFY0(zv->zv_zilog->zl_header->zh_flags &
1252 ZIL_REPLAY_NEEDED);
1254 rw_downgrade(&zv->zv_suspend_lock);
1258 boolean_t
1259 zvol_os_is_zvol(const char *device)
1261 return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
1264 void
1265 zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
1267 ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1268 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1270 /* Move to a new hashtable entry. */
1271 zv->zv_hash = zvol_name_hash(newname);
1272 hlist_del(&zv->zv_hlink);
1273 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1275 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1276 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1277 struct g_provider *pp = zsg->zsg_provider;
1278 struct g_geom *gp;
1280 g_topology_lock();
1281 gp = pp->geom;
1282 ASSERT3P(gp, !=, NULL);
1284 zsg->zsg_provider = NULL;
1285 g_wither_provider(pp, ENXIO);
1287 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
1288 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1289 pp->sectorsize = DEV_BSIZE;
1290 pp->mediasize = zv->zv_volsize;
1291 pp->private = zv;
1292 zsg->zsg_provider = pp;
1293 g_error_provider(pp, 0);
1294 g_topology_unlock();
1295 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1296 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1297 struct cdev *dev;
1298 struct make_dev_args args;
1300 dev = zsd->zsd_cdev;
1301 if (dev != NULL) {
1302 destroy_dev(dev);
1303 dev = zsd->zsd_cdev = NULL;
1304 if (zv->zv_open_count > 0) {
1305 zv->zv_flags &= ~ZVOL_EXCL;
1306 zv->zv_open_count = 0;
1307 /* XXX need suspend lock but lock order */
1308 zvol_last_close(zv);
1312 make_dev_args_init(&args);
1313 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1314 args.mda_devsw = &zvol_cdevsw;
1315 args.mda_cr = NULL;
1316 args.mda_uid = UID_ROOT;
1317 args.mda_gid = GID_OPERATOR;
1318 args.mda_mode = 0640;
1319 args.mda_si_drv2 = zv;
1320 if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname)
1321 == 0) {
1322 dev->si_iosize_max = maxphys;
1323 zsd->zsd_cdev = dev;
1326 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1327 dataset_kstats_rename(&zv->zv_kstat, newname);
1331 * Remove minor node for the specified volume.
1333 void
1334 zvol_os_free(zvol_state_t *zv)
1336 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1337 ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
1338 ASSERT0(zv->zv_open_count);
1340 ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
1342 rw_destroy(&zv->zv_suspend_lock);
1343 zfs_rangelock_fini(&zv->zv_rangelock);
1345 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1346 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1347 struct g_provider *pp __maybe_unused = zsg->zsg_provider;
1349 ASSERT3P(pp->private, ==, NULL);
1351 g_topology_lock();
1352 zvol_geom_destroy(zv);
1353 g_topology_unlock();
1354 mtx_destroy(&zsg->zsg_queue_mtx);
1355 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1356 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1357 struct cdev *dev = zsd->zsd_cdev;
1359 if (dev != NULL) {
1360 ASSERT3P(dev->si_drv2, ==, NULL);
1361 destroy_dev(dev);
1362 knlist_clear(&zsd->zsd_selinfo.si_note, 0);
1363 knlist_destroy(&zsd->zsd_selinfo.si_note);
1367 mutex_destroy(&zv->zv_state_lock);
1368 cv_destroy(&zv->zv_removing_cv);
1369 dataset_kstats_destroy(&zv->zv_kstat);
1370 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
1371 kmem_free(zv, sizeof (zvol_state_t));
1372 zvol_minors--;
1376 * Create a minor node (plus a whole lot more) for the specified volume.
1379 zvol_os_create_minor(const char *name)
1381 zvol_state_t *zv;
1382 objset_t *os;
1383 dmu_object_info_t *doi;
1384 uint64_t volsize;
1385 uint64_t volmode, hash;
1386 int error;
1387 bool replayed_zil = B_FALSE;
1389 ZFS_LOG(1, "Creating ZVOL %s...", name);
1390 hash = zvol_name_hash(name);
1391 if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) {
1392 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1393 mutex_exit(&zv->zv_state_lock);
1394 return (SET_ERROR(EEXIST));
1397 DROP_GIANT();
1399 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
1401 /* Lie and say we're read-only. */
1402 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
1403 if (error)
1404 goto out_doi;
1406 error = dmu_object_info(os, ZVOL_OBJ, doi);
1407 if (error)
1408 goto out_dmu_objset_disown;
1410 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
1411 if (error)
1412 goto out_dmu_objset_disown;
1414 error = dsl_prop_get_integer(name,
1415 zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL);
1416 if (error || volmode == ZFS_VOLMODE_DEFAULT)
1417 volmode = zvol_volmode;
1418 error = 0;
1421 * zvol_alloc equivalent ...
1423 zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
1424 zv->zv_hash = hash;
1425 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
1426 cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL);
1427 zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
1428 zv->zv_volmode = volmode;
1429 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1430 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1431 struct g_provider *pp;
1432 struct g_geom *gp;
1434 zsg->zsg_state = ZVOL_GEOM_UNINIT;
1435 mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF);
1437 g_topology_lock();
1438 gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
1439 gp->start = zvol_geom_bio_start;
1440 gp->access = zvol_geom_access;
1441 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
1442 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1443 pp->sectorsize = DEV_BSIZE;
1444 pp->mediasize = 0;
1445 pp->private = zv;
1447 zsg->zsg_provider = pp;
1448 bioq_init(&zsg->zsg_queue);
1449 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1450 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1451 struct cdev *dev;
1452 struct make_dev_args args;
1454 make_dev_args_init(&args);
1455 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1456 args.mda_devsw = &zvol_cdevsw;
1457 args.mda_cr = NULL;
1458 args.mda_uid = UID_ROOT;
1459 args.mda_gid = GID_OPERATOR;
1460 args.mda_mode = 0640;
1461 args.mda_si_drv2 = zv;
1462 if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name)
1463 == 0) {
1464 dev->si_iosize_max = maxphys;
1465 zsd->zsd_cdev = dev;
1466 knlist_init_sx(&zsd->zsd_selinfo.si_note,
1467 &zv->zv_state_lock);
1470 (void) strlcpy(zv->zv_name, name, MAXPATHLEN);
1471 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
1472 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
1474 if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
1475 zv->zv_flags |= ZVOL_RDONLY;
1477 zv->zv_volblocksize = doi->doi_data_block_size;
1478 zv->zv_volsize = volsize;
1479 zv->zv_objset = os;
1481 ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
1482 error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
1483 if (error)
1484 goto out_dmu_objset_disown;
1485 ASSERT3P(zv->zv_zilog, ==, NULL);
1486 zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums);
1487 if (spa_writeable(dmu_objset_spa(os))) {
1488 if (zil_replay_disable)
1489 replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE);
1490 else
1491 replayed_zil = zil_replay(os, zv, zvol_replay_vector);
1493 if (replayed_zil)
1494 zil_close(zv->zv_zilog);
1495 zv->zv_zilog = NULL;
1497 /* TODO: prefetch for geom tasting */
1499 zv->zv_objset = NULL;
1500 out_dmu_objset_disown:
1501 dmu_objset_disown(os, B_TRUE, FTAG);
1503 if (error == 0 && volmode == ZFS_VOLMODE_GEOM) {
1504 zvol_geom_run(zv);
1505 g_topology_unlock();
1507 out_doi:
1508 kmem_free(doi, sizeof (dmu_object_info_t));
1509 if (error == 0) {
1510 rw_enter(&zvol_state_lock, RW_WRITER);
1511 zvol_insert(zv);
1512 zvol_minors++;
1513 rw_exit(&zvol_state_lock);
1514 ZFS_LOG(1, "ZVOL %s created.", name);
1516 PICKUP_GIANT();
1517 return (error);
1520 void
1521 zvol_os_clear_private(zvol_state_t *zv)
1523 ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1524 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1525 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1526 struct g_provider *pp = zsg->zsg_provider;
1528 if (pp->private == NULL) /* already cleared */
1529 return;
1531 mtx_lock(&zsg->zsg_queue_mtx);
1532 zsg->zsg_state = ZVOL_GEOM_STOPPED;
1533 pp->private = NULL;
1534 wakeup_one(&zsg->zsg_queue);
1535 while (zsg->zsg_state != ZVOL_GEOM_RUNNING)
1536 msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx,
1537 0, "zvol:w", 0);
1538 mtx_unlock(&zsg->zsg_queue_mtx);
1539 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1540 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1541 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1542 struct cdev *dev = zsd->zsd_cdev;
1544 if (dev != NULL)
1545 dev->si_drv2 = NULL;
1550 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
1552 zv->zv_volsize = volsize;
1553 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1554 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1555 struct g_provider *pp = zsg->zsg_provider;
1557 g_topology_lock();
1559 if (pp->private == NULL) {
1560 g_topology_unlock();
1561 return (SET_ERROR(ENXIO));
1565 * Do not invoke resize event when initial size was zero.
1566 * ZVOL initializes the size on first open, this is not
1567 * real resizing.
1569 if (pp->mediasize == 0)
1570 pp->mediasize = zv->zv_volsize;
1571 else
1572 g_resize_provider(pp, zv->zv_volsize);
1574 g_topology_unlock();
1575 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1576 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1578 KNOTE_UNLOCKED(&zsd->zsd_selinfo.si_note, NOTE_ATTRIB);
1580 return (0);
1583 void
1584 zvol_os_set_disk_ro(zvol_state_t *zv, int flags)
1586 // XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags);
1589 void
1590 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity)
1592 // XXX? set_capacity(zv->zv_zso->zvo_disk, capacity);
1596 * Public interfaces
1600 zvol_busy(void)
1602 return (zvol_minors != 0);
1606 zvol_init(void)
1608 zvol_init_impl();
1609 return (0);
1612 void
1613 zvol_fini(void)
1615 zvol_fini_impl();