FreeBSD: Fix a pair of bugs in zfs_fhtovp()
[zfs.git] / module / os / freebsd / zfs / zvol_os.c
blob8d2a6d77624b322ed1ef8f8461803ba12be98936
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
25 * All rights reserved.
27 * Portions Copyright 2010 Robert Milkowski
29 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
30 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
31 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
32 * Copyright (c) 2014 Integros [integros.com]
35 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
38 * ZFS volume emulation driver.
40 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
41 * Volumes are accessed through the symbolic links named:
43 * /dev/zvol/<pool_name>/<dataset_name>
45 * Volumes are persistent through reboot. No user command needs to be
46 * run before opening and using a device.
48 * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
49 * in the system. Except when they're simply character devices (volmode=dev).
52 #include <sys/types.h>
53 #include <sys/param.h>
54 #include <sys/kernel.h>
55 #include <sys/errno.h>
56 #include <sys/uio.h>
57 #include <sys/bio.h>
58 #include <sys/buf.h>
59 #include <sys/kmem.h>
60 #include <sys/conf.h>
61 #include <sys/cmn_err.h>
62 #include <sys/stat.h>
63 #include <sys/proc.h>
64 #include <sys/zap.h>
65 #include <sys/spa.h>
66 #include <sys/spa_impl.h>
67 #include <sys/zio.h>
68 #include <sys/disk.h>
69 #include <sys/dmu_traverse.h>
70 #include <sys/dnode.h>
71 #include <sys/dsl_dataset.h>
72 #include <sys/dsl_prop.h>
73 #include <sys/dsl_dir.h>
74 #include <sys/byteorder.h>
75 #include <sys/sunddi.h>
76 #include <sys/dirent.h>
77 #include <sys/policy.h>
78 #include <sys/queue.h>
79 #include <sys/fs/zfs.h>
80 #include <sys/zfs_ioctl.h>
81 #include <sys/zil.h>
82 #include <sys/zfs_znode.h>
83 #include <sys/zfs_rlock.h>
84 #include <sys/vdev_impl.h>
85 #include <sys/vdev_raidz.h>
86 #include <sys/zvol.h>
87 #include <sys/zil_impl.h>
88 #include <sys/dataset_kstats.h>
89 #include <sys/dbuf.h>
90 #include <sys/dmu_tx.h>
91 #include <sys/zfeature.h>
92 #include <sys/zio_checksum.h>
93 #include <sys/zil_impl.h>
94 #include <sys/filio.h>
95 #include <sys/freebsd_event.h>
97 #include <geom/geom.h>
98 #include <sys/zvol.h>
99 #include <sys/zvol_impl.h>
101 #include "zfs_namecheck.h"
103 #define ZVOL_DUMPSIZE "dumpsize"
105 #ifdef ZVOL_LOCK_DEBUG
106 #define ZVOL_RW_READER RW_WRITER
107 #define ZVOL_RW_READ_HELD RW_WRITE_HELD
108 #else
109 #define ZVOL_RW_READER RW_READER
110 #define ZVOL_RW_READ_HELD RW_READ_HELD
111 #endif
113 enum zvol_geom_state {
114 ZVOL_GEOM_UNINIT,
115 ZVOL_GEOM_STOPPED,
116 ZVOL_GEOM_RUNNING,
119 struct zvol_state_os {
120 #define zso_dev _zso_state._zso_dev
121 #define zso_geom _zso_state._zso_geom
122 union {
123 /* volmode=dev */
124 struct zvol_state_dev {
125 struct cdev *zsd_cdev;
126 uint64_t zsd_sync_cnt;
127 struct selinfo zsd_selinfo;
128 } _zso_dev;
130 /* volmode=geom */
131 struct zvol_state_geom {
132 struct g_provider *zsg_provider;
133 struct bio_queue_head zsg_queue;
134 struct mtx zsg_queue_mtx;
135 enum zvol_geom_state zsg_state;
136 } _zso_geom;
137 } _zso_state;
138 int zso_dying;
141 static uint32_t zvol_minors;
143 SYSCTL_DECL(_vfs_zfs);
144 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
145 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0,
146 "Expose as GEOM providers (1), device files (2) or neither");
147 static boolean_t zpool_on_zvol = B_FALSE;
148 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
149 "Allow zpools to use zvols as vdevs (DANGEROUS)");
152 * Toggle unmap functionality.
154 boolean_t zvol_unmap_enabled = B_TRUE;
156 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
157 &zvol_unmap_enabled, 0, "Enable UNMAP functionality");
160 * zvol maximum transfer in one DMU tx.
162 int zvol_maxphys = DMU_MAX_ACCESS / 2;
164 static void zvol_ensure_zilog(zvol_state_t *zv);
166 static d_open_t zvol_cdev_open;
167 static d_close_t zvol_cdev_close;
168 static d_ioctl_t zvol_cdev_ioctl;
169 static d_read_t zvol_cdev_read;
170 static d_write_t zvol_cdev_write;
171 static d_strategy_t zvol_geom_bio_strategy;
172 static d_kqfilter_t zvol_cdev_kqfilter;
174 static struct cdevsw zvol_cdevsw = {
175 .d_name = "zvol",
176 .d_version = D_VERSION,
177 .d_flags = D_DISK | D_TRACKCLOSE,
178 .d_open = zvol_cdev_open,
179 .d_close = zvol_cdev_close,
180 .d_ioctl = zvol_cdev_ioctl,
181 .d_read = zvol_cdev_read,
182 .d_write = zvol_cdev_write,
183 .d_strategy = zvol_geom_bio_strategy,
184 .d_kqfilter = zvol_cdev_kqfilter,
187 static void zvol_filter_detach(struct knote *kn);
188 static int zvol_filter_vnode(struct knote *kn, long hint);
190 static struct filterops zvol_filterops_vnode = {
191 .f_isfd = 1,
192 .f_detach = zvol_filter_detach,
193 .f_event = zvol_filter_vnode,
196 extern uint_t zfs_geom_probe_vdev_key;
198 struct g_class zfs_zvol_class = {
199 .name = "ZFS::ZVOL",
200 .version = G_VERSION,
203 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
205 static int zvol_geom_open(struct g_provider *pp, int flag, int count);
206 static int zvol_geom_close(struct g_provider *pp, int flag, int count);
207 static void zvol_geom_run(zvol_state_t *zv);
208 static void zvol_geom_destroy(zvol_state_t *zv);
209 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
210 static void zvol_geom_worker(void *arg);
211 static void zvol_geom_bio_start(struct bio *bp);
212 static int zvol_geom_bio_getattr(struct bio *bp);
213 /* static d_strategy_t zvol_geom_bio_strategy; (declared elsewhere) */
216 * GEOM mode implementation
219 static int
220 zvol_geom_open(struct g_provider *pp, int flag, int count)
222 zvol_state_t *zv;
223 int err = 0;
224 boolean_t drop_suspend = B_FALSE;
226 if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
228 * If zfs_geom_probe_vdev_key is set, that means that zfs is
229 * attempting to probe geom providers while looking for a
230 * replacement for a missing VDEV. In this case, the
231 * spa_namespace_lock will not be held, but it is still illegal
232 * to use a zvol as a vdev. Deadlocks can result if another
233 * thread has spa_namespace_lock.
235 return (SET_ERROR(EOPNOTSUPP));
238 retry:
239 rw_enter(&zvol_state_lock, ZVOL_RW_READER);
241 * Obtain a copy of private under zvol_state_lock to make sure either
242 * the result of zvol free code setting private to NULL is observed,
243 * or the zv is protected from being freed because of the positive
244 * zv_open_count.
246 zv = pp->private;
247 if (zv == NULL) {
248 rw_exit(&zvol_state_lock);
249 err = SET_ERROR(ENXIO);
250 goto out_locked;
253 mutex_enter(&zv->zv_state_lock);
254 if (zv->zv_zso->zso_dying) {
255 rw_exit(&zvol_state_lock);
256 err = SET_ERROR(ENXIO);
257 goto out_zv_locked;
259 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
262 * Make sure zvol is not suspended during first open
263 * (hold zv_suspend_lock) and respect proper lock acquisition
264 * ordering - zv_suspend_lock before zv_state_lock.
266 if (zv->zv_open_count == 0) {
267 drop_suspend = B_TRUE;
268 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
269 mutex_exit(&zv->zv_state_lock);
270 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
271 mutex_enter(&zv->zv_state_lock);
272 /* Check to see if zv_suspend_lock is needed. */
273 if (zv->zv_open_count != 0) {
274 rw_exit(&zv->zv_suspend_lock);
275 drop_suspend = B_FALSE;
279 rw_exit(&zvol_state_lock);
281 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
283 if (zv->zv_open_count == 0) {
284 boolean_t drop_namespace = B_FALSE;
286 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
289 * Take spa_namespace_lock to prevent lock inversion when
290 * zvols from one pool are opened as vdevs in another.
292 if (!mutex_owned(&spa_namespace_lock)) {
293 if (!mutex_tryenter(&spa_namespace_lock)) {
294 mutex_exit(&zv->zv_state_lock);
295 rw_exit(&zv->zv_suspend_lock);
296 kern_yield(PRI_USER);
297 goto retry;
298 } else {
299 drop_namespace = B_TRUE;
302 err = zvol_first_open(zv, !(flag & FWRITE));
303 if (drop_namespace)
304 mutex_exit(&spa_namespace_lock);
305 if (err)
306 goto out_zv_locked;
307 pp->mediasize = zv->zv_volsize;
308 pp->stripeoffset = 0;
309 pp->stripesize = zv->zv_volblocksize;
312 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
315 * Check for a bad on-disk format version now since we
316 * lied about owning the dataset readonly before.
318 if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) ||
319 dmu_objset_incompatible_encryption_version(zv->zv_objset))) {
320 err = SET_ERROR(EROFS);
321 goto out_opened;
323 if (zv->zv_flags & ZVOL_EXCL) {
324 err = SET_ERROR(EBUSY);
325 goto out_opened;
327 if (flag & O_EXCL) {
328 if (zv->zv_open_count != 0) {
329 err = SET_ERROR(EBUSY);
330 goto out_opened;
332 zv->zv_flags |= ZVOL_EXCL;
335 zv->zv_open_count += count;
336 out_opened:
337 if (zv->zv_open_count == 0) {
338 zvol_last_close(zv);
339 wakeup(zv);
341 out_zv_locked:
342 mutex_exit(&zv->zv_state_lock);
343 out_locked:
344 if (drop_suspend)
345 rw_exit(&zv->zv_suspend_lock);
346 return (err);
349 static int
350 zvol_geom_close(struct g_provider *pp, int flag, int count)
352 (void) flag;
353 zvol_state_t *zv;
354 boolean_t drop_suspend = B_TRUE;
355 int new_open_count;
357 rw_enter(&zvol_state_lock, ZVOL_RW_READER);
358 zv = pp->private;
359 if (zv == NULL) {
360 rw_exit(&zvol_state_lock);
361 return (SET_ERROR(ENXIO));
364 mutex_enter(&zv->zv_state_lock);
365 if (zv->zv_flags & ZVOL_EXCL) {
366 ASSERT3U(zv->zv_open_count, ==, 1);
367 zv->zv_flags &= ~ZVOL_EXCL;
370 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
373 * If the open count is zero, this is a spurious close.
374 * That indicates a bug in the kernel / DDI framework.
376 ASSERT3U(zv->zv_open_count, >, 0);
379 * Make sure zvol is not suspended during last close
380 * (hold zv_suspend_lock) and respect proper lock acquisition
381 * ordering - zv_suspend_lock before zv_state_lock.
383 new_open_count = zv->zv_open_count - count;
384 if (new_open_count == 0) {
385 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
386 mutex_exit(&zv->zv_state_lock);
387 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
388 mutex_enter(&zv->zv_state_lock);
389 /* Check to see if zv_suspend_lock is needed. */
390 new_open_count = zv->zv_open_count - count;
391 if (new_open_count != 0) {
392 rw_exit(&zv->zv_suspend_lock);
393 drop_suspend = B_FALSE;
396 } else {
397 drop_suspend = B_FALSE;
399 rw_exit(&zvol_state_lock);
401 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
404 * You may get multiple opens, but only one close.
406 zv->zv_open_count = new_open_count;
407 if (zv->zv_open_count == 0) {
408 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
409 zvol_last_close(zv);
410 wakeup(zv);
413 mutex_exit(&zv->zv_state_lock);
415 if (drop_suspend)
416 rw_exit(&zv->zv_suspend_lock);
417 return (0);
420 static void
421 zvol_geom_run(zvol_state_t *zv)
423 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
424 struct g_provider *pp = zsg->zsg_provider;
426 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
428 g_error_provider(pp, 0);
430 kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0,
431 "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER));
434 static void
435 zvol_geom_destroy(zvol_state_t *zv)
437 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
438 struct g_provider *pp = zsg->zsg_provider;
440 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
442 g_topology_assert();
444 mutex_enter(&zv->zv_state_lock);
445 VERIFY3S(zsg->zsg_state, ==, ZVOL_GEOM_RUNNING);
446 mutex_exit(&zv->zv_state_lock);
447 zsg->zsg_provider = NULL;
448 g_wither_geom(pp->geom, ENXIO);
451 void
452 zvol_wait_close(zvol_state_t *zv)
455 if (zv->zv_volmode != ZFS_VOLMODE_GEOM)
456 return;
457 mutex_enter(&zv->zv_state_lock);
458 zv->zv_zso->zso_dying = B_TRUE;
460 if (zv->zv_open_count)
461 msleep(zv, &zv->zv_state_lock,
462 PRIBIO, "zvol:dying", 10*hz);
463 mutex_exit(&zv->zv_state_lock);
467 static int
468 zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
470 int count, error, flags;
472 g_topology_assert();
475 * To make it easier we expect either open or close, but not both
476 * at the same time.
478 KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
479 (acr <= 0 && acw <= 0 && ace <= 0),
480 ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
481 pp->name, acr, acw, ace));
483 if (pp->private == NULL) {
484 if (acr <= 0 && acw <= 0 && ace <= 0)
485 return (0);
486 return (pp->error);
490 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if
491 * ace != 0, because GEOM already handles that and handles it a bit
492 * differently. GEOM allows for multiple read/exclusive consumers and
493 * ZFS allows only one exclusive consumer, no matter if it is reader or
494 * writer. I like better the way GEOM works so I'll leave it for GEOM
495 * to decide what to do.
498 count = acr + acw + ace;
499 if (count == 0)
500 return (0);
502 flags = 0;
503 if (acr != 0 || ace != 0)
504 flags |= FREAD;
505 if (acw != 0)
506 flags |= FWRITE;
508 g_topology_unlock();
509 if (count > 0)
510 error = zvol_geom_open(pp, flags, count);
511 else
512 error = zvol_geom_close(pp, flags, -count);
513 g_topology_lock();
514 return (error);
517 static void
518 zvol_geom_worker(void *arg)
520 zvol_state_t *zv = arg;
521 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
522 struct bio *bp;
524 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
526 thread_lock(curthread);
527 sched_prio(curthread, PRIBIO);
528 thread_unlock(curthread);
530 for (;;) {
531 mtx_lock(&zsg->zsg_queue_mtx);
532 bp = bioq_takefirst(&zsg->zsg_queue);
533 if (bp == NULL) {
534 if (zsg->zsg_state == ZVOL_GEOM_STOPPED) {
535 zsg->zsg_state = ZVOL_GEOM_RUNNING;
536 wakeup(&zsg->zsg_state);
537 mtx_unlock(&zsg->zsg_queue_mtx);
538 kthread_exit();
540 msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx,
541 PRIBIO | PDROP, "zvol:io", 0);
542 continue;
544 mtx_unlock(&zsg->zsg_queue_mtx);
545 zvol_geom_bio_strategy(bp);
549 static void
550 zvol_geom_bio_start(struct bio *bp)
552 zvol_state_t *zv = bp->bio_to->private;
553 struct zvol_state_geom *zsg;
554 boolean_t first;
556 if (zv == NULL) {
557 g_io_deliver(bp, ENXIO);
558 return;
560 if (bp->bio_cmd == BIO_GETATTR) {
561 if (zvol_geom_bio_getattr(bp))
562 g_io_deliver(bp, EOPNOTSUPP);
563 return;
566 if (!THREAD_CAN_SLEEP()) {
567 zsg = &zv->zv_zso->zso_geom;
568 mtx_lock(&zsg->zsg_queue_mtx);
569 first = (bioq_first(&zsg->zsg_queue) == NULL);
570 bioq_insert_tail(&zsg->zsg_queue, bp);
571 mtx_unlock(&zsg->zsg_queue_mtx);
572 if (first)
573 wakeup_one(&zsg->zsg_queue);
574 return;
577 zvol_geom_bio_strategy(bp);
580 static int
581 zvol_geom_bio_getattr(struct bio *bp)
583 zvol_state_t *zv;
585 zv = bp->bio_to->private;
586 ASSERT3P(zv, !=, NULL);
588 spa_t *spa = dmu_objset_spa(zv->zv_objset);
589 uint64_t refd, avail, usedobjs, availobjs;
591 if (g_handleattr_int(bp, "GEOM::candelete", 1))
592 return (0);
593 if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
594 dmu_objset_space(zv->zv_objset, &refd, &avail,
595 &usedobjs, &availobjs);
596 if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE))
597 return (0);
598 } else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
599 dmu_objset_space(zv->zv_objset, &refd, &avail,
600 &usedobjs, &availobjs);
601 if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE))
602 return (0);
603 } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
604 avail = metaslab_class_get_space(spa_normal_class(spa));
605 avail -= metaslab_class_get_alloc(spa_normal_class(spa));
606 if (g_handleattr_off_t(bp, "poolblocksavail",
607 avail / DEV_BSIZE))
608 return (0);
609 } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
610 refd = metaslab_class_get_alloc(spa_normal_class(spa));
611 if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE))
612 return (0);
614 return (1);
617 static void
618 zvol_filter_detach(struct knote *kn)
620 zvol_state_t *zv;
621 struct zvol_state_dev *zsd;
623 zv = kn->kn_hook;
624 zsd = &zv->zv_zso->zso_dev;
626 knlist_remove(&zsd->zsd_selinfo.si_note, kn, 0);
629 static int
630 zvol_filter_vnode(struct knote *kn, long hint)
632 kn->kn_fflags |= kn->kn_sfflags & hint;
634 return (kn->kn_fflags != 0);
637 static int
638 zvol_cdev_kqfilter(struct cdev *dev, struct knote *kn)
640 zvol_state_t *zv;
641 struct zvol_state_dev *zsd;
643 zv = dev->si_drv2;
644 zsd = &zv->zv_zso->zso_dev;
646 if (kn->kn_filter != EVFILT_VNODE)
647 return (EINVAL);
649 /* XXX: extend support for other NOTE_* events */
650 if (kn->kn_sfflags != NOTE_ATTRIB)
651 return (EINVAL);
653 kn->kn_fop = &zvol_filterops_vnode;
654 kn->kn_hook = zv;
655 knlist_add(&zsd->zsd_selinfo.si_note, kn, 0);
657 return (0);
660 static void
661 zvol_geom_bio_strategy(struct bio *bp)
663 zvol_state_t *zv;
664 uint64_t off, volsize;
665 size_t resid;
666 char *addr;
667 objset_t *os;
668 zfs_locked_range_t *lr;
669 int error = 0;
670 boolean_t doread = B_FALSE;
671 boolean_t is_dumpified;
672 boolean_t sync;
674 if (bp->bio_to)
675 zv = bp->bio_to->private;
676 else
677 zv = bp->bio_dev->si_drv2;
679 if (zv == NULL) {
680 error = SET_ERROR(ENXIO);
681 goto out;
684 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
686 switch (bp->bio_cmd) {
687 case BIO_READ:
688 doread = B_TRUE;
689 break;
690 case BIO_WRITE:
691 case BIO_FLUSH:
692 case BIO_DELETE:
693 if (zv->zv_flags & ZVOL_RDONLY) {
694 error = SET_ERROR(EROFS);
695 goto resume;
697 zvol_ensure_zilog(zv);
698 if (bp->bio_cmd == BIO_FLUSH)
699 goto sync;
700 break;
701 default:
702 error = SET_ERROR(EOPNOTSUPP);
703 goto resume;
706 off = bp->bio_offset;
707 volsize = zv->zv_volsize;
709 os = zv->zv_objset;
710 ASSERT3P(os, !=, NULL);
712 addr = bp->bio_data;
713 resid = bp->bio_length;
715 if (resid > 0 && off >= volsize) {
716 error = SET_ERROR(EIO);
717 goto resume;
720 is_dumpified = B_FALSE;
721 sync = !doread && !is_dumpified &&
722 zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
725 * There must be no buffer changes when doing a dmu_sync() because
726 * we can't change the data whilst calculating the checksum.
728 lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid,
729 doread ? RL_READER : RL_WRITER);
731 if (bp->bio_cmd == BIO_DELETE) {
732 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
733 error = dmu_tx_assign(tx, TXG_WAIT);
734 if (error != 0) {
735 dmu_tx_abort(tx);
736 } else {
737 zvol_log_truncate(zv, tx, off, resid, sync);
738 dmu_tx_commit(tx);
739 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
740 off, resid);
741 resid = 0;
743 goto unlock;
745 while (resid != 0 && off < volsize) {
746 size_t size = MIN(resid, zvol_maxphys);
747 if (doread) {
748 error = dmu_read(os, ZVOL_OBJ, off, size, addr,
749 DMU_READ_PREFETCH);
750 } else {
751 dmu_tx_t *tx = dmu_tx_create(os);
752 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
753 error = dmu_tx_assign(tx, TXG_WAIT);
754 if (error) {
755 dmu_tx_abort(tx);
756 } else {
757 dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
758 zvol_log_write(zv, tx, off, size, sync);
759 dmu_tx_commit(tx);
762 if (error) {
763 /* Convert checksum errors into IO errors. */
764 if (error == ECKSUM)
765 error = SET_ERROR(EIO);
766 break;
768 off += size;
769 addr += size;
770 resid -= size;
772 unlock:
773 zfs_rangelock_exit(lr);
775 bp->bio_completed = bp->bio_length - resid;
776 if (bp->bio_completed < bp->bio_length && off > volsize)
777 error = SET_ERROR(EINVAL);
779 switch (bp->bio_cmd) {
780 case BIO_FLUSH:
781 break;
782 case BIO_READ:
783 dataset_kstats_update_read_kstats(&zv->zv_kstat,
784 bp->bio_completed);
785 break;
786 case BIO_WRITE:
787 dataset_kstats_update_write_kstats(&zv->zv_kstat,
788 bp->bio_completed);
789 break;
790 case BIO_DELETE:
791 break;
792 default:
793 break;
796 if (sync) {
797 sync:
798 zil_commit(zv->zv_zilog, ZVOL_OBJ);
800 resume:
801 rw_exit(&zv->zv_suspend_lock);
802 out:
803 if (bp->bio_to)
804 g_io_deliver(bp, error);
805 else
806 biofinish(bp, NULL, error);
810 * Character device mode implementation
813 static int
814 zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag)
816 zvol_state_t *zv;
817 uint64_t volsize;
818 zfs_locked_range_t *lr;
819 int error = 0;
820 zfs_uio_t uio;
822 zfs_uio_init(&uio, uio_s);
824 zv = dev->si_drv2;
826 volsize = zv->zv_volsize;
828 * uio_loffset == volsize isn't an error as
829 * it's required for EOF processing.
831 if (zfs_uio_resid(&uio) > 0 &&
832 (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
833 return (SET_ERROR(EIO));
835 ssize_t start_resid = zfs_uio_resid(&uio);
836 lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
837 zfs_uio_resid(&uio), RL_READER);
838 while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
839 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
841 /* Don't read past the end. */
842 if (bytes > volsize - zfs_uio_offset(&uio))
843 bytes = volsize - zfs_uio_offset(&uio);
845 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
846 if (error) {
847 /* Convert checksum errors into IO errors. */
848 if (error == ECKSUM)
849 error = SET_ERROR(EIO);
850 break;
853 zfs_rangelock_exit(lr);
854 int64_t nread = start_resid - zfs_uio_resid(&uio);
855 dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
857 return (error);
860 static int
861 zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
863 zvol_state_t *zv;
864 uint64_t volsize;
865 zfs_locked_range_t *lr;
866 int error = 0;
867 boolean_t sync;
868 zfs_uio_t uio;
870 zv = dev->si_drv2;
872 volsize = zv->zv_volsize;
874 zfs_uio_init(&uio, uio_s);
876 if (zfs_uio_resid(&uio) > 0 &&
877 (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
878 return (SET_ERROR(EIO));
880 ssize_t start_resid = zfs_uio_resid(&uio);
881 sync = (ioflag & IO_SYNC) ||
882 (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
884 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
885 zvol_ensure_zilog(zv);
887 lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
888 zfs_uio_resid(&uio), RL_WRITER);
889 while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
890 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
891 uint64_t off = zfs_uio_offset(&uio);
892 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
894 if (bytes > volsize - off) /* Don't write past the end. */
895 bytes = volsize - off;
897 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
898 error = dmu_tx_assign(tx, TXG_WAIT);
899 if (error) {
900 dmu_tx_abort(tx);
901 break;
903 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
904 if (error == 0)
905 zvol_log_write(zv, tx, off, bytes, sync);
906 dmu_tx_commit(tx);
908 if (error)
909 break;
911 zfs_rangelock_exit(lr);
912 int64_t nwritten = start_resid - zfs_uio_resid(&uio);
913 dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
914 if (sync)
915 zil_commit(zv->zv_zilog, ZVOL_OBJ);
916 rw_exit(&zv->zv_suspend_lock);
917 return (error);
920 static int
921 zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
923 zvol_state_t *zv;
924 struct zvol_state_dev *zsd;
925 int err = 0;
926 boolean_t drop_suspend = B_FALSE;
928 retry:
929 rw_enter(&zvol_state_lock, ZVOL_RW_READER);
931 * Obtain a copy of si_drv2 under zvol_state_lock to make sure either
932 * the result of zvol free code setting si_drv2 to NULL is observed,
933 * or the zv is protected from being freed because of the positive
934 * zv_open_count.
936 zv = dev->si_drv2;
937 if (zv == NULL) {
938 rw_exit(&zvol_state_lock);
939 err = SET_ERROR(ENXIO);
940 goto out_locked;
943 mutex_enter(&zv->zv_state_lock);
944 if (zv->zv_zso->zso_dying) {
945 rw_exit(&zvol_state_lock);
946 err = SET_ERROR(ENXIO);
947 goto out_zv_locked;
949 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
952 * Make sure zvol is not suspended during first open
953 * (hold zv_suspend_lock) and respect proper lock acquisition
954 * ordering - zv_suspend_lock before zv_state_lock.
956 if (zv->zv_open_count == 0) {
957 drop_suspend = B_TRUE;
958 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
959 mutex_exit(&zv->zv_state_lock);
960 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
961 mutex_enter(&zv->zv_state_lock);
962 /* Check to see if zv_suspend_lock is needed. */
963 if (zv->zv_open_count != 0) {
964 rw_exit(&zv->zv_suspend_lock);
965 drop_suspend = B_FALSE;
969 rw_exit(&zvol_state_lock);
971 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
973 if (zv->zv_open_count == 0) {
974 boolean_t drop_namespace = B_FALSE;
976 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
979 * Take spa_namespace_lock to prevent lock inversion when
980 * zvols from one pool are opened as vdevs in another.
982 if (!mutex_owned(&spa_namespace_lock)) {
983 if (!mutex_tryenter(&spa_namespace_lock)) {
984 mutex_exit(&zv->zv_state_lock);
985 rw_exit(&zv->zv_suspend_lock);
986 kern_yield(PRI_USER);
987 goto retry;
988 } else {
989 drop_namespace = B_TRUE;
992 err = zvol_first_open(zv, !(flags & FWRITE));
993 if (drop_namespace)
994 mutex_exit(&spa_namespace_lock);
995 if (err)
996 goto out_zv_locked;
999 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1001 if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
1002 err = SET_ERROR(EROFS);
1003 goto out_opened;
1005 if (zv->zv_flags & ZVOL_EXCL) {
1006 err = SET_ERROR(EBUSY);
1007 goto out_opened;
1009 if (flags & O_EXCL) {
1010 if (zv->zv_open_count != 0) {
1011 err = SET_ERROR(EBUSY);
1012 goto out_opened;
1014 zv->zv_flags |= ZVOL_EXCL;
1017 zv->zv_open_count++;
1018 if (flags & O_SYNC) {
1019 zsd = &zv->zv_zso->zso_dev;
1020 zsd->zsd_sync_cnt++;
1021 if (zsd->zsd_sync_cnt == 1 &&
1022 (zv->zv_flags & ZVOL_WRITTEN_TO) != 0)
1023 zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ);
1025 out_opened:
1026 if (zv->zv_open_count == 0) {
1027 zvol_last_close(zv);
1028 wakeup(zv);
1030 out_zv_locked:
1031 mutex_exit(&zv->zv_state_lock);
1032 out_locked:
1033 if (drop_suspend)
1034 rw_exit(&zv->zv_suspend_lock);
1035 return (err);
1038 static int
1039 zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
1041 zvol_state_t *zv;
1042 struct zvol_state_dev *zsd;
1043 boolean_t drop_suspend = B_TRUE;
1045 rw_enter(&zvol_state_lock, ZVOL_RW_READER);
1046 zv = dev->si_drv2;
1047 if (zv == NULL) {
1048 rw_exit(&zvol_state_lock);
1049 return (SET_ERROR(ENXIO));
1052 mutex_enter(&zv->zv_state_lock);
1053 if (zv->zv_flags & ZVOL_EXCL) {
1054 ASSERT3U(zv->zv_open_count, ==, 1);
1055 zv->zv_flags &= ~ZVOL_EXCL;
1058 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
1061 * If the open count is zero, this is a spurious close.
1062 * That indicates a bug in the kernel / DDI framework.
1064 ASSERT3U(zv->zv_open_count, >, 0);
1066 * Make sure zvol is not suspended during last close
1067 * (hold zv_suspend_lock) and respect proper lock acquisition
1068 * ordering - zv_suspend_lock before zv_state_lock.
1070 if (zv->zv_open_count == 1) {
1071 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
1072 mutex_exit(&zv->zv_state_lock);
1073 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1074 mutex_enter(&zv->zv_state_lock);
1075 /* Check to see if zv_suspend_lock is needed. */
1076 if (zv->zv_open_count != 1) {
1077 rw_exit(&zv->zv_suspend_lock);
1078 drop_suspend = B_FALSE;
1081 } else {
1082 drop_suspend = B_FALSE;
1084 rw_exit(&zvol_state_lock);
1086 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1089 * You may get multiple opens, but only one close.
1091 zv->zv_open_count--;
1092 if (flags & O_SYNC) {
1093 zsd = &zv->zv_zso->zso_dev;
1094 zsd->zsd_sync_cnt--;
1097 if (zv->zv_open_count == 0) {
1098 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1099 zvol_last_close(zv);
1100 wakeup(zv);
1103 mutex_exit(&zv->zv_state_lock);
1105 if (drop_suspend)
1106 rw_exit(&zv->zv_suspend_lock);
1107 return (0);
1110 static int
1111 zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
1112 int fflag, struct thread *td)
1114 zvol_state_t *zv;
1115 zfs_locked_range_t *lr;
1116 off_t offset, length;
1117 int error;
1118 boolean_t sync;
1120 zv = dev->si_drv2;
1122 error = 0;
1123 KASSERT(zv->zv_open_count > 0,
1124 ("Device with zero access count in %s", __func__));
1126 switch (cmd) {
1127 case DIOCGSECTORSIZE:
1128 *(uint32_t *)data = DEV_BSIZE;
1129 break;
1130 case DIOCGMEDIASIZE:
1131 *(off_t *)data = zv->zv_volsize;
1132 break;
1133 case DIOCGFLUSH:
1134 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1135 if (zv->zv_zilog != NULL)
1136 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1137 rw_exit(&zv->zv_suspend_lock);
1138 break;
1139 case DIOCGDELETE:
1140 if (!zvol_unmap_enabled)
1141 break;
1143 offset = ((off_t *)data)[0];
1144 length = ((off_t *)data)[1];
1145 if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
1146 offset < 0 || offset >= zv->zv_volsize ||
1147 length <= 0) {
1148 printf("%s: offset=%jd length=%jd\n", __func__, offset,
1149 length);
1150 error = SET_ERROR(EINVAL);
1151 break;
1153 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1154 zvol_ensure_zilog(zv);
1155 lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length,
1156 RL_WRITER);
1157 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1158 error = dmu_tx_assign(tx, TXG_WAIT);
1159 if (error != 0) {
1160 sync = FALSE;
1161 dmu_tx_abort(tx);
1162 } else {
1163 sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1164 zvol_log_truncate(zv, tx, offset, length, sync);
1165 dmu_tx_commit(tx);
1166 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1167 offset, length);
1169 zfs_rangelock_exit(lr);
1170 if (sync)
1171 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1172 rw_exit(&zv->zv_suspend_lock);
1173 break;
1174 case DIOCGSTRIPESIZE:
1175 *(off_t *)data = zv->zv_volblocksize;
1176 break;
1177 case DIOCGSTRIPEOFFSET:
1178 *(off_t *)data = 0;
1179 break;
1180 case DIOCGATTR: {
1181 spa_t *spa = dmu_objset_spa(zv->zv_objset);
1182 struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
1183 uint64_t refd, avail, usedobjs, availobjs;
1185 if (strcmp(arg->name, "GEOM::candelete") == 0)
1186 arg->value.i = 1;
1187 else if (strcmp(arg->name, "blocksavail") == 0) {
1188 dmu_objset_space(zv->zv_objset, &refd, &avail,
1189 &usedobjs, &availobjs);
1190 arg->value.off = avail / DEV_BSIZE;
1191 } else if (strcmp(arg->name, "blocksused") == 0) {
1192 dmu_objset_space(zv->zv_objset, &refd, &avail,
1193 &usedobjs, &availobjs);
1194 arg->value.off = refd / DEV_BSIZE;
1195 } else if (strcmp(arg->name, "poolblocksavail") == 0) {
1196 avail = metaslab_class_get_space(spa_normal_class(spa));
1197 avail -= metaslab_class_get_alloc(
1198 spa_normal_class(spa));
1199 arg->value.off = avail / DEV_BSIZE;
1200 } else if (strcmp(arg->name, "poolblocksused") == 0) {
1201 refd = metaslab_class_get_alloc(spa_normal_class(spa));
1202 arg->value.off = refd / DEV_BSIZE;
1203 } else
1204 error = SET_ERROR(ENOIOCTL);
1205 break;
1207 case FIOSEEKHOLE:
1208 case FIOSEEKDATA: {
1209 off_t *off = (off_t *)data;
1210 uint64_t noff;
1211 boolean_t hole;
1213 hole = (cmd == FIOSEEKHOLE);
1214 noff = *off;
1215 error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
1216 *off = noff;
1217 break;
1219 default:
1220 error = SET_ERROR(ENOIOCTL);
1223 return (error);
1227 * Misc. helpers
1230 static void
1231 zvol_ensure_zilog(zvol_state_t *zv)
1233 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1236 * Open a ZIL if this is the first time we have written to this
1237 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
1238 * than zv_state_lock so that we don't need to acquire an
1239 * additional lock in this path.
1241 if (zv->zv_zilog == NULL) {
1242 if (!rw_tryupgrade(&zv->zv_suspend_lock)) {
1243 rw_exit(&zv->zv_suspend_lock);
1244 rw_enter(&zv->zv_suspend_lock, RW_WRITER);
1246 if (zv->zv_zilog == NULL) {
1247 zv->zv_zilog = zil_open(zv->zv_objset,
1248 zvol_get_data, &zv->zv_kstat.dk_zil_sums);
1249 zv->zv_flags |= ZVOL_WRITTEN_TO;
1250 /* replay / destroy done in zvol_os_create_minor() */
1251 VERIFY0(zv->zv_zilog->zl_header->zh_flags &
1252 ZIL_REPLAY_NEEDED);
1254 rw_downgrade(&zv->zv_suspend_lock);
1258 boolean_t
1259 zvol_os_is_zvol(const char *device)
1261 return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
1264 void
1265 zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
1267 ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1268 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1270 /* Move to a new hashtable entry. */
1271 zv->zv_hash = zvol_name_hash(zv->zv_name);
1272 hlist_del(&zv->zv_hlink);
1273 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1275 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1276 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1277 struct g_provider *pp = zsg->zsg_provider;
1278 struct g_geom *gp;
1280 g_topology_lock();
1281 gp = pp->geom;
1282 ASSERT3P(gp, !=, NULL);
1284 zsg->zsg_provider = NULL;
1285 g_wither_provider(pp, ENXIO);
1287 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
1288 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1289 pp->sectorsize = DEV_BSIZE;
1290 pp->mediasize = zv->zv_volsize;
1291 pp->private = zv;
1292 zsg->zsg_provider = pp;
1293 g_error_provider(pp, 0);
1294 g_topology_unlock();
1295 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1296 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1297 struct cdev *dev;
1298 struct make_dev_args args;
1300 dev = zsd->zsd_cdev;
1301 if (dev != NULL) {
1302 destroy_dev(dev);
1303 dev = zsd->zsd_cdev = NULL;
1304 if (zv->zv_open_count > 0) {
1305 zv->zv_flags &= ~ZVOL_EXCL;
1306 zv->zv_open_count = 0;
1307 /* XXX need suspend lock but lock order */
1308 zvol_last_close(zv);
1312 make_dev_args_init(&args);
1313 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1314 args.mda_devsw = &zvol_cdevsw;
1315 args.mda_cr = NULL;
1316 args.mda_uid = UID_ROOT;
1317 args.mda_gid = GID_OPERATOR;
1318 args.mda_mode = 0640;
1319 args.mda_si_drv2 = zv;
1320 if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname)
1321 == 0) {
1322 #if __FreeBSD_version > 1300130
1323 dev->si_iosize_max = maxphys;
1324 #else
1325 dev->si_iosize_max = MAXPHYS;
1326 #endif
1327 zsd->zsd_cdev = dev;
1330 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1334 * Remove minor node for the specified volume.
1336 void
1337 zvol_os_free(zvol_state_t *zv)
1339 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1340 ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
1341 ASSERT0(zv->zv_open_count);
1343 ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
1345 rw_destroy(&zv->zv_suspend_lock);
1346 zfs_rangelock_fini(&zv->zv_rangelock);
1348 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1349 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1350 struct g_provider *pp __maybe_unused = zsg->zsg_provider;
1352 ASSERT3P(pp->private, ==, NULL);
1354 g_topology_lock();
1355 zvol_geom_destroy(zv);
1356 g_topology_unlock();
1357 mtx_destroy(&zsg->zsg_queue_mtx);
1358 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1359 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1360 struct cdev *dev = zsd->zsd_cdev;
1362 if (dev != NULL) {
1363 ASSERT3P(dev->si_drv2, ==, NULL);
1364 destroy_dev(dev);
1365 knlist_clear(&zsd->zsd_selinfo.si_note, 0);
1366 knlist_destroy(&zsd->zsd_selinfo.si_note);
1370 mutex_destroy(&zv->zv_state_lock);
1371 dataset_kstats_destroy(&zv->zv_kstat);
1372 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
1373 kmem_free(zv, sizeof (zvol_state_t));
1374 zvol_minors--;
1378 * Create a minor node (plus a whole lot more) for the specified volume.
1381 zvol_os_create_minor(const char *name)
1383 zvol_state_t *zv;
1384 objset_t *os;
1385 dmu_object_info_t *doi;
1386 uint64_t volsize;
1387 uint64_t volmode, hash;
1388 int error;
1390 ZFS_LOG(1, "Creating ZVOL %s...", name);
1391 hash = zvol_name_hash(name);
1392 if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) {
1393 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1394 mutex_exit(&zv->zv_state_lock);
1395 return (SET_ERROR(EEXIST));
1398 DROP_GIANT();
1400 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
1402 /* Lie and say we're read-only. */
1403 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
1404 if (error)
1405 goto out_doi;
1407 error = dmu_object_info(os, ZVOL_OBJ, doi);
1408 if (error)
1409 goto out_dmu_objset_disown;
1411 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
1412 if (error)
1413 goto out_dmu_objset_disown;
1415 error = dsl_prop_get_integer(name,
1416 zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL);
1417 if (error || volmode == ZFS_VOLMODE_DEFAULT)
1418 volmode = zvol_volmode;
1419 error = 0;
1422 * zvol_alloc equivalent ...
1424 zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
1425 zv->zv_hash = hash;
1426 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
1427 zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
1428 zv->zv_volmode = volmode;
1429 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1430 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1431 struct g_provider *pp;
1432 struct g_geom *gp;
1434 zsg->zsg_state = ZVOL_GEOM_UNINIT;
1435 mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF);
1437 g_topology_lock();
1438 gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
1439 gp->start = zvol_geom_bio_start;
1440 gp->access = zvol_geom_access;
1441 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
1442 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1443 pp->sectorsize = DEV_BSIZE;
1444 pp->mediasize = 0;
1445 pp->private = zv;
1447 zsg->zsg_provider = pp;
1448 bioq_init(&zsg->zsg_queue);
1449 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1450 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1451 struct cdev *dev;
1452 struct make_dev_args args;
1454 make_dev_args_init(&args);
1455 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1456 args.mda_devsw = &zvol_cdevsw;
1457 args.mda_cr = NULL;
1458 args.mda_uid = UID_ROOT;
1459 args.mda_gid = GID_OPERATOR;
1460 args.mda_mode = 0640;
1461 args.mda_si_drv2 = zv;
1462 if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name)
1463 == 0) {
1464 #if __FreeBSD_version > 1300130
1465 dev->si_iosize_max = maxphys;
1466 #else
1467 dev->si_iosize_max = MAXPHYS;
1468 #endif
1469 zsd->zsd_cdev = dev;
1470 knlist_init_sx(&zsd->zsd_selinfo.si_note,
1471 &zv->zv_state_lock);
1474 (void) strlcpy(zv->zv_name, name, MAXPATHLEN);
1475 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
1476 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
1478 if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
1479 zv->zv_flags |= ZVOL_RDONLY;
1481 zv->zv_volblocksize = doi->doi_data_block_size;
1482 zv->zv_volsize = volsize;
1483 zv->zv_objset = os;
1485 ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
1486 error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
1487 if (error)
1488 goto out_dmu_objset_disown;
1489 ASSERT3P(zv->zv_zilog, ==, NULL);
1490 zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums);
1491 if (spa_writeable(dmu_objset_spa(os))) {
1492 if (zil_replay_disable)
1493 zil_destroy(zv->zv_zilog, B_FALSE);
1494 else
1495 zil_replay(os, zv, zvol_replay_vector);
1497 zil_close(zv->zv_zilog);
1498 zv->zv_zilog = NULL;
1500 /* TODO: prefetch for geom tasting */
1502 zv->zv_objset = NULL;
1503 out_dmu_objset_disown:
1504 dmu_objset_disown(os, B_TRUE, FTAG);
1506 if (error == 0 && volmode == ZFS_VOLMODE_GEOM) {
1507 zvol_geom_run(zv);
1508 g_topology_unlock();
1510 out_doi:
1511 kmem_free(doi, sizeof (dmu_object_info_t));
1512 if (error == 0) {
1513 rw_enter(&zvol_state_lock, RW_WRITER);
1514 zvol_insert(zv);
1515 zvol_minors++;
1516 rw_exit(&zvol_state_lock);
1517 ZFS_LOG(1, "ZVOL %s created.", name);
1519 PICKUP_GIANT();
1520 return (error);
1523 void
1524 zvol_os_clear_private(zvol_state_t *zv)
1526 ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1527 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1528 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1529 struct g_provider *pp = zsg->zsg_provider;
1531 if (pp->private == NULL) /* already cleared */
1532 return;
1534 mtx_lock(&zsg->zsg_queue_mtx);
1535 zsg->zsg_state = ZVOL_GEOM_STOPPED;
1536 pp->private = NULL;
1537 wakeup_one(&zsg->zsg_queue);
1538 while (zsg->zsg_state != ZVOL_GEOM_RUNNING)
1539 msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx,
1540 0, "zvol:w", 0);
1541 mtx_unlock(&zsg->zsg_queue_mtx);
1542 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1543 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1544 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1545 struct cdev *dev = zsd->zsd_cdev;
1547 if (dev != NULL)
1548 dev->si_drv2 = NULL;
1553 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
1555 zv->zv_volsize = volsize;
1556 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1557 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1558 struct g_provider *pp = zsg->zsg_provider;
1560 g_topology_lock();
1562 if (pp->private == NULL) {
1563 g_topology_unlock();
1564 return (SET_ERROR(ENXIO));
1568 * Do not invoke resize event when initial size was zero.
1569 * ZVOL initializes the size on first open, this is not
1570 * real resizing.
1572 if (pp->mediasize == 0)
1573 pp->mediasize = zv->zv_volsize;
1574 else
1575 g_resize_provider(pp, zv->zv_volsize);
1577 g_topology_unlock();
1578 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1579 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1581 KNOTE_UNLOCKED(&zsd->zsd_selinfo.si_note, NOTE_ATTRIB);
1583 return (0);
1586 void
1587 zvol_os_set_disk_ro(zvol_state_t *zv, int flags)
1589 // XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags);
1592 void
1593 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity)
1595 // XXX? set_capacity(zv->zv_zso->zvo_disk, capacity);
1599 * Public interfaces
1603 zvol_busy(void)
1605 return (zvol_minors != 0);
1609 zvol_init(void)
1611 zvol_init_impl();
1612 return (0);
1615 void
1616 zvol_fini(void)
1618 zvol_fini_impl();