Allow disabling of unmapped I/O on FreeBSD
[zfs.git] / module / os / freebsd / zfs / vdev_geom.c
blob4ffa21495e748a55840357ef505ff00c2e220a18
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
23 * All rights reserved.
25 * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
28 #include <sys/zfs_context.h>
29 #include <sys/param.h>
30 #include <sys/kernel.h>
31 #include <sys/bio.h>
32 #include <sys/buf.h>
33 #include <sys/file.h>
34 #include <sys/spa.h>
35 #include <sys/spa_impl.h>
36 #include <sys/vdev_impl.h>
37 #include <sys/vdev_os.h>
38 #include <sys/fs/zfs.h>
39 #include <sys/zio.h>
40 #include <vm/vm_page.h>
41 #include <geom/geom.h>
42 #include <geom/geom_disk.h>
43 #include <geom/geom_int.h>
45 #ifndef g_topology_locked
46 #define g_topology_locked() sx_xlocked(&topology_lock)
47 #endif
50 * Virtual device vector for GEOM.
53 static g_attrchanged_t vdev_geom_attrchanged;
54 struct g_class zfs_vdev_class = {
55 .name = "ZFS::VDEV",
56 .version = G_VERSION,
57 .attrchanged = vdev_geom_attrchanged,
60 struct consumer_vdev_elem {
61 SLIST_ENTRY(consumer_vdev_elem) elems;
62 vdev_t *vd;
65 SLIST_HEAD(consumer_priv_t, consumer_vdev_elem);
66 /* BEGIN CSTYLED */
67 _Static_assert(sizeof (((struct g_consumer *)NULL)->private)
68 == sizeof (struct consumer_priv_t*),
69 "consumer_priv_t* can't be stored in g_consumer.private");
71 DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
73 SYSCTL_DECL(_vfs_zfs_vdev);
74 /* Don't send BIO_FLUSH. */
75 static int vdev_geom_bio_flush_disable;
76 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN,
77 &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
78 /* Don't send BIO_DELETE. */
79 static int vdev_geom_bio_delete_disable;
80 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN,
81 &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
82 /* END CSTYLED */
84 /* Declare local functions */
85 static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read);
88 * Thread local storage used to indicate when a thread is probing geoms
89 * for their guids. If NULL, this thread is not tasting geoms. If non NULL,
90 * it is looking for a replacement for the vdev_t* that is its value.
92 uint_t zfs_geom_probe_vdev_key;
94 static void
95 vdev_geom_set_physpath(vdev_t *vd, struct g_consumer *cp,
96 boolean_t do_null_update)
98 boolean_t needs_update = B_FALSE;
99 char *physpath;
100 int error, physpath_len;
102 physpath_len = MAXPATHLEN;
103 physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
104 error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
105 if (error == 0) {
106 char *old_physpath;
108 /* g_topology lock ensures that vdev has not been closed */
109 g_topology_assert();
110 old_physpath = vd->vdev_physpath;
111 vd->vdev_physpath = spa_strdup(physpath);
113 if (old_physpath != NULL) {
114 needs_update = (strcmp(old_physpath,
115 vd->vdev_physpath) != 0);
116 spa_strfree(old_physpath);
117 } else
118 needs_update = do_null_update;
120 g_free(physpath);
123 * If the physical path changed, update the config.
124 * Only request an update for previously unset physpaths if
125 * requested by the caller.
127 if (needs_update)
128 spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE);
132 static void
133 vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
135 struct consumer_priv_t *priv;
136 struct consumer_vdev_elem *elem;
138 priv = (struct consumer_priv_t *)&cp->private;
139 if (SLIST_EMPTY(priv))
140 return;
142 SLIST_FOREACH(elem, priv, elems) {
143 vdev_t *vd = elem->vd;
144 if (strcmp(attr, "GEOM::physpath") == 0) {
145 vdev_geom_set_physpath(vd, cp, /* null_update */B_TRUE);
146 return;
151 static void
152 vdev_geom_resize(struct g_consumer *cp)
154 struct consumer_priv_t *priv;
155 struct consumer_vdev_elem *elem;
156 spa_t *spa;
157 vdev_t *vd;
159 priv = (struct consumer_priv_t *)&cp->private;
160 if (SLIST_EMPTY(priv))
161 return;
163 SLIST_FOREACH(elem, priv, elems) {
164 vd = elem->vd;
165 if (vd->vdev_state != VDEV_STATE_HEALTHY)
166 continue;
167 spa = vd->vdev_spa;
168 if (!spa->spa_autoexpand)
169 continue;
170 vdev_online(spa, vd->vdev_guid, ZFS_ONLINE_EXPAND, NULL);
174 static void
175 vdev_geom_orphan(struct g_consumer *cp)
177 struct consumer_priv_t *priv;
178 // cppcheck-suppress uninitvar
179 struct consumer_vdev_elem *elem;
181 g_topology_assert();
183 priv = (struct consumer_priv_t *)&cp->private;
184 if (SLIST_EMPTY(priv))
185 /* Vdev close in progress. Ignore the event. */
186 return;
189 * Orphan callbacks occur from the GEOM event thread.
190 * Concurrent with this call, new I/O requests may be
191 * working their way through GEOM about to find out
192 * (only once executed by the g_down thread) that we've
193 * been orphaned from our disk provider. These I/Os
194 * must be retired before we can detach our consumer.
195 * This is most easily achieved by acquiring the
196 * SPA ZIO configuration lock as a writer, but doing
197 * so with the GEOM topology lock held would cause
198 * a lock order reversal. Instead, rely on the SPA's
199 * async removal support to invoke a close on this
200 * vdev once it is safe to do so.
202 // cppcheck-suppress All
203 SLIST_FOREACH(elem, priv, elems) {
204 // cppcheck-suppress uninitvar
205 vdev_t *vd = elem->vd;
207 vd->vdev_remove_wanted = B_TRUE;
208 spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
212 static struct g_consumer *
213 vdev_geom_attach(struct g_provider *pp, vdev_t *vd, boolean_t sanity)
215 struct g_geom *gp;
216 struct g_consumer *cp;
217 int error;
219 g_topology_assert();
221 ZFS_LOG(1, "Attaching to %s.", pp->name);
223 if (sanity) {
224 if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) {
225 ZFS_LOG(1, "Failing attach of %s. "
226 "Incompatible sectorsize %d\n",
227 pp->name, pp->sectorsize);
228 return (NULL);
229 } else if (pp->mediasize < SPA_MINDEVSIZE) {
230 ZFS_LOG(1, "Failing attach of %s. "
231 "Incompatible mediasize %ju\n",
232 pp->name, pp->mediasize);
233 return (NULL);
237 /* Do we have geom already? No? Create one. */
238 LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
239 if (gp->flags & G_GEOM_WITHER)
240 continue;
241 if (strcmp(gp->name, "zfs::vdev") != 0)
242 continue;
243 break;
245 if (gp == NULL) {
246 gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
247 gp->orphan = vdev_geom_orphan;
248 gp->attrchanged = vdev_geom_attrchanged;
249 gp->resize = vdev_geom_resize;
250 cp = g_new_consumer(gp);
251 error = g_attach(cp, pp);
252 if (error != 0) {
253 ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__,
254 __LINE__, error);
255 vdev_geom_detach(cp, B_FALSE);
256 return (NULL);
258 error = g_access(cp, 1, 0, 1);
259 if (error != 0) {
260 ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__,
261 __LINE__, error);
262 vdev_geom_detach(cp, B_FALSE);
263 return (NULL);
265 ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
266 } else {
267 /* Check if we are already connected to this provider. */
268 LIST_FOREACH(cp, &gp->consumer, consumer) {
269 if (cp->provider == pp) {
270 ZFS_LOG(1, "Found consumer for %s.", pp->name);
271 break;
274 if (cp == NULL) {
275 cp = g_new_consumer(gp);
276 error = g_attach(cp, pp);
277 if (error != 0) {
278 ZFS_LOG(1, "%s(%d): g_attach failed: %d\n",
279 __func__, __LINE__, error);
280 vdev_geom_detach(cp, B_FALSE);
281 return (NULL);
283 error = g_access(cp, 1, 0, 1);
284 if (error != 0) {
285 ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
286 __func__, __LINE__, error);
287 vdev_geom_detach(cp, B_FALSE);
288 return (NULL);
290 ZFS_LOG(1, "Created consumer for %s.", pp->name);
291 } else {
292 error = g_access(cp, 1, 0, 1);
293 if (error != 0) {
294 ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
295 __func__, __LINE__, error);
296 return (NULL);
298 ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
302 if (vd != NULL)
303 vd->vdev_tsd = cp;
305 cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
306 return (cp);
309 static void
310 vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read)
312 struct g_geom *gp;
314 g_topology_assert();
316 ZFS_LOG(1, "Detaching from %s.",
317 cp->provider && cp->provider->name ? cp->provider->name : "NULL");
319 gp = cp->geom;
320 if (open_for_read)
321 g_access(cp, -1, 0, -1);
322 /* Destroy consumer on last close. */
323 if (cp->acr == 0 && cp->ace == 0) {
324 if (cp->acw > 0)
325 g_access(cp, 0, -cp->acw, 0);
326 if (cp->provider != NULL) {
327 ZFS_LOG(1, "Destroying consumer for %s.",
328 cp->provider->name ? cp->provider->name : "NULL");
329 g_detach(cp);
331 g_destroy_consumer(cp);
333 /* Destroy geom if there are no consumers left. */
334 if (LIST_EMPTY(&gp->consumer)) {
335 ZFS_LOG(1, "Destroyed geom %s.", gp->name);
336 g_wither_geom(gp, ENXIO);
340 static void
341 vdev_geom_close_locked(vdev_t *vd)
343 struct g_consumer *cp;
344 struct consumer_priv_t *priv;
345 struct consumer_vdev_elem *elem, *elem_temp;
347 g_topology_assert();
349 cp = vd->vdev_tsd;
350 vd->vdev_delayed_close = B_FALSE;
351 if (cp == NULL)
352 return;
354 ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
355 KASSERT(cp->private != NULL, ("%s: cp->private is NULL", __func__));
356 priv = (struct consumer_priv_t *)&cp->private;
357 vd->vdev_tsd = NULL;
358 SLIST_FOREACH_SAFE(elem, priv, elems, elem_temp) {
359 if (elem->vd == vd) {
360 SLIST_REMOVE(priv, elem, consumer_vdev_elem, elems);
361 g_free(elem);
365 vdev_geom_detach(cp, B_TRUE);
369 * Issue one or more bios to the vdev in parallel
370 * cmds, datas, offsets, errors, and sizes are arrays of length ncmds. Each IO
371 * operation is described by parallel entries from each array. There may be
372 * more bios actually issued than entries in the array
374 static void
375 vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets,
376 off_t *sizes, int *errors, int ncmds)
378 struct bio **bios;
379 uint8_t *p;
380 off_t off, maxio, s, end;
381 int i, n_bios, j;
382 size_t bios_size;
384 #if __FreeBSD_version > 1300130
385 maxio = maxphys - (maxphys % cp->provider->sectorsize);
386 #else
387 maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
388 #endif
389 n_bios = 0;
391 /* How many bios are required for all commands ? */
392 for (i = 0; i < ncmds; i++)
393 n_bios += (sizes[i] + maxio - 1) / maxio;
395 /* Allocate memory for the bios */
396 bios_size = n_bios * sizeof (struct bio *);
397 bios = kmem_zalloc(bios_size, KM_SLEEP);
399 /* Prepare and issue all of the bios */
400 for (i = j = 0; i < ncmds; i++) {
401 off = offsets[i];
402 p = datas[i];
403 s = sizes[i];
404 end = off + s;
405 ASSERT0(off % cp->provider->sectorsize);
406 ASSERT0(s % cp->provider->sectorsize);
408 for (; off < end; off += maxio, p += maxio, s -= maxio, j++) {
409 bios[j] = g_alloc_bio();
410 bios[j]->bio_cmd = cmds[i];
411 bios[j]->bio_done = NULL;
412 bios[j]->bio_offset = off;
413 bios[j]->bio_length = MIN(s, maxio);
414 bios[j]->bio_data = (caddr_t)p;
415 g_io_request(bios[j], cp);
418 ASSERT3S(j, ==, n_bios);
420 /* Wait for all of the bios to complete, and clean them up */
421 for (i = j = 0; i < ncmds; i++) {
422 off = offsets[i];
423 s = sizes[i];
424 end = off + s;
426 for (; off < end; off += maxio, s -= maxio, j++) {
427 errors[i] = biowait(bios[j], "vdev_geom_io") ||
428 errors[i];
429 g_destroy_bio(bios[j]);
432 kmem_free(bios, bios_size);
436 * Read the vdev config from a device. Return the number of valid labels that
437 * were found. The vdev config will be returned in config if and only if at
438 * least one valid label was found.
440 static int
441 vdev_geom_read_config(struct g_consumer *cp, nvlist_t **configp)
443 struct g_provider *pp;
444 nvlist_t *config;
445 vdev_phys_t *vdev_lists[VDEV_LABELS];
446 char *buf;
447 size_t buflen;
448 uint64_t psize, state, txg;
449 off_t offsets[VDEV_LABELS];
450 off_t size;
451 off_t sizes[VDEV_LABELS];
452 int cmds[VDEV_LABELS];
453 int errors[VDEV_LABELS];
454 int l, nlabels;
456 g_topology_assert_not();
458 pp = cp->provider;
459 ZFS_LOG(1, "Reading config from %s...", pp->name);
461 psize = pp->mediasize;
462 psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
464 size = sizeof (*vdev_lists[0]) + pp->sectorsize -
465 ((sizeof (*vdev_lists[0]) - 1) % pp->sectorsize) - 1;
467 buflen = sizeof (vdev_lists[0]->vp_nvlist);
469 /* Create all of the IO requests */
470 for (l = 0; l < VDEV_LABELS; l++) {
471 cmds[l] = BIO_READ;
472 vdev_lists[l] = kmem_alloc(size, KM_SLEEP);
473 offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE;
474 sizes[l] = size;
475 errors[l] = 0;
476 ASSERT0(offsets[l] % pp->sectorsize);
479 /* Issue the IO requests */
480 vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors,
481 VDEV_LABELS);
483 /* Parse the labels */
484 config = *configp = NULL;
485 nlabels = 0;
486 for (l = 0; l < VDEV_LABELS; l++) {
487 if (errors[l] != 0)
488 continue;
490 buf = vdev_lists[l]->vp_nvlist;
492 if (nvlist_unpack(buf, buflen, &config, 0) != 0)
493 continue;
495 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
496 &state) != 0 || state > POOL_STATE_L2CACHE) {
497 nvlist_free(config);
498 continue;
501 if (state != POOL_STATE_SPARE &&
502 state != POOL_STATE_L2CACHE &&
503 (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
504 &txg) != 0 || txg == 0)) {
505 nvlist_free(config);
506 continue;
509 if (*configp != NULL)
510 nvlist_free(*configp);
511 *configp = config;
512 nlabels++;
515 /* Free the label storage */
516 for (l = 0; l < VDEV_LABELS; l++)
517 kmem_free(vdev_lists[l], size);
519 return (nlabels);
522 static void
523 resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id)
525 nvlist_t **new_configs;
526 uint64_t i;
528 if (id < *count)
529 return;
530 new_configs = kmem_zalloc((id + 1) * sizeof (nvlist_t *),
531 KM_SLEEP);
532 for (i = 0; i < *count; i++)
533 new_configs[i] = (*configs)[i];
534 if (*configs != NULL)
535 kmem_free(*configs, *count * sizeof (void *));
536 *configs = new_configs;
537 *count = id + 1;
540 static void
541 process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg,
542 const char *name, uint64_t *known_pool_guid)
544 nvlist_t *vdev_tree;
545 uint64_t pool_guid;
546 uint64_t vdev_guid;
547 uint64_t id, txg, known_txg;
548 char *pname;
550 if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 ||
551 strcmp(pname, name) != 0)
552 goto ignore;
554 if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
555 goto ignore;
557 if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0)
558 goto ignore;
560 if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0)
561 goto ignore;
563 if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0)
564 goto ignore;
566 txg = fnvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG);
568 if (*known_pool_guid != 0) {
569 if (pool_guid != *known_pool_guid)
570 goto ignore;
571 } else
572 *known_pool_guid = pool_guid;
574 resize_configs(configs, count, id);
576 if ((*configs)[id] != NULL) {
577 known_txg = fnvlist_lookup_uint64((*configs)[id],
578 ZPOOL_CONFIG_POOL_TXG);
579 if (txg <= known_txg)
580 goto ignore;
581 nvlist_free((*configs)[id]);
584 (*configs)[id] = cfg;
585 return;
587 ignore:
588 nvlist_free(cfg);
592 vdev_geom_read_pool_label(const char *name,
593 nvlist_t ***configs, uint64_t *count)
595 struct g_class *mp;
596 struct g_geom *gp;
597 struct g_provider *pp;
598 struct g_consumer *zcp;
599 nvlist_t *vdev_cfg;
600 uint64_t pool_guid;
601 int nlabels;
603 DROP_GIANT();
604 g_topology_lock();
606 *configs = NULL;
607 *count = 0;
608 pool_guid = 0;
609 LIST_FOREACH(mp, &g_classes, class) {
610 if (mp == &zfs_vdev_class)
611 continue;
612 LIST_FOREACH(gp, &mp->geom, geom) {
613 if (gp->flags & G_GEOM_WITHER)
614 continue;
615 LIST_FOREACH(pp, &gp->provider, provider) {
616 if (pp->flags & G_PF_WITHER)
617 continue;
618 zcp = vdev_geom_attach(pp, NULL, B_TRUE);
619 if (zcp == NULL)
620 continue;
621 g_topology_unlock();
622 nlabels = vdev_geom_read_config(zcp, &vdev_cfg);
623 g_topology_lock();
624 vdev_geom_detach(zcp, B_TRUE);
625 if (nlabels == 0)
626 continue;
627 ZFS_LOG(1, "successfully read vdev config");
629 process_vdev_config(configs, count,
630 vdev_cfg, name, &pool_guid);
634 g_topology_unlock();
635 PICKUP_GIANT();
637 return (*count > 0 ? 0 : ENOENT);
640 enum match {
641 NO_MATCH = 0, /* No matching labels found */
642 TOPGUID_MATCH = 1, /* Labels match top guid, not vdev guid */
643 ZERO_MATCH = 1, /* Should never be returned */
644 ONE_MATCH = 2, /* 1 label matching the vdev_guid */
645 TWO_MATCH = 3, /* 2 label matching the vdev_guid */
646 THREE_MATCH = 4, /* 3 label matching the vdev_guid */
647 FULL_MATCH = 5 /* all labels match the vdev_guid */
650 static enum match
651 vdev_attach_ok(vdev_t *vd, struct g_provider *pp)
653 nvlist_t *config;
654 uint64_t pool_guid, top_guid, vdev_guid;
655 struct g_consumer *cp;
656 int nlabels;
658 cp = vdev_geom_attach(pp, NULL, B_TRUE);
659 if (cp == NULL) {
660 ZFS_LOG(1, "Unable to attach tasting instance to %s.",
661 pp->name);
662 return (NO_MATCH);
664 g_topology_unlock();
665 nlabels = vdev_geom_read_config(cp, &config);
666 g_topology_lock();
667 vdev_geom_detach(cp, B_TRUE);
668 if (nlabels == 0) {
669 ZFS_LOG(1, "Unable to read config from %s.", pp->name);
670 return (NO_MATCH);
673 pool_guid = 0;
674 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid);
675 top_guid = 0;
676 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid);
677 vdev_guid = 0;
678 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
679 nvlist_free(config);
682 * Check that the label's pool guid matches the desired guid.
683 * Inactive spares and L2ARCs do not have any pool guid in the label.
685 if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) {
686 ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.",
687 pp->name,
688 (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid);
689 return (NO_MATCH);
693 * Check that the label's vdev guid matches the desired guid.
694 * The second condition handles possible race on vdev detach, when
695 * remaining vdev receives GUID of destroyed top level mirror vdev.
697 if (vdev_guid == vd->vdev_guid) {
698 ZFS_LOG(1, "guids match for provider %s.", pp->name);
699 return (ZERO_MATCH + nlabels);
700 } else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) {
701 ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name);
702 return (TOPGUID_MATCH);
704 ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.",
705 pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid);
706 return (NO_MATCH);
709 static struct g_consumer *
710 vdev_geom_attach_by_guids(vdev_t *vd)
712 struct g_class *mp;
713 struct g_geom *gp;
714 struct g_provider *pp, *best_pp;
715 struct g_consumer *cp;
716 const char *vdpath;
717 enum match match, best_match;
719 g_topology_assert();
721 vdpath = vd->vdev_path + sizeof ("/dev/") - 1;
722 cp = NULL;
723 best_pp = NULL;
724 best_match = NO_MATCH;
725 LIST_FOREACH(mp, &g_classes, class) {
726 if (mp == &zfs_vdev_class)
727 continue;
728 LIST_FOREACH(gp, &mp->geom, geom) {
729 if (gp->flags & G_GEOM_WITHER)
730 continue;
731 LIST_FOREACH(pp, &gp->provider, provider) {
732 match = vdev_attach_ok(vd, pp);
733 if (match > best_match) {
734 best_match = match;
735 best_pp = pp;
736 } else if (match == best_match) {
737 if (strcmp(pp->name, vdpath) == 0) {
738 best_pp = pp;
741 if (match == FULL_MATCH)
742 goto out;
747 out:
748 if (best_pp) {
749 cp = vdev_geom_attach(best_pp, vd, B_TRUE);
750 if (cp == NULL) {
751 printf("ZFS WARNING: Unable to attach to %s.\n",
752 best_pp->name);
755 return (cp);
758 static struct g_consumer *
759 vdev_geom_open_by_guids(vdev_t *vd)
761 struct g_consumer *cp;
762 char *buf;
763 size_t len;
765 g_topology_assert();
767 ZFS_LOG(1, "Searching by guids [%ju:%ju].",
768 (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid);
769 cp = vdev_geom_attach_by_guids(vd);
770 if (cp != NULL) {
771 len = strlen(cp->provider->name) + strlen("/dev/") + 1;
772 buf = kmem_alloc(len, KM_SLEEP);
774 snprintf(buf, len, "/dev/%s", cp->provider->name);
775 spa_strfree(vd->vdev_path);
776 vd->vdev_path = buf;
778 ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
779 (uintmax_t)spa_guid(vd->vdev_spa),
780 (uintmax_t)vd->vdev_guid, cp->provider->name);
781 } else {
782 ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
783 (uintmax_t)spa_guid(vd->vdev_spa),
784 (uintmax_t)vd->vdev_guid);
787 return (cp);
790 static struct g_consumer *
791 vdev_geom_open_by_path(vdev_t *vd, int check_guid)
793 struct g_provider *pp;
794 struct g_consumer *cp;
796 g_topology_assert();
798 cp = NULL;
799 pp = g_provider_by_name(vd->vdev_path + sizeof ("/dev/") - 1);
800 if (pp != NULL) {
801 ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
802 if (!check_guid || vdev_attach_ok(vd, pp) == FULL_MATCH)
803 cp = vdev_geom_attach(pp, vd, B_FALSE);
806 return (cp);
809 static int
810 vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
811 uint64_t *logical_ashift, uint64_t *physical_ashift)
813 struct g_provider *pp;
814 struct g_consumer *cp;
815 int error, has_trim;
816 uint16_t rate;
819 * Set the TLS to indicate downstack that we
820 * should not access zvols
822 VERIFY0(tsd_set(zfs_geom_probe_vdev_key, vd));
825 * We must have a pathname, and it must be absolute.
827 if (vd->vdev_path == NULL || strncmp(vd->vdev_path, "/dev/", 5) != 0) {
828 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
829 return (EINVAL);
833 * Reopen the device if it's not currently open. Otherwise,
834 * just update the physical size of the device.
836 if ((cp = vd->vdev_tsd) != NULL) {
837 ASSERT(vd->vdev_reopening);
838 goto skip_open;
841 DROP_GIANT();
842 g_topology_lock();
843 error = 0;
845 if (vd->vdev_spa->spa_is_splitting ||
846 ((vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
847 (vd->vdev_spa->spa_load_state == SPA_LOAD_NONE ||
848 vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)))) {
850 * We are dealing with a vdev that hasn't been previously
851 * opened (since boot), and we are not loading an
852 * existing pool configuration. This looks like a
853 * vdev add operation to a new or existing pool.
854 * Assume the user really wants to do this, and find
855 * GEOM provider by its name, ignoring GUID mismatches.
857 * XXPOLICY: It would be safer to only allow a device
858 * that is unlabeled or labeled but missing
859 * GUID information to be opened in this fashion,
860 * unless we are doing a split, in which case we
861 * should allow any guid.
863 cp = vdev_geom_open_by_path(vd, 0);
864 } else {
866 * Try using the recorded path for this device, but only
867 * accept it if its label data contains the expected GUIDs.
869 cp = vdev_geom_open_by_path(vd, 1);
870 if (cp == NULL) {
872 * The device at vd->vdev_path doesn't have the
873 * expected GUIDs. The disks might have merely
874 * moved around so try all other GEOM providers
875 * to find one with the right GUIDs.
877 cp = vdev_geom_open_by_guids(vd);
881 /* Clear the TLS now that tasting is done */
882 VERIFY0(tsd_set(zfs_geom_probe_vdev_key, NULL));
884 if (cp == NULL) {
885 ZFS_LOG(1, "Vdev %s not found.", vd->vdev_path);
886 error = ENOENT;
887 } else {
888 struct consumer_priv_t *priv;
889 struct consumer_vdev_elem *elem;
890 int spamode;
892 priv = (struct consumer_priv_t *)&cp->private;
893 if (cp->private == NULL)
894 SLIST_INIT(priv);
895 elem = g_malloc(sizeof (*elem), M_WAITOK|M_ZERO);
896 elem->vd = vd;
897 SLIST_INSERT_HEAD(priv, elem, elems);
899 spamode = spa_mode(vd->vdev_spa);
900 if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
901 !ISP2(cp->provider->sectorsize)) {
902 ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
903 cp->provider->name);
905 vdev_geom_close_locked(vd);
906 error = EINVAL;
907 cp = NULL;
908 } else if (cp->acw == 0 && (spamode & FWRITE) != 0) {
909 int i;
911 for (i = 0; i < 5; i++) {
912 error = g_access(cp, 0, 1, 0);
913 if (error == 0)
914 break;
915 g_topology_unlock();
916 tsleep(vd, 0, "vdev", hz / 2);
917 g_topology_lock();
919 if (error != 0) {
920 printf("ZFS WARNING: Unable to open %s for "
921 "writing (error=%d).\n",
922 cp->provider->name, error);
923 vdev_geom_close_locked(vd);
924 cp = NULL;
929 /* Fetch initial physical path information for this device. */
930 if (cp != NULL) {
931 vdev_geom_attrchanged(cp, "GEOM::physpath");
933 /* Set other GEOM characteristics */
934 vdev_geom_set_physpath(vd, cp, /* do_null_update */B_FALSE);
937 g_topology_unlock();
938 PICKUP_GIANT();
939 if (cp == NULL) {
940 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
941 vdev_dbgmsg(vd, "vdev_geom_open: failed to open [error=%d]",
942 error);
943 return (error);
945 skip_open:
946 pp = cp->provider;
949 * Determine the actual size of the device.
951 *max_psize = *psize = pp->mediasize;
954 * Determine the device's minimum transfer size and preferred
955 * transfer size.
957 *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
958 *physical_ashift = 0;
959 if (pp->stripesize && pp->stripesize > (1 << *logical_ashift) &&
960 ISP2(pp->stripesize) && pp->stripesize <= (1 << ASHIFT_MAX) &&
961 pp->stripeoffset == 0)
962 *physical_ashift = highbit(pp->stripesize) - 1;
965 * Clear the nowritecache settings, so that on a vdev_reopen()
966 * we will try again.
968 vd->vdev_nowritecache = B_FALSE;
970 /* Inform the ZIO pipeline that we are non-rotational. */
971 error = g_getattr("GEOM::rotation_rate", cp, &rate);
972 if (error == 0 && rate == DISK_RR_NON_ROTATING)
973 vd->vdev_nonrot = B_TRUE;
974 else
975 vd->vdev_nonrot = B_FALSE;
977 /* Set when device reports it supports TRIM. */
978 error = g_getattr("GEOM::candelete", cp, &has_trim);
979 vd->vdev_has_trim = (error == 0 && has_trim);
981 /* Set when device reports it supports secure TRIM. */
982 /* unavailable on FreeBSD */
983 vd->vdev_has_securetrim = B_FALSE;
985 return (0);
988 static void
989 vdev_geom_close(vdev_t *vd)
991 struct g_consumer *cp;
992 boolean_t locked;
994 cp = vd->vdev_tsd;
996 DROP_GIANT();
997 locked = g_topology_locked();
998 if (!locked)
999 g_topology_lock();
1001 if (!vd->vdev_reopening ||
1002 (cp != NULL && ((cp->flags & G_CF_ORPHAN) != 0 ||
1003 (cp->provider != NULL && cp->provider->error != 0))))
1004 vdev_geom_close_locked(vd);
1006 if (!locked)
1007 g_topology_unlock();
1008 PICKUP_GIANT();
1011 static void
1012 vdev_geom_io_intr(struct bio *bp)
1014 vdev_t *vd;
1015 zio_t *zio;
1017 zio = bp->bio_caller1;
1018 vd = zio->io_vd;
1019 zio->io_error = bp->bio_error;
1020 if (zio->io_error == 0 && bp->bio_resid != 0)
1021 zio->io_error = SET_ERROR(EIO);
1023 switch (zio->io_error) {
1024 case ENOTSUP:
1026 * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
1027 * that future attempts will never succeed. In this case
1028 * we set a persistent flag so that we don't bother with
1029 * requests in the future.
1031 switch (bp->bio_cmd) {
1032 case BIO_FLUSH:
1033 vd->vdev_nowritecache = B_TRUE;
1034 break;
1035 case BIO_DELETE:
1036 break;
1038 break;
1039 case ENXIO:
1040 if (!vd->vdev_remove_wanted) {
1042 * If provider's error is set we assume it is being
1043 * removed.
1045 if (bp->bio_to->error != 0) {
1046 vd->vdev_remove_wanted = B_TRUE;
1047 spa_async_request(zio->io_spa,
1048 SPA_ASYNC_REMOVE);
1049 } else if (!vd->vdev_delayed_close) {
1050 vd->vdev_delayed_close = B_TRUE;
1053 break;
1057 * We have to split bio freeing into two parts, because the ABD code
1058 * cannot be called in this context and vdev_op_io_done is not called
1059 * for ZIO_TYPE_IOCTL zio-s.
1061 if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) {
1062 g_destroy_bio(bp);
1063 zio->io_bio = NULL;
1065 zio_delay_interrupt(zio);
1068 struct vdev_geom_check_unmapped_cb_state {
1069 int pages;
1070 uint_t end;
1074 * Callback to check the ABD segment size/alignment and count the pages.
1075 * GEOM requires data buffer to look virtually contiguous. It means only
1076 * the first page of the buffer may not start and only the last may not
1077 * end on a page boundary. All other physical pages must be full.
1079 static int
1080 vdev_geom_check_unmapped_cb(void *buf, size_t len, void *priv)
1082 struct vdev_geom_check_unmapped_cb_state *s = priv;
1083 vm_offset_t off = (vm_offset_t)buf & PAGE_MASK;
1085 if (s->pages != 0 && off != 0)
1086 return (1);
1087 if (s->end != 0)
1088 return (1);
1089 s->end = (off + len) & PAGE_MASK;
1090 s->pages += (off + len + PAGE_MASK) >> PAGE_SHIFT;
1091 return (0);
1095 * Check whether we can use unmapped I/O for this ZIO on this device to
1096 * avoid data copying between scattered and/or gang ABD buffer and linear.
1098 static int
1099 vdev_geom_check_unmapped(zio_t *zio, struct g_consumer *cp)
1101 struct vdev_geom_check_unmapped_cb_state s;
1103 /* If unmapped I/O is administratively disabled, respect that. */
1104 if (!unmapped_buf_allowed)
1105 return (0);
1107 /* If the buffer is already linear, then nothing to do here. */
1108 if (abd_is_linear(zio->io_abd))
1109 return (0);
1112 * If unmapped I/O is not supported by the GEOM provider,
1113 * then we can't do anything and have to copy the data.
1115 if ((cp->provider->flags & G_PF_ACCEPT_UNMAPPED) == 0)
1116 return (0);
1118 /* Check the buffer chunks sizes/alignments and count pages. */
1119 s.pages = s.end = 0;
1120 if (abd_iterate_func(zio->io_abd, 0, zio->io_size,
1121 vdev_geom_check_unmapped_cb, &s))
1122 return (0);
1123 return (s.pages);
1127 * Callback to translate the ABD segment into array of physical pages.
1129 static int
1130 vdev_geom_fill_unmap_cb(void *buf, size_t len, void *priv)
1132 struct bio *bp = priv;
1133 vm_offset_t addr = (vm_offset_t)buf;
1134 vm_offset_t end = addr + len;
1136 if (bp->bio_ma_n == 0)
1137 bp->bio_ma_offset = addr & PAGE_MASK;
1138 do {
1139 bp->bio_ma[bp->bio_ma_n++] =
1140 PHYS_TO_VM_PAGE(pmap_kextract(addr));
1141 addr += PAGE_SIZE;
1142 } while (addr < end);
1143 return (0);
1146 static void
1147 vdev_geom_io_start(zio_t *zio)
1149 vdev_t *vd;
1150 struct g_consumer *cp;
1151 struct bio *bp;
1153 vd = zio->io_vd;
1155 switch (zio->io_type) {
1156 case ZIO_TYPE_IOCTL:
1157 /* XXPOLICY */
1158 if (!vdev_readable(vd)) {
1159 zio->io_error = SET_ERROR(ENXIO);
1160 zio_interrupt(zio);
1161 return;
1162 } else {
1163 switch (zio->io_cmd) {
1164 case DKIOCFLUSHWRITECACHE:
1165 if (zfs_nocacheflush ||
1166 vdev_geom_bio_flush_disable)
1167 break;
1168 if (vd->vdev_nowritecache) {
1169 zio->io_error = SET_ERROR(ENOTSUP);
1170 break;
1172 goto sendreq;
1173 default:
1174 zio->io_error = SET_ERROR(ENOTSUP);
1178 zio_execute(zio);
1179 return;
1180 case ZIO_TYPE_TRIM:
1181 if (!vdev_geom_bio_delete_disable) {
1182 goto sendreq;
1184 zio_execute(zio);
1185 return;
1186 default:
1188 /* PASSTHROUGH --- placate compiler */
1190 sendreq:
1191 ASSERT(zio->io_type == ZIO_TYPE_READ ||
1192 zio->io_type == ZIO_TYPE_WRITE ||
1193 zio->io_type == ZIO_TYPE_TRIM ||
1194 zio->io_type == ZIO_TYPE_IOCTL);
1196 cp = vd->vdev_tsd;
1197 if (cp == NULL) {
1198 zio->io_error = SET_ERROR(ENXIO);
1199 zio_interrupt(zio);
1200 return;
1202 bp = g_alloc_bio();
1203 bp->bio_caller1 = zio;
1204 switch (zio->io_type) {
1205 case ZIO_TYPE_READ:
1206 case ZIO_TYPE_WRITE:
1207 zio->io_target_timestamp = zio_handle_io_delay(zio);
1208 bp->bio_offset = zio->io_offset;
1209 bp->bio_length = zio->io_size;
1210 if (zio->io_type == ZIO_TYPE_READ)
1211 bp->bio_cmd = BIO_READ;
1212 else
1213 bp->bio_cmd = BIO_WRITE;
1216 * If possible, represent scattered and/or gang ABD buffer to
1217 * GEOM as an array of physical pages. It allows to satisfy
1218 * requirement of virtually contiguous buffer without copying.
1220 int pgs = vdev_geom_check_unmapped(zio, cp);
1221 if (pgs > 0) {
1222 bp->bio_ma = malloc(sizeof (struct vm_page *) * pgs,
1223 M_DEVBUF, M_WAITOK);
1224 bp->bio_ma_n = 0;
1225 bp->bio_ma_offset = 0;
1226 abd_iterate_func(zio->io_abd, 0, zio->io_size,
1227 vdev_geom_fill_unmap_cb, bp);
1228 bp->bio_data = unmapped_buf;
1229 bp->bio_flags |= BIO_UNMAPPED;
1230 } else {
1231 if (zio->io_type == ZIO_TYPE_READ) {
1232 bp->bio_data = abd_borrow_buf(zio->io_abd,
1233 zio->io_size);
1234 } else {
1235 bp->bio_data = abd_borrow_buf_copy(zio->io_abd,
1236 zio->io_size);
1239 break;
1240 case ZIO_TYPE_TRIM:
1241 bp->bio_cmd = BIO_DELETE;
1242 bp->bio_data = NULL;
1243 bp->bio_offset = zio->io_offset;
1244 bp->bio_length = zio->io_size;
1245 break;
1246 case ZIO_TYPE_IOCTL:
1247 bp->bio_cmd = BIO_FLUSH;
1248 bp->bio_data = NULL;
1249 bp->bio_offset = cp->provider->mediasize;
1250 bp->bio_length = 0;
1251 break;
1252 default:
1253 panic("invalid zio->io_type: %d\n", zio->io_type);
1255 bp->bio_done = vdev_geom_io_intr;
1256 zio->io_bio = bp;
1258 g_io_request(bp, cp);
1261 static void
1262 vdev_geom_io_done(zio_t *zio)
1264 struct bio *bp = zio->io_bio;
1266 if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) {
1267 ASSERT3P(bp, ==, NULL);
1268 return;
1271 if (bp == NULL) {
1272 ASSERT3S(zio->io_error, ==, ENXIO);
1273 return;
1276 if (bp->bio_ma != NULL) {
1277 free(bp->bio_ma, M_DEVBUF);
1278 } else {
1279 if (zio->io_type == ZIO_TYPE_READ) {
1280 abd_return_buf_copy(zio->io_abd, bp->bio_data,
1281 zio->io_size);
1282 } else {
1283 abd_return_buf(zio->io_abd, bp->bio_data,
1284 zio->io_size);
1288 g_destroy_bio(bp);
1289 zio->io_bio = NULL;
1292 static void
1293 vdev_geom_hold(vdev_t *vd)
1297 static void
1298 vdev_geom_rele(vdev_t *vd)
1302 vdev_ops_t vdev_disk_ops = {
1303 .vdev_op_init = NULL,
1304 .vdev_op_fini = NULL,
1305 .vdev_op_open = vdev_geom_open,
1306 .vdev_op_close = vdev_geom_close,
1307 .vdev_op_asize = vdev_default_asize,
1308 .vdev_op_min_asize = vdev_default_min_asize,
1309 .vdev_op_min_alloc = NULL,
1310 .vdev_op_io_start = vdev_geom_io_start,
1311 .vdev_op_io_done = vdev_geom_io_done,
1312 .vdev_op_state_change = NULL,
1313 .vdev_op_need_resilver = NULL,
1314 .vdev_op_hold = vdev_geom_hold,
1315 .vdev_op_rele = vdev_geom_rele,
1316 .vdev_op_remap = NULL,
1317 .vdev_op_xlate = vdev_default_xlate,
1318 .vdev_op_rebuild_asize = NULL,
1319 .vdev_op_metaslab_init = NULL,
1320 .vdev_op_config_generate = NULL,
1321 .vdev_op_nparity = NULL,
1322 .vdev_op_ndisks = NULL,
1323 .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */
1324 .vdev_op_leaf = B_TRUE /* leaf vdev */