4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
23 * All rights reserved.
25 * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
28 #include <sys/zfs_context.h>
29 #include <sys/param.h>
30 #include <sys/kernel.h>
35 #include <sys/spa_impl.h>
36 #include <sys/vdev_impl.h>
37 #include <sys/vdev_os.h>
38 #include <sys/fs/zfs.h>
40 #include <vm/vm_page.h>
41 #include <geom/geom.h>
42 #include <geom/geom_disk.h>
43 #include <geom/geom_int.h>
45 #ifndef g_topology_locked
46 #define g_topology_locked() sx_xlocked(&topology_lock)
50 * Virtual device vector for GEOM.
53 static g_attrchanged_t vdev_geom_attrchanged
;
54 struct g_class zfs_vdev_class
= {
57 .attrchanged
= vdev_geom_attrchanged
,
60 struct consumer_vdev_elem
{
61 SLIST_ENTRY(consumer_vdev_elem
) elems
;
65 SLIST_HEAD(consumer_priv_t
, consumer_vdev_elem
);
67 _Static_assert(sizeof (((struct g_consumer
*)NULL
)->private)
68 == sizeof (struct consumer_priv_t
*),
69 "consumer_priv_t* can't be stored in g_consumer.private");
71 DECLARE_GEOM_CLASS(zfs_vdev_class
, zfs_vdev
);
73 SYSCTL_DECL(_vfs_zfs_vdev
);
74 /* Don't send BIO_FLUSH. */
75 static int vdev_geom_bio_flush_disable
;
76 SYSCTL_INT(_vfs_zfs_vdev
, OID_AUTO
, bio_flush_disable
, CTLFLAG_RWTUN
,
77 &vdev_geom_bio_flush_disable
, 0, "Disable BIO_FLUSH");
78 /* Don't send BIO_DELETE. */
79 static int vdev_geom_bio_delete_disable
;
80 SYSCTL_INT(_vfs_zfs_vdev
, OID_AUTO
, bio_delete_disable
, CTLFLAG_RWTUN
,
81 &vdev_geom_bio_delete_disable
, 0, "Disable BIO_DELETE");
84 /* Declare local functions */
85 static void vdev_geom_detach(struct g_consumer
*cp
, boolean_t open_for_read
);
88 * Thread local storage used to indicate when a thread is probing geoms
89 * for their guids. If NULL, this thread is not tasting geoms. If non NULL,
90 * it is looking for a replacement for the vdev_t* that is its value.
92 uint_t zfs_geom_probe_vdev_key
;
95 vdev_geom_set_physpath(vdev_t
*vd
, struct g_consumer
*cp
,
96 boolean_t do_null_update
)
98 boolean_t needs_update
= B_FALSE
;
100 int error
, physpath_len
;
102 physpath_len
= MAXPATHLEN
;
103 physpath
= g_malloc(physpath_len
, M_WAITOK
|M_ZERO
);
104 error
= g_io_getattr("GEOM::physpath", cp
, &physpath_len
, physpath
);
108 /* g_topology lock ensures that vdev has not been closed */
110 old_physpath
= vd
->vdev_physpath
;
111 vd
->vdev_physpath
= spa_strdup(physpath
);
113 if (old_physpath
!= NULL
) {
114 needs_update
= (strcmp(old_physpath
,
115 vd
->vdev_physpath
) != 0);
116 spa_strfree(old_physpath
);
118 needs_update
= do_null_update
;
123 * If the physical path changed, update the config.
124 * Only request an update for previously unset physpaths if
125 * requested by the caller.
128 spa_async_request(vd
->vdev_spa
, SPA_ASYNC_CONFIG_UPDATE
);
133 vdev_geom_attrchanged(struct g_consumer
*cp
, const char *attr
)
135 struct consumer_priv_t
*priv
;
136 struct consumer_vdev_elem
*elem
;
138 priv
= (struct consumer_priv_t
*)&cp
->private;
139 if (SLIST_EMPTY(priv
))
142 SLIST_FOREACH(elem
, priv
, elems
) {
143 vdev_t
*vd
= elem
->vd
;
144 if (strcmp(attr
, "GEOM::physpath") == 0) {
145 vdev_geom_set_physpath(vd
, cp
, /* null_update */B_TRUE
);
152 vdev_geom_resize(struct g_consumer
*cp
)
154 struct consumer_priv_t
*priv
;
155 struct consumer_vdev_elem
*elem
;
159 priv
= (struct consumer_priv_t
*)&cp
->private;
160 if (SLIST_EMPTY(priv
))
163 SLIST_FOREACH(elem
, priv
, elems
) {
165 if (vd
->vdev_state
!= VDEV_STATE_HEALTHY
)
168 if (!spa
->spa_autoexpand
)
170 vdev_online(spa
, vd
->vdev_guid
, ZFS_ONLINE_EXPAND
, NULL
);
175 vdev_geom_orphan(struct g_consumer
*cp
)
177 struct consumer_priv_t
*priv
;
178 // cppcheck-suppress uninitvar
179 struct consumer_vdev_elem
*elem
;
183 priv
= (struct consumer_priv_t
*)&cp
->private;
184 if (SLIST_EMPTY(priv
))
185 /* Vdev close in progress. Ignore the event. */
189 * Orphan callbacks occur from the GEOM event thread.
190 * Concurrent with this call, new I/O requests may be
191 * working their way through GEOM about to find out
192 * (only once executed by the g_down thread) that we've
193 * been orphaned from our disk provider. These I/Os
194 * must be retired before we can detach our consumer.
195 * This is most easily achieved by acquiring the
196 * SPA ZIO configuration lock as a writer, but doing
197 * so with the GEOM topology lock held would cause
198 * a lock order reversal. Instead, rely on the SPA's
199 * async removal support to invoke a close on this
200 * vdev once it is safe to do so.
202 // cppcheck-suppress All
203 SLIST_FOREACH(elem
, priv
, elems
) {
204 // cppcheck-suppress uninitvar
205 vdev_t
*vd
= elem
->vd
;
207 vd
->vdev_remove_wanted
= B_TRUE
;
208 spa_async_request(vd
->vdev_spa
, SPA_ASYNC_REMOVE
);
212 static struct g_consumer
*
213 vdev_geom_attach(struct g_provider
*pp
, vdev_t
*vd
, boolean_t sanity
)
216 struct g_consumer
*cp
;
221 ZFS_LOG(1, "Attaching to %s.", pp
->name
);
224 if (pp
->sectorsize
> VDEV_PAD_SIZE
|| !ISP2(pp
->sectorsize
)) {
225 ZFS_LOG(1, "Failing attach of %s. "
226 "Incompatible sectorsize %d\n",
227 pp
->name
, pp
->sectorsize
);
229 } else if (pp
->mediasize
< SPA_MINDEVSIZE
) {
230 ZFS_LOG(1, "Failing attach of %s. "
231 "Incompatible mediasize %ju\n",
232 pp
->name
, pp
->mediasize
);
237 /* Do we have geom already? No? Create one. */
238 LIST_FOREACH(gp
, &zfs_vdev_class
.geom
, geom
) {
239 if (gp
->flags
& G_GEOM_WITHER
)
241 if (strcmp(gp
->name
, "zfs::vdev") != 0)
246 gp
= g_new_geomf(&zfs_vdev_class
, "zfs::vdev");
247 gp
->orphan
= vdev_geom_orphan
;
248 gp
->attrchanged
= vdev_geom_attrchanged
;
249 gp
->resize
= vdev_geom_resize
;
250 cp
= g_new_consumer(gp
);
251 error
= g_attach(cp
, pp
);
253 ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__
,
255 vdev_geom_detach(cp
, B_FALSE
);
258 error
= g_access(cp
, 1, 0, 1);
260 ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__
,
262 vdev_geom_detach(cp
, B_FALSE
);
265 ZFS_LOG(1, "Created geom and consumer for %s.", pp
->name
);
267 /* Check if we are already connected to this provider. */
268 LIST_FOREACH(cp
, &gp
->consumer
, consumer
) {
269 if (cp
->provider
== pp
) {
270 ZFS_LOG(1, "Found consumer for %s.", pp
->name
);
275 cp
= g_new_consumer(gp
);
276 error
= g_attach(cp
, pp
);
278 ZFS_LOG(1, "%s(%d): g_attach failed: %d\n",
279 __func__
, __LINE__
, error
);
280 vdev_geom_detach(cp
, B_FALSE
);
283 error
= g_access(cp
, 1, 0, 1);
285 ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
286 __func__
, __LINE__
, error
);
287 vdev_geom_detach(cp
, B_FALSE
);
290 ZFS_LOG(1, "Created consumer for %s.", pp
->name
);
292 error
= g_access(cp
, 1, 0, 1);
294 ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
295 __func__
, __LINE__
, error
);
298 ZFS_LOG(1, "Used existing consumer for %s.", pp
->name
);
305 cp
->flags
|= G_CF_DIRECT_SEND
| G_CF_DIRECT_RECEIVE
;
310 vdev_geom_detach(struct g_consumer
*cp
, boolean_t open_for_read
)
316 ZFS_LOG(1, "Detaching from %s.",
317 cp
->provider
&& cp
->provider
->name
? cp
->provider
->name
: "NULL");
321 g_access(cp
, -1, 0, -1);
322 /* Destroy consumer on last close. */
323 if (cp
->acr
== 0 && cp
->ace
== 0) {
325 g_access(cp
, 0, -cp
->acw
, 0);
326 if (cp
->provider
!= NULL
) {
327 ZFS_LOG(1, "Destroying consumer for %s.",
328 cp
->provider
->name
? cp
->provider
->name
: "NULL");
331 g_destroy_consumer(cp
);
333 /* Destroy geom if there are no consumers left. */
334 if (LIST_EMPTY(&gp
->consumer
)) {
335 ZFS_LOG(1, "Destroyed geom %s.", gp
->name
);
336 g_wither_geom(gp
, ENXIO
);
341 vdev_geom_close_locked(vdev_t
*vd
)
343 struct g_consumer
*cp
;
344 struct consumer_priv_t
*priv
;
345 struct consumer_vdev_elem
*elem
, *elem_temp
;
350 vd
->vdev_delayed_close
= B_FALSE
;
354 ZFS_LOG(1, "Closing access to %s.", cp
->provider
->name
);
355 KASSERT(cp
->private != NULL
, ("%s: cp->private is NULL", __func__
));
356 priv
= (struct consumer_priv_t
*)&cp
->private;
358 SLIST_FOREACH_SAFE(elem
, priv
, elems
, elem_temp
) {
359 if (elem
->vd
== vd
) {
360 SLIST_REMOVE(priv
, elem
, consumer_vdev_elem
, elems
);
365 vdev_geom_detach(cp
, B_TRUE
);
369 * Issue one or more bios to the vdev in parallel
370 * cmds, datas, offsets, errors, and sizes are arrays of length ncmds. Each IO
371 * operation is described by parallel entries from each array. There may be
372 * more bios actually issued than entries in the array
375 vdev_geom_io(struct g_consumer
*cp
, int *cmds
, void **datas
, off_t
*offsets
,
376 off_t
*sizes
, int *errors
, int ncmds
)
380 off_t off
, maxio
, s
, end
;
384 #if __FreeBSD_version > 1300130
385 maxio
= maxphys
- (maxphys
% cp
->provider
->sectorsize
);
387 maxio
= MAXPHYS
- (MAXPHYS
% cp
->provider
->sectorsize
);
391 /* How many bios are required for all commands ? */
392 for (i
= 0; i
< ncmds
; i
++)
393 n_bios
+= (sizes
[i
] + maxio
- 1) / maxio
;
395 /* Allocate memory for the bios */
396 bios_size
= n_bios
* sizeof (struct bio
*);
397 bios
= kmem_zalloc(bios_size
, KM_SLEEP
);
399 /* Prepare and issue all of the bios */
400 for (i
= j
= 0; i
< ncmds
; i
++) {
405 ASSERT0(off
% cp
->provider
->sectorsize
);
406 ASSERT0(s
% cp
->provider
->sectorsize
);
408 for (; off
< end
; off
+= maxio
, p
+= maxio
, s
-= maxio
, j
++) {
409 bios
[j
] = g_alloc_bio();
410 bios
[j
]->bio_cmd
= cmds
[i
];
411 bios
[j
]->bio_done
= NULL
;
412 bios
[j
]->bio_offset
= off
;
413 bios
[j
]->bio_length
= MIN(s
, maxio
);
414 bios
[j
]->bio_data
= (caddr_t
)p
;
415 g_io_request(bios
[j
], cp
);
418 ASSERT3S(j
, ==, n_bios
);
420 /* Wait for all of the bios to complete, and clean them up */
421 for (i
= j
= 0; i
< ncmds
; i
++) {
426 for (; off
< end
; off
+= maxio
, s
-= maxio
, j
++) {
427 errors
[i
] = biowait(bios
[j
], "vdev_geom_io") ||
429 g_destroy_bio(bios
[j
]);
432 kmem_free(bios
, bios_size
);
436 * Read the vdev config from a device. Return the number of valid labels that
437 * were found. The vdev config will be returned in config if and only if at
438 * least one valid label was found.
441 vdev_geom_read_config(struct g_consumer
*cp
, nvlist_t
**configp
)
443 struct g_provider
*pp
;
445 vdev_phys_t
*vdev_lists
[VDEV_LABELS
];
448 uint64_t psize
, state
, txg
;
449 off_t offsets
[VDEV_LABELS
];
451 off_t sizes
[VDEV_LABELS
];
452 int cmds
[VDEV_LABELS
];
453 int errors
[VDEV_LABELS
];
456 g_topology_assert_not();
459 ZFS_LOG(1, "Reading config from %s...", pp
->name
);
461 psize
= pp
->mediasize
;
462 psize
= P2ALIGN(psize
, (uint64_t)sizeof (vdev_label_t
));
464 size
= sizeof (*vdev_lists
[0]) + pp
->sectorsize
-
465 ((sizeof (*vdev_lists
[0]) - 1) % pp
->sectorsize
) - 1;
467 buflen
= sizeof (vdev_lists
[0]->vp_nvlist
);
469 /* Create all of the IO requests */
470 for (l
= 0; l
< VDEV_LABELS
; l
++) {
472 vdev_lists
[l
] = kmem_alloc(size
, KM_SLEEP
);
473 offsets
[l
] = vdev_label_offset(psize
, l
, 0) + VDEV_SKIP_SIZE
;
476 ASSERT0(offsets
[l
] % pp
->sectorsize
);
479 /* Issue the IO requests */
480 vdev_geom_io(cp
, cmds
, (void**)vdev_lists
, offsets
, sizes
, errors
,
483 /* Parse the labels */
484 config
= *configp
= NULL
;
486 for (l
= 0; l
< VDEV_LABELS
; l
++) {
490 buf
= vdev_lists
[l
]->vp_nvlist
;
492 if (nvlist_unpack(buf
, buflen
, &config
, 0) != 0)
495 if (nvlist_lookup_uint64(config
, ZPOOL_CONFIG_POOL_STATE
,
496 &state
) != 0 || state
> POOL_STATE_L2CACHE
) {
501 if (state
!= POOL_STATE_SPARE
&&
502 state
!= POOL_STATE_L2CACHE
&&
503 (nvlist_lookup_uint64(config
, ZPOOL_CONFIG_POOL_TXG
,
504 &txg
) != 0 || txg
== 0)) {
509 if (*configp
!= NULL
)
510 nvlist_free(*configp
);
515 /* Free the label storage */
516 for (l
= 0; l
< VDEV_LABELS
; l
++)
517 kmem_free(vdev_lists
[l
], size
);
523 resize_configs(nvlist_t
***configs
, uint64_t *count
, uint64_t id
)
525 nvlist_t
**new_configs
;
530 new_configs
= kmem_zalloc((id
+ 1) * sizeof (nvlist_t
*),
532 for (i
= 0; i
< *count
; i
++)
533 new_configs
[i
] = (*configs
)[i
];
534 if (*configs
!= NULL
)
535 kmem_free(*configs
, *count
* sizeof (void *));
536 *configs
= new_configs
;
541 process_vdev_config(nvlist_t
***configs
, uint64_t *count
, nvlist_t
*cfg
,
542 const char *name
, uint64_t *known_pool_guid
)
547 uint64_t id
, txg
, known_txg
;
550 if (nvlist_lookup_string(cfg
, ZPOOL_CONFIG_POOL_NAME
, &pname
) != 0 ||
551 strcmp(pname
, name
) != 0)
554 if (nvlist_lookup_uint64(cfg
, ZPOOL_CONFIG_POOL_GUID
, &pool_guid
) != 0)
557 if (nvlist_lookup_uint64(cfg
, ZPOOL_CONFIG_TOP_GUID
, &vdev_guid
) != 0)
560 if (nvlist_lookup_nvlist(cfg
, ZPOOL_CONFIG_VDEV_TREE
, &vdev_tree
) != 0)
563 if (nvlist_lookup_uint64(vdev_tree
, ZPOOL_CONFIG_ID
, &id
) != 0)
566 txg
= fnvlist_lookup_uint64(cfg
, ZPOOL_CONFIG_POOL_TXG
);
568 if (*known_pool_guid
!= 0) {
569 if (pool_guid
!= *known_pool_guid
)
572 *known_pool_guid
= pool_guid
;
574 resize_configs(configs
, count
, id
);
576 if ((*configs
)[id
] != NULL
) {
577 known_txg
= fnvlist_lookup_uint64((*configs
)[id
],
578 ZPOOL_CONFIG_POOL_TXG
);
579 if (txg
<= known_txg
)
581 nvlist_free((*configs
)[id
]);
584 (*configs
)[id
] = cfg
;
592 vdev_geom_read_pool_label(const char *name
,
593 nvlist_t
***configs
, uint64_t *count
)
597 struct g_provider
*pp
;
598 struct g_consumer
*zcp
;
609 LIST_FOREACH(mp
, &g_classes
, class) {
610 if (mp
== &zfs_vdev_class
)
612 LIST_FOREACH(gp
, &mp
->geom
, geom
) {
613 if (gp
->flags
& G_GEOM_WITHER
)
615 LIST_FOREACH(pp
, &gp
->provider
, provider
) {
616 if (pp
->flags
& G_PF_WITHER
)
618 zcp
= vdev_geom_attach(pp
, NULL
, B_TRUE
);
622 nlabels
= vdev_geom_read_config(zcp
, &vdev_cfg
);
624 vdev_geom_detach(zcp
, B_TRUE
);
627 ZFS_LOG(1, "successfully read vdev config");
629 process_vdev_config(configs
, count
,
630 vdev_cfg
, name
, &pool_guid
);
637 return (*count
> 0 ? 0 : ENOENT
);
641 NO_MATCH
= 0, /* No matching labels found */
642 TOPGUID_MATCH
= 1, /* Labels match top guid, not vdev guid */
643 ZERO_MATCH
= 1, /* Should never be returned */
644 ONE_MATCH
= 2, /* 1 label matching the vdev_guid */
645 TWO_MATCH
= 3, /* 2 label matching the vdev_guid */
646 THREE_MATCH
= 4, /* 3 label matching the vdev_guid */
647 FULL_MATCH
= 5 /* all labels match the vdev_guid */
651 vdev_attach_ok(vdev_t
*vd
, struct g_provider
*pp
)
654 uint64_t pool_guid
, top_guid
, vdev_guid
;
655 struct g_consumer
*cp
;
658 cp
= vdev_geom_attach(pp
, NULL
, B_TRUE
);
660 ZFS_LOG(1, "Unable to attach tasting instance to %s.",
665 nlabels
= vdev_geom_read_config(cp
, &config
);
667 vdev_geom_detach(cp
, B_TRUE
);
669 ZFS_LOG(1, "Unable to read config from %s.", pp
->name
);
674 (void) nvlist_lookup_uint64(config
, ZPOOL_CONFIG_POOL_GUID
, &pool_guid
);
676 (void) nvlist_lookup_uint64(config
, ZPOOL_CONFIG_TOP_GUID
, &top_guid
);
678 (void) nvlist_lookup_uint64(config
, ZPOOL_CONFIG_GUID
, &vdev_guid
);
682 * Check that the label's pool guid matches the desired guid.
683 * Inactive spares and L2ARCs do not have any pool guid in the label.
685 if (pool_guid
!= 0 && pool_guid
!= spa_guid(vd
->vdev_spa
)) {
686 ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.",
688 (uintmax_t)spa_guid(vd
->vdev_spa
), (uintmax_t)pool_guid
);
693 * Check that the label's vdev guid matches the desired guid.
694 * The second condition handles possible race on vdev detach, when
695 * remaining vdev receives GUID of destroyed top level mirror vdev.
697 if (vdev_guid
== vd
->vdev_guid
) {
698 ZFS_LOG(1, "guids match for provider %s.", pp
->name
);
699 return (ZERO_MATCH
+ nlabels
);
700 } else if (top_guid
== vd
->vdev_guid
&& vd
== vd
->vdev_top
) {
701 ZFS_LOG(1, "top vdev guid match for provider %s.", pp
->name
);
702 return (TOPGUID_MATCH
);
704 ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.",
705 pp
->name
, (uintmax_t)vd
->vdev_guid
, (uintmax_t)vdev_guid
);
709 static struct g_consumer
*
710 vdev_geom_attach_by_guids(vdev_t
*vd
)
714 struct g_provider
*pp
, *best_pp
;
715 struct g_consumer
*cp
;
717 enum match match
, best_match
;
721 vdpath
= vd
->vdev_path
+ sizeof ("/dev/") - 1;
724 best_match
= NO_MATCH
;
725 LIST_FOREACH(mp
, &g_classes
, class) {
726 if (mp
== &zfs_vdev_class
)
728 LIST_FOREACH(gp
, &mp
->geom
, geom
) {
729 if (gp
->flags
& G_GEOM_WITHER
)
731 LIST_FOREACH(pp
, &gp
->provider
, provider
) {
732 match
= vdev_attach_ok(vd
, pp
);
733 if (match
> best_match
) {
736 } else if (match
== best_match
) {
737 if (strcmp(pp
->name
, vdpath
) == 0) {
741 if (match
== FULL_MATCH
)
749 cp
= vdev_geom_attach(best_pp
, vd
, B_TRUE
);
751 printf("ZFS WARNING: Unable to attach to %s.\n",
758 static struct g_consumer
*
759 vdev_geom_open_by_guids(vdev_t
*vd
)
761 struct g_consumer
*cp
;
767 ZFS_LOG(1, "Searching by guids [%ju:%ju].",
768 (uintmax_t)spa_guid(vd
->vdev_spa
), (uintmax_t)vd
->vdev_guid
);
769 cp
= vdev_geom_attach_by_guids(vd
);
771 len
= strlen(cp
->provider
->name
) + strlen("/dev/") + 1;
772 buf
= kmem_alloc(len
, KM_SLEEP
);
774 snprintf(buf
, len
, "/dev/%s", cp
->provider
->name
);
775 spa_strfree(vd
->vdev_path
);
778 ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
779 (uintmax_t)spa_guid(vd
->vdev_spa
),
780 (uintmax_t)vd
->vdev_guid
, cp
->provider
->name
);
782 ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
783 (uintmax_t)spa_guid(vd
->vdev_spa
),
784 (uintmax_t)vd
->vdev_guid
);
790 static struct g_consumer
*
791 vdev_geom_open_by_path(vdev_t
*vd
, int check_guid
)
793 struct g_provider
*pp
;
794 struct g_consumer
*cp
;
799 pp
= g_provider_by_name(vd
->vdev_path
+ sizeof ("/dev/") - 1);
801 ZFS_LOG(1, "Found provider by name %s.", vd
->vdev_path
);
802 if (!check_guid
|| vdev_attach_ok(vd
, pp
) == FULL_MATCH
)
803 cp
= vdev_geom_attach(pp
, vd
, B_FALSE
);
810 vdev_geom_open(vdev_t
*vd
, uint64_t *psize
, uint64_t *max_psize
,
811 uint64_t *logical_ashift
, uint64_t *physical_ashift
)
813 struct g_provider
*pp
;
814 struct g_consumer
*cp
;
819 * Set the TLS to indicate downstack that we
820 * should not access zvols
822 VERIFY0(tsd_set(zfs_geom_probe_vdev_key
, vd
));
825 * We must have a pathname, and it must be absolute.
827 if (vd
->vdev_path
== NULL
|| strncmp(vd
->vdev_path
, "/dev/", 5) != 0) {
828 vd
->vdev_stat
.vs_aux
= VDEV_AUX_BAD_LABEL
;
833 * Reopen the device if it's not currently open. Otherwise,
834 * just update the physical size of the device.
836 if ((cp
= vd
->vdev_tsd
) != NULL
) {
837 ASSERT(vd
->vdev_reopening
);
845 if (vd
->vdev_spa
->spa_is_splitting
||
846 ((vd
->vdev_prevstate
== VDEV_STATE_UNKNOWN
&&
847 (vd
->vdev_spa
->spa_load_state
== SPA_LOAD_NONE
||
848 vd
->vdev_spa
->spa_load_state
== SPA_LOAD_CREATE
)))) {
850 * We are dealing with a vdev that hasn't been previously
851 * opened (since boot), and we are not loading an
852 * existing pool configuration. This looks like a
853 * vdev add operation to a new or existing pool.
854 * Assume the user really wants to do this, and find
855 * GEOM provider by its name, ignoring GUID mismatches.
857 * XXPOLICY: It would be safer to only allow a device
858 * that is unlabeled or labeled but missing
859 * GUID information to be opened in this fashion,
860 * unless we are doing a split, in which case we
861 * should allow any guid.
863 cp
= vdev_geom_open_by_path(vd
, 0);
866 * Try using the recorded path for this device, but only
867 * accept it if its label data contains the expected GUIDs.
869 cp
= vdev_geom_open_by_path(vd
, 1);
872 * The device at vd->vdev_path doesn't have the
873 * expected GUIDs. The disks might have merely
874 * moved around so try all other GEOM providers
875 * to find one with the right GUIDs.
877 cp
= vdev_geom_open_by_guids(vd
);
881 /* Clear the TLS now that tasting is done */
882 VERIFY0(tsd_set(zfs_geom_probe_vdev_key
, NULL
));
885 ZFS_LOG(1, "Vdev %s not found.", vd
->vdev_path
);
888 struct consumer_priv_t
*priv
;
889 struct consumer_vdev_elem
*elem
;
892 priv
= (struct consumer_priv_t
*)&cp
->private;
893 if (cp
->private == NULL
)
895 elem
= g_malloc(sizeof (*elem
), M_WAITOK
|M_ZERO
);
897 SLIST_INSERT_HEAD(priv
, elem
, elems
);
899 spamode
= spa_mode(vd
->vdev_spa
);
900 if (cp
->provider
->sectorsize
> VDEV_PAD_SIZE
||
901 !ISP2(cp
->provider
->sectorsize
)) {
902 ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
905 vdev_geom_close_locked(vd
);
908 } else if (cp
->acw
== 0 && (spamode
& FWRITE
) != 0) {
911 for (i
= 0; i
< 5; i
++) {
912 error
= g_access(cp
, 0, 1, 0);
916 tsleep(vd
, 0, "vdev", hz
/ 2);
920 printf("ZFS WARNING: Unable to open %s for "
921 "writing (error=%d).\n",
922 cp
->provider
->name
, error
);
923 vdev_geom_close_locked(vd
);
929 /* Fetch initial physical path information for this device. */
931 vdev_geom_attrchanged(cp
, "GEOM::physpath");
933 /* Set other GEOM characteristics */
934 vdev_geom_set_physpath(vd
, cp
, /* do_null_update */B_FALSE
);
940 vd
->vdev_stat
.vs_aux
= VDEV_AUX_OPEN_FAILED
;
941 vdev_dbgmsg(vd
, "vdev_geom_open: failed to open [error=%d]",
949 * Determine the actual size of the device.
951 *max_psize
= *psize
= pp
->mediasize
;
954 * Determine the device's minimum transfer size and preferred
957 *logical_ashift
= highbit(MAX(pp
->sectorsize
, SPA_MINBLOCKSIZE
)) - 1;
958 *physical_ashift
= 0;
959 if (pp
->stripesize
&& pp
->stripesize
> (1 << *logical_ashift
) &&
960 ISP2(pp
->stripesize
) && pp
->stripesize
<= (1 << ASHIFT_MAX
) &&
961 pp
->stripeoffset
== 0)
962 *physical_ashift
= highbit(pp
->stripesize
) - 1;
965 * Clear the nowritecache settings, so that on a vdev_reopen()
968 vd
->vdev_nowritecache
= B_FALSE
;
970 /* Inform the ZIO pipeline that we are non-rotational. */
971 error
= g_getattr("GEOM::rotation_rate", cp
, &rate
);
972 if (error
== 0 && rate
== DISK_RR_NON_ROTATING
)
973 vd
->vdev_nonrot
= B_TRUE
;
975 vd
->vdev_nonrot
= B_FALSE
;
977 /* Set when device reports it supports TRIM. */
978 error
= g_getattr("GEOM::candelete", cp
, &has_trim
);
979 vd
->vdev_has_trim
= (error
== 0 && has_trim
);
981 /* Set when device reports it supports secure TRIM. */
982 /* unavailable on FreeBSD */
983 vd
->vdev_has_securetrim
= B_FALSE
;
989 vdev_geom_close(vdev_t
*vd
)
991 struct g_consumer
*cp
;
997 locked
= g_topology_locked();
1001 if (!vd
->vdev_reopening
||
1002 (cp
!= NULL
&& ((cp
->flags
& G_CF_ORPHAN
) != 0 ||
1003 (cp
->provider
!= NULL
&& cp
->provider
->error
!= 0))))
1004 vdev_geom_close_locked(vd
);
1007 g_topology_unlock();
1012 vdev_geom_io_intr(struct bio
*bp
)
1017 zio
= bp
->bio_caller1
;
1019 zio
->io_error
= bp
->bio_error
;
1020 if (zio
->io_error
== 0 && bp
->bio_resid
!= 0)
1021 zio
->io_error
= SET_ERROR(EIO
);
1023 switch (zio
->io_error
) {
1026 * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
1027 * that future attempts will never succeed. In this case
1028 * we set a persistent flag so that we don't bother with
1029 * requests in the future.
1031 switch (bp
->bio_cmd
) {
1033 vd
->vdev_nowritecache
= B_TRUE
;
1040 if (!vd
->vdev_remove_wanted
) {
1042 * If provider's error is set we assume it is being
1045 if (bp
->bio_to
->error
!= 0) {
1046 vd
->vdev_remove_wanted
= B_TRUE
;
1047 spa_async_request(zio
->io_spa
,
1049 } else if (!vd
->vdev_delayed_close
) {
1050 vd
->vdev_delayed_close
= B_TRUE
;
1057 * We have to split bio freeing into two parts, because the ABD code
1058 * cannot be called in this context and vdev_op_io_done is not called
1059 * for ZIO_TYPE_IOCTL zio-s.
1061 if (zio
->io_type
!= ZIO_TYPE_READ
&& zio
->io_type
!= ZIO_TYPE_WRITE
) {
1065 zio_delay_interrupt(zio
);
1068 struct vdev_geom_check_unmapped_cb_state
{
1074 * Callback to check the ABD segment size/alignment and count the pages.
1075 * GEOM requires data buffer to look virtually contiguous. It means only
1076 * the first page of the buffer may not start and only the last may not
1077 * end on a page boundary. All other physical pages must be full.
1080 vdev_geom_check_unmapped_cb(void *buf
, size_t len
, void *priv
)
1082 struct vdev_geom_check_unmapped_cb_state
*s
= priv
;
1083 vm_offset_t off
= (vm_offset_t
)buf
& PAGE_MASK
;
1085 if (s
->pages
!= 0 && off
!= 0)
1089 s
->end
= (off
+ len
) & PAGE_MASK
;
1090 s
->pages
+= (off
+ len
+ PAGE_MASK
) >> PAGE_SHIFT
;
1095 * Check whether we can use unmapped I/O for this ZIO on this device to
1096 * avoid data copying between scattered and/or gang ABD buffer and linear.
1099 vdev_geom_check_unmapped(zio_t
*zio
, struct g_consumer
*cp
)
1101 struct vdev_geom_check_unmapped_cb_state s
;
1103 /* If unmapped I/O is administratively disabled, respect that. */
1104 if (!unmapped_buf_allowed
)
1107 /* If the buffer is already linear, then nothing to do here. */
1108 if (abd_is_linear(zio
->io_abd
))
1112 * If unmapped I/O is not supported by the GEOM provider,
1113 * then we can't do anything and have to copy the data.
1115 if ((cp
->provider
->flags
& G_PF_ACCEPT_UNMAPPED
) == 0)
1118 /* Check the buffer chunks sizes/alignments and count pages. */
1119 s
.pages
= s
.end
= 0;
1120 if (abd_iterate_func(zio
->io_abd
, 0, zio
->io_size
,
1121 vdev_geom_check_unmapped_cb
, &s
))
1127 * Callback to translate the ABD segment into array of physical pages.
1130 vdev_geom_fill_unmap_cb(void *buf
, size_t len
, void *priv
)
1132 struct bio
*bp
= priv
;
1133 vm_offset_t addr
= (vm_offset_t
)buf
;
1134 vm_offset_t end
= addr
+ len
;
1136 if (bp
->bio_ma_n
== 0)
1137 bp
->bio_ma_offset
= addr
& PAGE_MASK
;
1139 bp
->bio_ma
[bp
->bio_ma_n
++] =
1140 PHYS_TO_VM_PAGE(pmap_kextract(addr
));
1142 } while (addr
< end
);
1147 vdev_geom_io_start(zio_t
*zio
)
1150 struct g_consumer
*cp
;
1155 switch (zio
->io_type
) {
1156 case ZIO_TYPE_IOCTL
:
1158 if (!vdev_readable(vd
)) {
1159 zio
->io_error
= SET_ERROR(ENXIO
);
1163 switch (zio
->io_cmd
) {
1164 case DKIOCFLUSHWRITECACHE
:
1165 if (zfs_nocacheflush
||
1166 vdev_geom_bio_flush_disable
)
1168 if (vd
->vdev_nowritecache
) {
1169 zio
->io_error
= SET_ERROR(ENOTSUP
);
1174 zio
->io_error
= SET_ERROR(ENOTSUP
);
1181 if (!vdev_geom_bio_delete_disable
) {
1188 /* PASSTHROUGH --- placate compiler */
1191 ASSERT(zio
->io_type
== ZIO_TYPE_READ
||
1192 zio
->io_type
== ZIO_TYPE_WRITE
||
1193 zio
->io_type
== ZIO_TYPE_TRIM
||
1194 zio
->io_type
== ZIO_TYPE_IOCTL
);
1198 zio
->io_error
= SET_ERROR(ENXIO
);
1203 bp
->bio_caller1
= zio
;
1204 switch (zio
->io_type
) {
1206 case ZIO_TYPE_WRITE
:
1207 zio
->io_target_timestamp
= zio_handle_io_delay(zio
);
1208 bp
->bio_offset
= zio
->io_offset
;
1209 bp
->bio_length
= zio
->io_size
;
1210 if (zio
->io_type
== ZIO_TYPE_READ
)
1211 bp
->bio_cmd
= BIO_READ
;
1213 bp
->bio_cmd
= BIO_WRITE
;
1216 * If possible, represent scattered and/or gang ABD buffer to
1217 * GEOM as an array of physical pages. It allows to satisfy
1218 * requirement of virtually contiguous buffer without copying.
1220 int pgs
= vdev_geom_check_unmapped(zio
, cp
);
1222 bp
->bio_ma
= malloc(sizeof (struct vm_page
*) * pgs
,
1223 M_DEVBUF
, M_WAITOK
);
1225 bp
->bio_ma_offset
= 0;
1226 abd_iterate_func(zio
->io_abd
, 0, zio
->io_size
,
1227 vdev_geom_fill_unmap_cb
, bp
);
1228 bp
->bio_data
= unmapped_buf
;
1229 bp
->bio_flags
|= BIO_UNMAPPED
;
1231 if (zio
->io_type
== ZIO_TYPE_READ
) {
1232 bp
->bio_data
= abd_borrow_buf(zio
->io_abd
,
1235 bp
->bio_data
= abd_borrow_buf_copy(zio
->io_abd
,
1241 bp
->bio_cmd
= BIO_DELETE
;
1242 bp
->bio_data
= NULL
;
1243 bp
->bio_offset
= zio
->io_offset
;
1244 bp
->bio_length
= zio
->io_size
;
1246 case ZIO_TYPE_IOCTL
:
1247 bp
->bio_cmd
= BIO_FLUSH
;
1248 bp
->bio_data
= NULL
;
1249 bp
->bio_offset
= cp
->provider
->mediasize
;
1253 panic("invalid zio->io_type: %d\n", zio
->io_type
);
1255 bp
->bio_done
= vdev_geom_io_intr
;
1258 g_io_request(bp
, cp
);
1262 vdev_geom_io_done(zio_t
*zio
)
1264 struct bio
*bp
= zio
->io_bio
;
1266 if (zio
->io_type
!= ZIO_TYPE_READ
&& zio
->io_type
!= ZIO_TYPE_WRITE
) {
1267 ASSERT3P(bp
, ==, NULL
);
1272 ASSERT3S(zio
->io_error
, ==, ENXIO
);
1276 if (bp
->bio_ma
!= NULL
) {
1277 free(bp
->bio_ma
, M_DEVBUF
);
1279 if (zio
->io_type
== ZIO_TYPE_READ
) {
1280 abd_return_buf_copy(zio
->io_abd
, bp
->bio_data
,
1283 abd_return_buf(zio
->io_abd
, bp
->bio_data
,
1293 vdev_geom_hold(vdev_t
*vd
)
1298 vdev_geom_rele(vdev_t
*vd
)
1302 vdev_ops_t vdev_disk_ops
= {
1303 .vdev_op_init
= NULL
,
1304 .vdev_op_fini
= NULL
,
1305 .vdev_op_open
= vdev_geom_open
,
1306 .vdev_op_close
= vdev_geom_close
,
1307 .vdev_op_asize
= vdev_default_asize
,
1308 .vdev_op_min_asize
= vdev_default_min_asize
,
1309 .vdev_op_min_alloc
= NULL
,
1310 .vdev_op_io_start
= vdev_geom_io_start
,
1311 .vdev_op_io_done
= vdev_geom_io_done
,
1312 .vdev_op_state_change
= NULL
,
1313 .vdev_op_need_resilver
= NULL
,
1314 .vdev_op_hold
= vdev_geom_hold
,
1315 .vdev_op_rele
= vdev_geom_rele
,
1316 .vdev_op_remap
= NULL
,
1317 .vdev_op_xlate
= vdev_default_xlate
,
1318 .vdev_op_rebuild_asize
= NULL
,
1319 .vdev_op_metaslab_init
= NULL
,
1320 .vdev_op_config_generate
= NULL
,
1321 .vdev_op_nparity
= NULL
,
1322 .vdev_op_ndisks
= NULL
,
1323 .vdev_op_type
= VDEV_TYPE_DISK
, /* name of this vdev type */
1324 .vdev_op_leaf
= B_TRUE
/* leaf vdev */