4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
32 #include <sys/efi_partition.h>
41 * Append partition suffix to an otherwise fully qualified device path.
42 * This is used to generate the name the full path as its stored in
43 * ZPOOL_CONFIG_PATH for whole disk devices. On success the new length
44 * of 'path' will be returned on error a negative value is returned.
47 zfs_append_partition(char *path
, size_t max_len
)
49 int len
= strlen(path
);
51 if ((strncmp(path
, UDISK_ROOT
, strlen(UDISK_ROOT
)) == 0) ||
52 (strncmp(path
, ZVOL_ROOT
, strlen(ZVOL_ROOT
)) == 0)) {
53 if (len
+ 6 >= max_len
)
56 (void) strcat(path
, "-part1");
59 if (len
+ 2 >= max_len
)
62 if (isdigit(path
[len
-1])) {
63 (void) strcat(path
, "p1");
66 (void) strcat(path
, "1");
75 * Remove partition suffix from a vdev path. Partition suffixes may take three
76 * forms: "-partX", "pX", or "X", where X is a string of digits. The second
77 * case only occurs when the suffix is preceded by a digit, i.e. "md0p0" The
78 * third case only occurs when preceded by a string matching the regular
79 * expression "^([hsv]|xv)d[a-z]+", i.e. a scsi, ide, virtio or xen disk.
81 * caller must free the returned string
84 zfs_strip_partition(const char *path
)
86 char *tmp
= strdup(path
);
87 char *part
= NULL
, *d
= NULL
;
91 if ((part
= strstr(tmp
, "-part")) && part
!= tmp
) {
93 } else if ((part
= strrchr(tmp
, 'p')) &&
94 part
> tmp
+ 1 && isdigit(*(part
-1))) {
96 } else if ((tmp
[0] == 'h' || tmp
[0] == 's' || tmp
[0] == 'v') &&
98 for (d
= &tmp
[2]; isalpha(*d
); part
= ++d
) { }
99 } else if (strncmp("xvd", tmp
, 3) == 0) {
100 for (d
= &tmp
[3]; isalpha(*d
); part
= ++d
) { }
102 if (part
&& d
&& *d
!= '\0') {
103 for (; isdigit(*d
); d
++) { }
112 * Same as zfs_strip_partition, but allows "/dev/" to be in the pathname
117 * Returned string must be freed.
120 zfs_strip_partition_path(const char *path
)
122 char *newpath
= strdup(path
);
129 /* Point to "sda1" part of "/dev/sda1" */
130 sd_offset
= strrchr(newpath
, '/') + 1;
132 /* Get our new name "sda" */
133 new_sd
= zfs_strip_partition(sd_offset
);
139 /* Paste the "sda" where "sda1" was */
140 strlcpy(sd_offset
, new_sd
, strlen(sd_offset
) + 1);
142 /* Free temporary "sda" */
149 * Strip the unwanted portion of a device path.
152 zfs_strip_path(const char *path
)
155 const char *const *spaths
= zpool_default_search_paths(&spath_count
);
157 for (size_t i
= 0; i
< spath_count
; ++i
)
158 if (strncmp(path
, spaths
[i
], strlen(spaths
[i
])) == 0 &&
159 path
[strlen(spaths
[i
])] == '/')
160 return (path
+ strlen(spaths
[i
]) + 1);
166 * Read the contents of a sysfs file into an allocated buffer and remove the
169 * This is useful for reading sysfs files that return a single string. Return
170 * an allocated string pointer on success, NULL otherwise. Returned buffer
171 * must be freed by the user.
174 zfs_read_sysfs_file(char *filepath
)
176 char buf
[4096]; /* all sysfs files report 4k size */
179 FILE *fp
= fopen(filepath
, "r");
183 if (fgets(buf
, sizeof (buf
), fp
) == buf
) {
186 /* Remove the last newline (if any) */
187 size_t len
= strlen(buf
);
188 if (buf
[len
- 1] == '\n') {
200 * Given a dev name like "nvme0n1", return the full PCI slot sysfs path to
201 * the drive (in /sys/bus/pci/slots).
205 * returns: "/sys/bus/pci/slots/0"
207 * 'dev' must be an NVMe device.
209 * Returned string must be freed. Returns NULL on error or no sysfs path.
212 zfs_get_pci_slots_sys_path(const char *dev_name
)
216 char *address1
= NULL
;
217 char *address2
= NULL
;
219 char buf
[MAXPATHLEN
];
222 /* If they preface 'dev' with a path (like "/dev") then strip it off */
223 tmp
= strrchr(dev_name
, '/');
225 dev_name
= tmp
+ 1; /* +1 since we want the chr after '/' */
227 if (strncmp("nvme", dev_name
, 4) != 0)
230 (void) snprintf(buf
, sizeof (buf
), "/sys/block/%s/device/address",
233 address1
= zfs_read_sysfs_file(buf
);
238 * /sys/block/nvme0n1/device/address format will
239 * be "0000:01:00.0" while /sys/bus/pci/slots/0/address will be
240 * "0000:01:00". Just NULL terminate at the '.' so they match.
242 tmp
= strrchr(address1
, '.');
246 dp
= opendir("/sys/bus/pci/slots/");
253 * Look through all the /sys/bus/pci/slots/ subdirs
255 while ((ep
= readdir(dp
))) {
257 * We only care about directory names that are a single number.
258 * Sometimes there's other directories like
259 * "/sys/bus/pci/slots/0-3/" in there - skip those.
261 if (!zfs_isnumber(ep
->d_name
))
264 (void) snprintf(buf
, sizeof (buf
),
265 "/sys/bus/pci/slots/%s/address", ep
->d_name
);
267 address2
= zfs_read_sysfs_file(buf
);
271 if (strcmp(address1
, address2
) == 0) {
272 /* Addresses match, we're all done */
274 if (asprintf(&path
, "/sys/bus/pci/slots/%s",
290 * Given a dev name like "sda", return the full enclosure sysfs path to
291 * the disk. You can also pass in the name with "/dev" prepended
292 * to it (like /dev/sda). This works for both JBODs and NVMe PCI devices.
294 * For example, disk "sda" in enclosure slot 1:
296 * returns: "/sys/class/enclosure/1:0:3:0/Slot 1"
300 * dev_name: "nvme0n1"
301 * returns: "/sys/bus/pci/slots/0"
303 * 'dev' must be a non-devicemapper device.
305 * Returned string must be freed. Returns NULL on error.
308 zfs_get_enclosure_sysfs_path(const char *dev_name
)
312 char buf
[MAXPATHLEN
];
320 if (dev_name
== NULL
)
323 /* If they preface 'dev' with a path (like "/dev") then strip it off */
324 tmp1
= strrchr(dev_name
, '/');
326 dev_name
= tmp1
+ 1; /* +1 since we want the chr after '/' */
328 tmpsize
= asprintf(&tmp1
, "/sys/block/%s/device", dev_name
);
329 if (tmpsize
== -1 || tmp1
== NULL
) {
339 * Look though all sysfs entries in /sys/block/<dev>/device for
340 * the enclosure symlink.
342 while ((ep
= readdir(dp
))) {
343 /* Ignore everything that's not our enclosure_device link */
344 if (strstr(ep
->d_name
, "enclosure_device") == NULL
)
349 if (asprintf(&tmp2
, "%s/%s", tmp1
, ep
->d_name
) == -1) {
354 size
= readlink(tmp2
, buf
, sizeof (buf
));
356 /* Did readlink fail or crop the link name? */
357 if (size
== -1 || size
>= sizeof (buf
))
361 * We got a valid link. readlink() doesn't terminate strings
362 * so we have to do it.
367 * Our link will look like:
369 * "../../../../port-11:1:2/..STUFF../enclosure/1:0:3:0/SLOT 1"
371 * We want to grab the "enclosure/1:0:3:0/SLOT 1" part
373 tmp3
= strstr(buf
, "enclosure");
379 if (asprintf(&path
, "/sys/class/%s", tmp3
) == -1) {
380 /* If asprintf() fails, 'path' is undefined */
395 * This particular disk isn't in a JBOD. It could be an NVMe
396 * drive. If so, look up the NVMe device's path in
397 * /sys/bus/pci/slots/. Within that directory is a 'attention'
398 * file which controls the NVMe fault LED.
400 path
= zfs_get_pci_slots_sys_path(dev_name
);
407 * Allocate and return the underlying device name for a device mapper device.
409 * For example, dm_name = "/dev/dm-0" could return "/dev/sda". Symlinks to a
410 * DM device (like /dev/disk/by-vdev/A0) are also allowed.
412 * If the DM device has multiple underlying devices (like with multipath
413 * DM devices), then favor underlying devices that have a symlink back to their
414 * back to their enclosure device in sysfs. This will be useful for the
415 * zedlet scripts that toggle the fault LED.
417 * Returns an underlying device name, or NULL on error or no match. If dm_name
418 * is not a DM device then return NULL.
420 * NOTE: The returned name string must be *freed*.
423 dm_get_underlying_path(const char *dm_name
)
431 char *first_path
= NULL
;
432 char *enclosure_path
;
437 /* dm name may be a symlink (like /dev/disk/by-vdev/A0) */
438 realp
= realpath(dm_name
, NULL
);
443 * If they preface 'dev' with a path (like "/dev") then strip it off.
444 * We just want the 'dm-N' part.
446 tmp
= strrchr(realp
, '/');
448 dev_str
= tmp
+ 1; /* +1 since we want the chr after '/' */
452 if (asprintf(&tmp
, "/sys/block/%s/slaves/", dev_str
) == -1) {
462 * A device-mapper device can have multiple paths to it (multipath).
463 * Favor paths that have a symlink back to their enclosure device.
464 * We have to do this since some enclosures may only provide a symlink
465 * back for one underlying path to a disk and not the other.
467 * If no paths have links back to their enclosure, then just return the
470 while ((ep
= readdir(dp
))) {
471 if (ep
->d_type
!= DT_DIR
) { /* skip "." and ".." dirs */
473 first_path
= strdup(ep
->d_name
);
476 zfs_get_enclosure_sysfs_path(ep
->d_name
);
481 if (asprintf(&path
, "/dev/%s", ep
->d_name
) == -1)
483 free(enclosure_path
);
494 if (!path
&& first_path
) {
496 * None of the underlying paths had a link back to their
497 * enclosure devices. Throw up out hands and return the first
500 if (asprintf(&path
, "/dev/%s", first_path
) == -1)
509 * Return B_TRUE if device is a device mapper or multipath device.
510 * Return B_FALSE if not.
513 zfs_dev_is_dm(const char *dev_name
)
517 tmp
= dm_get_underlying_path(dev_name
);
526 * By "whole disk" we mean an entire physical disk (something we can
527 * label, toggle the write cache on, etc.) as opposed to the full
528 * capacity of a pseudo-device such as lofi or did. We act as if we
529 * are labeling the disk, which should be a pretty good test of whether
530 * it's a viable device or not. Returns B_TRUE if it is and B_FALSE if
534 zfs_dev_is_whole_disk(const char *dev_name
)
536 struct dk_gpt
*label
= NULL
;
539 if ((fd
= open(dev_name
, O_RDONLY
| O_DIRECT
| O_CLOEXEC
)) < 0)
542 if (efi_alloc_and_init(fd
, EFI_NUMPAR
, &label
) != 0) {
554 * Lookup the underlying device for a device name
556 * Often you'll have a symlink to a device, a partition device,
557 * or a multipath device, and want to look up the underlying device.
558 * This function returns the underlying device name. If the device
559 * name is already the underlying device, then just return the same
560 * name. If the device is a DM device with multiple underlying devices
561 * then return the first one.
565 * 1. /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001 -> ../../sda
566 * dev_name: /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001
569 * 2. /dev/mapper/mpatha (made up of /dev/sda and /dev/sdb)
570 * dev_name: /dev/mapper/mpatha
571 * returns: /dev/sda (first device)
573 * 3. /dev/sda (already the underlying device)
577 * 4. /dev/dm-3 (mapped to /dev/sda)
578 * dev_name: /dev/dm-3
581 * 5. /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9 -> ../../sdb9
582 * dev_name: /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9
585 * 6. /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a -> ../dev/sda2
586 * dev_name: /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a
589 * Returns underlying device name, or NULL on error or no match.
591 * NOTE: The returned name string must be *freed*.
594 zfs_get_underlying_path(const char *dev_name
)
599 if (dev_name
== NULL
)
602 tmp
= dm_get_underlying_path(dev_name
);
604 /* dev_name not a DM device, so just un-symlinkize it */
606 tmp
= realpath(dev_name
, NULL
);
609 name
= zfs_strip_partition_path(tmp
);
620 * A disk is considered a multipath whole disk when:
621 * DEVNAME key value has "dm-"
622 * DM_UUID key exists and starts with 'mpath-'
623 * ID_PART_TABLE_TYPE key does not exist or is not gpt
624 * ID_FS_LABEL key does not exist (disk isn't labeled)
627 is_mpath_udev_sane(struct udev_device
*dev
)
629 const char *devname
, *type
, *uuid
, *label
;
631 devname
= udev_device_get_property_value(dev
, "DEVNAME");
632 type
= udev_device_get_property_value(dev
, "ID_PART_TABLE_TYPE");
633 uuid
= udev_device_get_property_value(dev
, "DM_UUID");
634 label
= udev_device_get_property_value(dev
, "ID_FS_LABEL");
636 if ((devname
!= NULL
&& strncmp(devname
, "/dev/dm-", 8) == 0) &&
637 ((type
== NULL
) || (strcmp(type
, "gpt") != 0)) &&
638 ((uuid
!= NULL
) && (strncmp(uuid
, "mpath-", 6) == 0)) &&
647 * Check if a disk is a multipath "blank" disk:
649 * 1. The disk has udev values that suggest it's a multipath disk
650 * 2. The disk is not currently labeled with a filesystem of any type
651 * 3. There are no partitions on the disk
654 is_mpath_whole_disk(const char *path
)
657 struct udev_device
*dev
= NULL
;
658 char nodepath
[MAXPATHLEN
];
661 if (realpath(path
, nodepath
) == NULL
)
663 sysname
= strrchr(nodepath
, '/') + 1;
664 if (strncmp(sysname
, "dm-", 3) != 0)
666 if ((udev
= udev_new()) == NULL
)
668 if ((dev
= udev_device_new_from_subsystem_sysname(udev
, "block",
670 udev_device_unref(dev
);
674 /* Sanity check some udev values */
675 boolean_t is_sane
= is_mpath_udev_sane(dev
);
676 udev_device_unref(dev
);
681 #else /* HAVE_LIBUDEV */
684 is_mpath_whole_disk(const char *path
)
690 #endif /* HAVE_LIBUDEV */