4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
25 * Copyright (c) 2016, 2017 Intel Corporation.
26 * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
30 * Functions to convert between a list of vdevs and an nvlist representing the
31 * configuration. Each entry in the list can be one of:
34 * disk=(path=..., devid=...)
43 * While the underlying implementation supports it, group vdevs cannot contain
44 * other group vdevs. All userland verification of devices is contained within
45 * this file. If successful, the nvlist returned can be passed directly to the
46 * kernel; we've done as much verification as possible in userland.
48 * Hot spares are a special case, and passed down as an array of disk vdevs, at
49 * the same level as the root of the vdev tree.
51 * The only function exported by this file is 'make_root_vdev'. The
52 * function performs several passes:
54 * 1. Construct the vdev specification. Performs syntax validation and
55 * makes sure each device is valid.
56 * 2. Check for devices in use. Using libblkid to make sure that no
57 * devices are also in use. Some can be overridden using the 'force'
58 * flag, others cannot.
59 * 3. Check for replication errors if the 'force' flag is not specified.
60 * validates that the replication level is consistent across the
62 * 4. Call libzfs to label any whole disks with an EFI label.
70 #include <libnvpair.h>
77 #include "zpool_util.h"
78 #include <sys/zfs_context.h>
80 #include <scsi/scsi.h>
82 #include <sys/efi_partition.h>
84 #include <sys/mntent.h>
85 #include <uuid/uuid.h>
86 #include <blkid/blkid.h>
88 typedef struct vdev_disk_db_entry
92 } vdev_disk_db_entry_t
;
95 * Database of block devices that lie about physical sector sizes. The
96 * identification string must be precisely 24 characters to avoid false
99 static vdev_disk_db_entry_t vdev_disk_database
[] = {
100 {"ATA ADATA SSD S396 3", 8192},
101 {"ATA APPLE SSD SM128E", 8192},
102 {"ATA APPLE SSD SM256E", 8192},
103 {"ATA APPLE SSD SM512E", 8192},
104 {"ATA APPLE SSD SM768E", 8192},
105 {"ATA C400-MTFDDAC064M", 8192},
106 {"ATA C400-MTFDDAC128M", 8192},
107 {"ATA C400-MTFDDAC256M", 8192},
108 {"ATA C400-MTFDDAC512M", 8192},
109 {"ATA Corsair Force 3 ", 8192},
110 {"ATA Corsair Force GS", 8192},
111 {"ATA INTEL SSDSA2CT04", 8192},
112 {"ATA INTEL SSDSA2BZ10", 8192},
113 {"ATA INTEL SSDSA2BZ20", 8192},
114 {"ATA INTEL SSDSA2BZ30", 8192},
115 {"ATA INTEL SSDSA2CW04", 8192},
116 {"ATA INTEL SSDSA2CW08", 8192},
117 {"ATA INTEL SSDSA2CW12", 8192},
118 {"ATA INTEL SSDSA2CW16", 8192},
119 {"ATA INTEL SSDSA2CW30", 8192},
120 {"ATA INTEL SSDSA2CW60", 8192},
121 {"ATA INTEL SSDSC2CT06", 8192},
122 {"ATA INTEL SSDSC2CT12", 8192},
123 {"ATA INTEL SSDSC2CT18", 8192},
124 {"ATA INTEL SSDSC2CT24", 8192},
125 {"ATA INTEL SSDSC2CW06", 8192},
126 {"ATA INTEL SSDSC2CW12", 8192},
127 {"ATA INTEL SSDSC2CW18", 8192},
128 {"ATA INTEL SSDSC2CW24", 8192},
129 {"ATA INTEL SSDSC2CW48", 8192},
130 {"ATA KINGSTON SH100S3", 8192},
131 {"ATA KINGSTON SH103S3", 8192},
132 {"ATA M4-CT064M4SSD2 ", 8192},
133 {"ATA M4-CT128M4SSD2 ", 8192},
134 {"ATA M4-CT256M4SSD2 ", 8192},
135 {"ATA M4-CT512M4SSD2 ", 8192},
136 {"ATA OCZ-AGILITY2 ", 8192},
137 {"ATA OCZ-AGILITY3 ", 8192},
138 {"ATA OCZ-VERTEX2 3.5 ", 8192},
139 {"ATA OCZ-VERTEX3 ", 8192},
140 {"ATA OCZ-VERTEX3 LT ", 8192},
141 {"ATA OCZ-VERTEX3 MI ", 8192},
142 {"ATA OCZ-VERTEX4 ", 8192},
143 {"ATA SAMSUNG MZ7WD120", 8192},
144 {"ATA SAMSUNG MZ7WD240", 8192},
145 {"ATA SAMSUNG MZ7WD480", 8192},
146 {"ATA SAMSUNG MZ7WD960", 8192},
147 {"ATA SAMSUNG SSD 830 ", 8192},
148 {"ATA Samsung SSD 840 ", 8192},
149 {"ATA SanDisk SSD U100", 8192},
150 {"ATA TOSHIBA THNSNH06", 8192},
151 {"ATA TOSHIBA THNSNH12", 8192},
152 {"ATA TOSHIBA THNSNH25", 8192},
153 {"ATA TOSHIBA THNSNH51", 8192},
154 {"ATA APPLE SSD TS064C", 4096},
155 {"ATA APPLE SSD TS128C", 4096},
156 {"ATA APPLE SSD TS256C", 4096},
157 {"ATA APPLE SSD TS512C", 4096},
158 {"ATA INTEL SSDSA2M040", 4096},
159 {"ATA INTEL SSDSA2M080", 4096},
160 {"ATA INTEL SSDSA2M160", 4096},
161 {"ATA INTEL SSDSC2MH12", 4096},
162 {"ATA INTEL SSDSC2MH25", 4096},
163 {"ATA OCZ CORE_SSD ", 4096},
164 {"ATA OCZ-VERTEX ", 4096},
165 {"ATA SAMSUNG MCCOE32G", 4096},
166 {"ATA SAMSUNG MCCOE64G", 4096},
167 {"ATA SAMSUNG SSD PM80", 4096},
168 /* Flash drives optimized for 4KB IOs on larger pages */
169 {"ATA INTEL SSDSC2BA10", 4096},
170 {"ATA INTEL SSDSC2BA20", 4096},
171 {"ATA INTEL SSDSC2BA40", 4096},
172 {"ATA INTEL SSDSC2BA80", 4096},
173 {"ATA INTEL SSDSC2BB08", 4096},
174 {"ATA INTEL SSDSC2BB12", 4096},
175 {"ATA INTEL SSDSC2BB16", 4096},
176 {"ATA INTEL SSDSC2BB24", 4096},
177 {"ATA INTEL SSDSC2BB30", 4096},
178 {"ATA INTEL SSDSC2BB40", 4096},
179 {"ATA INTEL SSDSC2BB48", 4096},
180 {"ATA INTEL SSDSC2BB60", 4096},
181 {"ATA INTEL SSDSC2BB80", 4096},
182 {"ATA INTEL SSDSC2BW24", 4096},
183 {"ATA INTEL SSDSC2BW48", 4096},
184 {"ATA INTEL SSDSC2BP24", 4096},
185 {"ATA INTEL SSDSC2BP48", 4096},
186 {"NA SmrtStorSDLKAE9W", 4096},
187 {"NVMe Amazon EC2 NVMe ", 4096},
188 /* Imported from Open Solaris */
189 {"ATA MARVELL SD88SA02", 4096},
190 /* Advanced format Hard drives */
191 {"ATA Hitachi HDS5C303", 4096},
192 {"ATA SAMSUNG HD204UI ", 4096},
193 {"ATA ST2000DL004 HD20", 4096},
194 {"ATA WDC WD10EARS-00M", 4096},
195 {"ATA WDC WD10EARS-00S", 4096},
196 {"ATA WDC WD10EARS-00Z", 4096},
197 {"ATA WDC WD15EARS-00M", 4096},
198 {"ATA WDC WD15EARS-00S", 4096},
199 {"ATA WDC WD15EARS-00Z", 4096},
200 {"ATA WDC WD20EARS-00M", 4096},
201 {"ATA WDC WD20EARS-00S", 4096},
202 {"ATA WDC WD20EARS-00Z", 4096},
203 {"ATA WDC WD1600BEVT-0", 4096},
204 {"ATA WDC WD2500BEVT-0", 4096},
205 {"ATA WDC WD3200BEVT-0", 4096},
206 {"ATA WDC WD5000BEVT-0", 4096},
210 #define INQ_REPLY_LEN 96
211 #define INQ_CMD_LEN 6
213 static const int vdev_disk_database_size
=
214 sizeof (vdev_disk_database
) / sizeof (vdev_disk_database
[0]);
217 check_sector_size_database(char *path
, int *sector_size
)
219 unsigned char inq_buff
[INQ_REPLY_LEN
];
220 unsigned char sense_buffer
[32];
221 unsigned char inq_cmd_blk
[INQ_CMD_LEN
] =
222 {INQUIRY
, 0, 0, 0, INQ_REPLY_LEN
, 0};
228 /* Prepare INQUIRY command */
229 memset(&io_hdr
, 0, sizeof (sg_io_hdr_t
));
230 io_hdr
.interface_id
= 'S';
231 io_hdr
.cmd_len
= sizeof (inq_cmd_blk
);
232 io_hdr
.mx_sb_len
= sizeof (sense_buffer
);
233 io_hdr
.dxfer_direction
= SG_DXFER_FROM_DEV
;
234 io_hdr
.dxfer_len
= INQ_REPLY_LEN
;
235 io_hdr
.dxferp
= inq_buff
;
236 io_hdr
.cmdp
= inq_cmd_blk
;
237 io_hdr
.sbp
= sense_buffer
;
238 io_hdr
.timeout
= 10; /* 10 milliseconds is ample time */
240 if ((fd
= open(path
, O_RDONLY
|O_DIRECT
)) < 0)
243 error
= ioctl(fd
, SG_IO
, (unsigned long) &io_hdr
);
250 if ((io_hdr
.info
& SG_INFO_OK_MASK
) != SG_INFO_OK
)
253 for (i
= 0; i
< vdev_disk_database_size
; i
++) {
254 if (memcmp(inq_buff
+ 8, vdev_disk_database
[i
].id
, 24))
257 *sector_size
= vdev_disk_database
[i
].sector_size
;
265 check_slice(const char *path
, blkid_cache cache
, int force
, boolean_t isspare
)
270 /* No valid type detected device is safe to use */
271 value
= blkid_get_tag_value(cache
, "TYPE", path
);
276 * If libblkid detects a ZFS device, we check the device
277 * using check_file() to see if it's safe. The one safe
278 * case is a spare device shared between multiple pools.
280 if (strcmp(value
, "zfs_member") == 0) {
281 err
= check_file(path
, force
, isspare
);
287 vdev_error(gettext("%s contains a filesystem of "
288 "type '%s'\n"), path
, value
);
298 * Validate that a disk including all partitions are safe to use.
300 * For EFI labeled disks this can done relatively easily with the libefi
301 * library. The partition numbers are extracted from the label and used
302 * to generate the expected /dev/ paths. Each partition can then be
303 * checked for conflicts.
305 * For non-EFI labeled disks (MBR/EBR/etc) the same process is possible
306 * but due to the lack of a readily available libraries this scanning is
307 * not implemented. Instead only the device path as given is checked.
310 check_disk(const char *path
, blkid_cache cache
, int force
,
311 boolean_t isspare
, boolean_t iswholedisk
)
314 char slice_path
[MAXPATHLEN
];
317 int flags
= O_RDONLY
|O_DIRECT
;
320 return (check_slice(path
, cache
, force
, isspare
));
322 /* only spares can be shared, other devices require exclusive access */
326 if ((fd
= open(path
, flags
)) < 0) {
327 char *value
= blkid_get_tag_value(cache
, "TYPE", path
);
328 (void) fprintf(stderr
, gettext("%s is in use and contains "
329 "a %s filesystem.\n"), path
, value
? value
: "unknown");
335 * Expected to fail for non-EFI labeled disks. Just check the device
336 * as given and do not attempt to detect and scan partitions.
338 err
= efi_alloc_and_read(fd
, &vtoc
);
341 return (check_slice(path
, cache
, force
, isspare
));
345 * The primary efi partition label is damaged however the secondary
346 * label at the end of the device is intact. Rather than use this
347 * label we should play it safe and treat this as a non efi device.
349 if (vtoc
->efi_flags
& EFI_GPT_PRIMARY_CORRUPT
) {
354 /* Partitions will now be created using the backup */
357 vdev_error(gettext("%s contains a corrupt primary "
358 "EFI label.\n"), path
);
363 for (i
= 0; i
< vtoc
->efi_nparts
; i
++) {
365 if (vtoc
->efi_parts
[i
].p_tag
== V_UNASSIGNED
||
366 uuid_is_null((uchar_t
*)&vtoc
->efi_parts
[i
].p_guid
))
369 if (strncmp(path
, UDISK_ROOT
, strlen(UDISK_ROOT
)) == 0)
370 (void) snprintf(slice_path
, sizeof (slice_path
),
371 "%s%s%d", path
, "-part", i
+1);
373 (void) snprintf(slice_path
, sizeof (slice_path
),
374 "%s%s%d", path
, isdigit(path
[strlen(path
)-1]) ?
377 err
= check_slice(slice_path
, cache
, force
, isspare
);
389 check_device(const char *path
, boolean_t force
,
390 boolean_t isspare
, boolean_t iswholedisk
)
395 error
= blkid_get_cache(&cache
, NULL
);
397 (void) fprintf(stderr
, gettext("unable to access the blkid "
402 error
= check_disk(path
, cache
, force
, isspare
, iswholedisk
);
403 blkid_put_cache(cache
);
409 after_zpool_upgrade(zpool_handle_t
*zhp
)
415 check_file(const char *file
, boolean_t force
, boolean_t isspare
)
417 return (check_file_generic(file
, force
, isspare
));
421 * Read from a sysfs file and return an allocated string. Removes
422 * the newline from the end of the string if there is one.
424 * Returns a string on success (which must be freed), or NULL on error.
426 static char *zpool_sysfs_gets(char *path
)
432 fd
= open(path
, O_RDONLY
);
436 if (fstat(fd
, &statbuf
) != 0) {
441 buf
= calloc(statbuf
.st_size
+ 1, sizeof (*buf
));
448 * Note, we can read less bytes than st_size, and that's ok. Sysfs
449 * files will report their size is 4k even if they only return a small
452 count
= read(fd
, buf
, statbuf
.st_size
);
454 /* Error doing read() or we overran the buffer */
460 /* Remove trailing newline */
461 if (count
> 0 && buf
[count
- 1] == '\n')
470 * Write a string to a sysfs file.
472 * Returns 0 on success, non-zero otherwise.
474 static int zpool_sysfs_puts(char *path
, char *str
)
478 file
= fopen(path
, "w");
483 if (fputs(str
, file
) < 0) {
491 /* Given a vdev nvlist_t, rescan its enclosure sysfs path */
493 rescan_vdev_config_dev_sysfs_path(nvlist_t
*vdev_nv
)
495 update_vdev_config_dev_sysfs_path(vdev_nv
,
496 fnvlist_lookup_string(vdev_nv
, ZPOOL_CONFIG_PATH
),
497 ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH
);
501 * Given a power string: "on", "off", "1", or "0", return 0 if it's an
502 * off value, 1 if it's an on value, and -1 if the value is unrecognized.
504 static int zpool_power_parse_value(char *str
)
506 if ((strcmp(str
, "off") == 0) || (strcmp(str
, "0") == 0))
509 if ((strcmp(str
, "on") == 0) || (strcmp(str
, "1") == 0))
516 * Given a vdev string return an allocated string containing the sysfs path to
517 * its power control file. Also do a check if the power control file really
518 * exists and has correct permissions.
520 * Example returned strings:
522 * /sys/class/enclosure/0:0:122:0/10/power_status
523 * /sys/bus/pci/slots/10/power
525 * Returns allocated string on success (which must be freed), NULL on failure.
528 zpool_power_sysfs_path(zpool_handle_t
*zhp
, char *vdev
)
530 const char *enc_sysfs_dir
= NULL
;
532 nvlist_t
*vdev_nv
= zpool_find_vdev(zhp
, vdev
, NULL
, NULL
, NULL
);
534 if (vdev_nv
== NULL
) {
538 /* Make sure we're getting the updated enclosure sysfs path */
539 rescan_vdev_config_dev_sysfs_path(vdev_nv
);
541 if (nvlist_lookup_string(vdev_nv
, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH
,
542 &enc_sysfs_dir
) != 0) {
546 if (asprintf(&path
, "%s/power_status", enc_sysfs_dir
) == -1)
549 if (access(path
, W_OK
) != 0) {
552 /* No HDD 'power_control' file, maybe it's NVMe? */
553 if (asprintf(&path
, "%s/power", enc_sysfs_dir
) == -1) {
557 if (access(path
, R_OK
| W_OK
) != 0) {
558 /* Not NVMe either */
568 * Given a path to a sysfs power control file, return B_TRUE if you should use
569 * "on/off" words to control it, or B_FALSE otherwise ("0/1" to control).
572 zpool_power_use_word(char *sysfs_path
)
574 if (strcmp(&sysfs_path
[strlen(sysfs_path
) - strlen("power_status")],
575 "power_status") == 0) {
582 * Check the sysfs power control value for a vdev.
587 * -1 - Error or unsupported
590 zpool_power_current_state(zpool_handle_t
*zhp
, char *vdev
)
595 char *path
= zpool_power_sysfs_path(zhp
, vdev
);
599 val
= zpool_sysfs_gets(path
);
605 rc
= zpool_power_parse_value(val
);
612 * Turn on or off the slot to a device
614 * Device path is the full path to the device (like /dev/sda or /dev/sda1).
618 * ENOTSUP: Power control not supported for OS
619 * EBADSLT: Couldn't read current power state
620 * ENOENT: No sysfs path to power control
621 * EIO: Couldn't write sysfs power value
622 * EBADE: Sysfs power value didn't change
625 zpool_power(zpool_handle_t
*zhp
, char *vdev
, boolean_t turn_on
)
632 rc
= zpool_power_current_state(zhp
, vdev
);
637 /* Already correct value? */
638 if (rc
== (int)turn_on
)
641 sysfs_path
= zpool_power_sysfs_path(zhp
, vdev
);
642 if (sysfs_path
== NULL
)
645 if (zpool_power_use_word(sysfs_path
)) {
646 val
= turn_on
? "on" : "off";
648 val
= turn_on
? "1" : "0";
651 rc
= zpool_sysfs_puts(sysfs_path
, (char *)val
);
659 * Wait up to 30 seconds for sysfs power value to change after
662 timeout_ms
= zpool_getenv_int("ZPOOL_POWER_ON_SLOT_TIMEOUT_MS", 30000);
663 for (int i
= 0; i
< MAX(1, timeout_ms
/ 200); i
++) {
664 rc
= zpool_power_current_state(zhp
, vdev
);
665 if (rc
== (int)turn_on
)
666 return (0); /* success */
668 fsleep(0.200); /* 200ms */
671 /* sysfs value never changed */