4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License Version 1.0 (CDDL-1.0).
6 * You can obtain a copy of the license from the top-level file
7 * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
8 * You may not use this file except in compliance with the license.
14 * Copyright (c) 2016, 2017, Intel Corporation.
21 #include <libnvpair.h>
29 #include <sys/sysevent/eventdefs.h>
30 #include <sys/sysevent/dev.h>
33 #include "zed_disk_event.h"
34 #include "agents/zfs_agents.h"
37 * Portions of ZED need to see disk events for disks belonging to ZFS pools.
38 * A libudev monitor is established to monitor block device actions and pass
39 * them on to internal ZED logic modules. Initially, zfs_mod.c is the only
40 * consumer and is the Linux equivalent for the illumos syseventd ZFS SLM
41 * module responsible for handling disk events for ZFS.
46 struct udev_monitor
*g_mon
;
49 #define DEV_BYID_PATH "/dev/disk/by-id/"
51 /* 64MB is minimum usable disk for ZFS */
52 #define MINIMUM_SECTORS 131072ULL
56 * Post disk event to SLM module
58 * occurs in the context of monitor thread
61 zed_udev_event(const char *class, const char *subclass
, nvlist_t
*nvl
)
66 zed_log_msg(LOG_INFO
, "zed_disk_event:");
67 zed_log_msg(LOG_INFO
, "\tclass: %s", class);
68 zed_log_msg(LOG_INFO
, "\tsubclass: %s", subclass
);
69 if (nvlist_lookup_string(nvl
, DEV_NAME
, &strval
) == 0)
70 zed_log_msg(LOG_INFO
, "\t%s: %s", DEV_NAME
, strval
);
71 if (nvlist_lookup_string(nvl
, DEV_PATH
, &strval
) == 0)
72 zed_log_msg(LOG_INFO
, "\t%s: %s", DEV_PATH
, strval
);
73 if (nvlist_lookup_string(nvl
, DEV_IDENTIFIER
, &strval
) == 0)
74 zed_log_msg(LOG_INFO
, "\t%s: %s", DEV_IDENTIFIER
, strval
);
75 if (nvlist_lookup_boolean(nvl
, DEV_IS_PART
) == B_TRUE
)
76 zed_log_msg(LOG_INFO
, "\t%s: B_TRUE", DEV_IS_PART
);
77 if (nvlist_lookup_string(nvl
, DEV_PHYS_PATH
, &strval
) == 0)
78 zed_log_msg(LOG_INFO
, "\t%s: %s", DEV_PHYS_PATH
, strval
);
79 if (nvlist_lookup_uint64(nvl
, DEV_SIZE
, &numval
) == 0)
80 zed_log_msg(LOG_INFO
, "\t%s: %llu", DEV_SIZE
, numval
);
81 if (nvlist_lookup_uint64(nvl
, DEV_PARENT_SIZE
, &numval
) == 0)
82 zed_log_msg(LOG_INFO
, "\t%s: %llu", DEV_PARENT_SIZE
, numval
);
83 if (nvlist_lookup_uint64(nvl
, ZFS_EV_POOL_GUID
, &numval
) == 0)
84 zed_log_msg(LOG_INFO
, "\t%s: %llu", ZFS_EV_POOL_GUID
, numval
);
85 if (nvlist_lookup_uint64(nvl
, ZFS_EV_VDEV_GUID
, &numval
) == 0)
86 zed_log_msg(LOG_INFO
, "\t%s: %llu", ZFS_EV_VDEV_GUID
, numval
);
88 (void) zfs_agent_post_event(class, subclass
, nvl
);
92 * dev_event_nvlist: place event schema into an nv pair list
94 * NAME VALUE (example)
95 * -------------- --------------------------------------------------------
97 * DEV_PATH /devices/pci0000:00/0000:00:03.0/0000:04:00.0/host0/...
98 * DEV_IDENTIFIER ata-Hitachi_HTS725050A9A362_100601PCG420VLJ37DMC
99 * DEV_PHYS_PATH pci-0000:04:00.0-sas-0x4433221101000000-lun-0
101 * DEV_SIZE 500107862016
102 * ZFS_EV_POOL_GUID 17523635698032189180
103 * ZFS_EV_VDEV_GUID 14663607734290803088
106 dev_event_nvlist(struct udev_device
*dev
)
110 const char *value
, *path
;
113 if (nvlist_alloc(&nvl
, NV_UNIQUE_NAME
, 0) != 0)
116 if (zfs_device_get_devid(dev
, strval
, sizeof (strval
)) == 0)
117 (void) nvlist_add_string(nvl
, DEV_IDENTIFIER
, strval
);
118 if (zfs_device_get_physical(dev
, strval
, sizeof (strval
)) == 0)
119 (void) nvlist_add_string(nvl
, DEV_PHYS_PATH
, strval
);
120 if ((path
= udev_device_get_devnode(dev
)) != NULL
)
121 (void) nvlist_add_string(nvl
, DEV_NAME
, path
);
122 if ((value
= udev_device_get_devpath(dev
)) != NULL
)
123 (void) nvlist_add_string(nvl
, DEV_PATH
, value
);
124 value
= udev_device_get_devtype(dev
);
125 if ((value
!= NULL
&& strcmp("partition", value
) == 0) ||
126 (udev_device_get_property_value(dev
, "ID_PART_ENTRY_NUMBER")
128 (void) nvlist_add_boolean(nvl
, DEV_IS_PART
);
130 if ((value
= udev_device_get_sysattr_value(dev
, "size")) != NULL
) {
131 uint64_t numval
= DEV_BSIZE
;
133 numval
*= strtoull(value
, NULL
, 10);
134 (void) nvlist_add_uint64(nvl
, DEV_SIZE
, numval
);
137 * If the device has a parent, then get the parent block
138 * device's size as well. For example, /dev/sda1's parent
141 struct udev_device
*parent_dev
= udev_device_get_parent(dev
);
142 if (parent_dev
!= NULL
&&
143 (value
= udev_device_get_sysattr_value(parent_dev
, "size"))
145 uint64_t numval
= DEV_BSIZE
;
147 numval
*= strtoull(value
, NULL
, 10);
148 (void) nvlist_add_uint64(nvl
, DEV_PARENT_SIZE
, numval
);
153 * Grab the pool and vdev guids from blkid cache
155 value
= udev_device_get_property_value(dev
, "ID_FS_UUID");
156 if (value
!= NULL
&& (guid
= strtoull(value
, NULL
, 10)) != 0)
157 (void) nvlist_add_uint64(nvl
, ZFS_EV_POOL_GUID
, guid
);
159 value
= udev_device_get_property_value(dev
, "ID_FS_UUID_SUB");
160 if (value
!= NULL
&& (guid
= strtoull(value
, NULL
, 10)) != 0)
161 (void) nvlist_add_uint64(nvl
, ZFS_EV_VDEV_GUID
, guid
);
164 * Either a vdev guid or a devid must be present for matching
166 if (!nvlist_exists(nvl
, DEV_IDENTIFIER
) &&
167 !nvlist_exists(nvl
, ZFS_EV_VDEV_GUID
)) {
176 * Listen for block device uevents
179 zed_udev_monitor(void *arg
)
181 struct udev_monitor
*mon
= arg
;
185 zed_log_msg(LOG_INFO
, "Waiting for new udev disk events...");
188 struct udev_device
*dev
;
189 const char *action
, *type
, *part
, *sectors
;
190 const char *bus
, *uuid
, *devpath
;
191 const char *class, *subclass
;
193 boolean_t is_zfs
= B_FALSE
;
195 /* allow a cancellation while blocked (recvmsg) */
196 pthread_setcancelstate(PTHREAD_CANCEL_ENABLE
, NULL
);
198 /* blocks at recvmsg until an event occurs */
199 if ((dev
= udev_monitor_receive_device(mon
)) == NULL
) {
200 zed_log_msg(LOG_WARNING
, "zed_udev_monitor: receive "
201 "device error %d", errno
);
205 /* allow all steps to complete before a cancellation */
206 pthread_setcancelstate(PTHREAD_CANCEL_DISABLE
, NULL
);
209 * Strongly typed device is the preferred filter
211 type
= udev_device_get_property_value(dev
, "ID_FS_TYPE");
212 if (type
!= NULL
&& type
[0] != '\0') {
213 if (strcmp(type
, "zfs_member") == 0) {
216 /* not ours, so skip */
217 zed_log_msg(LOG_INFO
, "zed_udev_monitor: skip "
219 udev_device_get_devnode(dev
), type
);
220 udev_device_unref(dev
);
226 * if this is a disk and it is partitioned, then the
227 * zfs label will reside in a DEVTYPE=partition and
228 * we can skip passing this event
230 * Special case: Blank disks are sometimes reported with
231 * an erroneous 'atari' partition, and should not be
232 * excluded from being used as an autoreplace disk:
234 * https://github.com/openzfs/zfs/issues/13497
236 type
= udev_device_get_property_value(dev
, "DEVTYPE");
237 part
= udev_device_get_property_value(dev
,
238 "ID_PART_TABLE_TYPE");
239 if (type
!= NULL
&& type
[0] != '\0' &&
240 strcmp(type
, "disk") == 0 &&
241 part
!= NULL
&& part
[0] != '\0') {
242 const char *devname
=
243 udev_device_get_property_value(dev
, "DEVNAME");
245 if (strcmp(part
, "atari") == 0) {
246 zed_log_msg(LOG_INFO
,
247 "%s: %s is reporting an atari partition, "
248 "but we're going to assume it's a false "
249 "positive and still use it (issue #13497)",
252 zed_log_msg(LOG_INFO
,
253 "%s: skip %s since it has a %s partition "
254 "already", __func__
, devname
, part
);
255 /* skip and wait for partition event */
256 udev_device_unref(dev
);
262 * ignore small partitions
264 sectors
= udev_device_get_property_value(dev
,
265 "ID_PART_ENTRY_SIZE");
267 sectors
= udev_device_get_sysattr_value(dev
, "size");
268 if (sectors
!= NULL
&&
269 strtoull(sectors
, NULL
, 10) < MINIMUM_SECTORS
) {
270 zed_log_msg(LOG_INFO
,
271 "%s: %s sectors %s < %llu (minimum)",
273 udev_device_get_property_value(dev
, "DEVNAME"),
274 sectors
, MINIMUM_SECTORS
);
275 udev_device_unref(dev
);
280 * If the blkid probe didn't find ZFS, then a persistent
281 * device id string is required in the message schema
282 * for matching with vdevs. Preflight here for expected
286 * NVMe devices don't have ID_BUS set (at least on RHEL 7-8),
287 * but they are valid for autoreplace. Add a special case for
288 * them by searching for "/nvme/" in the udev DEVPATH:
290 * DEVPATH=/devices/pci0000:00/0000:00:1e.0/nvme/nvme2/nvme2n1
292 bus
= udev_device_get_property_value(dev
, "ID_BUS");
293 uuid
= udev_device_get_property_value(dev
, "DM_UUID");
294 devpath
= udev_device_get_devpath(dev
);
295 if (!is_zfs
&& (bus
== NULL
&& uuid
== NULL
&&
296 strstr(devpath
, "/nvme/") == NULL
)) {
297 zed_log_msg(LOG_INFO
, "zed_udev_monitor: %s no devid "
298 "source", udev_device_get_devnode(dev
));
299 udev_device_unref(dev
);
303 action
= udev_device_get_action(dev
);
304 if (strcmp(action
, "add") == 0) {
307 } else if (strcmp(action
, "remove") == 0) {
308 class = EC_DEV_REMOVE
;
310 } else if (strcmp(action
, "change") == 0) {
311 class = EC_DEV_STATUS
;
312 subclass
= ESC_DEV_DLE
;
314 zed_log_msg(LOG_WARNING
, "zed_udev_monitor: %s unknown",
316 udev_device_unref(dev
);
321 * Special case an EC_DEV_ADD for multipath devices
323 * When a multipath device is created, udev reports the
326 * 1. "add" event of the dm device for the multipath device
328 * 2. "change" event to create the actual multipath device
329 * symlink (like /dev/mapper/mpatha). The event also
330 * passes back the relevant DM vars we care about, like
332 * 3. Another "change" event identical to #2 (that we ignore).
334 * To get the behavior we want, we treat the "change" event
335 * in #2 as a "add" event; as if "/dev/mapper/mpatha" was
336 * a new disk being added.
338 if (strcmp(class, EC_DEV_STATUS
) == 0 &&
339 udev_device_get_property_value(dev
, "DM_UUID") &&
340 udev_device_get_property_value(dev
, "MPATH_SBIN_PATH")) {
341 tmp
= udev_device_get_devnode(dev
);
342 tmp2
= zfs_get_underlying_path(tmp
);
343 if (tmp
&& tmp2
&& (strcmp(tmp
, tmp2
) != 0)) {
345 * We have a real underlying device, which
346 * means that this multipath "change" event is
349 * If the multipath device and the underlying
350 * dev are the same name (i.e. /dev/dm-5), then
351 * there is no real underlying disk for this
352 * multipath device, and so this "change" event
353 * really is a multipath removal.
358 tmp
= udev_device_get_property_value(dev
,
359 "DM_NR_VALID_PATHS");
360 /* treat as a multipath remove */
361 if (tmp
!= NULL
&& strcmp(tmp
, "0") == 0) {
362 class = EC_DEV_REMOVE
;
370 * Special case an EC_DEV_ADD for scsi_debug devices
372 * These devices require a udevadm trigger command after
373 * creation in order to register the vdev_id scsidebug alias
374 * rule (adds a persistent path (phys_path) used for fault
375 * management automated tests in the ZFS test suite.
377 * After udevadm trigger command, event registers as a "change"
378 * event but needs to instead be handled as another "add" event
379 * to allow for disk labeling and partitioning to occur.
381 if (strcmp(class, EC_DEV_STATUS
) == 0 &&
382 udev_device_get_property_value(dev
, "ID_VDEV") &&
383 udev_device_get_property_value(dev
, "ID_MODEL")) {
384 const char *id_model
, *id_model_sd
= "scsi_debug";
386 id_model
= udev_device_get_property_value(dev
,
388 if (strcmp(id_model
, id_model_sd
) == 0) {
394 if ((nvl
= dev_event_nvlist(dev
)) != NULL
) {
395 zed_udev_event(class, subclass
, nvl
);
399 udev_device_unref(dev
);
406 zed_disk_event_init(void)
410 if ((g_udev
= udev_new()) == NULL
) {
411 zed_log_msg(LOG_WARNING
, "udev_new failed (%d)", errno
);
415 /* Set up a udev monitor for block devices */
416 g_mon
= udev_monitor_new_from_netlink(g_udev
, "udev");
417 udev_monitor_filter_add_match_subsystem_devtype(g_mon
, "block", "disk");
418 udev_monitor_filter_add_match_subsystem_devtype(g_mon
, "block",
420 udev_monitor_enable_receiving(g_mon
);
422 /* Make sure monitoring socket is blocking */
423 fd
= udev_monitor_get_fd(g_mon
);
424 if ((fflags
= fcntl(fd
, F_GETFL
)) & O_NONBLOCK
)
425 (void) fcntl(fd
, F_SETFL
, fflags
& ~O_NONBLOCK
);
427 /* spawn a thread to monitor events */
428 if (pthread_create(&g_mon_tid
, NULL
, zed_udev_monitor
, g_mon
) != 0) {
429 udev_monitor_unref(g_mon
);
431 zed_log_msg(LOG_WARNING
, "pthread_create failed");
435 pthread_setname_np(g_mon_tid
, "udev monitor");
436 zed_log_msg(LOG_INFO
, "zed_disk_event_init");
442 zed_disk_event_fini(void)
444 /* cancel monitor thread at recvmsg() */
445 (void) pthread_cancel(g_mon_tid
);
446 (void) pthread_join(g_mon_tid
, NULL
);
448 /* cleanup udev resources */
449 udev_monitor_unref(g_mon
);
452 zed_log_msg(LOG_INFO
, "zed_disk_event_fini");
457 #include "zed_disk_event.h"
460 zed_disk_event_init(void)
466 zed_disk_event_fini(void)
470 #endif /* HAVE_LIBUDEV */