2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
13 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
17 * Disk Lights Agent (FMA)
19 * This Fault Management Daemon (fmd) module periodically scans the topology
20 * tree, enumerates all disks with associated fault indicators, and then
21 * synchronises the fault status of resources in the FMA Resource Cache with
22 * the indicators. In short: it turns the fault light on for befallen disks.
24 * Presently, we recognise associated fault indicators for disks by looking
25 * for the following structure in the topology tree:
29 * +---- /disk=0 <---------------- our Disk
31 * +---- /bay=N?indicator=fail <---- the Fault Light
32 * \---- /bay=N?indicator=ident
34 * That is: a DISK node will have a parent BAY; that BAY will itself have
35 * child Facility nodes, one of which will be called "fail". If any of the
36 * above does not hold, we simply do nothing for this disk.
41 #include <libnvpair.h>
42 #include <fm/libtopo.h>
43 #include <fm/topo_list.h>
44 #include <fm/topo_hc.h>
45 #include <fm/fmd_api.h>
46 #include <sys/fm/protocol.h>
49 typedef struct disk_lights
{
51 uint64_t dl_poll_interval
;
52 uint64_t dl_coalesce_interval
;
54 boolean_t dl_triggered
;
57 static void disklights_topo(fmd_hdl_t
*, topo_hdl_t
*);
58 static void disklights_recv(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
60 static void disklights_timeout(fmd_hdl_t
*, id_t
, void *);
62 static const fmd_hdl_ops_t fmd_ops
= {
63 disklights_recv
, /* fmdo_recv */
64 disklights_timeout
, /* fmdo_timeout */
65 NULL
, /* fmdo_close */
66 NULL
, /* fmdo_stats */
69 disklights_topo
, /* fmdo_topo */
73 * POLL_INTERVAL is the period after which we perform an unsolicited poll
74 * to ensure we remain in sync with reality.
76 #define DL_PROP_POLL_INTERVAL "poll-interval"
79 * COALESCE_INTERVAL is how long we wait after we are trigged by either a
80 * topology change or a relevant list.* event, in order to allow a series
81 * of events to coalesce.
83 #define DL_PROP_COALESCE_INTERVAL "coalesce-interval"
85 static const fmd_prop_t fmd_props
[] = {
86 { DL_PROP_POLL_INTERVAL
, FMD_TYPE_TIME
, "5min" },
87 { DL_PROP_COALESCE_INTERVAL
, FMD_TYPE_TIME
, "3s" },
91 static const fmd_hdl_info_t fmd_info
= {
99 * Fetch the Facility Node properties (name, type) from the FMRI
100 * for this node, or return -1 if we can't.
103 get_facility_props(topo_hdl_t
*hdl
, tnode_t
*node
, char **facname
,
107 nvlist_t
*fmri
= NULL
, *fnvl
;
108 char *nn
= NULL
, *tt
= NULL
;
110 if (topo_node_resource(node
, &fmri
, &e
) != 0)
113 if (nvlist_lookup_nvlist(fmri
, FM_FMRI_FACILITY
, &fnvl
) != 0)
116 if (nvlist_lookup_string(fnvl
, FM_FMRI_FACILITY_NAME
, &nn
) != 0)
119 if (nvlist_lookup_string(fnvl
, FM_FMRI_FACILITY_TYPE
, &tt
) != 0)
122 *facname
= topo_hdl_strdup(hdl
, nn
);
123 *factype
= topo_hdl_strdup(hdl
, tt
);
131 typedef struct dl_fault_walk_inner
{
134 } dl_fault_walk_inner_t
;
137 dl_fault_walk_inner(topo_hdl_t
*thp
, tnode_t
*node
, void *arg
)
139 dl_fault_walk_inner_t
*fwi
= arg
;
140 char *facname
= NULL
, *factype
= NULL
;
144 * We're only interested in BAY children that are valid Facility Nodes.
146 if (topo_node_flags(node
) != TOPO_NODE_FACILITY
||
147 get_facility_props(thp
, node
, &facname
, &factype
) != 0) {
151 if (strcmp(fwi
->fwi_name
, facname
) != 0)
155 * Attempt to set the LED mode appropriately. If this fails, give up
158 (void) topo_prop_set_uint32(node
, TOPO_PGROUP_FACILITY
, TOPO_LED_MODE
,
159 TOPO_PROP_MUTABLE
, fwi
->fwi_mode
, &err
);
162 topo_hdl_strfree(thp
, facname
);
163 topo_hdl_strfree(thp
, factype
);
164 return (TOPO_WALK_NEXT
);
168 dl_fault_walk_outer(topo_hdl_t
*thp
, tnode_t
*node
, void *arg
)
170 disk_lights_t
*dl
= arg
;
171 dl_fault_walk_inner_t fwi
;
174 nvlist_t
*fmri
= NULL
;
176 bzero(&fwi
, sizeof (fwi
));
179 * We are only looking for DISK nodes in the topology that have a parent
182 if (strcmp(DISK
, topo_node_name(node
)) != 0 ||
183 (pnode
= topo_node_parent(node
)) == NULL
||
184 strcmp(BAY
, topo_node_name(pnode
)) != 0) {
185 return (TOPO_WALK_NEXT
);
189 * Check to see if the Resource this FMRI describes is Faulty:
191 if (topo_node_resource(node
, &fmri
, &err
) != 0)
192 return (TOPO_WALK_NEXT
);
193 has_fault
= fmd_nvl_fmri_has_fault(dl
->dl_fmd
, fmri
,
194 FMD_HAS_FAULT_RESOURCE
, NULL
);
198 * Walk the children of this BAY and flush out our fault status if
199 * we find an appropriate indicator node.
201 fwi
.fwi_name
= "fail";
202 fwi
.fwi_mode
= has_fault
? TOPO_LED_STATE_ON
: TOPO_LED_STATE_OFF
;
203 (void) topo_node_child_walk(thp
, pnode
, dl_fault_walk_inner
, &fwi
,
206 return (TOPO_WALK_NEXT
);
210 * Walk all of the topology nodes looking for DISKs that match the structure
211 * described in the overview. Once we find them, check their fault status
212 * and update their fault indiciator accordingly.
215 dl_examine_topo(disk_lights_t
*dl
)
218 topo_hdl_t
*thp
= NULL
;
219 topo_walk_t
*twp
= NULL
;
221 thp
= fmd_hdl_topo_hold(dl
->dl_fmd
, TOPO_VERSION
);
222 if ((twp
= topo_walk_init(thp
, FM_FMRI_SCHEME_HC
, dl_fault_walk_outer
,
223 dl
, &err
)) == NULL
) {
224 fmd_hdl_error(dl
->dl_fmd
, "failed to get topology: %s\n",
229 if (topo_walk_step(twp
, TOPO_WALK_CHILD
) == TOPO_WALK_ERR
) {
230 fmd_hdl_error(dl
->dl_fmd
, "failed to walk topology: %s\n",
239 fmd_hdl_topo_rele(dl
->dl_fmd
, thp
);
243 dl_trigger_enum(disk_lights_t
*dl
)
246 * If we're already on the short-poll coalesce timer, then return
249 if (dl
->dl_triggered
== B_TRUE
)
251 dl
->dl_triggered
= B_TRUE
;
254 * Replace existing poll timer with coalesce timer:
256 if (dl
->dl_timer
!= 0)
257 fmd_timer_remove(dl
->dl_fmd
, dl
->dl_timer
);
258 dl
->dl_timer
= fmd_timer_install(dl
->dl_fmd
, NULL
, NULL
,
259 dl
->dl_coalesce_interval
);
264 disklights_timeout(fmd_hdl_t
*hdl
, id_t id
, void *data
)
266 disk_lights_t
*dl
= fmd_hdl_getspecific(hdl
);
268 dl
->dl_triggered
= B_FALSE
;
273 * Install the long-interval timer for the next poll.
275 dl
->dl_timer
= fmd_timer_install(hdl
, NULL
, NULL
, dl
->dl_poll_interval
);
280 disklights_topo(fmd_hdl_t
*hdl
, topo_hdl_t
*thp
)
282 disk_lights_t
*dl
= fmd_hdl_getspecific(hdl
);
289 disklights_recv(fmd_hdl_t
*hdl
, fmd_event_t
*ep
, nvlist_t
*nvl
,
292 disk_lights_t
*dl
= fmd_hdl_getspecific(hdl
);
298 _fmd_init(fmd_hdl_t
*hdl
)
302 if (fmd_hdl_register(hdl
, FMD_API_VERSION
, &fmd_info
) != 0)
305 dl
= fmd_hdl_zalloc(hdl
, sizeof (*dl
), FMD_SLEEP
);
306 fmd_hdl_setspecific(hdl
, dl
);
309 * Load Configuration:
312 dl
->dl_poll_interval
= fmd_prop_get_int64(hdl
, DL_PROP_POLL_INTERVAL
);
313 dl
->dl_coalesce_interval
= fmd_prop_get_int64(hdl
,
314 DL_PROP_COALESCE_INTERVAL
);
317 * Schedule the initial enumeration:
323 _fmd_fini(fmd_hdl_t
*hdl
)
325 disk_lights_t
*dl
= fmd_hdl_getspecific(hdl
);
327 fmd_hdl_free(hdl
, dl
, sizeof (*dl
));