4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
33 #include <fm/fmd_api.h>
34 #include <sys/fm/protocol.h>
35 #include <sys/systeminfo.h>
36 #include <sys/utsname.h>
39 #include <sys/fm/ldom.h>
41 static fmd_hdl_t
*init_hdl
;
46 boolean_t cma_is_native
;
49 extern const char *fmd_fmri_get_platform();
53 cma_stats_t cma_stats
= {
54 { "cpu_flts", FMD_TYPE_UINT64
, "cpu faults resolved" },
55 { "cpu_repairs", FMD_TYPE_UINT64
, "cpu faults repaired" },
56 { "cpu_fails", FMD_TYPE_UINT64
, "cpu faults unresolveable" },
57 { "cpu_blfails", FMD_TYPE_UINT64
, "failed cpu blacklists" },
58 { "cpu_supp", FMD_TYPE_UINT64
, "cpu offlines suppressed" },
59 { "cpu_blsupp", FMD_TYPE_UINT64
, "cpu blacklists suppressed" },
60 { "page_flts", FMD_TYPE_UINT64
, "page faults resolved" },
61 { "page_repairs", FMD_TYPE_UINT64
, "page faults repaired" },
62 { "page_fails", FMD_TYPE_UINT64
, "page faults unresolveable" },
63 { "page_supp", FMD_TYPE_UINT64
, "page retires suppressed" },
64 { "page_nonent", FMD_TYPE_UINT64
, "retires for non-existent fmris" },
65 { "bad_flts", FMD_TYPE_UINT64
, "invalid fault events received" },
66 { "nop_flts", FMD_TYPE_UINT64
, "inapplicable fault events received" },
67 { "auto_flts", FMD_TYPE_UINT64
, "auto-close faults received" }
70 typedef struct cma_subscriber
{
71 const char *subr_class
;
72 const char *subr_sname
;
74 int (*subr_func
)(fmd_hdl_t
*, nvlist_t
*, nvlist_t
*, const char *,
78 static const cma_subscriber_t cma_subrs
[] = {
81 * On x86, the ASRUs are expected to be in hc scheme. When
82 * cpumem-retire wants to retire a cpu or mem page, it calls the
83 * methods registered in the topo node to do that. The topo
84 * enumerator, which necessarily knows all the config info that
85 * we'd ever need in deciding what/how to retire etc. This takes
86 * away much of that complexity from the agent into the entity
87 * that knows all config/topo information.
89 { "fault.memory.page", FM_FMRI_SCHEME_HC
, FM_HC_SCHEME_VERSION
,
91 { "fault.memory.page_sb", FM_FMRI_SCHEME_HC
, FM_HC_SCHEME_VERSION
,
93 { "fault.memory.page_ck", FM_FMRI_SCHEME_HC
, FM_HC_SCHEME_VERSION
,
95 { "fault.memory.page_ue", FM_FMRI_SCHEME_HC
, FM_HC_SCHEME_VERSION
,
97 { "fault.memory.generic-x86.page_ce", FM_FMRI_SCHEME_HC
,
98 FM_HC_SCHEME_VERSION
, cma_page_retire
},
99 { "fault.memory.generic-x86.page_ue", FM_FMRI_SCHEME_HC
,
100 FM_HC_SCHEME_VERSION
, cma_page_retire
},
101 { "fault.memory.intel.page_ce", FM_FMRI_SCHEME_HC
,
102 FM_HC_SCHEME_VERSION
, cma_page_retire
},
103 { "fault.memory.intel.page_ue", FM_FMRI_SCHEME_HC
,
104 FM_HC_SCHEME_VERSION
, cma_page_retire
},
105 { "fault.memory.dimm", FM_FMRI_SCHEME_HC
, FM_HC_SCHEME_VERSION
,
107 { "fault.memory.dimm_sb", FM_FMRI_SCHEME_HC
, FM_HC_SCHEME_VERSION
,
109 { "fault.memory.dimm_ck", FM_FMRI_SCHEME_HC
, FM_HC_SCHEME_VERSION
,
111 { "fault.memory.dimm_ue", FM_FMRI_SCHEME_HC
, FM_HC_SCHEME_VERSION
,
113 { "fault.memory.generic-x86.dimm_ce", FM_FMRI_SCHEME_HC
,
114 FM_HC_SCHEME_VERSION
, NULL
},
115 { "fault.memory.generic-x86.dimm_ue", FM_FMRI_SCHEME_HC
,
116 FM_HC_SCHEME_VERSION
, NULL
},
117 { "fault.memory.intel.dimm_ce", FM_FMRI_SCHEME_HC
,
118 FM_HC_SCHEME_VERSION
, NULL
},
119 { "fault.memory.intel.dimm_ue", FM_FMRI_SCHEME_HC
,
120 FM_HC_SCHEME_VERSION
, NULL
},
121 { "fault.memory.intel.fbd.*", FM_FMRI_SCHEME_HC
,
122 FM_HC_SCHEME_VERSION
, NULL
},
123 { "fault.memory.dimm_testfail", FM_FMRI_SCHEME_HC
,
124 FM_HC_SCHEME_VERSION
, NULL
},
125 { "fault.memory.bank", FM_FMRI_SCHEME_HC
, FM_HC_SCHEME_VERSION
,
127 { "fault.memory.datapath", FM_FMRI_SCHEME_HC
, FM_HC_SCHEME_VERSION
,
129 { "fault.cpu.intel.quickpath.mem_scrubbing", FM_FMRI_SCHEME_HC
,
130 FM_HC_SCHEME_VERSION
, cma_page_retire
},
131 { "fault.cpu.intel.quickpath.*", FM_FMRI_SCHEME_HC
,
132 FM_HC_SCHEME_VERSION
, NULL
},
133 { "fault.cpu.generic-x86.mc", FM_FMRI_SCHEME_HC
,
134 FM_HC_SCHEME_VERSION
, NULL
},
135 { "fault.cpu.intel.dma", FM_FMRI_SCHEME_HC
,
136 FM_HC_SCHEME_VERSION
, NULL
},
137 { "fault.cpu.intel.dma", FM_FMRI_SCHEME_CPU
,
138 FM_CPU_SCHEME_VERSION
, NULL
},
141 * The ASRU for cpu faults are in cpu scheme on native and in hc
142 * scheme on xpv. So each cpu fault class needs to be listed twice.
146 * The following faults do NOT retire a cpu thread,
147 * and therefore must be intercepted before
148 * the default "fault.cpu.*" dispatch to cma_cpu_hc_retire.
150 { "fault.cpu.amd.dramchannel", FM_FMRI_SCHEME_HC
, FM_HC_SCHEME_VERSION
,
152 { "fault.cpu.amd.dramchannel", FM_FMRI_SCHEME_CPU
,
153 FM_CPU_SCHEME_VERSION
, NULL
},
154 { "fault.cpu.generic-x86.bus_interconnect_memory", FM_FMRI_SCHEME_HC
,
155 FM_HC_SCHEME_VERSION
, NULL
},
156 { "fault.cpu.generic-x86.bus_interconnect_memory", FM_FMRI_SCHEME_CPU
,
157 FM_CPU_SCHEME_VERSION
, NULL
},
158 { "fault.cpu.generic-x86.bus_interconnect_io", FM_FMRI_SCHEME_HC
,
159 FM_HC_SCHEME_VERSION
, NULL
},
160 { "fault.cpu.generic-x86.bus_interconnect_io", FM_FMRI_SCHEME_CPU
,
161 FM_CPU_SCHEME_VERSION
, NULL
},
162 { "fault.cpu.generic-x86.bus_interconnect", FM_FMRI_SCHEME_HC
,
163 FM_HC_SCHEME_VERSION
, NULL
},
164 { "fault.cpu.generic-x86.bus_interconnect", FM_FMRI_SCHEME_CPU
,
165 FM_CPU_SCHEME_VERSION
, NULL
},
166 { "fault.cpu.intel.bus_interconnect_memory", FM_FMRI_SCHEME_HC
,
167 FM_HC_SCHEME_VERSION
, NULL
},
168 { "fault.cpu.intel.bus_interconnect_memory", FM_FMRI_SCHEME_CPU
,
169 FM_CPU_SCHEME_VERSION
, NULL
},
170 { "fault.cpu.intel.bus_interconnect_io", FM_FMRI_SCHEME_HC
,
171 FM_HC_SCHEME_VERSION
, NULL
},
172 { "fault.cpu.intel.bus_interconnect_io", FM_FMRI_SCHEME_CPU
,
173 FM_CPU_SCHEME_VERSION
, NULL
},
174 { "fault.cpu.intel.bus_interconnect", FM_FMRI_SCHEME_HC
,
175 FM_HC_SCHEME_VERSION
, NULL
},
176 { "fault.cpu.intel.bus_interconnect", FM_FMRI_SCHEME_CPU
,
177 FM_CPU_SCHEME_VERSION
, NULL
},
178 { "fault.cpu.intel.nb.*", FM_FMRI_SCHEME_HC
, FM_HC_SCHEME_VERSION
,
180 { "fault.cpu.intel.nb.*", FM_FMRI_SCHEME_CPU
, FM_CPU_SCHEME_VERSION
,
182 { "fault.cpu.intel.dma", FM_FMRI_SCHEME_HC
, FM_HC_SCHEME_VERSION
,
184 { "fault.cpu.intel.dma", FM_FMRI_SCHEME_CPU
, FM_CPU_SCHEME_VERSION
,
186 { "fault.cpu.*", FM_FMRI_SCHEME_HC
, FM_HC_SCHEME_VERSION
,
188 { "fault.cpu.*", FM_FMRI_SCHEME_CPU
, FM_CPU_SCHEME_VERSION
,
192 * The following are PI sun4v faults
194 { "fault.memory.memlink", FM_FMRI_SCHEME_HC
,
195 FM_HC_SCHEME_VERSION
, NULL
},
196 { "fault.memory.memlink-uc", FM_FMRI_SCHEME_HC
,
197 FM_HC_SCHEME_VERSION
, NULL
},
198 { "fault.memory.memlink-failover", FM_FMRI_SCHEME_HC
,
199 FM_HC_SCHEME_VERSION
, NULL
},
200 { "fault.memory.dimm-ue-imminent", FM_FMRI_SCHEME_HC
,
201 FM_HC_SCHEME_VERSION
, NULL
},
202 { "fault.memory.dram-ue-imminent", FM_FMRI_SCHEME_HC
,
203 FM_HC_SCHEME_VERSION
, NULL
},
204 { "fault.memory.dimm-page-retires-excessive", FM_FMRI_SCHEME_HC
,
205 FM_HC_SCHEME_VERSION
, NULL
},
206 { "fault.memory.page", FM_FMRI_SCHEME_MEM
, FM_MEM_SCHEME_VERSION
,
208 { "fault.memory.dimm", FM_FMRI_SCHEME_MEM
, FM_MEM_SCHEME_VERSION
,
210 { "fault.memory.dimm_sb", FM_FMRI_SCHEME_MEM
, FM_MEM_SCHEME_VERSION
,
212 { "fault.memory.dimm_ck", FM_FMRI_SCHEME_MEM
, FM_MEM_SCHEME_VERSION
,
214 { "fault.memory.dimm_ue", FM_FMRI_SCHEME_MEM
, FM_MEM_SCHEME_VERSION
,
216 { "fault.memory.dimm-page-retires-excessive", FM_FMRI_SCHEME_MEM
,
217 FM_MEM_SCHEME_VERSION
, NULL
},
218 { "fault.memory.dimm-ue-imminent", FM_FMRI_SCHEME_MEM
,
219 FM_MEM_SCHEME_VERSION
, NULL
},
220 { "fault.memory.dram-ue-imminent", FM_FMRI_SCHEME_MEM
,
221 FM_MEM_SCHEME_VERSION
, NULL
},
222 { "fault.memory.bank", FM_FMRI_SCHEME_MEM
, FM_MEM_SCHEME_VERSION
,
224 { "fault.memory.datapath", FM_FMRI_SCHEME_MEM
, FM_MEM_SCHEME_VERSION
,
226 { "fault.memory.datapath", FM_FMRI_SCHEME_HC
, FM_HC_SCHEME_VERSION
,
228 { "fault.memory.link-c", FM_FMRI_SCHEME_MEM
, FM_MEM_SCHEME_VERSION
,
230 { "fault.memory.link-u", FM_FMRI_SCHEME_MEM
, FM_MEM_SCHEME_VERSION
,
232 { "fault.memory.link-f", FM_FMRI_SCHEME_MEM
, FM_MEM_SCHEME_VERSION
,
234 { "fault.memory.link-c", FM_FMRI_SCHEME_HC
, FM_HC_SCHEME_VERSION
,
236 { "fault.memory.link-u", FM_FMRI_SCHEME_HC
, FM_HC_SCHEME_VERSION
,
238 { "fault.memory.link-f", FM_FMRI_SCHEME_HC
, FM_HC_SCHEME_VERSION
,
242 * The following ultraSPARC-T1/T2 faults do NOT retire a cpu thread,
243 * and therefore must be intercepted before
244 * the default "fault.cpu.*" dispatch to cma_cpu_hc_retire.
246 { "fault.cpu.*.l2cachedata", FM_FMRI_SCHEME_CPU
,
247 FM_CPU_SCHEME_VERSION
, NULL
},
248 { "fault.cpu.*.l2cachetag", FM_FMRI_SCHEME_CPU
,
249 FM_CPU_SCHEME_VERSION
, NULL
},
250 { "fault.cpu.*.l2cachectl", FM_FMRI_SCHEME_CPU
,
251 FM_CPU_SCHEME_VERSION
, NULL
},
252 { "fault.cpu.*.l2data-c", FM_FMRI_SCHEME_CPU
,
253 FM_CPU_SCHEME_VERSION
, NULL
},
254 { "fault.cpu.*.l2data-u", FM_FMRI_SCHEME_CPU
,
255 FM_CPU_SCHEME_VERSION
, NULL
},
256 { "fault.cpu.*.mau", FM_FMRI_SCHEME_CPU
,
257 FM_CPU_SCHEME_VERSION
, NULL
},
258 { "fault.cpu.*.lfu-u", FM_FMRI_SCHEME_CPU
,
259 FM_CPU_SCHEME_VERSION
, NULL
},
260 { "fault.cpu.*.lfu-f", FM_FMRI_SCHEME_CPU
,
261 FM_CPU_SCHEME_VERSION
, NULL
},
262 { "fault.cpu.*.lfu-p", FM_FMRI_SCHEME_CPU
,
263 FM_CPU_SCHEME_VERSION
, NULL
},
264 { "fault.cpu.ultraSPARC-T1.freg", FM_FMRI_SCHEME_CPU
,
265 FM_CPU_SCHEME_VERSION
, NULL
},
266 { "fault.cpu.ultraSPARC-T1.l2cachedata", FM_FMRI_SCHEME_CPU
,
267 FM_CPU_SCHEME_VERSION
, NULL
},
268 { "fault.cpu.ultraSPARC-T1.l2cachetag", FM_FMRI_SCHEME_CPU
,
269 FM_CPU_SCHEME_VERSION
, NULL
},
270 { "fault.cpu.ultraSPARC-T1.l2cachectl", FM_FMRI_SCHEME_CPU
,
271 FM_CPU_SCHEME_VERSION
, NULL
},
272 { "fault.cpu.ultraSPARC-T1.mau", FM_FMRI_SCHEME_CPU
,
273 FM_CPU_SCHEME_VERSION
, NULL
},
274 { "fault.cpu.ultraSPARC-T2plus.chip", FM_FMRI_SCHEME_HC
,
275 FM_HC_SCHEME_VERSION
, NULL
},
276 { "fault.cpu.*", FM_FMRI_SCHEME_HC
, FM_HC_SCHEME_VERSION
,
278 { "fault.cpu.*", FM_FMRI_SCHEME_CPU
, FM_CPU_SCHEME_VERSION
,
281 { "fault.memory.page", FM_FMRI_SCHEME_MEM
, FM_MEM_SCHEME_VERSION
,
283 { "fault.memory.dimm", FM_FMRI_SCHEME_MEM
, FM_MEM_SCHEME_VERSION
,
285 { "fault.memory.dimm-page-retires-excessive", FM_FMRI_SCHEME_MEM
,
286 FM_MEM_SCHEME_VERSION
, NULL
},
287 { "fault.memory.dimm-ue-imminent", FM_FMRI_SCHEME_MEM
,
288 FM_MEM_SCHEME_VERSION
, NULL
},
289 { "fault.memory.dram-ue-imminent", FM_FMRI_SCHEME_MEM
,
290 FM_MEM_SCHEME_VERSION
, NULL
},
291 { "fault.memory.bank", FM_FMRI_SCHEME_MEM
, FM_MEM_SCHEME_VERSION
,
293 { "fault.cpu.SPARC64-VI.*", FM_FMRI_SCHEME_CPU
, FM_CPU_SCHEME_VERSION
,
294 cma_cpu_cpu_retire
},
295 { "fault.cpu.SPARC64-VII.*", FM_FMRI_SCHEME_CPU
, FM_CPU_SCHEME_VERSION
,
296 cma_cpu_cpu_retire
},
297 { "fault.chassis.SPARC-Enterprise.cpu.SPARC64-VI.core.se",
298 FM_FMRI_SCHEME_HC
, FM_HC_SCHEME_VERSION
, cma_cpu_hc_retire
},
299 { "fault.chassis.SPARC-Enterprise.cpu.SPARC64-VI.core.se-offlinereq",
300 FM_FMRI_SCHEME_HC
, FM_HC_SCHEME_VERSION
, cma_cpu_hc_retire
},
301 { "fault.chassis.SPARC-Enterprise.cpu.SPARC64-VI.core.ce",
302 FM_FMRI_SCHEME_HC
, FM_HC_SCHEME_VERSION
, cma_cpu_hc_retire
},
303 { "fault.chassis.SPARC-Enterprise.cpu.SPARC64-VI.core.ce-offlinereq",
304 FM_FMRI_SCHEME_HC
, FM_HC_SCHEME_VERSION
, cma_cpu_hc_retire
},
305 { "fault.chassis.SPARC-Enterprise.cpu.SPARC64-VII.core.se",
306 FM_FMRI_SCHEME_HC
, FM_HC_SCHEME_VERSION
, cma_cpu_hc_retire
},
307 { "fault.chassis.SPARC-Enterprise.cpu.SPARC64-VII.core.se-offlinereq",
308 FM_FMRI_SCHEME_HC
, FM_HC_SCHEME_VERSION
, cma_cpu_hc_retire
},
309 { "fault.chassis.SPARC-Enterprise.cpu.SPARC64-VII.core.ce",
310 FM_FMRI_SCHEME_HC
, FM_HC_SCHEME_VERSION
, cma_cpu_hc_retire
},
311 { "fault.chassis.SPARC-Enterprise.cpu.SPARC64-VII.core.ce-offlinereq",
312 FM_FMRI_SCHEME_HC
, FM_HC_SCHEME_VERSION
, cma_cpu_hc_retire
},
315 * For platforms excluding i386, sun4v and opl.
317 { "fault.memory.page", FM_FMRI_SCHEME_MEM
, FM_MEM_SCHEME_VERSION
,
319 { "fault.memory.page_sb", FM_FMRI_SCHEME_MEM
, FM_MEM_SCHEME_VERSION
,
321 { "fault.memory.page_ck", FM_FMRI_SCHEME_MEM
, FM_MEM_SCHEME_VERSION
,
323 { "fault.memory.page_ue", FM_FMRI_SCHEME_MEM
, FM_MEM_SCHEME_VERSION
,
325 { "fault.memory.dimm", FM_FMRI_SCHEME_MEM
, FM_MEM_SCHEME_VERSION
,
327 { "fault.memory.dimm_sb", FM_FMRI_SCHEME_MEM
, FM_MEM_SCHEME_VERSION
,
329 { "fault.memory.dimm_ck", FM_FMRI_SCHEME_MEM
, FM_MEM_SCHEME_VERSION
,
331 { "fault.memory.dimm_ue", FM_FMRI_SCHEME_MEM
, FM_MEM_SCHEME_VERSION
,
333 { "fault.memory.dimm-page-retires-excessive", FM_FMRI_SCHEME_MEM
,
334 FM_MEM_SCHEME_VERSION
, NULL
},
335 { "fault.memory.dimm-ue-imminent", FM_FMRI_SCHEME_MEM
,
336 FM_MEM_SCHEME_VERSION
, NULL
},
337 { "fault.memory.dram-ue-imminent", FM_FMRI_SCHEME_MEM
,
338 FM_MEM_SCHEME_VERSION
, NULL
},
339 { "fault.memory.dimm_testfail", FM_FMRI_SCHEME_MEM
,
340 FM_MEM_SCHEME_VERSION
, NULL
},
341 { "fault.memory.bank", FM_FMRI_SCHEME_MEM
, FM_MEM_SCHEME_VERSION
,
343 { "fault.memory.datapath", FM_FMRI_SCHEME_MEM
, FM_MEM_SCHEME_VERSION
,
345 { "fault.memory.datapath", FM_FMRI_SCHEME_HC
, FM_HC_SCHEME_VERSION
,
347 { "fault.memory.datapath", FM_FMRI_SCHEME_CPU
, FM_CPU_SCHEME_VERSION
,
351 * The following faults do NOT retire a cpu thread,
352 * and therefore must be intercepted before
353 * the default "fault.cpu.*" dispatch to cma_cpu_cpu_retire.
355 { "fault.cpu.ultraSPARC-IVplus.l2cachedata-line",
356 FM_FMRI_SCHEME_CPU
, FM_CPU_SCHEME_VERSION
,
357 cma_cache_way_retire
},
358 { "fault.cpu.ultraSPARC-IVplus.l3cachedata-line",
359 FM_FMRI_SCHEME_CPU
, FM_CPU_SCHEME_VERSION
,
360 cma_cache_way_retire
},
361 { "fault.cpu.ultraSPARC-IVplus.l2cachetag-line",
362 FM_FMRI_SCHEME_CPU
, FM_CPU_SCHEME_VERSION
,
363 cma_cache_way_retire
},
364 { "fault.cpu.ultraSPARC-IVplus.l3cachetag-line",
365 FM_FMRI_SCHEME_CPU
, FM_CPU_SCHEME_VERSION
,
366 cma_cache_way_retire
},
369 * Default "fault.cpu.*" for "cpu" scheme ASRU dispatch.
371 { "fault.cpu.*", FM_FMRI_SCHEME_CPU
, FM_CPU_SCHEME_VERSION
,
372 cma_cpu_cpu_retire
},
374 { NULL
, NULL
, 0, NULL
}
377 static const cma_subscriber_t
*
378 nvl2subr(fmd_hdl_t
*hdl
, nvlist_t
*nvl
, nvlist_t
**asrup
)
380 const cma_subscriber_t
*sp
;
386 if (nvlist_lookup_boolean_value(nvl
, FM_SUSPECT_RETIRE
, &retire
) == 0 &&
388 fmd_hdl_debug(hdl
, "cma_recv: retire suppressed");
392 if (nvlist_lookup_nvlist(nvl
, FM_FAULT_ASRU
, &asru
) != 0 ||
393 nvlist_lookup_string(asru
, FM_FMRI_SCHEME
, &scheme
) != 0 ||
394 nvlist_lookup_uint8(asru
, FM_VERSION
, &version
) != 0) {
395 cma_stats
.bad_flts
.fmds_value
.ui64
++;
399 for (sp
= cma_subrs
; sp
->subr_class
!= NULL
; sp
++) {
400 if (fmd_nvl_class_match(hdl
, nvl
, sp
->subr_class
) &&
401 strcmp(scheme
, sp
->subr_sname
) == 0 &&
402 version
<= sp
->subr_svers
) {
408 cma_stats
.nop_flts
.fmds_value
.ui64
++;
413 cma_recv_list(fmd_hdl_t
*hdl
, nvlist_t
*nvl
, const char *class)
416 nvlist_t
**nva
, **save_nva
;
417 uint_t nvc
= 0, save_nvc
;
420 nvlist_t
*asru
= NULL
;
423 err
|= nvlist_lookup_string(nvl
, FM_SUSPECT_UUID
, &uuid
);
424 err
|= nvlist_lookup_nvlist_array(nvl
, FM_SUSPECT_FAULT_LIST
,
427 cma_stats
.bad_flts
.fmds_value
.ui64
++;
431 save_nvc
= keepopen
= nvc
;
433 while (nvc
-- != 0 && (strcmp(class, FM_LIST_SUSPECT_CLASS
) != 0 ||
434 !fmd_case_uuclosed(hdl
, uuid
))) {
435 nvlist_t
*nvl
= *nva
++;
436 const cma_subscriber_t
*subr
;
439 if ((subr
= nvl2subr(hdl
, nvl
, &asru
)) == NULL
)
443 * A handler returns CMA_RA_SUCCESS to indicate that
444 * from this suspects point-of-view the case may be
445 * closed, CMA_RA_FAILURE otherwise.
446 * A handler must not close the case itself.
448 if (subr
->subr_func
!= NULL
) {
449 has_fault
= fmd_nvl_fmri_has_fault(hdl
, asru
,
450 FMD_HAS_FAULT_ASRU
, NULL
);
451 if (strcmp(class, FM_LIST_SUSPECT_CLASS
) == 0) {
453 err
= subr
->subr_func(hdl
, nvl
, asru
,
457 err
= subr
->subr_func(hdl
, nvl
, asru
,
460 if (err
== CMA_RA_SUCCESS
)
466 * Run though again to catch any new faults in list.updated.
468 while (save_nvc
-- != 0 && (strcmp(class, FM_LIST_UPDATED_CLASS
) == 0)) {
469 nvlist_t
*nvl
= *save_nva
++;
470 const cma_subscriber_t
*subr
;
473 if ((subr
= nvl2subr(hdl
, nvl
, &asru
)) == NULL
)
475 if (subr
->subr_func
!= NULL
) {
476 has_fault
= fmd_nvl_fmri_has_fault(hdl
, asru
,
477 FMD_HAS_FAULT_ASRU
, NULL
);
479 err
= subr
->subr_func(hdl
, nvl
, asru
, uuid
, 0);
484 * Do not close the case if we are handling cache faults.
487 if (nvlist_lookup_uint32(asru
, FM_FMRI_CPU_CACHE_INDEX
,
489 if (!keepopen
&& strcmp(class,
490 FM_LIST_SUSPECT_CLASS
) == 0) {
491 fmd_case_uuclose(hdl
, uuid
);
496 if (!keepopen
&& strcmp(class, FM_LIST_REPAIRED_CLASS
) == 0)
497 fmd_case_uuresolved(hdl
, uuid
);
501 cma_recv_one(fmd_hdl_t
*hdl
, nvlist_t
*nvl
)
503 const cma_subscriber_t
*subr
;
506 if ((subr
= nvl2subr(hdl
, nvl
, &asru
)) == NULL
)
509 if (subr
->subr_func
!= NULL
) {
510 if (fmd_nvl_fmri_has_fault(hdl
, asru
,
511 FMD_HAS_FAULT_ASRU
, NULL
) == 1)
512 (void) subr
->subr_func(hdl
, nvl
, asru
, NULL
, 0);
518 cma_recv(fmd_hdl_t
*hdl
, fmd_event_t
*ep
, nvlist_t
*nvl
, const char *class)
520 fmd_hdl_debug(hdl
, "received %s\n", class);
522 if (strcmp(class, FM_LIST_RESOLVED_CLASS
) == 0)
525 if (strcmp(class, FM_LIST_SUSPECT_CLASS
) == 0 ||
526 strcmp(class, FM_LIST_REPAIRED_CLASS
) == 0 ||
527 strcmp(class, FM_LIST_UPDATED_CLASS
) == 0)
528 cma_recv_list(hdl
, nvl
, class);
530 cma_recv_one(hdl
, nvl
);
535 cma_timeout(fmd_hdl_t
*hdl
, id_t id
, void *arg
)
537 if (id
== cma
.cma_page_timerid
)
541 * cpu offline/online needs to be retried on sun4v because
542 * ldom request can be asynchronous.
544 else if (id
== cma
.cma_cpu_timerid
)
551 cma_init_alloc(size_t size
)
553 return (fmd_hdl_alloc(init_hdl
, size
, FMD_SLEEP
));
557 cma_init_free(void *addr
, size_t size
)
559 fmd_hdl_free(init_hdl
, addr
, size
);
563 static const fmd_hdl_ops_t fmd_ops
= {
564 cma_recv
, /* fmdo_recv */
565 cma_timeout
, /* fmdo_timeout */
566 NULL
, /* fmdo_close */
567 NULL
, /* fmdo_stats */
571 static const fmd_prop_t fmd_props
[] = {
572 { "cpu_tries", FMD_TYPE_UINT32
, "10" },
573 { "cpu_delay", FMD_TYPE_TIME
, "1sec" },
575 { "cpu_ret_mindelay", FMD_TYPE_TIME
, "5sec" },
576 { "cpu_ret_maxdelay", FMD_TYPE_TIME
, "5min" },
578 { "cpu_offline_enable", FMD_TYPE_BOOL
, "true" },
579 { "cpu_online_enable", FMD_TYPE_BOOL
, "true" },
580 { "cpu_forced_offline", FMD_TYPE_BOOL
, "true" },
582 { "cpu_blacklist_enable", FMD_TYPE_BOOL
, "false" },
583 { "cpu_unblacklist_enable", FMD_TYPE_BOOL
, "false" },
585 { "cpu_blacklist_enable", FMD_TYPE_BOOL
, "true" },
586 { "cpu_unblacklist_enable", FMD_TYPE_BOOL
, "true" },
588 { "page_ret_mindelay", FMD_TYPE_TIME
, "1sec" },
589 { "page_ret_maxdelay", FMD_TYPE_TIME
, "5min" },
590 { "page_retire_enable", FMD_TYPE_BOOL
, "true" },
591 { "page_unretire_enable", FMD_TYPE_BOOL
, "true" },
595 static const fmd_hdl_info_t fmd_info
= {
596 "CPU/Memory Retire Agent", CMA_VERSION
, &fmd_ops
, fmd_props
600 _fmd_init(fmd_hdl_t
*hdl
)
605 const char *dom0
= "control_d";
608 * Abort the cpumem-retire module if Solaris is running under DomU.
610 if (sysinfo(SI_PLATFORM
, buf
, sizeof (buf
)) == -1)
613 if (strncmp(buf
, "i86pc", sizeof (buf
)) == 0) {
614 cma_is_native
= B_TRUE
;
615 } else if (strncmp(buf
, "i86xpv", sizeof (buf
)) != 0) {
618 int fd
= open("/dev/xen/domcaps", O_RDONLY
);
621 if (read(fd
, buf
, sizeof (buf
)) <= 0 ||
622 strncmp(buf
, dom0
, strlen(dom0
)) != 0) {
628 cma_is_native
= B_FALSE
;
632 if (fmd_hdl_register(hdl
, FMD_API_VERSION
, &fmd_info
) != 0)
633 return; /* invalid data in configuration file */
635 fmd_hdl_subscribe(hdl
, "fault.cpu.*");
636 fmd_hdl_subscribe(hdl
, "fault.memory.*");
638 fmd_hdl_subscribe(hdl
, "fault.chassis.SPARC-Enterprise.cpu.*");
641 (void) fmd_stat_create(hdl
, FMD_STAT_NOALLOC
, sizeof (cma_stats
) /
642 sizeof (fmd_stat_t
), (fmd_stat_t
*)&cma_stats
);
644 cma
.cma_cpu_tries
= fmd_prop_get_int32(hdl
, "cpu_tries");
646 nsec
= fmd_prop_get_int64(hdl
, "cpu_delay");
647 cma
.cma_cpu_delay
.tv_sec
= nsec
/ NANOSEC
;
648 cma
.cma_cpu_delay
.tv_nsec
= nsec
% NANOSEC
;
650 cma
.cma_page_mindelay
= fmd_prop_get_int64(hdl
, "page_ret_mindelay");
651 cma
.cma_page_maxdelay
= fmd_prop_get_int64(hdl
, "page_ret_maxdelay");
654 cma
.cma_cpu_mindelay
= fmd_prop_get_int64(hdl
, "cpu_ret_mindelay");
655 cma
.cma_cpu_maxdelay
= fmd_prop_get_int64(hdl
, "cpu_ret_maxdelay");
658 cma
.cma_cpu_dooffline
= fmd_prop_get_int32(hdl
, "cpu_offline_enable");
659 cma
.cma_cpu_forcedoffline
= fmd_prop_get_int32(hdl
,
660 "cpu_forced_offline");
661 cma
.cma_cpu_doonline
= fmd_prop_get_int32(hdl
, "cpu_online_enable");
662 cma
.cma_cpu_doblacklist
= fmd_prop_get_int32(hdl
,
663 "cpu_blacklist_enable");
664 cma
.cma_cpu_dounblacklist
= fmd_prop_get_int32(hdl
,
665 "cpu_unblacklist_enable");
666 cma
.cma_page_doretire
= fmd_prop_get_int32(hdl
, "page_retire_enable");
667 cma
.cma_page_dounretire
= fmd_prop_get_int32(hdl
,
668 "page_unretire_enable");
670 if (cma
.cma_page_maxdelay
< cma
.cma_page_mindelay
)
671 fmd_hdl_abort(hdl
, "page retirement delays conflict\n");
675 cma_lhp
= ldom_init(cma_init_alloc
, cma_init_free
);
680 _fmd_fini(fmd_hdl_t
*hdl
)