4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
33 #include <fm/fmd_api.h>
34 #include <fm/fmd_agent.h>
35 #include <sys/fm/protocol.h>
37 #include <sys/processor.h>
39 static int cpu_statechange(fmd_hdl_t
*, nvlist_t
*, nvlist_t
*, const char *,
44 * Perform retire/unretire by invoking the topo methods registered in the
47 * If the fault is found to be diagnosed under the old topology, the resource
48 * will not exist in the current topology, then we fall back to legacy retire
49 * (using the "cpu" scheme ASRU).
53 old_topo_fault(nvlist_t
*nvl
)
62 if (nvlist_lookup_nvlist(nvl
, FM_FAULT_RESOURCE
, &rsrc
) != 0)
66 * x86 has moved from "motherboard/chip/cpu" topo to
67 * "motherboard/chip/core/strand"
69 if (nvlist_lookup_nvlist_array(rsrc
, FM_FMRI_HC_LIST
, &hcl
, &nhcl
)
71 nvlist_lookup_string(hcl
[0], FM_FMRI_HC_NAME
, &name
) == 0 &&
72 strcmp(name
, "motherboard") == 0 &&
73 nvlist_lookup_string(hcl
[1], FM_FMRI_HC_NAME
, &name
) == 0 &&
74 strcmp(name
, "chip") == 0 &&
75 nvlist_lookup_string(hcl
[2], FM_FMRI_HC_NAME
, &name
) == 0 &&
76 strcmp(name
, "cpu") == 0)
85 cma_cpu_hc_retire(fmd_hdl_t
*hdl
, nvlist_t
*nvl
, nvlist_t
*asru
,
86 const char *uuid
, boolean_t repair
)
89 int rc
= CMA_RA_SUCCESS
;
93 * For the cached faults which were diagnosed under the old
94 * topology, we fall back to retire by using cpu-scheme ASRUs.
95 * Under xVM Dom0, since logic cpuid in "cpu" scheme ASRU makes no
96 * sense, the fault should be ignored.
98 if (old_topo_fault(nvl
)) {
101 return (CMA_RA_FAILURE
);
103 return (cma_cpu_cpu_retire(hdl
, nvl
, asru
, uuid
, repair
));
107 * Lookup the resource and call its topo methods to do retire/unretire
109 if ((! repair
&& ! cma
.cma_cpu_dooffline
) ||
110 (repair
&& ! cma
.cma_cpu_doonline
)) {
111 fmd_hdl_debug(hdl
, "suppressed %s of CPU\n",
112 repair
? "unretire" : "retire");
113 cma_stats
.cpu_supp
.fmds_value
.ui64
++;
115 err
= FMD_AGENT_RETIRE_FAIL
;
116 if (nvlist_lookup_nvlist(nvl
, FM_FAULT_RESOURCE
, &rsrc
) == 0) {
118 err
= fmd_nvl_fmri_unretire(hdl
, rsrc
);
120 for (i
= 0; i
< cma
.cma_cpu_tries
; i
++) {
121 err
= fmd_nvl_fmri_retire(hdl
, rsrc
);
122 if (err
== FMD_AGENT_RETIRE_DONE
)
124 (void) nanosleep(&cma
.cma_cpu_delay
,
129 if (err
== FMD_AGENT_RETIRE_DONE
) {
131 cma_stats
.cpu_repairs
.fmds_value
.ui64
++;
133 cma_stats
.cpu_flts
.fmds_value
.ui64
++;
136 cma_stats
.bad_flts
.fmds_value
.ui64
++;
138 /* libldom requests are processed asynchronously */
139 cma_cpu_start_retry(hdl
, nvl
, uuid
, repair
);
144 if ((! repair
&& ! cma
.cma_cpu_doblacklist
) ||
145 (repair
&& ! cma
.cma_cpu_dounblacklist
)) {
146 fmd_hdl_debug(hdl
, "suppressed %s of CPU\n",
147 repair
? "unblacklist" : "blacklist");
148 cma_stats
.cpu_blsupp
.fmds_value
.ui64
++;
150 if (cma_cpu_blacklist(hdl
, nvl
, asru
, repair
) < 0)
151 cma_stats
.cpu_blfails
.fmds_value
.ui64
++;
161 cma_cpu_hc_retire(fmd_hdl_t
*hdl
, nvlist_t
*nvl
, nvlist_t
*asru
,
162 const char *uuid
, boolean_t repair
)
166 nvlist_t
**hc_prs
= NULL
, *hc_spec_nvl
;
168 /* OPL has ASRU in "hc" scheme */
169 if (nvlist_lookup_nvlist(asru
, FM_FMRI_HC_SPECIFIC
,
170 &hc_spec_nvl
) != 0) {
171 cma_stats
.bad_flts
.fmds_value
.ui64
++;
173 "cma_cpu_hc_retire lookup hc_spec_nvl failed\n");
174 return (CMA_RA_FAILURE
);
177 if (nvlist_lookup_nvlist_array(hc_spec_nvl
, FM_FMRI_HC_CPUIDS
,
178 &hc_prs
, &nprs
) != 0) {
179 cma_stats
.bad_flts
.fmds_value
.ui64
++;
181 "cma_cpu_hc_retire lookup cpuid array failed\n");
182 return (CMA_RA_FAILURE
);
185 for (i
= 0; i
< nprs
; i
++) {
186 if (nvlist_lookup_uint32(hc_prs
[i
],
187 FM_FMRI_CPU_ID
, &cpuid
) != 0) {
188 cma_stats
.bad_flts
.fmds_value
.ui64
++;
189 return (CMA_RA_FAILURE
);
192 if (cpu_statechange(hdl
, nvl
, hc_prs
[i
], uuid
, cpuid
, repair
)
194 cma_stats
.bad_flts
.fmds_value
.ui64
++;
195 return (CMA_RA_FAILURE
);
199 return (CMA_RA_SUCCESS
);
204 * The rest of this file uses ASRUs to do retire, this is now not the
205 * preferable way, but it's still needed for some circumstances when
206 * retire via topo methods can't work, ie.
208 * 1) There are legacy platforms which don't have full topology.
209 * 2) The resources in the FMD cached faults may not be set or exist in the
210 * up-to-dated topology.
215 cpu_online(fmd_hdl_t
*hdl
, nvlist_t
*nvl
, nvlist_t
*asru
, const char *uuid
,
218 int err
= CMA_RA_SUCCESS
;
220 if (cma
.cma_cpu_doonline
) {
221 err
= cma_cpu_statechange(hdl
, asru
, uuid
, P_ONLINE
,
224 fmd_hdl_debug(hdl
, "suppressed online of CPU %u\n",
226 cma_stats
.cpu_supp
.fmds_value
.ui64
++;
229 /* OPL performs the blacklist in the service processor */
231 if (cma
.cma_cpu_dounblacklist
) {
232 if (cma_cpu_blacklist(hdl
, nvl
, asru
, B_TRUE
) < 0)
233 cma_stats
.cpu_blfails
.fmds_value
.ui64
++;
235 fmd_hdl_debug(hdl
, "suppressed unblacklist of CPU %u\n", cpuid
);
236 cma_stats
.cpu_blsupp
.fmds_value
.ui64
++;
245 cpu_offline(fmd_hdl_t
*hdl
, nvlist_t
*nvl
, nvlist_t
*asru
, const char *uuid
,
248 int err
= CMA_RA_FAILURE
;
250 if (cma
.cma_cpu_dooffline
) {
251 int cpustate
= P_FAULTED
;
253 if (cma
.cma_cpu_forcedoffline
)
254 cpustate
|= P_FORCED
;
255 err
= cma_cpu_statechange(hdl
, asru
, uuid
, cpustate
,
258 fmd_hdl_debug(hdl
, "suppressed offline of CPU %u\n",
260 cma_stats
.cpu_supp
.fmds_value
.ui64
++;
263 /* OPL performs the blacklist in the service processor */
265 if (cma
.cma_cpu_doblacklist
) {
266 if (cma_cpu_blacklist(hdl
, nvl
, asru
, B_FALSE
) < 0)
267 cma_stats
.cpu_blfails
.fmds_value
.ui64
++;
269 fmd_hdl_debug(hdl
, "suppressed blacklist of CPU %u\n",
271 cma_stats
.cpu_blsupp
.fmds_value
.ui64
++;
279 cpu_statechange(fmd_hdl_t
*hdl
, nvlist_t
*nvl
, nvlist_t
*asru
, const char *uuid
,
280 uint32_t cpuid
, boolean_t repair
)
283 return (cpu_online(hdl
, nvl
, asru
, uuid
, cpuid
));
285 return (cpu_offline(hdl
, nvl
, asru
, uuid
, cpuid
));
289 p_online_state_fmt(int state
)
300 return (PS_POWEROFF
);
311 cma_cpu_cpu_retire(fmd_hdl_t
*hdl
, nvlist_t
*nvl
, nvlist_t
*asru
,
312 const char *uuid
, boolean_t repair
)
316 if (nvlist_lookup_uint32(asru
, FM_FMRI_CPU_ID
, &cpuid
) != 0) {
317 fmd_hdl_debug(hdl
, "cpu fault missing '%s'\n", FM_FMRI_CPU_ID
);
318 cma_stats
.bad_flts
.fmds_value
.ui64
++;
319 return (CMA_RA_FAILURE
);
322 return (cpu_statechange(hdl
, nvl
, asru
, uuid
, cpuid
, repair
));