4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
30 * Each CPU of interest has a cmd_cpu_t structure. CPUs become of interest when
31 * they are the focus of ereports, or when they detect UEs. CPUs may be the
32 * target of several different kinds of ereport, each of which is tracked
33 * differently. cpu_cases lists the types of cases that can be open against a
34 * given CPU. The life of a CPU is complicated by the fact that xxCs and xxUs
35 * received by the DE may in fact be side-effects of earlier UEs, xxCs, or xxUs.
36 * Causes of side-effects, and actions taken to resolve them, can be found below
37 * and in cmd_memerr.h.
40 * ________ CMD_PTR_CPU_ICACHE
41 * / \ ,--------. CMD_PTR_CPU_DCACHE
42 * |CPU | <---- |case_ptr| (one or more of CMD_PTR_CPU_PCACHE )
43 * | | `--------' CMD_PTR_CPU_ITLB
44 * |,-------| ,-------. CMD_PTR_CPU_DTLB
45 * ||asru | ----> |fmri_t | CMD_PTR_CPU_L2DATA
46 * |:-------| :-------: CMD_PTR_CPU_L2DATA_UERETRY
47 * ||fru | ----> |fmri_t | CMD_PTR_CPU_L2TAG
48 * |`-------| `-------' CMD_PTR_CPU_L3DATA
49 * | | ,---------. CMD_PTR_CPU_L3DATA_UERETRY
50 * | uec | ----> |UE cache | CMD_PTR_CPU_L3TAG
51 * \________/ `---------' CMD_PTR_CPU_FPU
59 * | xr | <---- |case_ptr| (CMD_PTR_XR_WAITER)
61 * |,-------| ,-------.
62 * ||rsrc | ----> |fmri_t |
63 * |`-------| `-------'
64 * | cpu | ----> detecting CPU
67 * Data structure P? Case- Notes
69 * ---------------- --- ----- --------------------------------------
70 * cmd_cpu_t Yes No Name is derived from CPU ID ("cpu_%d")
71 * cmd_case_ptr_t Yes Yes Name is case's UUID
72 * cpu_asru (fmri_t) Yes No Name is derived from CPU ID ("cpu_asru_%d")
73 * cpu_fru (fmri_t) Yes No Name is derived from CPU ID ("cpu_fru_%d")
74 * cpu_uec Yes No Name is derived from CPU ID ("cpu_uec_%d")
75 * cmd_xr_t Yes Yes Name is `redelivery'
76 * xr_rsrc (fmri_t) Yes No Name is derived from case's UUID ("%s_rsrc")
80 #include <cmd_state.h>
87 #define CPU_FRU_FMRI FM_FMRI_SCHEME_HC":///" \
90 #define BK_LFUFAULT_CERT 50
92 typedef struct cmd_cpu cmd_cpu_t
;
94 typedef enum cmd_cpu_type
{
95 CPU_ULTRASPARC_III
= 1,
96 CPU_ULTRASPARC_IIIplus
,
99 CPU_ULTRASPARC_IVplus
,
100 CPU_ULTRASPARC_IIIiplus
,
105 CPU_ULTRASPARC_T2plus
108 typedef struct cmd_cpu_cases
{
109 cmd_case_t cpuc_icache
; /* All I$ errors (IPE, IDSPE, etc) */
110 cmd_case_t cpuc_dcache
; /* All D$ errors (DPE, DDSPE, etc) */
111 cmd_case_t cpuc_pcache
; /* All P$ errors (PDSPE) */
112 cmd_case_t cpuc_itlb
; /* ITLB errors (ITLBPE) */
113 cmd_case_t cpuc_dtlb
; /* DTLB errors (DTLBPE) */
114 cmd_case_t cpuc_l2data
; /* All correctable L2$ data errors */
115 cmd_case_t cpuc_l2tag
; /* All correctable L2$ tag errors */
116 cmd_case_t cpuc_l3data
; /* All correctable L3$ data errors */
117 cmd_case_t cpuc_l3tag
; /* All correctable L3$ tag errors */
118 cmd_case_t cpuc_fpu
; /* FPU errors */
119 cmd_case_t cpuc_ireg
; /* Integer reg errors (IRC, IRU) */
120 cmd_case_t cpuc_freg
; /* Floatpnt reg errors (frc, fru) */
121 cmd_case_t cpuc_mau
; /* Modular arith errors (MAU) */
122 cmd_case_t cpuc_l2ctl
; /* L2$ directory, VUAD parity */
123 cmd_case_t cpuc_misc_regs
; /* Scratchpad array (SCA) */
124 /* Tick compare (TC) */
125 /* Store buffer (SBD) */
126 /* Trap stack array errors (TSA) */
127 cmd_case_t cpuc_lfu
; /* Coherency link error (LFU) */
129 cmd_case_t cpuc_opl_invsfsr
; /* Olympus-C cpu inv-sfsr errors */
130 cmd_case_t cpuc_oplue_detcpu
; /* Olympus-C cpu det. ue (eid=CPU) */
131 cmd_case_t cpuc_oplue_detio
; /* Olympus-C io det. ue (eid=CPU) */
132 cmd_case_t cpuc_opl_mtlb
; /* Olympus-C mtlb errors */
133 cmd_case_t cpuc_opl_tlbp
; /* Olympus-C tlbp errors */
134 cmd_case_t cpuc_opl_inv_urg
; /* Olympus-C inv-urg invalid urgent */
135 cmd_case_t cpuc_opl_cre
; /* Olympus-C cre urgent errors */
136 cmd_case_t cpuc_opl_tsb_ctx
; /* Olympus-C tsb_ctx urgent errors */
137 cmd_case_t cpuc_opl_tsbp
; /* Olympus-C tsbp urgent errors */
138 cmd_case_t cpuc_opl_pstate
; /* Olympus-C pstate urgent errors */
139 cmd_case_t cpuc_opl_tstate
; /* Olympus-C tstate urgent errors */
140 cmd_case_t cpuc_opl_iug_f
; /* Olympus-C iug_f urgent errors */
141 cmd_case_t cpuc_opl_iug_r
; /* Olympus-C iug_r urgent errors */
142 cmd_case_t cpuc_opl_sdc
; /* Olympus-C sdc urgent errors */
143 cmd_case_t cpuc_opl_wdt
; /* Olympus-C wdt urgent errors */
144 cmd_case_t cpuc_opl_dtlb
; /* Olympus-C dtlb urgent errors */
145 cmd_case_t cpuc_opl_itlb
; /* Olympus-C itlb urgent errors */
146 cmd_case_t cpuc_opl_core_err
; /* Olympus-C core-err urgent errors */
147 cmd_case_t cpuc_opl_dae
; /* Olympus-C dae urgent errors */
148 cmd_case_t cpuc_opl_iae
; /* Olympus-C iae urgent errors */
149 cmd_case_t cpuc_opl_uge
; /* Olympus-C uge urgent errors */
154 * The UE cache. We actually have two UE caches - the current one and the old
155 * one. When it's time to flush the UE cache, we move the current UE cache to
156 * the old position and flush the E$. Then, we schedule the removal of the old
157 * UE cache. This allows a) xxUs triggered by the flush to match against the
158 * old cache, while b) still allowing new UEs to be added to the current UE
159 * cache. UE matches will always search in both caches (if present), but
160 * additions will only end up in the current cache. We go to all of this
161 * effort because the cost of a missed ereport (discarding due to a false match
162 * in the cache) is much less than that of a missed match. In the latter case,
163 * the CPU will be erroneously offlined.
165 * A special case is triggered if we see a UE with a not valid AFAR. Without
166 * the AFAR, we aren't able to properly match subsequent xxU's. As a result,
167 * we need to throw the cache into all-match mode, wherein all subsequent match
168 * attempts will succeed until the UE cache is flushed.
171 #define CPU_UEC_F_ALLMATCH 0x1 /* all-match mode active */
173 typedef struct cmd_cpu_uec
{
174 uint64_t *uec_cache
; /* The UE cache */
175 uint_t uec_nent
; /* Number of allocated slots in cache */
176 uint_t uec_flags
; /* CPU_UEC_F_* */
177 char uec_bufname
[CMD_BUFNMLEN
]; /* Name of buffer used for cache */
180 extern const char *cmd_cpu_type2name(fmd_hdl_t
*, cmd_cpu_type_t
);
181 extern void cmd_cpu_uec_add(fmd_hdl_t
*, cmd_cpu_t
*, uint64_t);
182 extern int cmd_cpu_uec_match(cmd_cpu_t
*, uint64_t);
183 extern void cmd_cpu_uec_clear(fmd_hdl_t
*, cmd_cpu_t
*);
184 extern void cmd_cpu_uec_set_allmatch(fmd_hdl_t
*, cmd_cpu_t
*);
187 * Certain types of xxC and xxU can trigger other types as side-effects. These
188 * secondary ereports need to be discarded, as treating them as legitimate
189 * ereports in their own right will cause erroneous diagnosis. As an example
190 * (see cmd_xxcu_trains for more), an L2$ UCC will usually trigger an L2$ WDC
191 * resulting from the trap handler's flushing of the L2$. If we treat both as
192 * legitimate, we'll end up adding two ereports to the SERD engine,
193 * significantly cutting the threshold for retiring the CPU.
195 * Our saving grace is the fact that the side-effect ereports will have the same
196 * ENA as the primary. As such, we can keep track of groups of ereports by ENA.
197 * These groups, which we'll call trains, can then be matched against a list of
198 * known trains. The list (an array of cmd_xxcu_train_t structures) has both a
199 * description of the composition of the train and an indication as to which of
200 * the received ereports is the primary.
202 * The cmd_xxcu_trw_t is used to gather the members of the train. When the
203 * first member comes in, we allocate a trw, recording the ENA of the ereport,
204 * as well as noting its class in trw_mask. We then reschedule the delivery of
205 * the ereport for some configurable time in the future, trusting that all
206 * members of the train will have arrived by that time. Subsequent ereports in
207 * the same train match the recorded ENA, and add themselves to the mask.
208 * When the first ereport is redelivered, trw_mask is used to determine whether
209 * or not a train has been seen. An exact match is required. If a match is
210 * made, the ereport indicated as the primary cause is used for diagnosis.
213 #define CMD_TRW_F_DELETING 0x1 /* reclaiming events */
214 #define CMD_TRW_F_CAUSESEEN 0x2 /* cause of train already processed */
215 #define CMD_TRW_F_GCSEEN 0x4 /* seen by GC, erased next time */
217 typedef struct cmd_xxcu_trw
{
218 uint64_t trw_ena
; /* the ENA for this group of ereports */
219 uint64_t trw_afar
; /* the AFAR for this group of ereports */
220 cmd_errcl_t trw_mask
; /* ereports seen thus far with this ENA */
221 uint16_t trw_cpuid
; /* CPU to which this watcher belongs */
222 uint8_t trw_ref
; /* number of ereports with this ENA */
223 uint8_t trw_flags
; /* CMD_TRW_F_* */
227 extern cmd_xxcu_trw_t
*cmd_trw_lookup(uint64_t, uint8_t, uint64_t);
228 extern cmd_xxcu_trw_t
*cmd_trw_alloc(uint64_t, uint64_t);
229 extern void cmd_trw_restore(fmd_hdl_t
*);
230 extern void cmd_trw_write(fmd_hdl_t
*);
231 extern void cmd_trw_ref(fmd_hdl_t
*, cmd_xxcu_trw_t
*, cmd_errcl_t
);
232 extern void cmd_trw_deref(fmd_hdl_t
*, cmd_xxcu_trw_t
*);
234 extern cmd_errcl_t
cmd_xxcu_train_match(cmd_errcl_t
);
237 * We don't have access to ereport nvlists when they are redelivered via timer.
238 * As such, we have to retrieve everything we might need for diagnosis when we
239 * first receive the ereport. The retrieved information is stored in the
240 * cmd_xr_t, which is persisted.
243 typedef struct cmd_xr cmd_xr_t
;
246 * xr_hdlr can't be persisted, so we use these in xr_hdlrid to indicate the
247 * handler to be used. xr_hdlr is then updated so it can be used directly.
249 #define CMD_XR_HDLR_XXC 1
250 #define CMD_XR_HDLR_XXU 2
251 #define CMD_XR_HDLR_NOP 3
253 typedef void cmd_xr_hdlr_f(fmd_hdl_t
*, cmd_xr_t
*, fmd_event_t
*);
256 * For sun4v, the size of xr_synd is expanded to 32 bits in order to
257 * accomodate the Niagara L2 syndrome (4x7 bits).
262 id_t xr_id
; /* ID of timer used for redelivery */
263 cmd_cpu_t
*xr_cpu
; /* Detecting CPU, recalc'd from cpuid */
264 uint32_t xr_cpuid
; /* ID of detecting CPU */
265 uint64_t xr_ena
; /* ENA from ereport */
266 uint64_t xr_afar
; /* AFAR from ereport nvlist */
268 uint16_t xr_synd
; /* syndrome from ereport nvlist */
270 uint32_t xr_synd
; /* for Niagara, enlarged to 32 bits */
272 uint8_t xr_afar_status
; /* AFAR status from ereport nvlist */
273 uint8_t xr_synd_status
; /* syndrome status from ereport nvlist */
274 cmd_fmri_t xr_rsrc
; /* resource from ereport nvlist */
275 cmd_errcl_t xr_clcode
; /* CMD_ERRCL_* for this ereport */
276 cmd_xr_hdlr_f
*xr_hdlr
; /* handler, recalc'd from hdlrid on restart */
277 uint_t xr_hdlrid
; /* CMD_XR_HDLR_*, used for recalc of hdlr */
278 fmd_case_t
*xr_case
; /* Throwaway case used to track redelivery */
279 uint_t xr_ref
; /* Number of references to this struct */
281 uint64_t xr_afsr
; /* AFSR from ereport nvlist */
282 uint8_t xr_num_ways
; /* Number of Cache ways reporting from nvlist */
283 uint32_t xr_error_way
; /* The way from the ereport nvlist payload */
284 uint64_t xr_error_tag
; /* The tag from the ereport nvlist payload */
285 uint32_t xr_error_index
; /* the index from the ereport payload */
286 uint64_t *xr_cache_data
; /* The cache data */
287 nvlist_t
*xr_detector_nvlist
; /* The detecting resource */
291 #define xr_rsrc_nvl xr_rsrc.fmri_nvl
293 extern cmd_xr_t
*cmd_xr_create(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
294 cmd_cpu_t
*, cmd_errcl_t
);
295 extern cmd_evdisp_t
cmd_xr_reschedule(fmd_hdl_t
*, cmd_xr_t
*, uint_t
);
296 extern void cmd_xr_deref(fmd_hdl_t
*, cmd_xr_t
*);
297 extern void cmd_xr_write(fmd_hdl_t
*, cmd_xr_t
*);
299 extern void cmd_xxc_resolve(fmd_hdl_t
*, cmd_xr_t
*, fmd_event_t
*);
300 extern void cmd_xxu_resolve(fmd_hdl_t
*, cmd_xr_t
*, fmd_event_t
*);
301 extern void cmd_nop_resolve(fmd_hdl_t
*, cmd_xr_t
*, fmd_event_t
*);
302 extern cmd_evdisp_t
cmd_xxcu_initial(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
303 const char *, cmd_errcl_t
, uint_t
);
306 * The master structure containing or referencing all of the state for a given
311 * We periodically flush the E$, thus allowing us to flush the UE cache (see
312 * above for a description of the UE cache). In particular, we flush it
313 * whenever we see a UE with a non-valid AFAR. To keep from overflushing the
314 * CPU, we cap the number of flushes that we'll do in response to UEs with
315 * non-valid AFARs. The cap is the number of permitted flushes per GC/restart
316 * cycle, and was determined arbitrarily.
318 #define CPU_UEC_FLUSH_MAX 3
321 * The CPU structure started life without a version number. Making things more
322 * complicated, the version number in the new struct occupies the space used for
323 * cpu_cpuid in the non-versioned struct. We therefore have to use somewhat
324 * unorthodox version numbers to distinguish between the two types of struct
325 * (pre- and post-versioning) -- version numbers that can't be mistaken for
326 * CPUIDs. Our version numbers, therefore, will be negative.
328 * For future expansion, the version member must always stay where it is. At
329 * some point in the future, when more structs get versions, the version member
330 * should move into the cmd_header_t.
332 #define CPU_MKVERSION(version) ((uint_t)(0 - (version)))
334 #define CMD_CPU_VERSION_1 CPU_MKVERSION(1) /* -1 */
335 #define CMD_CPU_VERSION_2 CPU_MKVERSION(2) /* -2 */
336 #define CMD_CPU_VERSION_3 CPU_MKVERSION(3) /* -3 */
337 #define CMD_CPU_VERSION CMD_CPU_VERSION_3
339 #define CMD_CPU_VERSIONED(cpu) ((int)(cpu)->cpu_version < 0)
341 #define CMD_CPU_F_DELETING 0x1
343 typedef struct cmd_cpu_0
{
344 cmd_header_t cpu0_header
; /* Nodetype must be CMD_NT_CPU */
345 uint32_t cpu0_cpuid
; /* Logical ID for this CPU */
346 cmd_cpu_type_t cpu0_type
; /* CPU model */
347 fmd_case_t
*cpu0_cases
[4]; /* v0 had embedded case_t w/4 cases */
348 uint8_t cpu0_faulting
; /* Set if fault has been issued */
349 cmd_fmri_t cpu0_asru
; /* ASRU for this CPU */
350 cmd_fmri_t cpu0_fru
; /* FRU for this CPU */
351 cmd_cpu_uec_t cpu0_uec
; /* UE cache */
352 cmd_cpu_uec_t cpu0_olduec
; /* To-be-flushed UE cache */
353 id_t cpu0_uec_flush
; /* Timer ID for UE cache flush */
354 uint_t cpu0_uec_nflushes
; /* # of flushes since last restart/GC */
355 cmd_list_t cpu0_xxu_retries
; /* List of pending xxU retries */
358 typedef struct cmd_cpu_1
{
359 cmd_header_t cpu1_header
; /* Nodetype must be CMD_NT_CPU */
360 uint_t cpu1_version
; /* struct version - must follow hdr */
361 uint32_t cpu1_cpuid
; /* Logical ID for this CPU */
362 cmd_cpu_type_t cpu1_type
; /* CPU model */
363 uintptr_t *cpu1_cases
; /* v1 had a pointer to a case array */
364 uint8_t cpu1_faulting
; /* Set if fault has been issued */
365 cmd_fmri_t cpu1_asru
; /* ASRU for this CPU */
366 cmd_fmri_t cpu1_fru
; /* FRU for this CPU */
367 cmd_cpu_uec_t cpu1_uec
; /* UE cache */
368 cmd_cpu_uec_t cpu1_olduec
; /* To-be-flushed UE cache */
369 id_t cpu1_uec_flush
; /* Timer ID for UE cache flush */
370 uint_t cpu1_uec_nflushes
; /* # of flushes since last restart/GC */
371 cmd_list_t cpu1_xxu_retries
; /* List of pending xxU retries */
374 typedef struct cmd_cpu_2
{
375 cmd_header_t cpu2_header
; /* Nodetype must be CMD_NT_CPU */
376 uint_t cpu2_version
; /* struct version - must follow hdr */
377 uint32_t cpu2_cpuid
; /* Logical ID for this CPU */
378 cmd_cpu_type_t cpu2_type
; /* CPU model */
379 uint8_t cpu2_faulting
; /* Set if fault has been issued */
380 cmd_fmri_t cpu2_asru
; /* ASRU for this CPU */
381 cmd_fmri_t cpu2_fru
; /* FRU for this CPU */
382 cmd_cpu_uec_t cpu2_uec
; /* UE cache */
383 cmd_cpu_uec_t cpu2_olduec
; /* To-be-flushed UE cache */
386 /* Portion of the cpu structure which must be persisted */
387 typedef struct cmd_cpu_pers
{
388 cmd_header_t cpup_header
; /* Nodetype must be CMD_NT_CPU */
389 uint_t cpup_version
; /* struct version - must follow hdr */
390 uint32_t cpup_cpuid
; /* Logical ID for this CPU */
391 cmd_cpu_type_t cpup_type
; /* CPU model */
392 uint8_t cpup_faulting
; /* Set if fault has been issued */
393 uint8_t cpup_level
; /* cpu group level - 0 == thread */
394 cmd_fmri_t cpup_asru
; /* ASRU for this CPU */
395 cmd_fmri_t cpup_fru
; /* FRU for this CPU */
396 cmd_cpu_uec_t cpup_uec
; /* UE cache */
397 cmd_cpu_uec_t cpup_olduec
; /* To-be-flushed UE cache */
400 /* Persistent and dynamic CPU data */
402 cmd_cpu_pers_t cpu_pers
;
403 cmd_cpu_cases_t cpu_cases
;
404 id_t cpu_uec_flush
; /* Timer ID for UE cache flush */
405 uint_t cpu_uec_nflushes
; /* # of flushes since last restart/GC */
406 cmd_list_t cpu_xxu_retries
; /* List of pending xxU retries */
408 cmd_list_t cpu_Lxcaches
; /* List of Lxcache state structures */
409 fmd_stat_t Lxcache_creat
; /* num of Lxcache states created */
412 #define CMD_CPU_MAXSIZE \
413 MAX(MAX(sizeof (cmd_cpu_0_t), sizeof (cmd_cpu_1_t)), \
414 MAX(sizeof (cmd_cpu_2_t), sizeof (cmd_cpu_pers_t)))
415 #define CMD_CPU_MINSIZE \
416 MIN(MIN(sizeof (cmd_cpu_0_t), sizeof (cmd_cpu_1_t)), \
417 MIN(sizeof (cmd_cpu_2_t), sizeof (cmd_cpu_pers_t)))
419 #define cpu_header cpu_pers.cpup_header
420 #define cpu_nodetype cpu_pers.cpup_header.hdr_nodetype
421 #define cpu_bufname cpu_pers.cpup_header.hdr_bufname
422 #define cpu_version cpu_pers.cpup_version
423 #define cpu_cpuid cpu_pers.cpup_cpuid
424 #define cpu_type cpu_pers.cpup_type
425 #define cpu_faulting cpu_pers.cpup_faulting
426 #define cpu_level cpu_pers.cpup_level
427 #define cpu_asru cpu_pers.cpup_asru
428 #define cpu_fru cpu_pers.cpup_fru
429 #define cpu_uec cpu_pers.cpup_uec
430 #define cpu_olduec cpu_pers.cpup_olduec
431 #define cpu_icache cpu_cases.cpuc_icache
432 #define cpu_dcache cpu_cases.cpuc_dcache
433 #define cpu_pcache cpu_cases.cpuc_pcache
434 #define cpu_itlb cpu_cases.cpuc_itlb
435 #define cpu_dtlb cpu_cases.cpuc_dtlb
436 #define cpu_l2data cpu_cases.cpuc_l2data
437 #define cpu_l2tag cpu_cases.cpuc_l2tag
438 #define cpu_l3data cpu_cases.cpuc_l3data
439 #define cpu_l3tag cpu_cases.cpuc_l3tag
440 #define cpu_fpu cpu_cases.cpuc_fpu
441 #define cpu_ireg cpu_cases.cpuc_ireg
442 #define cpu_freg cpu_cases.cpuc_freg
443 #define cpu_mau cpu_cases.cpuc_mau
444 #define cpu_l2ctl cpu_cases.cpuc_l2ctl
445 #define cpu_misc_regs cpu_cases.cpuc_misc_regs
446 #define cpu_lfu cpu_cases.cpuc_lfu
448 #define cpu_opl_invsfsr cpu_cases.cpuc_opl_invsfsr
449 #define cpu_oplue_detcpu cpu_cases.cpuc_oplue_detcpu
450 #define cpu_oplue_detio cpu_cases.cpuc_oplue_detio
451 #define cpu_opl_mtlb cpu_cases.cpuc_opl_mtlb
452 #define cpu_opl_tlbp cpu_cases.cpuc_opl_tlbp
453 #define cpu_opl_inv_urg cpu_cases.cpuc_opl_inv_urg
454 #define cpu_opl_cre cpu_cases.cpuc_opl_cre
455 #define cpu_opl_tsb_ctx cpu_cases.cpuc_opl_tsb_ctx
456 #define cpu_opl_tsbp cpu_cases.cpuc_opl_tsbp
457 #define cpu_opl_pstate cpu_cases.cpuc_opl_pstate
458 #define cpu_opl_tstate cpu_cases.cpuc_opl_tstate
459 #define cpu_opl_iug_f cpu_cases.cpuc_opl_iug_f
460 #define cpu_opl_iug_r cpu_cases.cpuc_opl_iug_r
461 #define cpu_opl_sdc cpu_cases.cpuc_opl_sdc
462 #define cpu_opl_wdt cpu_cases.cpuc_opl_wdt
463 #define cpu_opl_dtlb cpu_cases.cpuc_opl_dtlb
464 #define cpu_opl_itlb cpu_cases.cpuc_opl_itlb
465 #define cpu_opl_core_err cpu_cases.cpuc_opl_core_err
466 #define cpu_opl_dae cpu_cases.cpuc_opl_dae
467 #define cpu_opl_iae cpu_cases.cpuc_opl_iae
468 #define cpu_opl_uge cpu_cases.cpuc_opl_uge
471 #define cpu_asru_nvl cpu_asru.fmri_nvl
472 #define cpu_fru_nvl cpu_fru.fmri_nvl
475 * L2$ and L3$ Data errors
478 * Type (if any) Fault
479 * ------ ----------- -------------------------------
480 * xxC l2cachedata fault.cpu.<cputype>.l2cachedata
481 * xxU - fault.cpu.<cputype>.l2cachedata
482 * L3_xxC l3cachedata fault.cpu.<cputype>.l3cachedata
483 * L3_xxU - fault.cpu.<cputype>.l3cachedata
485 * NOTE: For the purposes of the discussion below, xxC and xxU refer to both
486 * L2$ and L3$ data errors.
488 * These ereports will be dropped if (among other things) they are side-effects
489 * of UEs (xxUs only) or other xxCs or xxUs. Whenever UEs are detected, they
490 * are added to a per-CPU cache. xxUs are then compared to this cache. If a
491 * xxU's AFAR refers to an address which recently saw a UE, the xxU is dropped,
492 * as it was most likely caused by the UE. When multiple xxCs and xxUs are seen
493 * with the same ENA, all save one are generally side-effects. We track these
494 * groups (referred to as trains), matching them against a premade list. If one
495 * of the trains matches, we drop all but the primary, which is indicated in the
498 * The expected resolution of l2cachedata and l3cachedata faults is the
499 * disabling of the indicated CPU.
501 extern cmd_evdisp_t
cmd_xxc(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
502 const char *, cmd_errcl_t
);
503 extern cmd_evdisp_t
cmd_xxu(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
504 const char *, cmd_errcl_t
);
507 * As of Niagara-2, we ignore writeback (ldwc, ldwu) errors. Since these were
508 * the only defined follow-on errors for sun4v trains, sun4v L2 cache data
509 * errors no longer need to use the train mechanism.
512 extern cmd_evdisp_t
cmd_l2c(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
513 const char *, cmd_errcl_t
);
514 extern cmd_evdisp_t
cmd_l2u(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
515 const char *, cmd_errcl_t
);
518 * Common Errdata structure for SERD engines
520 typedef struct errdata
{
522 const char *ed_fltnm
;
523 const cmd_ptrsubtype_t ed_pst
;
527 * L2$ and L3$ Tag errors
530 * Type (if any) Fault
531 * ------- ----------- -------------------------------
532 * TxCE l2cachetag fault.cpu.<cputype>.l2cachetag
533 * L3_THCE l3cachetag fault.cpu.<cputype>.l3cachetag
534 * LTC l2cachetag fault.cpu.<cputype>.l2cachetag
536 * We'll never see the uncorrectable Tag errors - they'll cause the machine to
537 * reset, and we'll be ne'er the wiser.
539 * The expected resolution of l2cachetag and l3cachetag faults is the disabling
540 * of the indicated CPU.
542 extern cmd_evdisp_t
cmd_txce(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
543 const char *, cmd_errcl_t
);
545 extern cmd_evdisp_t
cmd_l3_thce(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
546 const char *, cmd_errcl_t
);
552 * Type (if any) Fault
553 * ------- --------- -------------------------------
554 * IPE icache fault.cpu.<cputype>.icache
555 * IxSPE icache fault.cpu.<cputype>.icache
556 * DPE dcache fault.cpu.<cputype>.dcache
557 * DxSPE dcache fault.cpu.<cputype>.dcache
558 * PDSPE pcache fault.cpu.<cputype>.pcache
560 * The I$, D$, and P$ are clean, and thus have no uncorrectable errors.
562 * The expected resolution of icache, dcache, and pcache faults is the disabling
563 * of the indicated CPU.
565 extern cmd_evdisp_t
cmd_icache(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
566 const char *, cmd_errcl_t
);
567 extern cmd_evdisp_t
cmd_dcache(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
568 const char *, cmd_errcl_t
);
569 extern cmd_evdisp_t
cmd_pcache(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
570 const char *, cmd_errcl_t
);
576 * Type (if any) Fault
577 * ------ --------- -------------------------------
578 * ITLBPE itlb fault.cpu.<cputype>.itlb
579 * DTLBPE dtlb fault.cpu.<cputype>.dtlb
581 * The expected resolution of itlb and dtlb faults is the disabling of the
584 extern cmd_evdisp_t
cmd_itlb(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
585 const char *, cmd_errcl_t
);
586 extern cmd_evdisp_t
cmd_dtlb(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
587 const char *, cmd_errcl_t
);
589 extern void cmd_cpuerr_close(fmd_hdl_t
*, void *);
595 * Type (if any) Fault
596 * ------ --------- -------------------------------
597 * FPU - fault.cpu.<cputype>.fpu
599 * The expected resolution of FPU faults is the disabling of the indicated CPU.
601 extern cmd_evdisp_t
cmd_fpu(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
602 const char *, cmd_errcl_t
);
609 * Type (if any) Fault
610 * ------ --------- -------------------------------
611 * IRC ireg fault.cpu.<cputype>.ireg
614 * The expected resolution of ireg faults is the disabling of the indicated CPU.
616 extern cmd_evdisp_t
cmd_irc(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
617 const char *, cmd_errcl_t
);
618 extern cmd_evdisp_t
cmd_iru(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
619 const char *, cmd_errcl_t
);
625 * Type (if any) Fault
626 * ------ --------- -------------------------------
627 * FRC freg fault.cpu.ultraSPARC-T1.frc
630 * The expected resolution of freg faults is the repair of the indicated CPU.
632 extern cmd_evdisp_t
cmd_frc(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
633 const char *, cmd_errcl_t
);
634 extern cmd_evdisp_t
cmd_fru(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
635 const char *, cmd_errcl_t
);
641 * Type (if any) Fault
642 * ------ --------- -------------------------------
643 * MAU mau fault.cpu.<cputype>.mau
645 * The expected resolution of mau faults is the repair of the indicated CPU.
647 extern cmd_evdisp_t
cmd_mau(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
648 const char *, cmd_errcl_t
);
654 * Type (if any) Fault
655 * ------ --------- -------------------------------
656 * L2CTL - fault.cpu.<cputype>.l2ctl
658 * The expected resolution of l2ctl faults is the repair of the indicated CPU.
660 extern cmd_evdisp_t
cmd_l2ctl(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
661 const char *, cmd_errcl_t
);
664 * SBD (Storage Buffer Data) errors
665 * SCA (Scratchpath Array) erros
666 * TC (Tick compare) errors
667 * TSA (Trap stack Array) errors
670 * Type (if any) Fault
671 * ------ --------- -------------------------------
672 * SBDC misc_regs fault.cpu.<cputype>.misc_regs
678 * The expected resolution of misc_regs faults is the repair of
681 extern cmd_evdisp_t
cmd_miscregs_ce(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
682 const char *, cmd_errcl_t
);
683 extern cmd_evdisp_t
cmd_miscregs_ue(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
684 const char *, cmd_errcl_t
);
686 extern cmd_evdisp_t
cmd_miscregs_train(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
687 const char *, cmd_errcl_t
);
691 * ---------------------------------------------------------------------
692 * LFU-RTF uncorrectable link retrain fail error fault.cpu.T2plus.lfu-u
693 * LFU-TTO uncorrectable training timeout error
694 * LFU-CTO uncorrectable config timeout error
695 * LFU-MLF uncorrectable multi lanes link fail error
696 * LFU-SLF correctable single lane failover fault.cpu.T2plus.lfu-f
698 * The expected resolution of lfu faults is the repair of the indicated CPU.
700 extern cmd_evdisp_t
cmd_lfu_ue(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
701 const char *, cmd_errcl_t
);
702 extern cmd_evdisp_t
cmd_lfu_ce(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
703 const char *, cmd_errcl_t
);
706 * ---------------------------------------------------------------------
707 * Coherency link protocol errors
708 * to Transaction timed out fault.cpu.T2plus.lfu-p
709 * frack Invalid or redundant request ack
710 * fsr Invalid or redundant snoop response
711 * fdr Invalid or redundant data return
712 * snptyp Invalid snoop type received from
715 * The expected resolution of lfu faults is the repair of the indicated CPU.
717 extern cmd_evdisp_t
cmd_lfu_pe(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
718 const char *, cmd_errcl_t
);
721 * CPUs are described by FMRIs. This routine will retrieve the CPU state
722 * structure (creating a new one if necessary) described by the detector
723 * FMRI in the passed ereport.
725 extern cmd_cpu_t
*cmd_cpu_lookup_from_detector(fmd_hdl_t
*, nvlist_t
*,
726 const char *, uint8_t);
728 extern char *cmd_cpu_getfrustr(fmd_hdl_t
*, cmd_cpu_t
*);
729 extern char *cmd_cpu_getpartstr(fmd_hdl_t
*, cmd_cpu_t
*);
731 extern char *cmd_cpu_getserialstr(fmd_hdl_t
*, cmd_cpu_t
*);
732 extern nvlist_t
*cmd_cpu_mkfru(fmd_hdl_t
*, char *, char *, char *);
734 extern cmd_cpu_t
*cmd_cpu_lookup(fmd_hdl_t
*, nvlist_t
*, const char *,
737 extern void cmd_cpu_create_faultlist(fmd_hdl_t
*, fmd_case_t
*, cmd_cpu_t
*,
738 const char *, nvlist_t
*, uint_t
);
740 extern cmd_cpu_t
*cmd_restore_cpu_only(fmd_hdl_t
*, fmd_case_t
*, char *);
741 extern void cmd_cpu_destroy(fmd_hdl_t
*, cmd_cpu_t
*);
742 extern void *cmd_cpu_restore(fmd_hdl_t
*, fmd_case_t
*, cmd_case_ptr_t
*);
743 extern void cmd_cpu_validate(fmd_hdl_t
*);
744 extern void cmd_cpu_timeout(fmd_hdl_t
*, id_t
, void *);
745 extern void cmd_cpu_gc(fmd_hdl_t
*);
746 extern void cmd_cpu_fini(fmd_hdl_t
*hdl
);
747 extern char *cmd_cpu_serdnm_create(fmd_hdl_t
*, cmd_cpu_t
*, const char *);
748 extern nvlist_t
*cmd_cpu_fmri_create(uint32_t, uint8_t);
750 extern uint32_t cmd_cpu2core(uint32_t, cmd_cpu_type_t
, uint8_t);
752 #define CMD_CPU_LEVEL_THREAD 0
753 #define CMD_CPU_LEVEL_CORE 1
754 #define CMD_CPU_LEVEL_CHIP 2
755 #define CMD_CPU_STAT_BUMP(cpu, name) cpu->name.fmds_value.ui64++
758 CMD_CPU_FAM_UNSUPPORTED
,
764 typedef struct faminfo
{
765 cpu_family_t fam_value
;
766 boolean_t ecache_flush_needed
;
769 extern cpu_family_t
cmd_cpu_check_support(void);
770 extern boolean_t
cmd_cpu_ecache_support(void);
772 extern int cmd_xr_fill(fmd_hdl_t
*, nvlist_t
*, cmd_xr_t
*, cmd_errcl_t
);
773 extern void cmd_fill_errdata(cmd_errcl_t
, cmd_cpu_t
*, cmd_case_t
**,
775 extern cmd_xxcu_trw_t
*cmd_trw_lookup(uint64_t, uint8_t, uint64_t);
776 extern cmd_evdisp_t
cmd_nop_train(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
777 const char *, cmd_errcl_t
);
778 extern cmd_errcl_t
cmd_train_match(cmd_errcl_t
, cmd_errcl_t
);
779 extern int cmd_afar_status_check(uint8_t, cmd_errcl_t
);
782 extern int cmd_cpu_synd_check(uint16_t, cmd_errcl_t clcode
);
784 extern int cmd_cpu_synd_check(uint32_t, cmd_errcl_t clcode
);
787 extern int cmd_afar_valid(fmd_hdl_t
*hdl
, nvlist_t
*nvl
, cmd_errcl_t
,
794 #endif /* _CMD_CPU_H */