4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
29 * Support routines for managing state related to memory modules.
31 * Correctable errors generally cause changes to the DIMM-related state (see
32 * cmd_dimm.c), whereas uncorrectable errors tend to use the bank-related
33 * routines (see cmd_bank.c). The primary exception to this division (though
34 * it eventually devolves to one of the two) is the RxE/FRx pair emitted by
35 * UltraSPARC-IIIi processors. With these errors, a complete pair must be
36 * received and matched before we know whether we're dealing with a CE or a UE.
40 #include <cmd_state.h>
42 #include <sys/errclassify.h>
49 #define CMD_MEM_F_FAULTING 0x1
52 * Used to store as-yet unmatched IOxEs, RxEs, and FRxs. When a new IOxE,
53 * RxE or FRx arrives, we traverse the cmd.cmd_iorxefrx list, looking for
54 * matching entries. Matching has a cpuid-based component, as well as a
55 * temporal one. We can compare the cpuids directly, using the cmd_iorxefrx_t
56 * and the newly-received event. Temporal comparison isn't performed directly.
57 * Instead, we ensure that entries in the iorxefrx list are removed when they
58 * expire by means of timers. This frees the matching code from the need to
61 typedef struct cmd_iorxefrx
{
62 cmd_list_t rf_list
; /* List of cmd_iorxefrx_t's */
63 cmd_errcl_t rf_errcl
; /* Error type (CMD_ERRCL_*) */
64 uint_t rf_afsr_agentid
; /* Remote Agent ID (from AFSR) */
65 uint_t rf_det_agentid
; /* Locat Agent ID (from detector) */
66 id_t rf_expid
; /* Timer ID for entry expiration */
67 uint64_t rf_afar
; /* Valid for RxE only */
68 uint8_t rf_afar_status
; /* Valid for RxE only */
69 ce_dispact_t rf_type
; /* Valid for RxE only */
70 uint16_t rf_synd
; /* Valid for FRx only */
71 uint8_t rf_synd_status
; /* Valid for FRx only */
72 uint64_t rf_afsr
; /* Valid for FRx only */
73 uint64_t rf_disp
; /* Valid for RCE only */
76 typedef struct cmd_dimm cmd_dimm_t
;
77 typedef struct cmd_bank cmd_bank_t
;
79 typedef struct cmd_branch cmd_branch_t
;
83 * Correctable and Uncorrectable memory errors
85 * CEs of "Unknown" or "Intermittent" classification are not used in diagnosis.
87 * "Persistent" CEs are added to per-DIMM SERD engines. When the
88 * engine for a given DIMM fires, the page corresponding to the CE that
89 * caused the engine to fire is retired, and the SERD engine for that
92 * "Possibly Persistent" CEs are at least Persistent and so are treated
93 * as "Persistent" errors above, being added to the same SERD engines.
95 * "Leaky" CEs and "Sticky" CEs trigger immediate page retirement.
97 * "Possibly Sticky" CEs to which no valid partner test has been applied
98 * are not used in diagnosis. Where a valid partner test has been applied
99 * but did not confirm "Sticky" status there is a _suggestion_ that the
100 * original cpu may be a bad reader or writer or suffering from other
101 * datapath issues. To avoid retiring pages for such non-DIMM problems
102 * these classifications are also not used in diagnosis.
104 * UEs immediately trigger page retirements, but do not affect the CE SERD
105 * engines. In addition, UEs are recorded in the UE caches of the detecting
106 * CPUs. When a page is to be retired, a fault.memory.page fault is
111 typedef cmd_evdisp_t
cmd_xe_handler_f(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
112 const char *, uint64_t, uint8_t, uint16_t, uint8_t, ce_dispact_t
, uint64_t,
115 extern ce_dispact_t
cmd_mem_name2type(const char *, int);
116 extern int cmd_synd2upos(uint16_t);
117 extern cmd_evdisp_t
cmd_ce(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
118 const char *, cmd_errcl_t
);
119 extern cmd_evdisp_t
cmd_ue(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
120 const char *, cmd_errcl_t
);
121 extern cmd_evdisp_t
cmd_ce_common(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
122 const char *, uint64_t, uint8_t, uint16_t, uint8_t,
123 ce_dispact_t
, uint64_t, nvlist_t
*);
124 extern cmd_evdisp_t
cmd_ue_common(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
125 const char *, uint64_t, uint8_t, uint16_t, uint8_t,
126 ce_dispact_t
, uint64_t, nvlist_t
*);
127 extern cmd_evdisp_t
cmd_mem_synd_check(fmd_hdl_t
*, uint64_t, uint8_t,
128 uint16_t, uint8_t, cmd_cpu_t
*);
129 extern void cmd_dimm_close(fmd_hdl_t
*, void *);
130 extern void cmd_bank_close(fmd_hdl_t
*, void *);
131 extern int cmd_same_datapath_dimms(cmd_dimm_t
*, cmd_dimm_t
*);
132 extern void cmd_gen_datapath_fault(fmd_hdl_t
*, cmd_dimm_t
*, cmd_dimm_t
*,
133 uint16_t, nvlist_t
*);
134 extern void cmd_to_hashed_addr(uint64_t *, uint64_t, const char *);
137 extern char *cmd_cpu_getfrustr_by_id(fmd_hdl_t
*, uint32_t);
141 extern void cmd_branch_close(fmd_hdl_t
*, void *);
142 extern cmd_evdisp_t
cmd_fb(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
143 const char *, cmd_errcl_t
);
144 extern cmd_evdisp_t
cmd_fw_defect(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
145 const char *, cmd_errcl_t
);
146 extern cmd_evdisp_t
cmd_fb_train(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
147 const char *, cmd_errcl_t
);
148 extern cmd_evdisp_t
cmd_ue_train(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
149 const char *, cmd_errcl_t
);
153 * US-IIIi I/O, Remote and Foreign Read memory errors
155 * When one processor or I/O bridge attempts to read memory local to
156 * another processor, one each of IOCE/IOUE/RCE/RUE and FRC/FRU will be
157 * generated, depending on the type of error. Both the IOxE/RxE and the FRx
158 * are needed, as each contains data necessary to the diagnosis of the error.
159 * Upon receipt of one of the errors, we wait until we receive the other.
160 * When the pair has been successfully received and matched, a CE or UE,
161 * as appropriate, is synthesized from the data in the matched ereports.
162 * The synthesized ereports are handled by the normal CE and UE mechanisms.
164 extern cmd_evdisp_t
cmd_frx(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
165 const char *, cmd_errcl_t
);
166 extern cmd_evdisp_t
cmd_rxe(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
167 const char *, cmd_errcl_t
);
168 extern cmd_evdisp_t
cmd_ioxe(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
169 const char *, cmd_errcl_t
);
170 extern cmd_evdisp_t
cmd_ioxe_sec(fmd_hdl_t
*, fmd_event_t
*, nvlist_t
*,
171 const char *, cmd_errcl_t
);
172 extern cmd_evdisp_t
cmd_rxefrx_common(fmd_hdl_t
*hdl
, fmd_event_t
*ep
,
173 nvlist_t
*nvl
, const char *class, cmd_errcl_t clcode
,
174 cmd_errcl_t matchmask
);
177 * A list of received IOxE/RxE/FRx ereports is maintained for correlation
178 * purposes (see above). These two routines manage the addition of new
179 * ereports, and the retrieval of existing ones. Pruning of the list is
180 * handled automatically.
182 extern void cmd_iorxefrx_queue(fmd_hdl_t
*, cmd_iorxefrx_t
*);
183 extern void cmd_iorxefrx_free(fmd_hdl_t
*, cmd_iorxefrx_t
*);
185 extern const char *cmd_fmri_get_unum(nvlist_t
*);
186 extern nvlist_t
*cmd_mem_fmri_create(const char *, char **, size_t);
187 extern nvlist_t
*cmd_mem_fmri_derive(fmd_hdl_t
*, uint64_t, uint64_t, uint16_t);
189 extern void cmd_mem_case_restore(fmd_hdl_t
*, cmd_case_t
*, fmd_case_t
*,
190 const char *, const char *);
191 extern char *cmd_mem_serdnm_create(fmd_hdl_t
*, const char *, const char *);
192 extern char *cmd_page_serdnm_create(fmd_hdl_t
*, const char *, uint64_t);
193 extern char *cmd_mq_serdnm_create(fmd_hdl_t
*, const char *, uint64_t,
195 extern void cmd_mem_retirestat_create(fmd_hdl_t
*, fmd_stat_t
*, const char *,
196 uint64_t, const char *);
197 extern int cmd_mem_thresh_check(fmd_hdl_t
*, uint_t
);
198 extern ulong_t
cmd_mem_get_phys_pages(fmd_hdl_t
*);
200 extern void cmd_mem_timeout(fmd_hdl_t
*, id_t
);
201 extern void cmd_mem_gc(fmd_hdl_t
*);
202 extern void cmd_mem_fini(fmd_hdl_t
*);
208 #endif /* _CMD_MEM_H */