4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
26 * Ereport-handling routines for memory errors
36 #include <cmd_dp_page.h>
44 #include <fm/fmd_api.h>
45 #include <sys/fm/protocol.h>
46 #include <sys/async.h>
47 #include <sys/errclassify.h>
51 #include <cmd_hc_sun4v.h>
60 cmd_mem_name2type(const char *name
, int minorvers
)
62 static const struct ce_name2type old
[] = {
63 { ERR_TYPE_DESC_INTERMITTENT
, CE_DISP_INTERMITTENT
},
64 { ERR_TYPE_DESC_PERSISTENT
, CE_DISP_PERS
},
65 { ERR_TYPE_DESC_STICKY
, CE_DISP_STICKY
},
66 { ERR_TYPE_DESC_UNKNOWN
, CE_DISP_UNKNOWN
},
69 static const struct ce_name2type
new[] = {
70 { CE_DISP_DESC_U
, CE_DISP_UNKNOWN
},
71 { CE_DISP_DESC_I
, CE_DISP_INTERMITTENT
},
72 { CE_DISP_DESC_PP
, CE_DISP_POSS_PERS
},
73 { CE_DISP_DESC_P
, CE_DISP_PERS
},
74 { CE_DISP_DESC_L
, CE_DISP_LEAKY
},
75 { CE_DISP_DESC_PS
, CE_DISP_POSS_STICKY
},
76 { CE_DISP_DESC_S
, CE_DISP_STICKY
},
79 const struct ce_name2type
*names
= (minorvers
== 0) ? &old
[0] : &new[0];
80 const struct ce_name2type
*tp
;
82 for (tp
= names
; tp
->name
!= NULL
; tp
++)
83 if (strcasecmp(name
, tp
->name
) == 0)
86 return (CE_DISP_UNKNOWN
);
90 * check if a dimm has n CEs with the same symbol-in-error
93 upos_thresh_check(cmd_dimm_t
*dimm
, uint16_t upos
, uint32_t threshold
)
99 for (i
= 0; i
< CMD_MAX_CKWDS
; i
++) {
100 for (ip
= cmd_list_next(&dimm
->mq_root
[i
]); ip
!= NULL
;
102 next
= cmd_list_next(ip
);
103 if (ip
->mq_unit_position
== upos
) {
105 if (count
>= threshold
)
114 * check if smaller number of retired pages > 1/16 of larger
115 * number of retired pages
118 check_bad_rw_retired_pages(fmd_hdl_t
*hdl
, cmd_dimm_t
*d1
, cmd_dimm_t
*d2
)
122 uint_t d1_nretired
, d2_nretired
;
126 d1_nretired
= d1
->dimm_nretired
;
127 d2_nretired
= d2
->dimm_nretired
;
129 if (d1
->dimm_bank
!= NULL
)
130 d1_nretired
+= d1
->dimm_bank
->bank_nretired
;
132 if (d2
->dimm_bank
!= NULL
)
133 d2_nretired
+= d2
->dimm_bank
->bank_nretired
;
135 if (d2_nretired
< d1_nretired
) {
138 } else if (d2_nretired
> d1_nretired
) {
144 ratio
= lret
* CMD_PAGE_RATIO
;
147 fmd_hdl_debug(hdl
, "sret=%d lret=%d ratio=%.3f\n",
155 * check bad rw between two DIMMs
156 * the check succeeds if
157 * - each DIMM has 4 CEs with the same symbol-in-error.
158 * - the smaller number of retired pages > 1/16 larger number of retired pages
161 check_bad_rw_between_dimms(fmd_hdl_t
*hdl
, cmd_dimm_t
*d1
, cmd_dimm_t
*d2
,
168 for (i
= 0; i
< CMD_MAX_CKWDS
; i
++) {
169 for (ip
= cmd_list_next(&d1
->mq_root
[i
]); ip
!= NULL
;
171 next
= cmd_list_next(ip
);
172 upos
= ip
->mq_unit_position
;
173 if (upos_thresh_check(d1
, upos
, cmd
.cmd_nupos
)) {
174 if (upos_thresh_check(d2
, upos
,
176 if (check_bad_rw_retired_pages(hdl
,
190 bad_reader_writer_check(fmd_hdl_t
*hdl
, cmd_dimm_t
*ce_dimm
, nvlist_t
*det
)
192 cmd_dimm_t
*d
, *next
;
195 for (d
= cmd_list_next(&cmd
.cmd_dimms
); d
!= NULL
; d
= next
) {
196 next
= cmd_list_next(d
);
199 if (!cmd_same_datapath_dimms(ce_dimm
, d
))
201 if (check_bad_rw_between_dimms(hdl
, ce_dimm
, d
, &upos
)) {
202 cmd_gen_datapath_fault(hdl
, ce_dimm
, d
, upos
, det
);
203 cmd_dimm_save_symbol_error(ce_dimm
, upos
);
205 "check_bad_rw_dimms succeeded: %s %s",
206 ce_dimm
->dimm_unum
, d
->dimm_unum
);
213 * rule 5a checking. The check succeeds if
215 * - nretired >= 128 and (addr_hi - addr_low) / (nretired - 1) > 512KB
218 ce_thresh_check(fmd_hdl_t
*hdl
, cmd_dimm_t
*dimm
)
223 uint64_t delta_addr
= 0;
225 if (dimm
->dimm_flags
& CMD_MEM_F_FAULTING
)
226 /* We've already complained about this DIMM */
229 nret
= dimm
->dimm_nretired
;
230 if (dimm
->dimm_bank
!= NULL
)
231 nret
+= dimm
->dimm_bank
->bank_nretired
;
233 if (nret
< cmd
.cmd_low_ce_thresh
)
236 if (dimm
->dimm_phys_addr_hi
>= dimm
->dimm_phys_addr_low
)
238 (dimm
->dimm_phys_addr_hi
- dimm
->dimm_phys_addr_low
) /
241 if (nret
>= cmd
.cmd_hi_ce_thresh
|| delta_addr
> CMD_MQ_512KB
) {
243 dimm
->dimm_flags
|= CMD_MEM_F_FAULTING
;
244 cmd_dimm_dirty(hdl
, dimm
);
246 cp
= fmd_case_open(hdl
, NULL
);
247 flt
= cmd_dimm_create_fault(hdl
, dimm
,
248 "fault.memory.dimm-page-retires-excessive", CMD_FLTMAXCONF
);
249 fmd_case_add_suspect(hdl
, cp
, flt
);
250 fmd_case_solve(hdl
, cp
);
251 fmd_hdl_debug(hdl
, "ce_thresh_check succeeded nretired %d\n",
258 * rule 5b checking. The check succeeds if
259 * more than 120 non-intermittent CEs are reported against one symbol
260 * position of one afar in 72 hours.
263 mq_5b_check(fmd_hdl_t
*hdl
, cmd_dimm_t
*dimm
)
270 for (cw
= 0; cw
< CMD_MAX_CKWDS
; cw
++) {
271 for (ip
= cmd_list_next(&dimm
->mq_root
[cw
]);
272 ip
!= NULL
; ip
= next
) {
273 next
= cmd_list_next(ip
);
274 if (ip
->mq_dupce_count
>= cmd
.cmd_dupce
) {
275 cp
= fmd_case_open(hdl
, NULL
);
276 flt
= cmd_dimm_create_fault(hdl
, dimm
,
277 "fault.memory.dimm-page-retires-excessive",
279 dimm
->dimm_flags
|= CMD_MEM_F_FAULTING
;
280 cmd_dimm_dirty(hdl
, dimm
);
281 fmd_case_add_suspect(hdl
, cp
, flt
);
282 fmd_case_solve(hdl
, cp
);
284 "mq_5b_check succeeded: duplicate CE=%d",
293 * delete the expired duplicate CE time stamps
296 mq_prune_dup(fmd_hdl_t
*hdl
, cmd_mq_t
*ip
, uint64_t now
)
298 tstamp_t
*tsp
, *next
;
300 for (tsp
= cmd_list_next(&ip
->mq_dupce_tstamp
); tsp
!= NULL
;
302 next
= cmd_list_next(tsp
);
303 if (tsp
->tstamp
< now
- CMD_MQ_TIMELIM
) {
304 cmd_list_delete(&ip
->mq_dupce_tstamp
, &tsp
->ts_l
);
305 fmd_hdl_free(hdl
, tsp
, sizeof (tstamp_t
));
306 ip
->mq_dupce_count
--;
312 mq_update(fmd_hdl_t
*hdl
, fmd_event_t
*ep
, cmd_mq_t
*ip
, uint64_t now
,
318 ip
->mq_cpuid
= cpuid
;
321 if (fmd_serd_exists(hdl
, ip
->mq_serdnm
))
322 fmd_serd_destroy(hdl
, ip
->mq_serdnm
);
323 fmd_serd_create(hdl
, ip
->mq_serdnm
, CMD_MQ_SERDN
, CMD_MQ_SERDT
);
324 (void) fmd_serd_record(hdl
, ip
->mq_serdnm
, ep
);
326 tsp
= fmd_hdl_zalloc(hdl
, sizeof (tstamp_t
), FMD_SLEEP
);
328 cmd_list_append(&ip
->mq_dupce_tstamp
, tsp
);
329 ip
->mq_dupce_count
++;
332 /* Create a fresh index block for MQSC CE correlation. */
334 mq_create(fmd_hdl_t
*hdl
, fmd_event_t
*ep
,
335 uint64_t afar
, uint16_t upos
, uint64_t now
, uint32_t cpuid
)
339 uint16_t ckwd
= (afar
& 0x30) >> 4;
341 cp
= fmd_hdl_zalloc(hdl
, sizeof (cmd_mq_t
), FMD_SLEEP
);
344 cp
->mq_phys_addr
= afar
;
345 cp
->mq_unit_position
= upos
;
348 cmd_mq_serdnm_create(hdl
, "mq", afar
, ckwd
, upos
);
350 tsp
= fmd_hdl_zalloc(hdl
, sizeof (tstamp_t
), FMD_SLEEP
);
352 cmd_list_append(&cp
->mq_dupce_tstamp
, tsp
);
353 cp
->mq_dupce_count
= 1;
354 cp
->mq_cpuid
= cpuid
;
357 * Create SERD to keep this event from being removed
358 * by fmd which may not know there is an event pointer
359 * saved here. This SERD is *never* meant to fire.
360 * NOTE: wouldn't need to do this if there were an fmd
361 * api to 'hold' an event.
363 if (fmd_serd_exists(hdl
, cp
->mq_serdnm
)) {
365 fmd_serd_destroy(hdl
, cp
->mq_serdnm
);
367 fmd_serd_create(hdl
, cp
->mq_serdnm
, CMD_MQ_SERDN
, CMD_MQ_SERDT
);
368 (void) fmd_serd_record(hdl
, cp
->mq_serdnm
, ep
);
373 /* Destroy MQSC tracking block as well as event tracking SERD. */
376 mq_destroy(fmd_hdl_t
*hdl
, cmd_list_t
*lp
, cmd_mq_t
*ip
)
378 cmd_mq_t
*jp
= cmd_list_next(ip
);
379 tstamp_t
*tsp
, *next
;
381 if (ip
->mq_serdnm
!= NULL
) {
382 if (fmd_serd_exists(hdl
, ip
->mq_serdnm
))
383 fmd_serd_destroy(hdl
, ip
->mq_serdnm
);
384 fmd_hdl_strfree(hdl
, ip
->mq_serdnm
);
385 ip
->mq_serdnm
= NULL
;
388 for (tsp
= cmd_list_next(&ip
->mq_dupce_tstamp
); tsp
!= NULL
;
390 next
= cmd_list_next(tsp
);
391 cmd_list_delete(&ip
->mq_dupce_tstamp
, &tsp
->ts_l
);
392 fmd_hdl_free(hdl
, tsp
, sizeof (tstamp_t
));
395 cmd_list_delete(lp
, &ip
->mq_l
);
396 fmd_hdl_free(hdl
, ip
, sizeof (cmd_mq_t
));
402 * Add an index block for a new CE, sorted
403 * a) by ascending unit position
404 * b) order of arrival (~= time order)
408 mq_add(fmd_hdl_t
*hdl
, cmd_dimm_t
*dimm
, fmd_event_t
*ep
,
409 uint64_t afar
, uint16_t synd
, uint64_t now
, uint32_t cpuid
)
412 int cw
, unit_position
;
414 cw
= (afar
& 0x30) >> 4; /* 0:3 */
415 if ((unit_position
= cmd_synd2upos(synd
)) < 0)
416 return; /* not a CE */
418 for (ip
= cmd_list_next(&dimm
->mq_root
[cw
]); ip
!= NULL
; ) {
419 if (ip
->mq_unit_position
> unit_position
) {
420 /* list is in unit position order */
422 } else if (ip
->mq_unit_position
== unit_position
&&
423 ip
->mq_phys_addr
== afar
) {
425 * Found a duplicate cw, unit_position, and afar.
426 * update the mq_t with the new information
428 mq_update(hdl
, ep
, ip
, now
, cpuid
);
431 ip
= cmd_list_next(ip
);
435 jp
= mq_create(hdl
, ep
, afar
, unit_position
, now
, cpuid
);
437 cmd_list_append(&dimm
->mq_root
[cw
], jp
);
439 cmd_list_insert_before(&dimm
->mq_root
[cw
], ip
, jp
);
443 * Prune the MQSC index lists (one for each checkword), by deleting
444 * outdated index blocks from each list.
448 mq_prune(fmd_hdl_t
*hdl
, cmd_dimm_t
*dimm
, uint64_t now
)
453 for (cw
= 0; cw
< CMD_MAX_CKWDS
; cw
++) {
454 for (ip
= cmd_list_next(&dimm
->mq_root
[cw
]); ip
!= NULL
; ) {
455 if (ip
->mq_tstamp
< now
- CMD_MQ_TIMELIM
) {
457 * This event has timed out - delete the
458 * mq block as well as serd for the event.
460 ip
= mq_destroy(hdl
, &dimm
->mq_root
[cw
], ip
);
462 /* tstamp < now - ce_t */
463 mq_prune_dup(hdl
, ip
, now
);
464 ip
= cmd_list_next(ip
);
466 } /* per checkword */
471 * Check the MQSC index lists (one for each checkword) by making a
472 * complete pass through each list, checking if the criteria for
473 * Rule 4A has been met. Rule 4A checking is done for each checkword.
475 * Rule 4A: fault a DIMM "whenever Solaris reports two or more CEs from
476 * two or more different physical addresses on each of two or more different
477 * bit positions from the same DIMM within 72 hours of each other, and all
478 * the addresses are in the same relative checkword (that is, the AFARs
479 * are all the same modulo 64). [Note: This means at least 4 CEs; two
480 * from one bit position, with unique addresses, and two from another,
481 * also with unique addresses, and the lower 6 bits of all the addresses
486 mq_check(fmd_hdl_t
*hdl
, cmd_dimm_t
*dimm
)
488 int upos_pairs
, curr_upos
, cw
, i
, j
;
490 typedef struct upos_pair
{
495 upos_pair_t upos_array
[8]; /* max per cw = 2, * 4 cw's */
499 * Each upos_array[] member represents a pair of CEs for the same
500 * unit position (symbol) which on a sun4u is a bit, and on sun4v
501 * is a (4 bit) nibble.
502 * MQSC rule 4 requires pairs of CEs from the same symbol (same DIMM
503 * for rule 4A, and same DRAM for rule 4B) for a violation - this
504 * is why CE pairs are tracked.
507 upos_array
[0].mq1
= NULL
;
509 /* Loop through all checkwords */
510 for (cw
= 0; cw
< CMD_MAX_CKWDS
; cw
++) {
515 * mq_root[] is an array of cumulative lists of CEs
516 * indexed by checkword where the list is in unit position
517 * order. Loop through checking for duplicate unit position
518 * entries (filled in at mq_create()).
519 * The upos_array[] is filled in each time a duplicate
520 * unit position is found; the first time through the loop
521 * of a unit position sets curr_upos but does not fill in
522 * upos_array[] until the second symbol is found.
524 for (ip
= cmd_list_next(&dimm
->mq_root
[cw
]); ip
!= NULL
;
525 ip
= cmd_list_next(ip
)) {
526 if (curr_upos
!= ip
->mq_unit_position
) {
527 /* Set initial current position */
528 curr_upos
= ip
->mq_unit_position
;
529 } else if (i
> upos_pairs
&&
530 curr_upos
== upos_array
[i
-1].upos
) {
532 * Only keep track of CE pairs; skip
533 * triples, quads, etc...
536 } else if (upos_array
[i
].mq1
== NULL
) {
538 * Have a pair, add to upos_array[].
540 upos_array
[i
].upos
= curr_upos
;
541 upos_array
[i
].mq1
= cmd_list_prev(ip
);
542 upos_array
[i
].mq2
= ip
;
543 upos_array
[++i
].mq1
= NULL
;
547 if (i
- upos_pairs
>= 2) {
548 /* Rule 4A Violation. */
549 flt
= cmd_dimm_create_fault(hdl
,
550 dimm
, "fault.memory.dimm-ue-imminent",
552 for (j
= upos_pairs
; j
< i
; j
++) {
553 fmd_case_add_ereport(hdl
,
554 dimm
->dimm_case
.cc_cp
,
555 upos_array
[j
].mq1
->mq_ep
);
556 fmd_case_add_ereport(hdl
,
557 dimm
->dimm_case
.cc_cp
,
558 upos_array
[j
].mq2
->mq_ep
);
560 dimm
->dimm_flags
|= CMD_MEM_F_FAULTING
;
561 cmd_dimm_dirty(hdl
, dimm
);
562 fmd_case_add_suspect(hdl
, dimm
->dimm_case
.cc_cp
, flt
);
563 fmd_case_solve(hdl
, dimm
->dimm_case
.cc_cp
);
567 assert(upos_pairs
< 8);
573 cmd_ce_common(fmd_hdl_t
*hdl
, fmd_event_t
*ep
, nvlist_t
*nvl
,
574 const char *class, uint64_t afar
, uint8_t afar_status
, uint16_t synd
,
575 uint8_t synd_status
, ce_dispact_t type
, uint64_t disp
, nvlist_t
*asru
)
587 if (afar_status
!= AFLT_STAT_VALID
||
588 synd_status
!= AFLT_STAT_VALID
)
589 return (CMD_EVD_UNUSED
);
591 if ((page
= cmd_page_lookup(afar
)) != NULL
&&
592 page
->page_case
.cc_cp
!= NULL
&&
593 fmd_case_solved(hdl
, page
->page_case
.cc_cp
))
594 return (CMD_EVD_REDUND
);
597 if (cmd_dp_error(hdl
) || cmd_dp_fault(hdl
, afar
)) {
598 CMD_STAT_BUMP(dp_ignored_ce
);
599 return (CMD_EVD_UNUSED
);
603 if (fmd_nvl_fmri_expand(hdl
, asru
) < 0) {
604 CMD_STAT_BUMP(bad_mem_asru
);
605 return (CMD_EVD_BAD
);
608 if ((dimm
= cmd_dimm_lookup(hdl
, asru
)) == NULL
&&
609 (dimm
= cmd_dimm_create(hdl
, asru
)) == NULL
)
610 return (CMD_EVD_UNUSED
);
612 if (dimm
->dimm_case
.cc_cp
== NULL
) {
613 dimm
->dimm_case
.cc_cp
= cmd_case_create(hdl
,
614 &dimm
->dimm_header
, CMD_PTR_DIMM_CASE
, &uuid
);
617 if (nvlist_lookup_nvlist(nvl
, FM_EREPORT_DETECTOR
, &det
) != 0)
618 return (CMD_EVD_BAD
);
621 * Add to MQSC correlation lists all CEs which pass validity
623 * Add mq_t when there is no bad r/w or dimm fault.
624 * Always prune the expired mq_t.
626 skip_error
= cmd_dimm_check_symbol_error(dimm
, synd
);
628 if (nvlist_lookup_uint64_array(nvl
,
629 "__tod", &now
, &nelem
) == 0) {
631 if (!skip_error
|| !(dimm
->dimm_flags
& CMD_MEM_F_FAULTING
)) {
632 if (nvlist_lookup_uint32(det
, FM_FMRI_CPU_ID
, &cpuid
)
636 mq_add(hdl
, dimm
, ep
, afar
, synd
, *now
, cpuid
);
639 mq_prune(hdl
, dimm
, *now
);
642 bad_reader_writer_check(hdl
, dimm
, det
);
644 if (!(dimm
->dimm_flags
& CMD_MEM_F_FAULTING
)) {
646 mq_5b_check(hdl
, dimm
);
651 case CE_DISP_UNKNOWN
:
652 CMD_STAT_BUMP(ce_unknown
);
653 return (CMD_EVD_UNUSED
);
654 case CE_DISP_INTERMITTENT
:
655 CMD_STAT_BUMP(ce_interm
);
656 return (CMD_EVD_UNUSED
);
657 case CE_DISP_POSS_PERS
:
658 CMD_STAT_BUMP(ce_ppersis
);
661 CMD_STAT_BUMP(ce_persis
);
664 CMD_STAT_BUMP(ce_leaky
);
666 case CE_DISP_POSS_STICKY
:
668 uchar_t ptnrinfo
= CE_XDIAG_PTNRINFO(disp
);
670 if (CE_XDIAG_TESTVALID(ptnrinfo
)) {
671 int ce1
= CE_XDIAG_CE1SEEN(ptnrinfo
);
672 int ce2
= CE_XDIAG_CE2SEEN(ptnrinfo
);
675 /* Should have been CE_DISP_STICKY */
676 return (CMD_EVD_BAD
);
678 /* Partner could see and could fix CE */
679 CMD_STAT_BUMP(ce_psticky_ptnrclrd
);
681 /* Partner could not see ce1 (ignore ce2) */
682 CMD_STAT_BUMP(ce_psticky_ptnrnoerr
);
685 CMD_STAT_BUMP(ce_psticky_noptnr
);
687 return (CMD_EVD_UNUSED
);
690 CMD_STAT_BUMP(ce_sticky
);
693 return (CMD_EVD_BAD
);
696 if (cmd_dimm_check_symbol_error(dimm
, synd
))
697 return (CMD_EVD_REDUND
);
700 page
= cmd_page_create(hdl
, asru
, afar
);
702 if (page
->page_case
.cc_cp
== NULL
) {
703 page
->page_case
.cc_cp
= cmd_case_create(hdl
,
704 &page
->page_header
, CMD_PTR_PAGE_CASE
, &uuid
);
708 case CE_DISP_POSS_PERS
:
710 fmd_hdl_debug(hdl
, "adding %sPersistent event to CE serd "
711 "engine\n", type
== CE_DISP_POSS_PERS
? "Possible-" : "");
713 if (page
->page_case
.cc_serdnm
== NULL
) {
714 page
->page_case
.cc_serdnm
= cmd_page_serdnm_create(hdl
,
715 "page", page
->page_physbase
);
717 fmd_serd_create(hdl
, page
->page_case
.cc_serdnm
,
718 fmd_prop_get_int32(hdl
, "ce_n"),
719 fmd_prop_get_int64(hdl
, "ce_t"));
722 if (fmd_serd_record(hdl
, page
->page_case
.cc_serdnm
, ep
) ==
724 return (CMD_EVD_OK
); /* engine hasn't fired */
726 fmd_hdl_debug(hdl
, "ce page serd fired\n");
727 fmd_case_add_serd(hdl
, page
->page_case
.cc_cp
,
728 page
->page_case
.cc_serdnm
);
729 fmd_serd_reset(hdl
, page
->page_case
.cc_serdnm
);
730 break; /* to retire */
734 fmd_case_add_ereport(hdl
, page
->page_case
.cc_cp
, ep
);
735 break; /* to retire */
738 if (page
->page_flags
& CMD_MEM_F_FAULTING
||
739 fmd_nvl_fmri_unusable(hdl
, page
->page_asru_nvl
))
743 * convert a unhashed address to hashed address
745 cmd_to_hashed_addr(&addr
, afar
, class);
747 if (afar
> dimm
->dimm_phys_addr_hi
)
748 dimm
->dimm_phys_addr_hi
= addr
;
750 if (afar
< dimm
->dimm_phys_addr_low
)
751 dimm
->dimm_phys_addr_low
= addr
;
753 dimm
->dimm_nretired
++;
754 dimm
->dimm_retstat
.fmds_value
.ui64
++;
755 cmd_dimm_dirty(hdl
, dimm
);
757 cmd_page_fault(hdl
, asru
, cmd_dimm_fru(dimm
), ep
, afar
);
758 ce_thresh_check(hdl
, dimm
);
764 * Solve a bank case with suspect "fault.memory.bank". The caller must
765 * have populated bank->bank_case.cc_cp and is also responsible for adding
766 * associated ereport(s) to that case.
769 cmd_bank_fault(fmd_hdl_t
*hdl
, cmd_bank_t
*bank
)
771 fmd_case_t
*cp
= bank
->bank_case
.cc_cp
;
774 if (bank
->bank_flags
& CMD_MEM_F_FAULTING
)
775 return; /* Only complain once per bank */
777 bank
->bank_flags
|= CMD_MEM_F_FAULTING
;
778 cmd_bank_dirty(hdl
, bank
);
781 flt
= cmd_bank_create_fault(hdl
, bank
, "fault.memory.bank",
783 fmd_case_add_suspect(hdl
, cp
, flt
);
788 /* create separate fault for each dimm in bank */
790 for (d
= cmd_list_next(&bank
->bank_dimms
);
791 d
!= NULL
; d
= cmd_list_next(d
)) {
792 flt
= cmd_dimm_create_fault(hdl
, d
->bm_dimm
,
793 "fault.memory.bank", CMD_FLTMAXCONF
);
794 fmd_case_add_suspect(hdl
, cp
, flt
);
798 fmd_case_solve(hdl
, cp
);
803 cmd_ue_common(fmd_hdl_t
*hdl
, fmd_event_t
*ep
, nvlist_t
*nvl
,
804 const char *class, uint64_t afar
, uint8_t afar_status
, uint16_t synd
,
805 uint8_t synd_status
, ce_dispact_t type
, uint64_t disp
, nvlist_t
*asru
)
813 * Note: Currently all sun4u processors using this code share
814 * L2 and L3 cache at CMD_CPU_LEVEL_CORE.
816 cpu
= cmd_cpu_lookup_from_detector(hdl
, nvl
, class,
819 cpu
= cmd_cpu_lookup_from_detector(hdl
, nvl
, class,
820 CMD_CPU_LEVEL_THREAD
);
824 fmd_hdl_debug(hdl
, "cmd_ue_common: cpu not found\n");
825 return (CMD_EVD_UNUSED
);
829 * The following code applies only to sun4u, because sun4u does
830 * not poison data in L2 cache resulting from the fetch of a
835 if (afar_status
!= AFLT_STAT_VALID
) {
837 * Had this report's AFAR been valid, it would have
838 * contributed an address to the UE cache. We don't
839 * know what the AFAR would have been, and thus we can't
840 * add anything to the cache. If a xxU is caused by
841 * this UE, we won't be able to detect it, and will thus
842 * erroneously offline the CPU. To prevent this
843 * situation, we need to assume that all xxUs generated
844 * through the next E$ flush are attributable to the UE.
846 cmd_cpu_uec_set_allmatch(hdl
, cpu
);
848 cmd_cpu_uec_add(hdl
, cpu
, afar
);
852 if (synd_status
!= AFLT_STAT_VALID
) {
853 fmd_hdl_debug(hdl
, "cmd_ue_common: syndrome not valid\n");
854 return (CMD_EVD_UNUSED
);
857 if (cmd_mem_synd_check(hdl
, afar
, afar_status
, synd
, synd_status
,
858 cpu
) == CMD_EVD_UNUSED
)
859 return (CMD_EVD_UNUSED
);
861 if (afar_status
!= AFLT_STAT_VALID
)
862 return (CMD_EVD_UNUSED
);
864 if ((page
= cmd_page_lookup(afar
)) != NULL
&&
865 page
->page_case
.cc_cp
!= NULL
&&
866 fmd_case_solved(hdl
, page
->page_case
.cc_cp
))
867 return (CMD_EVD_REDUND
);
869 if (fmd_nvl_fmri_expand(hdl
, asru
) < 0) {
870 CMD_STAT_BUMP(bad_mem_asru
);
874 if ((bank
= cmd_bank_lookup(hdl
, asru
)) == NULL
&&
875 (bank
= cmd_bank_create(hdl
, asru
)) == NULL
)
876 return (CMD_EVD_UNUSED
);
885 * 1: locate the array of serial numbers inside the bank asru.
886 * 2: for each serial #, lookup its mem: FMRI in libtopo
887 * 3: ensure that each DIMM's FMRI is on bank's dimmlist
890 if (nvlist_lookup_string_array(asru
,
891 FM_FMRI_MEM_SERIAL_ID
, &snarray
, &n
) != 0)
892 fmd_hdl_abort(hdl
, "Cannot locate serial #s for bank");
894 for (i
= 0; i
< n
; i
++) {
895 fmri
= cmd_find_dimm_by_sn(hdl
, FM_FMRI_SCHEME_MEM
,
898 * If dimm structure doesn't already exist for
899 * each dimm, create and link to bank.
901 if (cmd_dimm_lookup(hdl
, fmri
) == NULL
)
902 (void) cmd_dimm_create(hdl
, fmri
);
908 if (bank
->bank_case
.cc_cp
== NULL
) {
910 bank
->bank_case
.cc_cp
= cmd_case_create(hdl
, &bank
->bank_header
,
911 CMD_PTR_BANK_CASE
, &uuid
);
915 if (cmd_dp_error(hdl
)) {
916 CMD_STAT_BUMP(dp_deferred_ue
);
917 cmd_dp_page_defer(hdl
, asru
, ep
, afar
);
919 } else if (cmd_dp_fault(hdl
, afar
)) {
920 CMD_STAT_BUMP(dp_ignored_ue
);
921 return (CMD_EVD_UNUSED
);
925 fmd_case_add_ereport(hdl
, bank
->bank_case
.cc_cp
, ep
);
927 bank
->bank_nretired
++;
928 bank
->bank_retstat
.fmds_value
.ui64
++;
929 cmd_bank_dirty(hdl
, bank
);
931 cmd_page_fault(hdl
, bank
->bank_asru_nvl
, cmd_bank_fru(bank
), ep
, afar
);
932 cmd_bank_fault(hdl
, bank
);
938 cmd_dimm_close(fmd_hdl_t
*hdl
, void *arg
)
940 cmd_dimm_destroy(hdl
, arg
);
944 cmd_bank_close(fmd_hdl_t
*hdl
, void *arg
)
946 cmd_bank_destroy(hdl
, arg
);