4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
27 * Ereport-handling routines for memory errors
31 #include <gmem_dimm.h>
32 #include <gmem_page.h>
39 #include <fm/fmd_api.h>
40 #include <fm/libtopo.h>
41 #include <sys/fm/protocol.h>
42 #include <sys/async.h>
43 #include <sys/errclassify.h>
45 #define OFFBIT 0xFFFFFFFFFFFC07FFULL
46 #define BIT28_32 0x00000001F0000000ULL
47 #define BIT13_17 0x000000000003E000ULL
48 #define BIT18_19 0x00000000000C0000ULL
49 #define BIT11_12 0x0000000000001800ULL
59 gmem_mem_name2type(const char *name
)
61 static const struct ce_name2type
new[] = {
62 { "mem-unk", CE_DISP_UNKNOWN
},
63 { "mem-is", CE_DISP_INTERMITTENT
},
64 { "mem-cs", CE_DISP_PERS
},
65 { "mem-ss", CE_DISP_STICKY
},
68 const struct ce_name2type
*names
= &new[0];
69 const struct ce_name2type
*tp
;
71 for (tp
= names
; tp
->name
!= NULL
; tp
++) {
72 if (strcasecmp(name
, tp
->name
) == 0)
76 return (CE_DISP_UNKNOWN
);
81 find_fault_fru(topo_hdl_t
*thp
, tnode_t
*node
, void *arg
)
83 nvlist_t
*nvl
= (nvlist_t
*)arg
;
84 nvlist_t
*rsc
= NULL
, *fru
= NULL
;
85 nvlist_t
**hcl
, **topo_hcl
;
87 char *name
, *name1
, *name2
;
91 if (topo_node_resource(node
, &rsc
, &err
) < 0)
92 return (TOPO_WALK_NEXT
);
94 err
= nvlist_lookup_nvlist_array(rsc
, FM_FMRI_HC_LIST
, &topo_hcl
, &n1
);
98 return (TOPO_WALK_NEXT
);
101 (void) nvlist_lookup_string(topo_hcl
[n1
- 1], FM_FMRI_HC_NAME
, &name
);
102 if (strcmp(name
, "chip") != 0) {
104 return (TOPO_WALK_NEXT
);
107 (void) nvlist_lookup_nvlist_array(nvl
, FM_FMRI_HC_LIST
, &hcl
, &n2
);
111 return (TOPO_WALK_NEXT
);
114 for (i
= 0; i
< n1
; i
++) {
115 (void) nvlist_lookup_string(topo_hcl
[i
], FM_FMRI_HC_NAME
,
117 (void) nvlist_lookup_string(topo_hcl
[i
], FM_FMRI_HC_ID
, &id1
);
118 (void) nvlist_lookup_string(hcl
[i
], FM_FMRI_HC_NAME
, &name2
);
119 (void) nvlist_lookup_string(hcl
[i
], FM_FMRI_HC_ID
, &id2
);
120 if (strcmp(name1
, name2
) != 0 || strcmp(id1
, id2
) != 0) {
122 return (TOPO_WALK_NEXT
);
126 (void) topo_node_fru(node
, &fru
, NULL
, &err
);
128 (void) nvlist_dup(fru
, &fru_nvl
, NV_UNIQUE_NAME
);
132 return (TOPO_WALK_TERMINATE
);
136 gmem_find_fault_fru(fmd_hdl_t
*hdl
, nvlist_t
*nvl
) {
142 if ((thp
= fmd_hdl_topo_hold(hdl
, TOPO_VERSION
)) == NULL
)
145 if ((twp
= topo_walk_init(thp
, FM_FMRI_SCHEME_HC
,
146 find_fault_fru
, nvl
, &err
)) == NULL
) {
147 fmd_hdl_topo_rele(hdl
, thp
);
151 (void) topo_walk_step(twp
, TOPO_WALK_CHILD
);
153 fmd_hdl_topo_rele(hdl
, thp
);
158 * fault the FRU of the common detector between two DIMMs
161 gmem_gen_datapath_fault(fmd_hdl_t
*hdl
, nvlist_t
*det
)
164 nvlist_t
**hcl1
, **hcl
;
168 nvlist_t
*fltlist
, *rsrc
;
169 nvlist_t
*fru
= NULL
;
171 if (nvlist_lookup_nvlist_array(det
, FM_FMRI_HC_LIST
, &hcl1
, &n
) < 0)
174 for (i
= 0; i
< n
; i
++) {
175 (void) nvlist_lookup_string(hcl1
[i
], FM_FMRI_HC_NAME
, &name
);
176 if (strcmp(name
, "chip") == 0)
181 hcl
= fmd_hdl_zalloc(hdl
, sizeof (nvlist_t
*) * n
, FMD_SLEEP
);
185 for (i
= 0; i
< n
; i
++) {
186 (void) nvlist_alloc(&hcl
[i
],
187 NV_UNIQUE_NAME
|NV_UNIQUE_NAME_TYPE
, 0);
190 for (i
= 0, j
= 0; i
< n
; i
++) {
191 (void) nvlist_lookup_string(hcl1
[i
], FM_FMRI_HC_NAME
, &name
);
192 (void) nvlist_lookup_string(hcl1
[i
], FM_FMRI_HC_ID
, &id
);
193 (void) nvlist_add_string(hcl
[j
], FM_FMRI_HC_NAME
, name
);
194 (void) nvlist_add_string(hcl
[j
], FM_FMRI_HC_ID
, id
);
196 if (strcmp(name
, "chip") == 0)
200 if (nvlist_alloc(&rsrc
, NV_UNIQUE_NAME
|NV_UNIQUE_NAME_TYPE
, 0) != 0) {
201 for (i
= 0; i
< n
; i
++) {
204 fmd_hdl_free(hdl
, hcl
, sizeof (nvlist_t
*) * n
);
207 if (nvlist_add_uint8(rsrc
, FM_VERSION
, FM_HC_SCHEME_VERSION
) != 0 ||
208 nvlist_add_string(rsrc
, FM_FMRI_SCHEME
, FM_FMRI_SCHEME_HC
) != 0 ||
209 nvlist_add_string(rsrc
, FM_FMRI_HC_ROOT
, "") != 0 ||
210 nvlist_add_uint32(rsrc
, FM_FMRI_HC_LIST_SZ
, n
) != 0 ||
211 nvlist_add_nvlist_array(rsrc
, FM_FMRI_HC_LIST
, hcl
, n
) != 0) {
212 for (i
= 0; i
< n
; i
++) {
215 fmd_hdl_free(hdl
, hcl
, sizeof (nvlist_t
*) * n
);
219 fru
= gmem_find_fault_fru(hdl
, rsrc
);
221 cp
= fmd_case_open(hdl
, NULL
);
222 fltlist
= fmd_nvl_create_fault(hdl
, "fault.memory.datapath",
224 fmd_case_add_suspect(hdl
, cp
, fltlist
);
225 fmd_case_solve(hdl
, cp
);
229 for (i
= 0; i
< n
; i
++) {
233 fmd_hdl_free(hdl
, hcl
, sizeof (nvlist_t
*) * n
);
238 * formula to conver an unhashed address to hashed address
239 * PA[17:11] = (PA[32:28] xor PA[17:13]) :: ((PA[19:18] xor PA[12:11])
242 gmem_to_hashed_addr(uint64_t *addr
, uint64_t afar
)
245 *addr
= (afar
& OFFBIT
) | ((afar
& BIT28_32
) >> 15) ^ (afar
& BIT13_17
)
246 | ((afar
& BIT18_19
) >> 7) ^ (afar
& BIT11_12
);
250 * check if a dimm has n CEs that have the same symbol-in-error
253 upos_thresh_check(gmem_dimm_t
*dimm
, uint16_t upos
, uint32_t threshold
)
256 gmem_mq_t
*ip
, *next
;
259 for (i
= 0; i
< GMEM_MAX_CKWDS
; i
++) {
260 for (ip
= gmem_list_next(&dimm
->mq_root
[i
]); ip
!= NULL
;
262 next
= gmem_list_next(ip
);
263 if (ip
->mq_unit_position
== upos
) {
265 if (count
>= threshold
)
274 * check if smaller number of retired pages > 1/16 of larger number of
278 check_bad_rw_retired_pages(fmd_hdl_t
*hdl
, gmem_dimm_t
*d1
, gmem_dimm_t
*d2
)
285 if (d2
->dimm_nretired
< d1
->dimm_nretired
) {
286 sret
= d2
->dimm_nretired
;
287 lret
= d1
->dimm_nretired
;
288 } else if (d2
->dimm_nretired
> d1
->dimm_nretired
) {
289 sret
= d1
->dimm_nretired
;
290 lret
= d2
->dimm_nretired
;
294 ratio
= lret
* GMEM_MQ_RATIO
;
297 fmd_hdl_debug(hdl
, "sret=%d lret=%d ratio=%.3f",
305 * check bad rw on any two DIMMs. The check succeeds if
306 * - each DIMM has a n CEs which have the same symbol-in-error,
307 * - the smaller number of retired pages > 1/16 larger number of retired pages
310 check_bad_rw_between_dimms(fmd_hdl_t
*hdl
, gmem_dimm_t
*d1
, gmem_dimm_t
*d2
,
314 gmem_mq_t
*ip
, *next
;
317 for (i
= 0; i
< GMEM_MAX_CKWDS
; i
++) {
318 for (ip
= gmem_list_next(&d1
->mq_root
[i
]); ip
!= NULL
;
320 next
= gmem_list_next(ip
);
321 upos
= ip
->mq_unit_position
;
322 if (upos_thresh_check(d1
, upos
, gmem
.gm_nupos
)) {
323 if (upos_thresh_check(d2
, upos
,
325 if (check_bad_rw_retired_pages(hdl
,
339 bad_reader_writer_check(fmd_hdl_t
*hdl
, nvlist_t
*det
, gmem_dimm_t
*ce_dimm
)
341 gmem_dimm_t
*d
, *next
;
344 for (d
= gmem_list_next(&gmem
.gm_dimms
); d
!= NULL
; d
= next
) {
345 next
= gmem_list_next(d
);
348 if (!gmem_same_datapath_dimms(hdl
, ce_dimm
, d
))
350 if (check_bad_rw_between_dimms(hdl
, ce_dimm
, d
, &upos
)) {
351 gmem_gen_datapath_fault(hdl
, det
);
352 gmem_save_symbol_error(hdl
, ce_dimm
, upos
);
354 "check_bad_rw_dimms succeeded: %s %s\n",
355 ce_dimm
->dimm_serial
, d
->dimm_serial
);
362 * rule 5a checking. The check succeeds if
364 * - nretired >= 128 and (addr_hi - addr_low) / (nretired -1 ) > 512KB
367 ce_thresh_check(fmd_hdl_t
*hdl
, gmem_dimm_t
*dimm
)
369 nvlist_t
*flt
, *rsrc
;
372 uint64_t delta_addr
= 0;
374 if (dimm
->dimm_flags
& GMEM_F_FAULTING
)
377 nret
= dimm
->dimm_nretired
;
379 if (nret
< gmem
.gm_low_ce_thresh
)
382 if (dimm
->dimm_phys_addr_hi
>= dimm
->dimm_phys_addr_low
)
384 (dimm
->dimm_phys_addr_hi
- dimm
->dimm_phys_addr_low
) /
387 if (nret
>= gmem
.gm_max_retired_pages
|| delta_addr
> GMEM_MQ_512KB
) {
389 fmd_hdl_debug(hdl
, "ce_thresh_check succeeded nret=%d", nret
);
390 dimm
->dimm_flags
|= GMEM_F_FAULTING
;
391 gmem_dimm_dirty(hdl
, dimm
);
393 cp
= fmd_case_open(hdl
, NULL
);
394 rsrc
= gmem_find_dimm_rsc(hdl
, dimm
->dimm_serial
);
395 flt
= fmd_nvl_create_fault(hdl
, GMEM_FAULT_DIMM_PAGES
,
396 GMEM_FLTMAXCONF
, NULL
, gmem_dimm_fru(dimm
), rsrc
);
397 fmd_case_add_suspect(hdl
, cp
, flt
);
398 fmd_case_solve(hdl
, cp
);
404 * rule 5b checking. The check succeeds if more than 120
405 * non-intermittent CEs are reported against one symbol
406 * position of one afar in 72 hours
409 mq_5b_check(fmd_hdl_t
*hdl
, gmem_dimm_t
*dimm
)
411 nvlist_t
*flt
, *rsrc
;
413 gmem_mq_t
*ip
, *next
;
416 for (cw
= 0; cw
< GMEM_MAX_CKWDS
; cw
++) {
417 for (ip
= gmem_list_next(&dimm
->mq_root
[cw
]);
418 ip
!= NULL
; ip
= next
) {
419 next
= gmem_list_next(ip
);
420 if (ip
->mq_dupce_count
>= gmem
.gm_dupce
) {
422 "mq_5b_check succeeded: duplicate CE=%d",
424 cp
= fmd_case_open(hdl
, NULL
);
425 rsrc
= gmem_find_dimm_rsc(hdl
,
427 flt
= fmd_nvl_create_fault(hdl
,
428 GMEM_FAULT_DIMM_PAGES
, GMEM_FLTMAXCONF
,
429 NULL
, gmem_dimm_fru(dimm
), rsrc
);
430 dimm
->dimm_flags
|= GMEM_F_FAULTING
;
431 gmem_dimm_dirty(hdl
, dimm
);
432 fmd_case_add_suspect(hdl
, cp
, flt
);
433 fmd_case_solve(hdl
, cp
);
442 * delete the expired duplicate CE time stamps
445 mq_prune_dup(fmd_hdl_t
*hdl
, gmem_mq_t
*ip
, uint64_t now
)
447 tstamp_t
*tsp
, *next
;
449 for (tsp
= gmem_list_next(&ip
->mq_dupce_tstamp
); tsp
!= NULL
;
451 next
= gmem_list_next(tsp
);
452 if (tsp
->tstamp
< now
- GMEM_MQ_TIMELIM
) {
453 gmem_list_delete(&ip
->mq_dupce_tstamp
, &tsp
->ts_l
);
454 fmd_hdl_free(hdl
, tsp
, sizeof (tstamp_t
));
455 ip
->mq_dupce_count
--;
461 mq_update(fmd_hdl_t
*hdl
, fmd_event_t
*ep
, gmem_mq_t
*ip
, uint64_t now
)
467 if (fmd_serd_exists(hdl
, ip
->mq_serdnm
))
468 fmd_serd_destroy(hdl
, ip
->mq_serdnm
);
470 fmd_serd_create(hdl
, ip
->mq_serdnm
, GMEM_MQ_SERDN
, GMEM_MQ_SERDT
);
471 (void) fmd_serd_record(hdl
, ip
->mq_serdnm
, ep
);
473 tsp
= fmd_hdl_zalloc(hdl
, sizeof (tstamp_t
), FMD_SLEEP
);
475 gmem_list_append(&ip
->mq_dupce_tstamp
, tsp
);
476 ip
->mq_dupce_count
++;
480 * Create a fresh index block for MQSC CE correlation.
483 mq_create(fmd_hdl_t
*hdl
, fmd_event_t
*ep
,
484 uint64_t afar
, uint16_t upos
, uint16_t ckwd
, uint64_t now
)
489 cp
= fmd_hdl_zalloc(hdl
, sizeof (gmem_mq_t
), FMD_SLEEP
);
492 cp
->mq_phys_addr
= afar
;
493 cp
->mq_unit_position
= upos
;
496 gmem_mq_serdnm_create(hdl
, "mq", afar
, ckwd
, upos
);
498 tsp
= fmd_hdl_zalloc(hdl
, sizeof (tstamp_t
), FMD_SLEEP
);
500 gmem_list_append(&cp
->mq_dupce_tstamp
, tsp
);
501 cp
->mq_dupce_count
= 1;
504 * Create SERD to keep this event from being removed
505 * by fmd which may not know there is an event pointer
506 * saved here. This SERD is *never* meant to fire.
508 if (fmd_serd_exists(hdl
, cp
->mq_serdnm
))
509 fmd_serd_destroy(hdl
, cp
->mq_serdnm
);
511 fmd_serd_create(hdl
, cp
->mq_serdnm
, GMEM_MQ_SERDN
, GMEM_MQ_SERDT
);
512 (void) fmd_serd_record(hdl
, cp
->mq_serdnm
, ep
);
518 mq_destroy(fmd_hdl_t
*hdl
, gmem_list_t
*lp
, gmem_mq_t
*ip
)
520 gmem_mq_t
*jp
= gmem_list_next(ip
);
521 tstamp_t
*tsp
, *next
;
524 if (ip
->mq_serdnm
!= NULL
) {
525 if (fmd_serd_exists(hdl
, ip
->mq_serdnm
))
526 fmd_serd_destroy(hdl
, ip
->mq_serdnm
);
527 fmd_hdl_strfree(hdl
, ip
->mq_serdnm
);
528 ip
->mq_serdnm
= NULL
;
531 for (tsp
= gmem_list_next(&ip
->mq_dupce_tstamp
); tsp
!= NULL
;
533 next
= gmem_list_next(tsp
);
534 gmem_list_delete(&ip
->mq_dupce_tstamp
, &tsp
->ts_l
);
535 fmd_hdl_free(hdl
, tsp
, sizeof (tstamp_t
));
538 gmem_list_delete(lp
, &ip
->mq_l
);
539 fmd_hdl_free(hdl
, ip
, sizeof (gmem_mq_t
));
546 * Add an index block for a new CE, sorted
547 * a) by ascending unit position
548 * b) order of arrival (~= time order)
551 mq_add(fmd_hdl_t
*hdl
, gmem_dimm_t
*dimm
, fmd_event_t
*ep
,
552 uint64_t afar
, uint16_t unit_position
, uint16_t ckwd
,
558 for (ip
= gmem_list_next(&dimm
->mq_root
[cw
]); ip
!= NULL
; ) {
559 if (ip
->mq_unit_position
> unit_position
) {
560 /* list is in unit position order */
562 } else if (ip
->mq_unit_position
== unit_position
&&
563 ip
->mq_phys_addr
== afar
) {
565 * Found a duplicate cw, unit_position, and afar.
566 * Delete this node, to be superseded by the new
568 * update the mq_t structure
570 mq_update(hdl
, ep
, ip
, now
);
573 ip
= gmem_list_next(ip
);
577 jp
= mq_create(hdl
, ep
, afar
, unit_position
, cw
, now
);
579 gmem_list_append(&dimm
->mq_root
[cw
], jp
);
581 gmem_list_insert_before(&dimm
->mq_root
[cw
], ip
, jp
);
585 * Prune the MQSC index lists (one for each checkword), by deleting
586 * outdated index blocks from each list.
590 mq_prune(fmd_hdl_t
*hdl
, gmem_dimm_t
*dimm
, uint64_t now
)
595 for (cw
= 0; cw
< GMEM_MAX_CKWDS
; cw
++) {
596 for (ip
= gmem_list_next(&dimm
->mq_root
[cw
]); ip
!= NULL
; ) {
597 if (ip
->mq_tstamp
< now
- GMEM_MQ_TIMELIM
) {
599 * This event has timed out - delete the
600 * mq block as well as serd for the event.
602 ip
= mq_destroy(hdl
, &dimm
->mq_root
[cw
], ip
);
604 mq_prune_dup(hdl
, ip
, now
);
605 /* tstamp < now - ce_t */
606 ip
= gmem_list_next(ip
);
608 } /* per checkword */
613 * Check the MQSC index lists (one for each checkword) by making a
614 * complete pass through each list, checking if the criteria for
615 * Rule 4A has been met. Rule 4A checking is done for each checkword.
617 * Rule 4A: fault a DIMM "whenever Solaris reports two or more CEs from
618 * two or more different physical addresses on each of two or more different
619 * bit positions from the same DIMM within 72 hours of each other, and all
620 * the addresses are in the same relative checkword (that is, the AFARs
621 * are all the same modulo 64). [Note: This means at least 4 CEs; two
622 * from one bit position, with unique addresses, and two from another,
623 * also with unique addresses, and the lower 6 bits of all the addresses
628 mq_check(fmd_hdl_t
*hdl
, gmem_dimm_t
*dimm
)
630 int upos_pairs
, curr_upos
, cw
, i
, j
;
632 typedef struct upos_pair
{
637 upos_pair_t upos_array
[16]; /* max per cw = 2, * 8 cw's */
641 * Each upos_array[] member represents a pair of CEs for the same
642 * unit position (symbol) which is a 4 bit nibble.
643 * MQSC rule 4 requires pairs of CEs from the same symbol (same DIMM
644 * for rule 4A, and same DRAM for rule 4B) for a violation - this
645 * is why CE pairs are tracked.
648 upos_array
[0].mq1
= NULL
;
650 for (cw
= 0; cw
< GMEM_MAX_CKWDS
; cw
++) {
655 * mq_root[] is an array of cumulative lists of CEs
656 * indexed by checkword where the list is in unit position
657 * order. Loop through checking for duplicate unit position
658 * entries (filled in at mq_create()).
659 * The upos_array[] is filled in each time a duplicate
660 * unit position is found; the first time through the loop
661 * of a unit position sets curr_upos but does not fill in
662 * upos_array[] until the second symbol is found.
664 for (ip
= gmem_list_next(&dimm
->mq_root
[cw
]); ip
!= NULL
;
665 ip
= gmem_list_next(ip
)) {
666 if (curr_upos
!= ip
->mq_unit_position
) {
667 /* Set initial current position */
668 curr_upos
= ip
->mq_unit_position
;
669 } else if (i
> upos_pairs
&&
670 curr_upos
== upos_array
[i
-1].upos
) {
672 * Only keep track of CE pairs; skip
673 * triples, quads, etc...
676 } else if (upos_array
[i
].mq1
== NULL
) {
677 /* Have a pair. Add to upos_array[] */
678 fmd_hdl_debug(hdl
, "pair:upos=%d",
680 upos_array
[i
].upos
= curr_upos
;
681 upos_array
[i
].mq1
= gmem_list_prev(ip
);
682 upos_array
[i
].mq2
= ip
;
683 upos_array
[++i
].mq1
= NULL
;
686 if (i
- upos_pairs
>= 2) {
687 /* Rule 4A violation */
688 rsc
= gmem_find_dimm_rsc(hdl
, dimm
->dimm_serial
);
689 flt
= fmd_nvl_create_fault(hdl
, GMEM_FAULT_DIMM_4A
,
690 GMEM_FLTMAXCONF
, NULL
, gmem_dimm_fru(dimm
), rsc
);
691 for (j
= upos_pairs
; j
< i
; j
++) {
692 fmd_case_add_ereport(hdl
,
693 dimm
->dimm_case
.cc_cp
,
694 upos_array
[j
].mq1
->mq_ep
);
695 fmd_case_add_ereport(hdl
,
696 dimm
->dimm_case
.cc_cp
,
697 upos_array
[j
].mq2
->mq_ep
);
699 dimm
->dimm_flags
|= GMEM_F_FAULTING
;
700 gmem_dimm_dirty(hdl
, dimm
);
701 fmd_case_add_suspect(hdl
, dimm
->dimm_case
.cc_cp
, flt
);
702 fmd_case_solve(hdl
, dimm
->dimm_case
.cc_cp
);
707 assert(upos_pairs
< 16);
713 gmem_ce(fmd_hdl_t
*hdl
, fmd_event_t
*ep
, nvlist_t
*nvl
, const char *class)
715 uint16_t symbol_pos
, cw
;
716 uint64_t phyaddr
, offset
, addr
;
717 uint32_t filter_ratio
= 0;
720 nvlist_t
*fru
= NULL
;
721 nvlist_t
*topo_rsc
= NULL
;
722 nvlist_t
*rsrc
, *det
;
732 err
= nvlist_lookup_boolean_value(nvl
, GMEM_ERPT_PAYLOAD_DIAGNOSE
,
734 if (err
!= 0 || diagnose
== 0)
735 return (GMEM_EVD_UNUSED
);
737 if ((nvlist_lookup_uint64(nvl
, GMEM_ERPT_PAYLOAD_PHYSADDR
,
739 (nvlist_lookup_uint64(nvl
, GMEM_ERPT_PAYLOAD_OFFSET
,
741 fmd_hdl_debug(hdl
, "Can't get page phyaddr or offset");
742 return (GMEM_EVD_BAD
);
745 fmd_hdl_debug(hdl
, "phyaddr %llx offset %llx", phyaddr
, offset
);
747 if ((page
= gmem_page_lookup(phyaddr
)) != NULL
&&
748 page
->page_case
.cc_cp
!= NULL
&&
749 fmd_case_solved(hdl
, page
->page_case
.cc_cp
))
750 return (GMEM_EVD_REDUND
);
752 if (nvlist_lookup_nvlist(nvl
, GMEM_ERPT_PAYLOAD_RESOURCE
,
754 nvlist_lookup_string(rsrc
, FM_FMRI_HC_SERIAL_ID
, &sn
) != 0) {
755 fmd_hdl_debug(hdl
, "Can't get dimm serial\n");
756 return (GMEM_EVD_BAD
);
759 fmd_hdl_debug(hdl
, "serial %s", sn
);
761 if (nvlist_lookup_nvlist(nvl
, GMEM_ERPT_PAYLOAD_DETECTOR
, &det
) != 0)
762 return (GMEM_EVD_BAD
);
765 * Find dimm fru by serial number.
767 fru
= gmem_find_dimm_fru(hdl
, sn
);
770 fmd_hdl_debug(hdl
, "Dimm is not present\n");
771 return (GMEM_EVD_UNUSED
);
774 if ((dimm
= gmem_dimm_lookup(hdl
, fru
)) == NULL
&&
775 (dimm
= gmem_dimm_create(hdl
, fru
, det
)) == NULL
) {
777 return (GMEM_EVD_UNUSED
);
780 if (dimm
->dimm_case
.cc_cp
== NULL
) {
781 dimm
->dimm_case
.cc_cp
= gmem_case_create(hdl
,
782 &dimm
->dimm_header
, GMEM_PTR_DIMM_CASE
, &uuid
);
786 * Add to MQSC correlation lists all CEs which pass validity
787 * checks above. If there is no symbol_pos & relative ckword
788 * in the ereport, skip rule 4A checking.
791 err
= nvlist_lookup_uint16(nvl
, GMEM_ERPT_PAYLOAD_SYMBOLPOS
,
793 err
|= nvlist_lookup_uint16(nvl
, GMEM_ERPT_PAYLOAD_CKW
, &cw
);
796 fmd_hdl_debug(hdl
, "symbol_pos=%d cw=%d", symbol_pos
, cw
);
798 if (nvlist_lookup_uint64_array(nvl
,
799 "__tod", &now
, &nelem
) == 0) {
800 skip_error
= gmem_check_symbol_error(hdl
, dimm
,
804 !(dimm
->dimm_flags
& GMEM_F_FAULTING
))
805 mq_add(hdl
, dimm
, ep
, phyaddr
, symbol_pos
,
808 mq_prune(hdl
, dimm
, *now
);
811 bad_reader_writer_check(hdl
, det
, dimm
);
812 if (!(dimm
->dimm_flags
& GMEM_F_FAULTING
)) {
814 mq_5b_check(hdl
, dimm
);
819 type
= gmem_mem_name2type(strstr(class, "mem"));
822 case CE_DISP_UNKNOWN
:
823 GMEM_STAT_BUMP(ce_unknown
);
825 return (GMEM_EVD_UNUSED
);
826 case CE_DISP_INTERMITTENT
:
827 GMEM_STAT_BUMP(ce_interm
);
829 return (GMEM_EVD_UNUSED
);
831 GMEM_STAT_BUMP(ce_clearable_persis
);
834 GMEM_STAT_BUMP(ce_sticky
);
838 return (GMEM_EVD_BAD
);
841 if (gmem_check_symbol_error(hdl
, dimm
, symbol_pos
)) {
843 return (GMEM_EVD_REDUND
);
847 page
= gmem_page_create(hdl
, fru
, phyaddr
, offset
);
850 return (GMEM_EVD_UNUSED
);
856 if (page
->page_case
.cc_cp
== NULL
) {
857 page
->page_case
.cc_cp
= gmem_case_create(hdl
,
858 &page
->page_header
, GMEM_PTR_PAGE_CASE
, &uuid
);
863 fmd_hdl_debug(hdl
, "adding persistent event to CE serd");
864 if (page
->page_case
.cc_serdnm
== NULL
)
865 gmem_page_serd_create(hdl
, page
, nvl
);
867 filter_ratio
= gmem_get_serd_filter_ratio(nvl
);
869 fmd_hdl_debug(hdl
, "filter_ratio %d\n", filter_ratio
);
871 if (gmem_serd_record(hdl
, page
->page_case
.cc_serdnm
,
872 filter_ratio
, ep
) == FMD_B_FALSE
) {
873 return (GMEM_EVD_OK
); /* engine hasn't fired */
876 fmd_hdl_debug(hdl
, "ce page serd fired\n");
877 fmd_case_add_serd(hdl
, page
->page_case
.cc_cp
,
878 page
->page_case
.cc_serdnm
);
879 fmd_serd_reset(hdl
, page
->page_case
.cc_serdnm
);
880 break; /* to retire */
883 fmd_case_add_ereport(hdl
, page
->page_case
.cc_cp
, ep
);
884 break; /* to retire */
888 topo_rsc
= gmem_find_dimm_rsc(hdl
, dimm
->dimm_serial
);
889 rc
= gmem_page_fault(hdl
, gmem_dimm_fru(dimm
), topo_rsc
,
890 ep
, phyaddr
, offset
);
893 gmem_to_hashed_addr(&addr
, phyaddr
);
895 if (addr
> dimm
->dimm_phys_addr_hi
)
896 dimm
->dimm_phys_addr_hi
= addr
;
897 if (addr
< dimm
->dimm_phys_addr_low
)
898 dimm
->dimm_phys_addr_low
= addr
;
900 dimm
->dimm_nretired
++;
901 dimm
->dimm_retstat
.fmds_value
.ui64
++;
902 gmem_dimm_dirty(hdl
, dimm
);
903 ce_thresh_check(hdl
, dimm
);
905 return (GMEM_EVD_OK
);
909 gmem_dimm_close(fmd_hdl_t
*hdl
, void *arg
)
911 gmem_dimm_destroy(hdl
, arg
);