4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
27 #pragma dictionary "AMD"
30 * Eversholt rules for the AMD Opteron CPU/Memory
33 #define MAX(x, y) ((x) >= (y) ? (x) : (y))
34 #define MIN(x, y) ((x) <= (y) ? (x) : (y))
37 * SET_ADDR and SET_OFFSET are used to set a payload value in the fault that
38 * we diagnose for page faults, to record the physical address of the faulting
41 #define SET_ADDR (setpayloadprop("asru-physaddr", payloadprop("IA32_MCi_ADDR")))
43 #define SET_OFFSET (setpayloadprop("asru-offset", \
44 payloadprop("resource[0].hc-specific.offset")))
47 * RESOURCE_EXISTS is true if a member with name "resource" exists in the
48 * payload - regardless of type (e.g., nvlist or nvlist array) or value.
50 #define RESOURCE_EXISTS (payloadprop_defined("resource"))
53 * CONTAINS_RANK is true if the "resource" nvlist array (as used in memory
54 * ereports) exists and one if its members matches the path for the
55 * rank node. Our memory propogation are of the form
57 * "prop foo@chip/memory-controller/dimm/rank -> blah@chip/core/strand"
59 * since cpus detect memory errors; in eversholt such a propogation, where
60 * the lhs path and rhs path do not match, expands to the cross-product of
61 * all dimms, ranks and cpus on the same chip (since chip appears in the
62 * path on both sides). We use CONTAINS_RANK to constrain the propogation
63 * such that it only happens if the payload resource matches the rank.
65 #define CONTAINS_RANK (payloadprop_contains("resource", \
66 asru(chip/memory-controller/dimm/rank)) \
67 || payloadprop_contains("resource", \
68 asru(chip/memory-controller/dimm)))
71 * The following will tell us whether a syndrome that is known to be
72 * correctable (from a mem_ce ereport) is single-bit or multi-bit. For a
73 * correctable ChipKill syndrome the number of bits set in the lowest
74 * nibble indicates how many bits were in error.
77 #define CBITMASK(synd) ((synd) & 0xf)
79 #define CKSINGLE(synd) \
81 (CBITMASK(synd) == 0x1 || CBITMASK(synd) == 0x2 || \
82 CBITMASK(synd) == 0x4 || CBITMASK(synd) == 0x8))
84 #define SINGLE_BIT_CE \
85 (payloadprop("syndrome-type") == "E" || \
86 (payloadprop("syndrome-type") == "C4" && \
87 CKSINGLE(payloadprop("syndrome"))))
89 #define MULTI_BIT_CE \
90 (payloadprop("syndrome-type") == "C4" && \
91 !CKSINGLE(payloadprop("syndrome")))
95 * A single bit fault in a memory rank can cause:
97 * - mem_ce : reported by nb
98 * - inf_sys_ecc1: reported by ic or dc; inf_sys_ecc1 errors detected at the
99 * ic do not record a syndrome; these errors will not be triggered in
100 * ChipKill ECC mode (the NB corrects all ECC errors in that mode)
101 * - s_ecc1: reported by bu; this error will not be triggered in ChipKill
102 * ECC mode (the NB corrects all ECC in that mode)
104 * Single-bit errors are fed into a per-rank SERD engine; if a SERD engine
105 * trips we diagnose a fault.memory.page so that the response agent can
106 * retire the page that caused the trip. If the total number of pages
107 * faulted in this way on a single rank exceeds a threshold we will
108 * diagnose a fault.memory.dimm_sb against the containing dimm.
110 * Multibit ChipKill-correctable errors are treated identically to
111 * single-bit errors, but via separate serd engines to allow distinct
112 * parameters if desired.
114 * Uncorrectable errors produce an immediate page fault and corresponding
115 * fault.memory.dimm_ue.
117 * Page faults are essentially internal - action is only required when
118 * they are accompanied by a dimm fault. As such we include message=0
122 event ereport.cpu.amd.ic.inf_sys_ecc1@chip/core/strand{within(5s)};
123 event ereport.cpu.amd.dc.inf_sys_ecc1@chip/core/strand{within(5s)};
124 event ereport.cpu.amd.bu.s_ecc1@chip/core/strand{within(5s)};
125 event ereport.cpu.amd.nb.mem_ce@chip/core/strand{within(5s)};
128 * Single-bit correctable errors feed into per-rank
129 * SERD engines which diagnose fault.memory.page_sb if they trip.
131 * Multi-bit correctable (via ChipKill) errors feed
132 * into additional per-rank SERD engines which diagnose fault.memory.page_ck
135 * The number of fault.memory.page and fault.memory.page_ck diagnosed is
136 * counted in stat engines for each type. These are used in deciding
137 * whether to declare a dimm faulty after repeated page faults.
140 #define PAGE_SB_COUNT 2
141 #define PAGE_SB_TIME 72h
142 #define PAGE_CK_COUNT 2
143 #define PAGE_CK_TIME 72h
145 engine stat.sbpgflt@chip/memory-controller/dimm/rank;
146 engine stat.ckpgflt@chip/memory-controller/dimm/rank;
147 engine serd.memory.page_sb@chip/memory-controller/dimm/rank,
148 N=PAGE_SB_COUNT, T=PAGE_SB_TIME;
149 engine serd.memory.page_ck@chip/memory-controller/dimm/rank,
150 N=PAGE_CK_COUNT, T=PAGE_CK_TIME;
151 engine serd.memory.dimm_sb@chip/memory-controller/dimm/rank,
152 N=PAGE_SB_COUNT, T=PAGE_SB_TIME;
153 engine serd.memory.dimm_ck@chip/memory-controller/dimm/rank,
154 N=PAGE_CK_COUNT, T=PAGE_CK_TIME;
155 event fault.memory.page_sb@chip/memory-controller/dimm/rank, message=0,
156 count=stat.sbpgflt@chip/memory-controller/dimm/rank, response=0,
157 engine=serd.memory.page_sb@chip/memory-controller/dimm/rank;
158 event fault.memory.page_ck@chip/memory-controller/dimm/rank, message=0,
159 count=stat.ckpgflt@chip/memory-controller/dimm/rank, response=0,
160 engine=serd.memory.page_ck@chip/memory-controller/dimm/rank;
161 event fault.memory.dimm_sb@chip/memory-controller/dimm/rank,
162 engine=serd.memory.dimm_sb@chip/memory-controller/dimm/rank;
163 event fault.memory.dimm_ck@chip/memory-controller/dimm/rank,
164 engine=serd.memory.dimm_ck@chip/memory-controller/dimm/rank;
167 * The fraction of pages on a single rank that must be diagnosed as faulty
168 * with single correctable unit faults before we will fault the rank.
169 * Once we have faulted the rank we will continue to diagnose any further page
170 * faults on the rank up to some maximum multiple of the threshold at which
171 * we faulted the dimm. This allows us to potentially contain some fairly
172 * far-reaching but still limited-extent fault (such as a partial column
173 * failure) without getting carried away and allowing a single faulty rank to
174 * use up the entire system-imposed page retirenment limit (which, once
175 * reached, causes retirement request to have no effect other than to fill
176 * the fault manager cache and logs).
178 * This fraction is specified in basis points, where 100 basis points are
179 * equivalent to 1 percent. It is applied on a per-rank basis.
181 * The system imposes an absolute maximum on the number of pages it will
182 * retire; the current value is 10 basis points, or 0.1% of 'physmem'. Note
183 * that 'physmem' is reduced from installed memory pages by an amount
184 * reflecting permanent kernel memory allocations. This system page retire
185 * limit bounds the maximum real response to page faults across all ranks
186 * that fault manager response agents can effect, but it should not be confused
187 * with any diagnosis threshold (i.e., the number of faulty pages we are
188 * prepared to tolerate from a single rank before faulting the rank is
189 * distinct from the total number of pages we are prepared to retire from use
190 * in response to that and other faults). It is, however, desirable to
191 * arrange that the maximum number of pages we are prepared to fault from
192 * any one rank is less than the system-wide quota.
194 #define PAGE_RETIRE_LIMIT_BPS 5 /* or 0.05%; ~ 131 pages/GB %/
197 * A macro to manipulate the above fraction. Given a size in bytes convert
198 * this to pages (4K pagesize) and calculate the number of those pages
199 * indicated by PAGE_RETIRE_LIMIT_BPS basis points.
201 #define _BPS_PGCNT(totalbytes) \
202 ((((totalbytes) / 4096 ) * PAGE_RETIRE_LIMIT_BPS) / 10000)
205 * The single-correctable-unit threshold at which number of faulted pages
206 * on a rank we we fault the rank. We insist that this be at least 128 and
207 * never more than 512.
209 #define RANK_THRESH MIN(512, MAX(128, \
210 _BPS_PGCNT(confprop(chip/memory-controller/dimm/rank, "size"))))
213 * The maximum number of single-correctable-unit page faults we will diagnose
214 * on a single rank (must be greater than RANK_THRESH). We set
215 * this at twice the rank fault threshold.
217 #define RANK_PGFLT_MAX (2 * RANK_THRESH)
219 #define SB_PGFLTS (count(stat.sbpgflt@chip/memory-controller/dimm/rank))
220 #define CK_PGFLTS (count(stat.ckpgflt@chip/memory-controller/dimm/rank))
223 * "Single-correctable-unit" DIMM faults are diagnosed when the total number of
224 * page faults (diagnosed from repeated single-bit or multibit-chipkills)
225 * from any one rank on that DIMM reaches a threshold. A "correctable unit"
226 * is a single bit in normal 64/8 ECC mode, or a single symbol in ChipKill
227 * 128/16 mode (i.e., nibble-aligned nibble for the code used on Opteron).
229 * We do not stop diagnosing further single-bit page faults once we have
230 * declared a single-bit DIMM fault - we continue diagnosing them and
231 * response agents can continue to retire those pages up to the system-imposed
234 * Two distinct fault types may be diagnosed - fault.memory.dimm_sb and
235 * fault.memory.dimm_ck. Which one is diagnosed depends on whether we
236 * have reached the threshold for a majority of single-bit page faults or
237 * multibit page faults.
239 * Implementation: we maintain parallel SERD engines to the page_sb and
240 * page_ck engines, which trip in unison. On trip it generates a distinct
241 * ereport which we diagnose to a fault if the threshold has been reached.
243 prop fault.memory.page_sb@chip/memory-controller/dimm/rank
244 { CONTAINS_RANK && SINGLE_BIT_CE &&
245 SB_PGFLTS + CK_PGFLTS < RANK_PGFLT_MAX && SET_ADDR && SET_OFFSET } (1)->
246 ereport.cpu.amd.dc.inf_sys_ecc1@chip/core<>/strand<>,
247 ereport.cpu.amd.bu.s_ecc1@chip/core<>/strand<>,
248 ereport.cpu.amd.nb.mem_ce@chip/core<>/strand<>;
250 prop fault.memory.page_ck@chip/memory-controller/dimm/rank
251 { CONTAINS_RANK && !SINGLE_BIT_CE &&
252 SB_PGFLTS + CK_PGFLTS < RANK_PGFLT_MAX && SET_ADDR && SET_OFFSET } (1)->
253 ereport.cpu.amd.dc.inf_sys_ecc1@chip/core<>/strand<>,
254 ereport.cpu.amd.bu.s_ecc1@chip/core<>/strand<>,
255 ereport.cpu.amd.nb.mem_ce@chip/core<>/strand<>;
257 prop fault.memory.dimm_sb@chip/memory-controller/dimm/rank
258 { CONTAINS_RANK && SB_PGFLTS + CK_PGFLTS > RANK_THRESH &&
259 SB_PGFLTS > RANK_THRESH / 2 } (1)->
260 ereport.cpu.amd.dc.inf_sys_ecc1@chip/core<>/strand<>,
261 ereport.cpu.amd.bu.s_ecc1@chip/core<>/strand<>,
262 ereport.cpu.amd.nb.mem_ce@chip/core<>/strand<>;
264 prop fault.memory.dimm_ck@chip/memory-controller/dimm/rank
265 { CONTAINS_RANK && SB_PGFLTS + CK_PGFLTS > RANK_THRESH &&
266 CK_PGFLTS > RANK_THRESH / 2 } (1)->
267 ereport.cpu.amd.nb.mem_ce@chip/core<>/strand<>;
270 * If the address is not valid then no resource member will be included
271 * in a nb.mem_ce or nb.mem_ue ereport. These cases should be rare.
272 * We will also discard all inf_sys_ecc1 events detected at the ic since they
273 * have no syndrome and therefore no resource information.
274 * We will discard such ereports. An alternative may be to SERD them
275 * on a per MC basis and trip if we see too many such events.
277 event upset.memory.discard1@chip/core/strand;
278 prop upset.memory.discard1@chip/core/strand
279 { !RESOURCE_EXISTS } (1)->
280 ereport.cpu.amd.ic.inf_sys_ecc1@chip/core/strand,
281 ereport.cpu.amd.dc.inf_sys_ecc1@chip/core/strand,
282 ereport.cpu.amd.bu.s_ecc1@chip/core/strand,
283 ereport.cpu.amd.nb.mem_ce@chip/core/strand;
287 * An uncorrectable multi-bit fault in a memory dimm can cause:
289 * - mem_ue : reported by nb for an access from a remote cpu
290 * - inf_sys_eccm : reported by ic or dc; the ic does not report a syndrome
291 * - s_eccm : reported by bu
293 * Since on production systems we force HT Sync Flood on uncorrectable
294 * memory errors (if not already set as such by the BIOS, as it should be)
295 * we won't actually receive these ereports since the system will be reset.
298 event ereport.cpu.amd.ic.inf_sys_eccm@chip/core/strand{within(5s)};
299 event ereport.cpu.amd.dc.inf_sys_eccm@chip/core/strand{within(5s)};
300 event ereport.cpu.amd.bu.s_eccm@chip/core/strand{within(5s)};
301 event ereport.cpu.amd.nb.mem_ue@chip/core/strand{within(5s)};
303 event fault.memory.dimm_ue@chip/memory-controller/dimm/rank;
304 event fault.memory.page_ue@chip/memory-controller/dimm/rank, message=0,
307 prop fault.memory.dimm_ue@chip/memory-controller/dimm/rank
308 { CONTAINS_RANK } (1)->
309 ereport.cpu.amd.ic.inf_sys_eccm@chip/core<>/strand<>,
310 ereport.cpu.amd.dc.inf_sys_eccm@chip/core<>/strand<>,
311 ereport.cpu.amd.bu.s_eccm@chip/core<>/strand<>,
312 ereport.cpu.amd.nb.mem_ue@chip/core<>/strand<>;
314 prop fault.memory.page_ue@chip/memory-controller/dimm/rank
315 { CONTAINS_RANK && SET_ADDR && SET_OFFSET } (1)->
316 ereport.cpu.amd.ic.inf_sys_eccm@chip/core<>/strand<>,
317 ereport.cpu.amd.dc.inf_sys_eccm@chip/core<>/strand<>,
318 ereport.cpu.amd.bu.s_eccm@chip/core<>/strand<>,
319 ereport.cpu.amd.nb.mem_ue@chip/core<>/strand<>;
321 event upset.memory.discard3@chip/core/strand;
322 prop upset.memory.discard3@chip/core/strand
323 { !RESOURCE_EXISTS } (1)->
324 ereport.cpu.amd.ic.inf_sys_eccm@chip/core/strand,
325 ereport.cpu.amd.dc.inf_sys_eccm@chip/core/strand,
326 ereport.cpu.amd.bu.s_eccm@chip/core/strand,
327 ereport.cpu.amd.nb.mem_ue@chip/core/strand;
330 * If the BIOS fails a chip-select during POST, or perhaps after a
331 * sync flood from an uncorrectable error, then on revision F and G it
332 * should mark that chip-select as TestFail in the CS Base register.
333 * When the memory-controller driver discovers all the MC configuration
334 * it notes such failed chip-selects and creates topology nodes for the
335 * chip-select and associated dimms and ranks, and produces an ereport for each
336 * failed chip-select with detector set to the memory-controller node
337 * and resource indicating the failed chip-select.
340 event ereport.cpu.amd.mc.cs_testfail@chip/memory-controller{within(5s)};
341 event fault.memory.dimm_testfail@chip/memory-controller/dimm/rank;
342 event error.memory.cs_testfail@chip/memory-controller/chip-select;
344 #define CONTAINS_CS (payloadprop_contains("resource", \
345 asru(chip/memory-controller/chip-select)))
347 prop error.memory.cs_testfail@chip/memory-controller/chip-select (1)->
348 ereport.cpu.amd.mc.cs_testfail@chip/memory-controller
352 (confprop_defined(chip/memory-controller/chip-select, s) && \
353 confprop(chip/memory-controller/chip-select, s) == \
354 confprop(chip/memory-controller/dimm/rank, "csname"))
356 prop fault.memory.dimm_testfail@chip/memory-controller/dimm/rank (0)->
357 error.memory.cs_testfail@chip/memory-controller/chip-select
358 { CSMATCH("dimm1-csname") || CSMATCH("dimm2-csname")};
361 * DRAM Command/Address Parity Errors.
363 * - dramaddr_par : reported by the nb; the NB status register includes
364 * a bit indicating which dram controller channel (A or B) experienced
368 event ereport.cpu.amd.nb.dramaddr_par@chip/core/strand{within(5s)};
369 event fault.cpu.amd.dramchannel@chip/memory-controller/dram-channel, response=0;
371 prop fault.cpu.amd.dramchannel@chip/memory-controller/dram-channel[y] (0)->
372 ereport.cpu.amd.nb.dramaddr_par@chip/core/strand {
373 ((payloadprop("IA32_MCi_STATUS") >> 32 & 0x200) ? 1 : 0) == y };
376 * A single bit data array fault in an l2 cache can cause:
378 * - inf_l2_ecc1 : reported by ic on this cpu
379 * - inf_l2_ecc1 : reported by dc on this cpu
380 * - l2d_ecc1 : reported by bu on copyback or on snoop from another cpu
383 #define L2CACHEDATA_SB_COUNT 3
384 #define L2CACHEDATA_SB_TIME 12h
386 event ereport.cpu.amd.ic.inf_l2_ecc1@chip/core/strand{within(5s)};
387 event ereport.cpu.amd.dc.inf_l2_ecc1@chip/core/strand{within(5s)};
388 event ereport.cpu.amd.bu.l2d_ecc1@chip/core/strand{within(5s)};
389 engine serd.cpu.amd.l2d_sb@chip/core/strand,
390 N=L2CACHEDATA_SB_COUNT, T=L2CACHEDATA_SB_TIME;
391 event fault.cpu.amd.l2cachedata@chip/core/strand, engine=serd.cpu.amd.l2d_sb@chip/core/strand;
393 prop fault.cpu.amd.l2cachedata@chip/core/strand (0)->
394 ereport.cpu.amd.ic.inf_l2_ecc1@chip/core/strand,
395 ereport.cpu.amd.dc.inf_l2_ecc1@chip/core/strand,
396 ereport.cpu.amd.bu.l2d_ecc1@chip/core/strand;
399 * A multi-bit data array fault in an l2 cache can cause:
401 * - inf_l2_eccm : reported by ic on this cpu
402 * - inf_l2_eccm : reported by dc on this cpu
403 * - l2d_eccm : reported by bu on copyback or on snoop from another cpu
406 event ereport.cpu.amd.ic.inf_l2_eccm@chip/core/strand{within(5s)};
407 event ereport.cpu.amd.dc.inf_l2_eccm@chip/core/strand{within(5s)};
408 event ereport.cpu.amd.bu.l2d_eccm@chip/core/strand{within(5s)};
410 prop fault.cpu.amd.l2cachedata@chip/core/strand
411 { setserdincrement(L2CACHEDATA_SB_COUNT + 1) } (0)->
412 ereport.cpu.amd.ic.inf_l2_eccm@chip/core/strand,
413 ereport.cpu.amd.dc.inf_l2_eccm@chip/core/strand,
414 ereport.cpu.amd.bu.l2d_eccm@chip/core/strand;
417 * A single bit tag array fault in an l2 cache can cause:
419 * - l2t_ecc1 : reported by bu on this cpu when detected during snoop
420 * - l2t_par : reported by bu on this cpu when detected other than during snoop
423 #define L2CACHETAG_SB_COUNT 3
424 #define L2CACHETAG_SB_TIME 12h
426 event ereport.cpu.amd.bu.l2t_ecc1@chip/core/strand{within(5s)};
427 event ereport.cpu.amd.bu.l2t_par@chip/core/strand{within(5s)};
428 engine serd.cpu.amd.l2t_sb@chip/core/strand,
429 N=L2CACHETAG_SB_COUNT, T=L2CACHETAG_SB_TIME;
430 event fault.cpu.amd.l2cachetag@chip/core/strand, engine=serd.cpu.amd.l2t_sb@chip/core/strand;
432 prop fault.cpu.amd.l2cachetag@chip/core/strand (0)->
433 ereport.cpu.amd.bu.l2t_ecc1@chip/core/strand;
436 * A multi-bit tag array fault in an l2 cache can cause:
438 * - l2t_eccm : reported by bu on this cpu when detected during snoop
439 * - l2t_par : reported by bu on this cpu when detected other than during snoop
442 event ereport.cpu.amd.bu.l2t_eccm@chip/core/strand{within(5s)};
444 prop fault.cpu.amd.l2cachetag@chip/core/strand
445 { setserdincrement(L2CACHETAG_SB_COUNT + 1) } (0)->
446 ereport.cpu.amd.bu.l2t_eccm@chip/core/strand,
447 ereport.cpu.amd.bu.l2t_par@chip/core/strand;
450 * A data array parity fault in an I cache can cause:
452 * - data_par : reported by ic on this cpu
455 #define ICACHEDATA_SB_COUNT 2
456 #define ICACHEDATA_SB_TIME 168h
458 event ereport.cpu.amd.ic.data_par@chip/core/strand{within(5s)};
459 engine serd.cpu.amd.icachedata@chip/core/strand,
460 N=ICACHEDATA_SB_COUNT, T=ICACHEDATA_SB_TIME;
461 event fault.cpu.amd.icachedata@chip/core/strand,
462 engine=serd.cpu.amd.icachedata@chip/core/strand;
464 prop fault.cpu.amd.icachedata@chip/core/strand (0)->
465 ereport.cpu.amd.ic.data_par@chip/core/strand;
468 * A tag array parity fault in an I cache can cause:
470 * - tag_par : reported by ic on this cpu
473 #define ICACHETAG_SB_COUNT 2
474 #define ICACHETAG_SB_TIME 168h
476 event ereport.cpu.amd.ic.tag_par@chip/core/strand{within(5s)};
477 engine serd.cpu.amd.icachetag@chip/core/strand,
478 N=ICACHETAG_SB_COUNT, T=ICACHETAG_SB_TIME;
479 event fault.cpu.amd.icachetag@chip/core/strand, engine=serd.cpu.amd.icachetag@chip/core/strand;
481 prop fault.cpu.amd.icachetag@chip/core/strand (0)->
482 ereport.cpu.amd.ic.tag_par@chip/core/strand;
485 * A snoop tag array parity fault in an I cache can cause:
487 * - stag_par : reported by ic on this cpu
490 event ereport.cpu.amd.ic.stag_par@chip/core/strand{within(5s)};
491 event fault.cpu.amd.icachestag@chip/core/strand;
493 prop fault.cpu.amd.icachestag@chip/core/strand (1)->
494 ereport.cpu.amd.ic.stag_par@chip/core/strand;
497 * An l1tlb parity fault in an I cache can cause:
499 * - l1tlb_par : reported by ic on this cpu
502 #define ICACHEL1TLB_SB_COUNT 2
503 #define ICACHEL1TLB_SB_TIME 168h
505 event ereport.cpu.amd.ic.l1tlb_par@chip/core/strand{within(5s)};
506 engine serd.cpu.amd.l1itlb@chip/core/strand,
507 N=ICACHEL1TLB_SB_COUNT, T=ICACHEL1TLB_SB_TIME;
508 event fault.cpu.amd.l1itlb@chip/core/strand, engine=serd.cpu.amd.l1itlb@chip/core/strand;
510 prop fault.cpu.amd.l1itlb@chip/core/strand (0)->
511 ereport.cpu.amd.ic.l1tlb_par@chip/core/strand;
514 * An l2tlb parity fault in an I cache can cause:
516 * - l2tlb_par : reported by ic on this cpu
519 #define ICACHEL2TLB_SB_COUNT 2
520 #define ICACHEL2TLB_SB_TIME 168h
522 event ereport.cpu.amd.ic.l2tlb_par@chip/core/strand{within(5s)};
523 engine serd.cpu.amd.l2itlb@chip/core/strand,
524 N=ICACHEL2TLB_SB_COUNT, T=ICACHEL2TLB_SB_TIME;
525 event fault.cpu.amd.l2itlb@chip/core/strand, engine=serd.cpu.amd.l2itlb@chip/core/strand;
527 prop fault.cpu.amd.l2itlb@chip/core/strand (0)->
528 ereport.cpu.amd.ic.l2tlb_par@chip/core/strand;
531 * A single bit data array fault in an D cache can cause:
533 * - data_ecc1 : reported by dc on this cpu by scrubber
534 * - data_ecc1_uc : reported by dc on this cpu other than by scrubber
536 * Make data_ecc1_uc fault immediately as it may have caused a panic, so
537 * it is handled by the multi-bit case in the following section.
540 #define DCACHEDATA_SB_COUNT 2
541 #define DCACHEDATA_SB_TIME 168h
543 event ereport.cpu.amd.dc.data_ecc1@chip/core/strand{within(5s)};
544 event ereport.cpu.amd.dc.data_ecc1_uc@chip/core/strand{within(5s)};
545 engine serd.cpu.amd.dc_sb@chip/core/strand,
546 N=DCACHEDATA_SB_COUNT, T=DCACHEDATA_SB_TIME;
547 event fault.cpu.amd.dcachedata@chip/core/strand, engine=serd.cpu.amd.dc_sb@chip/core/strand;
549 prop fault.cpu.amd.dcachedata@chip/core/strand (0)->
550 ereport.cpu.amd.dc.data_ecc1@chip/core/strand;
553 * A multi-bit data array fault in an D cache can cause:
555 * - data_eccm : reported by dc on this cpu
558 event ereport.cpu.amd.dc.data_eccm@chip/core/strand{within(5s)};
560 prop fault.cpu.amd.dcachedata@chip/core/strand
561 { setserdincrement(L2CACHETAG_SB_COUNT + 1) } (0)->
562 ereport.cpu.amd.dc.data_eccm@chip/core/strand,
563 ereport.cpu.amd.dc.data_ecc1_uc@chip/core/strand;
566 * A tag array parity fault in an D cache can cause:
568 * - tag_par : reported by dc on this cpu
571 event ereport.cpu.amd.dc.tag_par@chip/core/strand{within(5s)};
572 event fault.cpu.amd.dcachetag@chip/core/strand;
574 prop fault.cpu.amd.dcachetag@chip/core/strand (1)->
575 ereport.cpu.amd.dc.tag_par@chip/core/strand;
578 * A snoop tag array parity fault in an D cache can cause:
580 * - stag_par : reported by dc on this cpu
583 event ereport.cpu.amd.dc.stag_par@chip/core/strand{within(5s)};
584 event fault.cpu.amd.dcachestag@chip/core/strand;
586 prop fault.cpu.amd.dcachestag@chip/core/strand (1)->
587 ereport.cpu.amd.dc.stag_par@chip/core/strand;
590 * An l1tlb parity fault in an D cache can cause:
592 * - l1tlb_par : reported by dc on this cpu
595 event ereport.cpu.amd.dc.l1tlb_par@chip/core/strand{within(5s)};
596 event fault.cpu.amd.l1dtlb@chip/core/strand;
598 prop fault.cpu.amd.l1dtlb@chip/core/strand (1)->
599 ereport.cpu.amd.dc.l1tlb_par@chip/core/strand;
602 * An l2tlb parity fault in an D cache can cause:
604 * - l2tlb_par : reported by dc on this cpu
607 event ereport.cpu.amd.dc.l2tlb_par@chip/core/strand{within(5s)};
608 event fault.cpu.amd.l2dtlb@chip/core/strand;
610 prop fault.cpu.amd.l2dtlb@chip/core/strand (1)->
611 ereport.cpu.amd.dc.l2tlb_par@chip/core/strand;
614 * Ereports that should not normally happen and which we will discard
615 * without diagnosis if they do. These fall into a few categories:
617 * - the corresponding detector is not enabled, typically because
618 * detection/handling of the event is taking place elsewhere
619 * (nb.ma, nb.ta, ls.rde, ic.rdde, bu.s_rde, nb.gart_walk)
620 * - the event is associated with a sync flood so even if the detector is
621 * enabled we will never handle the event and generate an ereport *and*
622 * even if the ereport did arrive we could perform no useful diagnosis
623 * e.g., the NB can be configured for sync flood on nb.mem_eccm
624 * but we don't choose to discard that ereport here since we could have
625 * made a useful diagnosis from it had it been delivered
626 * (nb.ht_sync, nb.ht_crc)
627 * - events that will be accompanied by an immediate panic and
628 * delivery of the ereport during subsequent reboot but from
629 * which no useful diagnosis can be made. (nb.rmw, nb.wdog)
631 * Ereports for all of these can be generated by error simulation and
632 * injection. We will perform a null diagnosos of all these ereports in order
633 * to avoid "no subscription" complaints during test harness runs.
636 event ereport.cpu.amd.nb.ma@strand{within(5s)};
637 event ereport.cpu.amd.nb.ta@strand{within(5s)};
638 event ereport.cpu.amd.ls.s_rde@strand{within(5s)};
639 event ereport.cpu.amd.ic.rdde@strand{within(5s)};
640 event ereport.cpu.amd.bu.s_rde@strand{within(5s)};
641 event ereport.cpu.amd.nb.gart_walk@strand{within(5s)};
642 event ereport.cpu.amd.nb.ht_sync@strand{within(5s)};
643 event ereport.cpu.amd.nb.ht_crc@strand{within(5s)};
644 event ereport.cpu.amd.nb.rmw@strand{within(5s)};
645 event ereport.cpu.amd.nb.wdog@strand{within(5s)};
646 event ereport.cpu.amd.unknown@strand{within(5s)};
648 event upset.null_diag@strand;
650 prop upset.null_diag@strand (1)->
651 ereport.cpu.amd.nb.ma@strand,
652 ereport.cpu.amd.nb.ta@strand,
653 ereport.cpu.amd.ls.s_rde@strand,
654 ereport.cpu.amd.ic.rdde@strand,
655 ereport.cpu.amd.bu.s_rde@strand,
656 ereport.cpu.amd.nb.gart_walk@strand,
657 ereport.cpu.amd.nb.ht_sync@strand,
658 ereport.cpu.amd.nb.ht_crc@strand,
659 ereport.cpu.amd.nb.rmw@strand,
660 ereport.cpu.amd.nb.wdog@strand,
661 ereport.cpu.amd.unknown@strand;