4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
27 * sun4u Memory Scrubbing
29 * On detection of a correctable memory ECC error, the sun4u kernel
30 * returns the corrected data to the requester and re-writes it
31 * to memory (DRAM). So if the correctable error was transient,
32 * the read has effectively been cleaned (scrubbed) from memory.
34 * Scrubbing thus reduces the likelyhood that multiple transient errors
35 * will occur in the same memory word, making uncorrectable errors due
36 * to transients less likely.
38 * Thus is born the desire that every memory location be periodically
41 * This file implements a memory scrubbing thread. This scrubber
42 * guarantees that all of physical memory is accessed periodically
43 * (memscrub_period_sec -- 12 hours).
45 * It attempts to do this as unobtrusively as possible. The thread
46 * schedules itself to wake up at an interval such that if it reads
47 * memscrub_span_pages (32MB) on each wakeup, it will read all of physical
48 * memory in in memscrub_period_sec (12 hours).
50 * The scrubber uses the block load and prefetch hardware to read memory
51 * @ 1300MB/s, so it reads spans of 32MB in 0.025 seconds. Unlike the
52 * original sun4d scrubber the sun4u scrubber does not read ahead if the
53 * system is idle because we can read memory very efficently.
55 * The scrubber maintains a private copy of the phys_install memory list
56 * to keep track of what memory should be scrubbed.
58 * The global routines memscrub_add_span() and memscrub_delete_span() are
59 * used to add and delete from this list. If hotplug memory is later
60 * supported these two routines can be used to notify the scrubber of
61 * memory configuration changes.
63 * The following parameters can be set via /etc/system
65 * memscrub_span_pages = MEMSCRUB_DFL_SPAN_PAGES (8MB)
66 * memscrub_period_sec = MEMSCRUB_DFL_PERIOD_SEC (12 hours)
67 * memscrub_thread_pri = MEMSCRUB_DFL_THREAD_PRI (MINCLSYSPRI)
68 * memscrub_delay_start_sec = (5 minutes)
69 * memscrub_verbose = (0)
70 * memscrub_override_ticks = (1 tick)
71 * disable_memscrub = (0)
72 * pause_memscrub = (0)
73 * read_all_memscrub = (0)
75 * The scrubber will print NOTICE messages of what it is doing if
76 * "memscrub_verbose" is set.
78 * If the scrubber's sleep time calculation drops to zero ticks,
79 * memscrub_override_ticks will be used as the sleep time instead. The
80 * sleep time should only drop to zero on a system with over 131.84
81 * terabytes of memory, or where the default scrubber parameters have
82 * been adjusted. For example, reducing memscrub_span_pages or
83 * memscrub_period_sec causes the sleep time to drop to zero with less
84 * memory. Note that since the sleep time is calculated in clock ticks,
85 * using hires clock ticks allows for more memory before the sleep time
88 * The scrubber will exit (or never be started) if it finds the variable
89 * "disable_memscrub" set.
91 * The scrubber will pause (not read memory) when "pause_memscrub"
92 * is set. It will check the state of pause_memscrub at each wakeup
93 * period. The scrubber will not make up for lost time. If you
94 * pause the scrubber for a prolonged period of time you can use
95 * the "read_all_memscrub" switch (see below) to catch up. In addition,
96 * pause_memscrub is used internally by the post memory DR callbacks.
97 * It is set for the small period of time during which the callbacks
98 * are executing. This ensures "memscrub_lock" will be released,
99 * allowing the callbacks to finish.
101 * The scrubber will read all memory if "read_all_memscrub" is set.
102 * The normal span read will also occur during the wakeup.
104 * MEMSCRUB_MIN_PAGES (32MB) is the minimum amount of memory a system
105 * must have before we'll start the scrubber.
107 * MEMSCRUB_DFL_SPAN_PAGES (32MB) is based on the guess that 0.025 sec
108 * is a "good" amount of minimum time for the thread to run at a time.
110 * MEMSCRUB_DFL_PERIOD_SEC (12 hours) is nearly a total guess --
111 * twice the frequency the hardware folk estimated would be necessary.
113 * MEMSCRUB_DFL_THREAD_PRI (MINCLSYSPRI) is based on the assumption
114 * that the scurbber should get its fair share of time (since it
115 * is short). At a priority of 0 the scrubber will be starved.
118 #include <sys/systm.h> /* timeout, types, t_lock */
119 #include <sys/cmn_err.h>
120 #include <sys/sysmacros.h> /* MIN */
121 #include <sys/memlist.h> /* memlist */
122 #include <sys/mem_config.h> /* memory add/delete */
123 #include <sys/kmem.h> /* KMEM_NOSLEEP */
124 #include <sys/cpuvar.h> /* ncpus_online */
125 #include <sys/debug.h> /* ASSERTs */
126 #include <sys/machsystm.h> /* lddphys */
127 #include <sys/cpu_module.h> /* vtag_flushpage */
128 #include <sys/kstat.h>
129 #include <sys/atomic.h> /* atomic_add_32 */
132 #include <vm/seg_kmem.h>
133 #include <vm/hat_sfmmu.h> /* XXX FIXME - delete */
135 #include <sys/time.h>
136 #include <sys/callb.h> /* CPR callback */
137 #include <sys/ontrap.h>
140 * Should really have paddr_t defined, but it is broken. Use
141 * ms_paddr_t in the meantime to make the code cleaner
143 typedef uint64_t ms_paddr_t
;
148 int memscrub_add_span(pfn_t pfn
, pgcnt_t pages
);
149 int memscrub_delete_span(pfn_t pfn
, pgcnt_t pages
);
150 int memscrub_init(void);
151 void memscrub_induced_error(void);
158 * scrub if we have at least this many pages
160 #define MEMSCRUB_MIN_PAGES (32 * 1024 * 1024 / PAGESIZE)
163 * scan all of physical memory at least once every MEMSCRUB_PERIOD_SEC
165 #define MEMSCRUB_DFL_PERIOD_SEC (12 * 60 * 60) /* 12 hours */
168 * scan at least MEMSCRUB_DFL_SPAN_PAGES each iteration
170 #define MEMSCRUB_DFL_SPAN_PAGES ((32 * 1024 * 1024) / PAGESIZE)
173 * almost anything is higher priority than scrubbing
175 #define MEMSCRUB_DFL_THREAD_PRI MINCLSYSPRI
178 * size used when scanning memory
180 #define MEMSCRUB_BLOCK_SIZE 256
181 #define MEMSCRUB_BLOCK_SIZE_SHIFT 8 /* log2(MEMSCRUB_BLOCK_SIZE) */
182 #define MEMSCRUB_BLOCKS_PER_PAGE (PAGESIZE >> MEMSCRUB_BLOCK_SIZE_SHIFT)
184 #define MEMSCRUB_BPP4M MMU_PAGESIZE4M >> MEMSCRUB_BLOCK_SIZE_SHIFT
185 #define MEMSCRUB_BPP512K MMU_PAGESIZE512K >> MEMSCRUB_BLOCK_SIZE_SHIFT
186 #define MEMSCRUB_BPP64K MMU_PAGESIZE64K >> MEMSCRUB_BLOCK_SIZE_SHIFT
187 #define MEMSCRUB_BPP MMU_PAGESIZE >> MEMSCRUB_BLOCK_SIZE_SHIFT
190 * This message indicates that we have exceeded the limitations of
191 * the memscrubber. See the comments above regarding what would
192 * cause the sleep time to become zero. In DEBUG mode, this message
193 * is logged on the console and in the messages file. In non-DEBUG
194 * mode, it is only logged in the messages file.
197 #define MEMSCRUB_OVERRIDE_MSG "Memory scrubber sleep time is zero " \
198 "seconds, consuming entire CPU."
200 #define MEMSCRUB_OVERRIDE_MSG "!Memory scrubber sleep time is zero " \
201 "seconds, consuming entire CPU."
205 * we can patch these defaults in /etc/system if necessary
207 uint_t disable_memscrub
= 0;
208 uint_t pause_memscrub
= 0;
209 uint_t read_all_memscrub
= 0;
210 uint_t memscrub_verbose
= 0;
211 uint_t memscrub_all_idle
= 0;
212 uint_t memscrub_span_pages
= MEMSCRUB_DFL_SPAN_PAGES
;
213 uint_t memscrub_period_sec
= MEMSCRUB_DFL_PERIOD_SEC
;
214 uint_t memscrub_thread_pri
= MEMSCRUB_DFL_THREAD_PRI
;
215 uint_t memscrub_delay_start_sec
= 5 * 60;
216 uint_t memscrub_override_ticks
= 1;
221 static void memscrubber(void);
222 static void memscrub_cleanup(void);
223 static int memscrub_add_span_gen(pfn_t
, pgcnt_t
, struct memlist
**, uint_t
*);
224 static int memscrub_verify_span(ms_paddr_t
*addrp
, pgcnt_t
*pagesp
);
225 static void memscrub_scan(uint_t blks
, ms_paddr_t src
);
231 static struct memlist
*memscrub_memlist
;
232 static uint_t memscrub_phys_pages
;
234 static kcondvar_t memscrub_cv
;
235 static kmutex_t memscrub_lock
;
237 * memscrub_lock protects memscrub_memlist, interval_ticks, cprinfo, ...
239 static void memscrub_init_mem_config(void);
240 static void memscrub_uninit_mem_config(void);
243 * Linked list of memscrub aware spans having retired pages.
244 * Currently enabled only on sun4u USIII-based platforms.
246 typedef struct memscrub_page_retire_span
{
248 struct memscrub_page_retire_span
*next
;
249 } memscrub_page_retire_span_t
;
251 static memscrub_page_retire_span_t
*memscrub_page_retire_span_list
= NULL
;
253 static void memscrub_page_retire_span_add(ms_paddr_t
);
254 static void memscrub_page_retire_span_delete(ms_paddr_t
);
255 static int memscrub_page_retire_span_search(ms_paddr_t
);
256 static void memscrub_page_retire_span_list_update(void);
259 * add_to_page_retire_list: Set by cpu_async_log_err() routine
260 * by calling memscrub_induced_error() when CE/UE occurs on a retired
261 * page due to memscrub reading. Cleared by memscrub after updating
262 * global page retire span list. Piggybacking on protection of
263 * memscrub_lock, which is held during set and clear.
264 * Note: When cpu_async_log_err() calls memscrub_induced_error(), it is running
265 * on softint context, which gets fired on a cpu memscrub thread currently
266 * running. Memscrub thread has affinity set during memscrub_read(), hence
267 * migration to new cpu not expected.
269 static int add_to_page_retire_list
= 0;
272 * Keep track of some interesting statistics
274 static struct memscrub_kstats
{
275 kstat_named_t done_early
; /* ahead of schedule */
276 kstat_named_t early_sec
; /* by cumulative num secs */
277 kstat_named_t done_late
; /* behind schedule */
278 kstat_named_t late_sec
; /* by cumulative num secs */
279 kstat_named_t interval_ticks
; /* num ticks between intervals */
280 kstat_named_t force_run
; /* forced to run, non-timeout */
281 kstat_named_t errors_found
; /* num errors found by memscrub */
282 } memscrub_counts
= {
283 { "done_early", KSTAT_DATA_UINT32
},
284 { "early_sec", KSTAT_DATA_UINT32
},
285 { "done_late", KSTAT_DATA_UINT32
},
286 { "late_sec", KSTAT_DATA_UINT32
},
287 { "interval_ticks", KSTAT_DATA_UINT32
},
288 { "force_run", KSTAT_DATA_UINT32
},
289 { "errors_found", KSTAT_DATA_UINT32
},
292 #define MEMSCRUB_STAT_INC(stat) memscrub_counts.stat.value.ui32++
293 #define MEMSCRUB_STAT_SET(stat, val) memscrub_counts.stat.value.ui32 = (val)
294 #define MEMSCRUB_STAT_NINC(stat, val) memscrub_counts.stat.value.ui32 += (val)
296 static struct kstat
*memscrub_ksp
= (struct kstat
*)NULL
;
298 static timeout_id_t memscrub_tid
= 0; /* keep track of timeout id */
301 * create memscrub_memlist from phys_install list
302 * initialize locks, set memscrub_phys_pages.
310 * only startup the scrubber if we have a minimum
313 if (physinstalled
>= MEMSCRUB_MIN_PAGES
) {
318 mutex_init(&memscrub_lock
, NULL
, MUTEX_DRIVER
, NULL
);
319 cv_init(&memscrub_cv
, NULL
, CV_DRIVER
, NULL
);
322 * copy phys_install to memscrub_memlist
324 for (src
= phys_install
; src
; src
= src
->ml_next
) {
325 if (memscrub_add_span(
326 (pfn_t
)(src
->ml_address
>> PAGESHIFT
),
327 (pgcnt_t
)(src
->ml_size
>> PAGESHIFT
))) {
336 memscrub_ksp
= kstat_create("unix", 0, "memscrub_kstat",
337 "misc", KSTAT_TYPE_NAMED
,
338 sizeof (memscrub_counts
) / sizeof (kstat_named_t
),
339 KSTAT_FLAG_VIRTUAL
| KSTAT_FLAG_WRITABLE
);
342 memscrub_ksp
->ks_data
= (void *)&memscrub_counts
;
343 kstat_install(memscrub_ksp
);
345 cmn_err(CE_NOTE
, "Memscrubber cannot create kstats\n");
349 * create memscrubber thread
351 (void) thread_create(NULL
, 0, (void (*)())memscrubber
,
352 NULL
, 0, &p0
, TS_RUN
, memscrub_thread_pri
);
355 * We don't want call backs changing the list
356 * if there is no thread running. We do not
357 * attempt to deal with stopping/starting scrubbing
358 * on memory size changes.
360 memscrub_init_mem_config();
367 memscrub_cleanup(void)
369 memscrub_uninit_mem_config();
370 while (memscrub_memlist
) {
371 (void) memscrub_delete_span(
372 (pfn_t
)(memscrub_memlist
->ml_address
>> PAGESHIFT
),
373 (pgcnt_t
)(memscrub_memlist
->ml_size
>> PAGESHIFT
));
376 kstat_delete(memscrub_ksp
);
377 cv_destroy(&memscrub_cv
);
378 mutex_destroy(&memscrub_lock
);
381 #ifdef MEMSCRUB_DEBUG
383 memscrub_printmemlist(char *title
, struct memlist
*listp
)
385 struct memlist
*list
;
387 cmn_err(CE_CONT
, "%s:\n", title
);
389 for (list
= listp
; list
; list
= list
->ml_next
) {
390 cmn_err(CE_CONT
, "addr = 0x%llx, size = 0x%llx\n",
391 list
->ml_address
, list
->ml_size
);
394 #endif /* MEMSCRUB_DEBUG */
398 memscrub_wakeup(void *c
)
401 * grab mutex to guarantee that our wakeup call
402 * arrives after we go to sleep -- so we can't sleep forever.
404 mutex_enter(&memscrub_lock
);
405 cv_signal(&memscrub_cv
);
406 mutex_exit(&memscrub_lock
);
410 * provide an interface external to the memscrubber
411 * which will force the memscrub thread to run vs.
412 * waiting for the timeout, if one is set
417 MEMSCRUB_STAT_INC(force_run
);
419 (void) untimeout(memscrub_tid
);
420 memscrub_wakeup((void *)NULL
);
425 * this calculation doesn't account for the time
426 * that the actual scan consumes -- so we'd fall
427 * slightly behind schedule with this interval.
432 compute_interval_ticks(void)
435 * We use msp_safe mpp_safe below to insure somebody
436 * doesn't set memscrub_span_pages or memscrub_phys_pages
439 static uint_t msp_safe
, mpp_safe
;
440 static uint_t interval_ticks
, period_ticks
;
441 msp_safe
= memscrub_span_pages
;
442 mpp_safe
= memscrub_phys_pages
;
444 period_ticks
= memscrub_period_sec
* hz
;
445 interval_ticks
= period_ticks
;
447 ASSERT(mutex_owned(&memscrub_lock
));
449 if ((msp_safe
!= 0) && (mpp_safe
!= 0)) {
450 if (memscrub_phys_pages
<= msp_safe
) {
451 interval_ticks
= period_ticks
;
453 interval_ticks
= (period_ticks
/
454 (mpp_safe
/ msp_safe
));
457 return (interval_ticks
);
463 ms_paddr_t address
, addr
;
466 uint_t reached_end
= 1;
467 uint_t paused_message
= 0;
468 uint_t interval_ticks
= 0;
469 uint_t sleep_warn_printed
= 0;
473 * notify CPR of our existence
475 CALLB_CPR_INIT(&cprinfo
, &memscrub_lock
, callb_generic_cpr
, "memscrub");
477 mutex_enter(&memscrub_lock
);
479 if (memscrub_memlist
== NULL
) {
480 cmn_err(CE_WARN
, "memscrub_memlist not initialized.");
484 address
= memscrub_memlist
->ml_address
;
486 deadline
= gethrestime_sec() + memscrub_delay_start_sec
;
489 if (disable_memscrub
)
493 * compute interval_ticks
495 interval_ticks
= compute_interval_ticks();
498 * If the calculated sleep time is zero, and pause_memscrub
499 * has been set, make sure we sleep so that another thread
500 * can acquire memscrub_lock.
502 if (interval_ticks
== 0 && pause_memscrub
) {
507 * And as a fail safe, under normal non-paused operation, do
508 * not allow the sleep time to be zero.
510 if (interval_ticks
== 0) {
511 interval_ticks
= memscrub_override_ticks
;
512 if (!sleep_warn_printed
) {
513 cmn_err(CE_NOTE
, MEMSCRUB_OVERRIDE_MSG
);
514 sleep_warn_printed
= 1;
518 MEMSCRUB_STAT_SET(interval_ticks
, interval_ticks
);
521 * Did we just reach the end of memory? If we are at the
522 * end of memory, delay end of memory processing until
523 * pause_memscrub is not set.
525 if (reached_end
&& !pause_memscrub
) {
526 time_t now
= gethrestime_sec();
528 if (now
>= deadline
) {
529 MEMSCRUB_STAT_INC(done_late
);
530 MEMSCRUB_STAT_NINC(late_sec
, now
- deadline
);
532 * past deadline, start right away
536 deadline
= now
+ memscrub_period_sec
;
539 * we finished ahead of schedule.
540 * wait till previous deadline before re-start.
542 interval_ticks
= (deadline
- now
) * hz
;
543 MEMSCRUB_STAT_INC(done_early
);
544 MEMSCRUB_STAT_NINC(early_sec
, deadline
- now
);
545 deadline
+= memscrub_period_sec
;
548 sleep_warn_printed
= 0;
551 if (interval_ticks
!= 0) {
553 * it is safe from our standpoint for CPR to
556 CALLB_CPR_SAFE_BEGIN(&cprinfo
);
561 memscrub_tid
= timeout(memscrub_wakeup
, NULL
,
567 cv_wait(&memscrub_cv
, &memscrub_lock
);
570 * at this point, no timeout should be set
575 * we need to goto work and will be modifying
576 * our internal state and mapping/unmapping
579 CALLB_CPR_SAFE_END(&cprinfo
, &memscrub_lock
);
583 if (memscrub_phys_pages
== 0) {
584 cmn_err(CE_WARN
, "Memory scrubber has 0 pages to read");
588 if (!pause_memscrub
) {
589 if (paused_message
) {
591 if (memscrub_verbose
)
592 cmn_err(CE_NOTE
, "Memory scrubber "
596 if (read_all_memscrub
) {
597 if (memscrub_verbose
)
598 cmn_err(CE_NOTE
, "Memory scrubber "
599 "reading all memory per request");
601 addr
= memscrub_memlist
->ml_address
;
603 while (!reached_end
) {
604 if (disable_memscrub
)
606 pages
= memscrub_phys_pages
;
607 reached_end
= memscrub_verify_span(
609 memscrub_scan(pages
*
610 MEMSCRUB_BLOCKS_PER_PAGE
, addr
);
611 addr
+= ((uint64_t)pages
* PAGESIZE
);
613 read_all_memscrub
= 0;
619 pages
= memscrub_span_pages
;
621 if (disable_memscrub
)
625 * determine physical address range
627 reached_end
= memscrub_verify_span(&address
,
630 memscrub_scan(pages
* MEMSCRUB_BLOCKS_PER_PAGE
,
633 address
+= ((uint64_t)pages
* PAGESIZE
);
636 if (pause_memscrub
&& !paused_message
) {
638 if (memscrub_verbose
)
639 cmn_err(CE_NOTE
, "Memory scrubber paused");
644 cmn_err(CE_NOTE
, "Memory scrubber exiting");
645 CALLB_CPR_EXIT(&cprinfo
);
652 * condition address and size
653 * such that they span legal physical addresses.
655 * when appropriate, address will be rounded up to start of next
656 * struct memlist, and pages will be rounded down to the end of the
659 * returns 1 if reached end of list, else returns 0.
662 memscrub_verify_span(ms_paddr_t
*addrp
, pgcnt_t
*pagesp
)
665 ms_paddr_t address
= *addrp
;
666 uint64_t bytes
= (uint64_t)*pagesp
* PAGESIZE
;
667 uint64_t bytes_remaining
;
670 ASSERT(mutex_owned(&memscrub_lock
));
673 * find memlist struct that contains addrp
674 * assumes memlist is sorted by ascending address.
676 for (mlp
= memscrub_memlist
; mlp
!= NULL
; mlp
= mlp
->ml_next
) {
678 * if before this chunk, round up to beginning
680 if (address
< mlp
->ml_address
) {
681 address
= mlp
->ml_address
;
685 * if before end of chunk, then we found it
687 if (address
< (mlp
->ml_address
+ mlp
->ml_size
))
690 /* else go to next struct memlist */
693 * if we hit end of list, start at beginning
696 mlp
= memscrub_memlist
;
697 address
= mlp
->ml_address
;
701 * now we have legal address, and its mlp, condition bytes
703 bytes_remaining
= (mlp
->ml_address
+ mlp
->ml_size
) - address
;
705 if (bytes
> bytes_remaining
)
706 bytes
= bytes_remaining
;
709 * will this span take us to end of list?
711 if ((mlp
->ml_next
== NULL
) &&
712 ((mlp
->ml_address
+ mlp
->ml_size
) == (address
+ bytes
)))
717 *pagesp
= bytes
/ PAGESIZE
;
719 return (reached_end
);
723 * add a span to the memscrub list
724 * add to memscrub_phys_pages
727 memscrub_add_span(pfn_t pfn
, pgcnt_t pages
)
729 #ifdef MEMSCRUB_DEBUG
730 ms_paddr_t address
= (ms_paddr_t
)pfn
<< PAGESHIFT
;
731 uint64_t bytes
= (uint64_t)pages
<< PAGESHIFT
;
732 #endif /* MEMSCRUB_DEBUG */
736 mutex_enter(&memscrub_lock
);
738 #ifdef MEMSCRUB_DEBUG
739 memscrub_printmemlist("memscrub_memlist before", memscrub_memlist
);
740 cmn_err(CE_CONT
, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages
);
741 cmn_err(CE_CONT
, "memscrub_add_span: address: 0x%llx"
742 " size: 0x%llx\n", address
, bytes
);
743 #endif /* MEMSCRUB_DEBUG */
745 retval
= memscrub_add_span_gen(pfn
, pages
, &memscrub_memlist
,
746 &memscrub_phys_pages
);
748 #ifdef MEMSCRUB_DEBUG
749 memscrub_printmemlist("memscrub_memlist after", memscrub_memlist
);
750 cmn_err(CE_CONT
, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages
);
751 #endif /* MEMSCRUB_DEBUG */
753 mutex_exit(&memscrub_lock
);
759 memscrub_add_span_gen(
762 struct memlist
**list
,
765 ms_paddr_t address
= (ms_paddr_t
)pfn
<< PAGESHIFT
;
766 uint64_t bytes
= (uint64_t)pages
<< PAGESHIFT
;
768 struct memlist
*prev
, *next
;
772 * allocate a new struct memlist
775 dst
= (struct memlist
*)
776 kmem_alloc(sizeof (struct memlist
), KM_NOSLEEP
);
783 dst
->ml_address
= address
;
784 dst
->ml_size
= bytes
;
798 * insert into sorted list
800 for (prev
= NULL
, next
= *list
;
802 prev
= next
, next
= next
->ml_next
) {
803 if (address
> (next
->ml_address
+ next
->ml_size
))
813 if ((address
+ bytes
) == next
->ml_address
) {
814 kmem_free(dst
, sizeof (struct memlist
));
816 next
->ml_address
= address
;
817 next
->ml_size
+= bytes
;
825 if (address
== (next
->ml_address
+ next
->ml_size
)) {
826 kmem_free(dst
, sizeof (struct memlist
));
830 * don't overlap with next->ml_next
832 if ((address
+ bytes
) >
833 next
->ml_next
->ml_address
) {
838 * concatenate next and next->ml_next
840 if ((address
+ bytes
) ==
841 next
->ml_next
->ml_address
) {
842 struct memlist
*mlp
= next
->ml_next
;
845 *list
= next
->ml_next
;
847 mlp
->ml_address
= next
->ml_address
;
848 mlp
->ml_size
+= next
->ml_size
;
849 mlp
->ml_size
+= bytes
;
852 next
->ml_prev
->ml_next
= mlp
;
853 mlp
->ml_prev
= next
->ml_prev
;
856 sizeof (struct memlist
));
861 next
->ml_size
+= bytes
;
866 /* don't overlap with next */
867 if ((address
+ bytes
) > next
->ml_address
) {
869 kmem_free(dst
, sizeof (struct memlist
));
888 * end of list, prev is valid and next is NULL
903 * delete a span from the memscrub list
904 * subtract from memscrub_phys_pages
907 memscrub_delete_span(pfn_t pfn
, pgcnt_t pages
)
909 ms_paddr_t address
= (ms_paddr_t
)pfn
<< PAGESHIFT
;
910 uint64_t bytes
= (uint64_t)pages
<< PAGESHIFT
;
911 struct memlist
*dst
, *next
;
914 mutex_enter(&memscrub_lock
);
916 #ifdef MEMSCRUB_DEBUG
917 memscrub_printmemlist("memscrub_memlist Before", memscrub_memlist
);
918 cmn_err(CE_CONT
, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages
);
919 cmn_err(CE_CONT
, "memscrub_delete_span: 0x%llx 0x%llx\n",
921 #endif /* MEMSCRUB_DEBUG */
924 * find struct memlist containing page
926 for (next
= memscrub_memlist
; next
!= NULL
; next
= next
->ml_next
) {
927 if ((address
>= next
->ml_address
) &&
928 (address
< next
->ml_address
+ next
->ml_size
))
933 * if start address not in list
941 * error if size goes off end of this struct memlist
943 if (address
+ bytes
> next
->ml_address
+ next
->ml_size
) {
949 * pages at beginning of struct memlist
951 if (address
== next
->ml_address
) {
953 * if start & size match, delete from list
955 if (bytes
== next
->ml_size
) {
956 if (next
== memscrub_memlist
)
957 memscrub_memlist
= next
->ml_next
;
958 if (next
->ml_prev
!= NULL
)
959 next
->ml_prev
->ml_next
= next
->ml_next
;
960 if (next
->ml_next
!= NULL
)
961 next
->ml_next
->ml_prev
= next
->ml_prev
;
963 kmem_free(next
, sizeof (struct memlist
));
966 * increment start address by bytes
968 next
->ml_address
+= bytes
;
969 next
->ml_size
-= bytes
;
975 * pages at end of struct memlist
977 if (address
+ bytes
== next
->ml_address
+ next
->ml_size
) {
979 * decrement size by bytes
981 next
->ml_size
-= bytes
;
986 * delete a span in the middle of the struct memlist
990 * create a new struct memlist
992 dst
= (struct memlist
*)
993 kmem_alloc(sizeof (struct memlist
), KM_NOSLEEP
);
1001 * existing struct memlist gets address
1002 * and size up to pfn
1004 dst
->ml_address
= address
+ bytes
;
1006 (next
->ml_address
+ next
->ml_size
) - dst
->ml_address
;
1007 next
->ml_size
= address
- next
->ml_address
;
1010 * new struct memlist gets address starting
1011 * after pfn, until end
1015 * link in new memlist after old
1017 dst
->ml_next
= next
->ml_next
;
1018 dst
->ml_prev
= next
;
1020 if (next
->ml_next
!= NULL
)
1021 next
->ml_next
->ml_prev
= dst
;
1022 next
->ml_next
= dst
;
1027 memscrub_phys_pages
-= pages
;
1028 if (memscrub_phys_pages
== 0)
1029 disable_memscrub
= 1;
1032 #ifdef MEMSCRUB_DEBUG
1033 memscrub_printmemlist("memscrub_memlist After", memscrub_memlist
);
1034 cmn_err(CE_CONT
, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages
);
1035 #endif /* MEMSCRUB_DEBUG */
1037 mutex_exit(&memscrub_lock
);
1042 memscrub_scan(uint_t blks
, ms_paddr_t src
)
1044 uint_t psz
, bpp
, pgsread
;
1049 int scan_mmu_pagesize
= 0;
1050 int retired_pages
= 0;
1052 extern void memscrub_read(caddr_t src
, uint_t blks
);
1054 ASSERT(mutex_owned(&memscrub_lock
));
1059 if (memscrub_page_retire_span_list
!= NULL
) {
1060 if (memscrub_page_retire_span_search(src
)) {
1061 /* retired pages in current span */
1062 scan_mmu_pagesize
= 1;
1066 #ifdef MEMSCRUB_DEBUG
1067 cmn_err(CE_NOTE
, "scan_mmu_pagesize = %d\n" scan_mmu_pagesize
);
1068 #endif /* MEMSCRUB_DEBUG */
1071 /* Ensure the PA is properly aligned */
1072 if (((pa
& MMU_PAGEMASK4M
) == pa
) &&
1073 (blks
>= MEMSCRUB_BPP4M
)) {
1074 psz
= MMU_PAGESIZE4M
;
1075 bpp
= MEMSCRUB_BPP4M
;
1076 } else if (((pa
& MMU_PAGEMASK512K
) == pa
) &&
1077 (blks
>= MEMSCRUB_BPP512K
)) {
1078 psz
= MMU_PAGESIZE512K
;
1079 bpp
= MEMSCRUB_BPP512K
;
1080 } else if (((pa
& MMU_PAGEMASK64K
) == pa
) &&
1081 (blks
>= MEMSCRUB_BPP64K
)) {
1082 psz
= MMU_PAGESIZE64K
;
1083 bpp
= MEMSCRUB_BPP64K
;
1084 } else if ((pa
& MMU_PAGEMASK
) == pa
) {
1088 if (memscrub_verbose
) {
1089 cmn_err(CE_NOTE
, "Memory scrubber ignoring "
1090 "non-page aligned block starting at 0x%"
1095 if (blks
< bpp
) bpp
= blks
;
1097 #ifdef MEMSCRUB_DEBUG
1098 cmn_err(CE_NOTE
, "Going to run psz=%x, "
1099 "bpp=%x pa=%llx\n", psz
, bpp
, pa
);
1100 #endif /* MEMSCRUB_DEBUG */
1103 * MEMSCRUBBASE is a 4MB aligned page in the
1104 * kernel so that we can quickly map the PA
1105 * to a VA for the block loads performed in
1109 va
= (caddr_t
)MEMSCRUBBASE
;
1110 hat_devload(kas
.a_hat
, va
, psz
, pfn
, PROT_READ
,
1111 HAT_LOAD_NOCONSIST
| HAT_LOAD_LOCK
);
1114 * Can't allow the memscrubber to migrate across CPUs as
1115 * we need to know whether CEEN is enabled for the current
1116 * CPU to enable us to scrub the memory. Don't use
1117 * kpreempt_disable as the time we take to scan a span (even
1118 * without cpu_check_ce having to manually cpu_check_block)
1119 * is too long to hold a higher priority thread (eg, RT)
1122 thread_affinity_set(curthread
, CPU_CURRENT
);
1125 * Protect read scrub from async faults. For now, we simply
1126 * maintain a count of such faults caught.
1129 if (!on_trap(&otd
, OT_DATA_EC
) && !scan_mmu_pagesize
) {
1130 memscrub_read(va
, bpp
);
1132 * Check if CEs require logging
1134 cpu_check_ce(SCRUBBER_CEEN_CHECK
,
1135 (uint64_t)pa
, va
, psz
);
1137 thread_affinity_clear(curthread
);
1140 thread_affinity_clear(curthread
);
1143 * Got an async error..
1144 * Try rescanning it at MMU_PAGESIZE
1145 * granularity if we were trying to
1146 * read at a larger page size.
1147 * This is to ensure we continue to
1148 * scan the rest of the span.
1149 * OR scanning MMU_PAGESIZE granularity to avoid
1150 * reading retired pages memory when scan_mmu_pagesize
1153 if (psz
> MMU_PAGESIZE
|| scan_mmu_pagesize
) {
1155 ms_paddr_t paddr
= pa
;
1157 for (; tmp
< bpp
; tmp
+= MEMSCRUB_BPP
) {
1158 /* Don't scrub retired pages */
1159 if (page_retire_check(paddr
, NULL
)
1161 vaddr
+= MMU_PAGESIZE
;
1162 paddr
+= MMU_PAGESIZE
;
1166 thread_affinity_set(curthread
,
1168 if (!on_trap(&otd
, OT_DATA_EC
)) {
1169 memscrub_read(vaddr
,
1172 SCRUBBER_CEEN_CHECK
,
1173 (uint64_t)paddr
, vaddr
,
1178 MEMSCRUB_STAT_INC(errors_found
);
1180 thread_affinity_clear(curthread
);
1181 vaddr
+= MMU_PAGESIZE
;
1182 paddr
+= MMU_PAGESIZE
;
1186 hat_unload(kas
.a_hat
, va
, psz
, HAT_UNLOAD_UNLOCK
);
1194 * If just finished scrubbing MMU_PAGESIZE at a time, but no retired
1195 * pages found so delete span from global list.
1197 if (scan_mmu_pagesize
&& retired_pages
== 0)
1198 memscrub_page_retire_span_delete(src
);
1201 * Encountered CE/UE on a retired page during memscrub read of current
1202 * span. Adding span to global list to enable avoid reading further.
1204 if (add_to_page_retire_list
) {
1205 if (!memscrub_page_retire_span_search(src
))
1206 memscrub_page_retire_span_add(src
);
1207 add_to_page_retire_list
= 0;
1210 if (memscrub_verbose
) {
1211 cmn_err(CE_NOTE
, "Memory scrubber read 0x%x pages starting "
1212 "at 0x%" PRIx64
, pgsread
, src
);
1217 * Called by cpu_async_log_err() when memscrub read causes
1218 * CE/UE on a retired page.
1221 memscrub_induced_error(void)
1223 add_to_page_retire_list
= 1;
1227 * Called by page_retire() when toxic pages cannot be retired
1228 * immediately and are scheduled for retire. Memscrubber stops
1229 * scrubbing them to avoid further CE/UEs.
1232 memscrub_notify(ms_paddr_t pa
)
1234 mutex_enter(&memscrub_lock
);
1235 if (!memscrub_page_retire_span_search(pa
))
1236 memscrub_page_retire_span_add(pa
);
1237 mutex_exit(&memscrub_lock
);
1241 * Called by memscrub_scan() and memscrub_notify().
1242 * pa: physical address of span with CE/UE, add to global list.
1245 memscrub_page_retire_span_add(ms_paddr_t pa
)
1247 memscrub_page_retire_span_t
*new_span
;
1249 new_span
= (memscrub_page_retire_span_t
*)
1250 kmem_zalloc(sizeof (memscrub_page_retire_span_t
), KM_NOSLEEP
);
1252 if (new_span
== NULL
) {
1253 #ifdef MEMSCRUB_DEBUG
1254 cmn_err(CE_NOTE
, "failed to allocate new span - span with"
1255 " retired page/s not tracked.\n");
1256 #endif /* MEMSCRUB_DEBUG */
1260 new_span
->address
= pa
;
1261 new_span
->next
= memscrub_page_retire_span_list
;
1262 memscrub_page_retire_span_list
= new_span
;
1266 * Called by memscrub_scan().
1267 * pa: physical address of span to be removed from global list.
1270 memscrub_page_retire_span_delete(ms_paddr_t pa
)
1272 memscrub_page_retire_span_t
*prev_span
, *next_span
;
1274 prev_span
= memscrub_page_retire_span_list
;
1275 next_span
= memscrub_page_retire_span_list
->next
;
1277 if (pa
== prev_span
->address
) {
1278 memscrub_page_retire_span_list
= next_span
;
1279 kmem_free(prev_span
, sizeof (memscrub_page_retire_span_t
));
1284 if (pa
== next_span
->address
) {
1285 prev_span
->next
= next_span
->next
;
1286 kmem_free(next_span
,
1287 sizeof (memscrub_page_retire_span_t
));
1290 prev_span
= next_span
;
1291 next_span
= next_span
->next
;
1296 * Called by memscrub_scan() and memscrub_notify().
1297 * pa: physical address of span to be searched in global list.
1300 memscrub_page_retire_span_search(ms_paddr_t pa
)
1302 memscrub_page_retire_span_t
*next_span
= memscrub_page_retire_span_list
;
1305 if (pa
== next_span
->address
)
1307 next_span
= next_span
->next
;
1313 * Called from new_memscrub() as a result of memory delete.
1314 * Using page_numtopp_nolock() to determine if we have valid PA.
1317 memscrub_page_retire_span_list_update(void)
1319 memscrub_page_retire_span_t
*prev
, *cur
, *next
;
1321 if (memscrub_page_retire_span_list
== NULL
)
1324 prev
= cur
= memscrub_page_retire_span_list
;
1328 if (page_numtopp_nolock(mmu_btop(cur
->address
)) == NULL
) {
1329 if (cur
== memscrub_page_retire_span_list
) {
1330 memscrub_page_retire_span_list
= next
;
1332 sizeof (memscrub_page_retire_span_t
));
1333 prev
= cur
= memscrub_page_retire_span_list
;
1335 prev
->next
= cur
->next
;
1337 sizeof (memscrub_page_retire_span_t
));
1350 * The memory add/delete callback mechanism does not pass in the
1351 * page ranges. The phys_install list has been updated though, so
1352 * create a new scrub list from it.
1356 new_memscrub(int update_page_retire_list
)
1358 struct memlist
*src
, *list
, *old_list
;
1362 * copy phys_install to memscrub_memlist
1366 memlist_read_lock();
1367 for (src
= phys_install
; src
; src
= src
->ml_next
) {
1368 if (memscrub_add_span_gen((pfn_t
)(src
->ml_address
>> PAGESHIFT
),
1369 (pgcnt_t
)(src
->ml_size
>> PAGESHIFT
), &list
, &npgs
)) {
1370 memlist_read_unlock();
1375 list
= list
->ml_next
;
1376 kmem_free(el
, sizeof (struct memlist
));
1381 memlist_read_unlock();
1383 mutex_enter(&memscrub_lock
);
1384 memscrub_phys_pages
= npgs
;
1385 old_list
= memscrub_memlist
;
1386 memscrub_memlist
= list
;
1388 if (update_page_retire_list
)
1389 memscrub_page_retire_span_list_update();
1391 mutex_exit(&memscrub_lock
);
1397 old_list
= old_list
->ml_next
;
1398 kmem_free(el
, sizeof (struct memlist
));
1406 memscrub_mem_config_post_add(
1408 pgcnt_t delta_pages
)
1411 * We increment pause_memscrub before entering new_memscrub(). This
1412 * will force the memscrubber to sleep, allowing the DR callback
1413 * thread to acquire memscrub_lock in new_memscrub(). The use of
1414 * atomic_add_32() allows concurrent memory DR operations to use the
1417 atomic_inc_32(&pause_memscrub
);
1418 ASSERT(pause_memscrub
!= 0);
1421 * "Don't care" if we are not scrubbing new memory.
1423 (void) new_memscrub(0); /* retain page retire list */
1425 /* Restore the pause setting. */
1426 atomic_dec_32(&pause_memscrub
);
1431 memscrub_mem_config_pre_del(
1433 pgcnt_t delta_pages
)
1435 /* Nothing to do. */
1441 memscrub_mem_config_post_del(
1443 pgcnt_t delta_pages
,
1447 * We increment pause_memscrub before entering new_memscrub(). This
1448 * will force the memscrubber to sleep, allowing the DR callback
1449 * thread to acquire memscrub_lock in new_memscrub(). The use of
1450 * atomic_add_32() allows concurrent memory DR operations to use the
1453 atomic_inc_32(&pause_memscrub
);
1454 ASSERT(pause_memscrub
!= 0);
1457 * Must stop scrubbing deleted memory as it may be disconnected.
1459 if (new_memscrub(1)) { /* update page retire list */
1460 disable_memscrub
= 1;
1463 /* Restore the pause setting. */
1464 atomic_dec_32(&pause_memscrub
);
1467 static kphysm_setup_vector_t memscrub_mem_config_vec
= {
1468 KPHYSM_SETUP_VECTOR_VERSION
,
1469 memscrub_mem_config_post_add
,
1470 memscrub_mem_config_pre_del
,
1471 memscrub_mem_config_post_del
,
1475 memscrub_init_mem_config()
1479 ret
= kphysm_setup_func_register(&memscrub_mem_config_vec
,
1485 memscrub_uninit_mem_config()
1487 /* This call is OK if the register call was not done. */
1488 kphysm_setup_func_unregister(&memscrub_mem_config_vec
, (void *)NULL
);