8354 sync regcomp(3C) with upstream (fix make catalog)
[unleashed/tickless.git] / usr / src / uts / sun4u / os / memscrub.c
blob2fda07db9e5b649b883bf93ff25636ac066e3543
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
27 * sun4u Memory Scrubbing
29 * On detection of a correctable memory ECC error, the sun4u kernel
30 * returns the corrected data to the requester and re-writes it
31 * to memory (DRAM). So if the correctable error was transient,
32 * the read has effectively been cleaned (scrubbed) from memory.
34 * Scrubbing thus reduces the likelyhood that multiple transient errors
35 * will occur in the same memory word, making uncorrectable errors due
36 * to transients less likely.
38 * Thus is born the desire that every memory location be periodically
39 * accessed.
41 * This file implements a memory scrubbing thread. This scrubber
42 * guarantees that all of physical memory is accessed periodically
43 * (memscrub_period_sec -- 12 hours).
45 * It attempts to do this as unobtrusively as possible. The thread
46 * schedules itself to wake up at an interval such that if it reads
47 * memscrub_span_pages (32MB) on each wakeup, it will read all of physical
48 * memory in in memscrub_period_sec (12 hours).
50 * The scrubber uses the block load and prefetch hardware to read memory
51 * @ 1300MB/s, so it reads spans of 32MB in 0.025 seconds. Unlike the
52 * original sun4d scrubber the sun4u scrubber does not read ahead if the
53 * system is idle because we can read memory very efficently.
55 * The scrubber maintains a private copy of the phys_install memory list
56 * to keep track of what memory should be scrubbed.
58 * The global routines memscrub_add_span() and memscrub_delete_span() are
59 * used to add and delete from this list. If hotplug memory is later
60 * supported these two routines can be used to notify the scrubber of
61 * memory configuration changes.
63 * The following parameters can be set via /etc/system
65 * memscrub_span_pages = MEMSCRUB_DFL_SPAN_PAGES (8MB)
66 * memscrub_period_sec = MEMSCRUB_DFL_PERIOD_SEC (12 hours)
67 * memscrub_thread_pri = MEMSCRUB_DFL_THREAD_PRI (MINCLSYSPRI)
68 * memscrub_delay_start_sec = (5 minutes)
69 * memscrub_verbose = (0)
70 * memscrub_override_ticks = (1 tick)
71 * disable_memscrub = (0)
72 * pause_memscrub = (0)
73 * read_all_memscrub = (0)
75 * The scrubber will print NOTICE messages of what it is doing if
76 * "memscrub_verbose" is set.
78 * If the scrubber's sleep time calculation drops to zero ticks,
79 * memscrub_override_ticks will be used as the sleep time instead. The
80 * sleep time should only drop to zero on a system with over 131.84
81 * terabytes of memory, or where the default scrubber parameters have
82 * been adjusted. For example, reducing memscrub_span_pages or
83 * memscrub_period_sec causes the sleep time to drop to zero with less
84 * memory. Note that since the sleep time is calculated in clock ticks,
85 * using hires clock ticks allows for more memory before the sleep time
86 * becomes zero.
88 * The scrubber will exit (or never be started) if it finds the variable
89 * "disable_memscrub" set.
91 * The scrubber will pause (not read memory) when "pause_memscrub"
92 * is set. It will check the state of pause_memscrub at each wakeup
93 * period. The scrubber will not make up for lost time. If you
94 * pause the scrubber for a prolonged period of time you can use
95 * the "read_all_memscrub" switch (see below) to catch up. In addition,
96 * pause_memscrub is used internally by the post memory DR callbacks.
97 * It is set for the small period of time during which the callbacks
98 * are executing. This ensures "memscrub_lock" will be released,
99 * allowing the callbacks to finish.
101 * The scrubber will read all memory if "read_all_memscrub" is set.
102 * The normal span read will also occur during the wakeup.
104 * MEMSCRUB_MIN_PAGES (32MB) is the minimum amount of memory a system
105 * must have before we'll start the scrubber.
107 * MEMSCRUB_DFL_SPAN_PAGES (32MB) is based on the guess that 0.025 sec
108 * is a "good" amount of minimum time for the thread to run at a time.
110 * MEMSCRUB_DFL_PERIOD_SEC (12 hours) is nearly a total guess --
111 * twice the frequency the hardware folk estimated would be necessary.
113 * MEMSCRUB_DFL_THREAD_PRI (MINCLSYSPRI) is based on the assumption
114 * that the scurbber should get its fair share of time (since it
115 * is short). At a priority of 0 the scrubber will be starved.
118 #include <sys/systm.h> /* timeout, types, t_lock */
119 #include <sys/cmn_err.h>
120 #include <sys/sysmacros.h> /* MIN */
121 #include <sys/memlist.h> /* memlist */
122 #include <sys/mem_config.h> /* memory add/delete */
123 #include <sys/kmem.h> /* KMEM_NOSLEEP */
124 #include <sys/cpuvar.h> /* ncpus_online */
125 #include <sys/debug.h> /* ASSERTs */
126 #include <sys/machsystm.h> /* lddphys */
127 #include <sys/cpu_module.h> /* vtag_flushpage */
128 #include <sys/kstat.h>
129 #include <sys/atomic.h> /* atomic_add_32 */
131 #include <vm/hat.h>
132 #include <vm/seg_kmem.h>
133 #include <vm/hat_sfmmu.h> /* XXX FIXME - delete */
135 #include <sys/time.h>
136 #include <sys/callb.h> /* CPR callback */
137 #include <sys/ontrap.h>
140 * Should really have paddr_t defined, but it is broken. Use
141 * ms_paddr_t in the meantime to make the code cleaner
143 typedef uint64_t ms_paddr_t;
146 * Global Routines:
148 int memscrub_add_span(pfn_t pfn, pgcnt_t pages);
149 int memscrub_delete_span(pfn_t pfn, pgcnt_t pages);
150 int memscrub_init(void);
151 void memscrub_induced_error(void);
154 * Global Data:
158 * scrub if we have at least this many pages
160 #define MEMSCRUB_MIN_PAGES (32 * 1024 * 1024 / PAGESIZE)
163 * scan all of physical memory at least once every MEMSCRUB_PERIOD_SEC
165 #define MEMSCRUB_DFL_PERIOD_SEC (12 * 60 * 60) /* 12 hours */
168 * scan at least MEMSCRUB_DFL_SPAN_PAGES each iteration
170 #define MEMSCRUB_DFL_SPAN_PAGES ((32 * 1024 * 1024) / PAGESIZE)
173 * almost anything is higher priority than scrubbing
175 #define MEMSCRUB_DFL_THREAD_PRI MINCLSYSPRI
178 * size used when scanning memory
180 #define MEMSCRUB_BLOCK_SIZE 256
181 #define MEMSCRUB_BLOCK_SIZE_SHIFT 8 /* log2(MEMSCRUB_BLOCK_SIZE) */
182 #define MEMSCRUB_BLOCKS_PER_PAGE (PAGESIZE >> MEMSCRUB_BLOCK_SIZE_SHIFT)
184 #define MEMSCRUB_BPP4M MMU_PAGESIZE4M >> MEMSCRUB_BLOCK_SIZE_SHIFT
185 #define MEMSCRUB_BPP512K MMU_PAGESIZE512K >> MEMSCRUB_BLOCK_SIZE_SHIFT
186 #define MEMSCRUB_BPP64K MMU_PAGESIZE64K >> MEMSCRUB_BLOCK_SIZE_SHIFT
187 #define MEMSCRUB_BPP MMU_PAGESIZE >> MEMSCRUB_BLOCK_SIZE_SHIFT
190 * This message indicates that we have exceeded the limitations of
191 * the memscrubber. See the comments above regarding what would
192 * cause the sleep time to become zero. In DEBUG mode, this message
193 * is logged on the console and in the messages file. In non-DEBUG
194 * mode, it is only logged in the messages file.
196 #ifdef DEBUG
197 #define MEMSCRUB_OVERRIDE_MSG "Memory scrubber sleep time is zero " \
198 "seconds, consuming entire CPU."
199 #else
200 #define MEMSCRUB_OVERRIDE_MSG "!Memory scrubber sleep time is zero " \
201 "seconds, consuming entire CPU."
202 #endif /* DEBUG */
205 * we can patch these defaults in /etc/system if necessary
207 uint_t disable_memscrub = 0;
208 uint_t pause_memscrub = 0;
209 uint_t read_all_memscrub = 0;
210 uint_t memscrub_verbose = 0;
211 uint_t memscrub_all_idle = 0;
212 uint_t memscrub_span_pages = MEMSCRUB_DFL_SPAN_PAGES;
213 uint_t memscrub_period_sec = MEMSCRUB_DFL_PERIOD_SEC;
214 uint_t memscrub_thread_pri = MEMSCRUB_DFL_THREAD_PRI;
215 uint_t memscrub_delay_start_sec = 5 * 60;
216 uint_t memscrub_override_ticks = 1;
219 * Static Routines
221 static void memscrubber(void);
222 static void memscrub_cleanup(void);
223 static int memscrub_add_span_gen(pfn_t, pgcnt_t, struct memlist **, uint_t *);
224 static int memscrub_verify_span(ms_paddr_t *addrp, pgcnt_t *pagesp);
225 static void memscrub_scan(uint_t blks, ms_paddr_t src);
228 * Static Data
231 static struct memlist *memscrub_memlist;
232 static uint_t memscrub_phys_pages;
234 static kcondvar_t memscrub_cv;
235 static kmutex_t memscrub_lock;
237 * memscrub_lock protects memscrub_memlist, interval_ticks, cprinfo, ...
239 static void memscrub_init_mem_config(void);
240 static void memscrub_uninit_mem_config(void);
243 * Linked list of memscrub aware spans having retired pages.
244 * Currently enabled only on sun4u USIII-based platforms.
246 typedef struct memscrub_page_retire_span {
247 ms_paddr_t address;
248 struct memscrub_page_retire_span *next;
249 } memscrub_page_retire_span_t;
251 static memscrub_page_retire_span_t *memscrub_page_retire_span_list = NULL;
253 static void memscrub_page_retire_span_add(ms_paddr_t);
254 static void memscrub_page_retire_span_delete(ms_paddr_t);
255 static int memscrub_page_retire_span_search(ms_paddr_t);
256 static void memscrub_page_retire_span_list_update(void);
259 * add_to_page_retire_list: Set by cpu_async_log_err() routine
260 * by calling memscrub_induced_error() when CE/UE occurs on a retired
261 * page due to memscrub reading. Cleared by memscrub after updating
262 * global page retire span list. Piggybacking on protection of
263 * memscrub_lock, which is held during set and clear.
264 * Note: When cpu_async_log_err() calls memscrub_induced_error(), it is running
265 * on softint context, which gets fired on a cpu memscrub thread currently
266 * running. Memscrub thread has affinity set during memscrub_read(), hence
267 * migration to new cpu not expected.
269 static int add_to_page_retire_list = 0;
272 * Keep track of some interesting statistics
274 static struct memscrub_kstats {
275 kstat_named_t done_early; /* ahead of schedule */
276 kstat_named_t early_sec; /* by cumulative num secs */
277 kstat_named_t done_late; /* behind schedule */
278 kstat_named_t late_sec; /* by cumulative num secs */
279 kstat_named_t interval_ticks; /* num ticks between intervals */
280 kstat_named_t force_run; /* forced to run, non-timeout */
281 kstat_named_t errors_found; /* num errors found by memscrub */
282 } memscrub_counts = {
283 { "done_early", KSTAT_DATA_UINT32 },
284 { "early_sec", KSTAT_DATA_UINT32 },
285 { "done_late", KSTAT_DATA_UINT32 },
286 { "late_sec", KSTAT_DATA_UINT32 },
287 { "interval_ticks", KSTAT_DATA_UINT32 },
288 { "force_run", KSTAT_DATA_UINT32 },
289 { "errors_found", KSTAT_DATA_UINT32 },
292 #define MEMSCRUB_STAT_INC(stat) memscrub_counts.stat.value.ui32++
293 #define MEMSCRUB_STAT_SET(stat, val) memscrub_counts.stat.value.ui32 = (val)
294 #define MEMSCRUB_STAT_NINC(stat, val) memscrub_counts.stat.value.ui32 += (val)
296 static struct kstat *memscrub_ksp = (struct kstat *)NULL;
298 static timeout_id_t memscrub_tid = 0; /* keep track of timeout id */
301 * create memscrub_memlist from phys_install list
302 * initialize locks, set memscrub_phys_pages.
305 memscrub_init(void)
307 struct memlist *src;
310 * only startup the scrubber if we have a minimum
311 * number of pages
313 if (physinstalled >= MEMSCRUB_MIN_PAGES) {
316 * initialize locks
318 mutex_init(&memscrub_lock, NULL, MUTEX_DRIVER, NULL);
319 cv_init(&memscrub_cv, NULL, CV_DRIVER, NULL);
322 * copy phys_install to memscrub_memlist
324 for (src = phys_install; src; src = src->ml_next) {
325 if (memscrub_add_span(
326 (pfn_t)(src->ml_address >> PAGESHIFT),
327 (pgcnt_t)(src->ml_size >> PAGESHIFT))) {
328 memscrub_cleanup();
329 return (-1);
334 * initialize kstats
336 memscrub_ksp = kstat_create("unix", 0, "memscrub_kstat",
337 "misc", KSTAT_TYPE_NAMED,
338 sizeof (memscrub_counts) / sizeof (kstat_named_t),
339 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
341 if (memscrub_ksp) {
342 memscrub_ksp->ks_data = (void *)&memscrub_counts;
343 kstat_install(memscrub_ksp);
344 } else {
345 cmn_err(CE_NOTE, "Memscrubber cannot create kstats\n");
349 * create memscrubber thread
351 (void) thread_create(NULL, 0, (void (*)())memscrubber,
352 NULL, 0, &p0, TS_RUN, memscrub_thread_pri);
355 * We don't want call backs changing the list
356 * if there is no thread running. We do not
357 * attempt to deal with stopping/starting scrubbing
358 * on memory size changes.
360 memscrub_init_mem_config();
363 return (0);
366 static void
367 memscrub_cleanup(void)
369 memscrub_uninit_mem_config();
370 while (memscrub_memlist) {
371 (void) memscrub_delete_span(
372 (pfn_t)(memscrub_memlist->ml_address >> PAGESHIFT),
373 (pgcnt_t)(memscrub_memlist->ml_size >> PAGESHIFT));
375 if (memscrub_ksp)
376 kstat_delete(memscrub_ksp);
377 cv_destroy(&memscrub_cv);
378 mutex_destroy(&memscrub_lock);
381 #ifdef MEMSCRUB_DEBUG
382 static void
383 memscrub_printmemlist(char *title, struct memlist *listp)
385 struct memlist *list;
387 cmn_err(CE_CONT, "%s:\n", title);
389 for (list = listp; list; list = list->ml_next) {
390 cmn_err(CE_CONT, "addr = 0x%llx, size = 0x%llx\n",
391 list->ml_address, list->ml_size);
394 #endif /* MEMSCRUB_DEBUG */
396 /* ARGSUSED */
397 static void
398 memscrub_wakeup(void *c)
401 * grab mutex to guarantee that our wakeup call
402 * arrives after we go to sleep -- so we can't sleep forever.
404 mutex_enter(&memscrub_lock);
405 cv_signal(&memscrub_cv);
406 mutex_exit(&memscrub_lock);
410 * provide an interface external to the memscrubber
411 * which will force the memscrub thread to run vs.
412 * waiting for the timeout, if one is set
414 void
415 memscrub_run(void)
417 MEMSCRUB_STAT_INC(force_run);
418 if (memscrub_tid) {
419 (void) untimeout(memscrub_tid);
420 memscrub_wakeup((void *)NULL);
425 * this calculation doesn't account for the time
426 * that the actual scan consumes -- so we'd fall
427 * slightly behind schedule with this interval.
428 * It's very small.
431 static uint_t
432 compute_interval_ticks(void)
435 * We use msp_safe mpp_safe below to insure somebody
436 * doesn't set memscrub_span_pages or memscrub_phys_pages
437 * to 0 on us.
439 static uint_t msp_safe, mpp_safe;
440 static uint_t interval_ticks, period_ticks;
441 msp_safe = memscrub_span_pages;
442 mpp_safe = memscrub_phys_pages;
444 period_ticks = memscrub_period_sec * hz;
445 interval_ticks = period_ticks;
447 ASSERT(mutex_owned(&memscrub_lock));
449 if ((msp_safe != 0) && (mpp_safe != 0)) {
450 if (memscrub_phys_pages <= msp_safe) {
451 interval_ticks = period_ticks;
452 } else {
453 interval_ticks = (period_ticks /
454 (mpp_safe / msp_safe));
457 return (interval_ticks);
460 void
461 memscrubber(void)
463 ms_paddr_t address, addr;
464 time_t deadline;
465 pgcnt_t pages;
466 uint_t reached_end = 1;
467 uint_t paused_message = 0;
468 uint_t interval_ticks = 0;
469 uint_t sleep_warn_printed = 0;
470 callb_cpr_t cprinfo;
473 * notify CPR of our existence
475 CALLB_CPR_INIT(&cprinfo, &memscrub_lock, callb_generic_cpr, "memscrub");
477 mutex_enter(&memscrub_lock);
479 if (memscrub_memlist == NULL) {
480 cmn_err(CE_WARN, "memscrub_memlist not initialized.");
481 goto memscrub_exit;
484 address = memscrub_memlist->ml_address;
486 deadline = gethrestime_sec() + memscrub_delay_start_sec;
488 for (;;) {
489 if (disable_memscrub)
490 break;
493 * compute interval_ticks
495 interval_ticks = compute_interval_ticks();
498 * If the calculated sleep time is zero, and pause_memscrub
499 * has been set, make sure we sleep so that another thread
500 * can acquire memscrub_lock.
502 if (interval_ticks == 0 && pause_memscrub) {
503 interval_ticks = hz;
507 * And as a fail safe, under normal non-paused operation, do
508 * not allow the sleep time to be zero.
510 if (interval_ticks == 0) {
511 interval_ticks = memscrub_override_ticks;
512 if (!sleep_warn_printed) {
513 cmn_err(CE_NOTE, MEMSCRUB_OVERRIDE_MSG);
514 sleep_warn_printed = 1;
518 MEMSCRUB_STAT_SET(interval_ticks, interval_ticks);
521 * Did we just reach the end of memory? If we are at the
522 * end of memory, delay end of memory processing until
523 * pause_memscrub is not set.
525 if (reached_end && !pause_memscrub) {
526 time_t now = gethrestime_sec();
528 if (now >= deadline) {
529 MEMSCRUB_STAT_INC(done_late);
530 MEMSCRUB_STAT_NINC(late_sec, now - deadline);
532 * past deadline, start right away
534 interval_ticks = 0;
536 deadline = now + memscrub_period_sec;
537 } else {
539 * we finished ahead of schedule.
540 * wait till previous deadline before re-start.
542 interval_ticks = (deadline - now) * hz;
543 MEMSCRUB_STAT_INC(done_early);
544 MEMSCRUB_STAT_NINC(early_sec, deadline - now);
545 deadline += memscrub_period_sec;
547 reached_end = 0;
548 sleep_warn_printed = 0;
551 if (interval_ticks != 0) {
553 * it is safe from our standpoint for CPR to
554 * suspend the system
556 CALLB_CPR_SAFE_BEGIN(&cprinfo);
559 * hit the snooze bar
561 memscrub_tid = timeout(memscrub_wakeup, NULL,
562 interval_ticks);
565 * go to sleep
567 cv_wait(&memscrub_cv, &memscrub_lock);
570 * at this point, no timeout should be set
572 memscrub_tid = 0;
575 * we need to goto work and will be modifying
576 * our internal state and mapping/unmapping
577 * TTEs
579 CALLB_CPR_SAFE_END(&cprinfo, &memscrub_lock);
583 if (memscrub_phys_pages == 0) {
584 cmn_err(CE_WARN, "Memory scrubber has 0 pages to read");
585 goto memscrub_exit;
588 if (!pause_memscrub) {
589 if (paused_message) {
590 paused_message = 0;
591 if (memscrub_verbose)
592 cmn_err(CE_NOTE, "Memory scrubber "
593 "resuming");
596 if (read_all_memscrub) {
597 if (memscrub_verbose)
598 cmn_err(CE_NOTE, "Memory scrubber "
599 "reading all memory per request");
601 addr = memscrub_memlist->ml_address;
602 reached_end = 0;
603 while (!reached_end) {
604 if (disable_memscrub)
605 break;
606 pages = memscrub_phys_pages;
607 reached_end = memscrub_verify_span(
608 &addr, &pages);
609 memscrub_scan(pages *
610 MEMSCRUB_BLOCKS_PER_PAGE, addr);
611 addr += ((uint64_t)pages * PAGESIZE);
613 read_all_memscrub = 0;
617 * read 1 span
619 pages = memscrub_span_pages;
621 if (disable_memscrub)
622 break;
625 * determine physical address range
627 reached_end = memscrub_verify_span(&address,
628 &pages);
630 memscrub_scan(pages * MEMSCRUB_BLOCKS_PER_PAGE,
631 address);
633 address += ((uint64_t)pages * PAGESIZE);
636 if (pause_memscrub && !paused_message) {
637 paused_message = 1;
638 if (memscrub_verbose)
639 cmn_err(CE_NOTE, "Memory scrubber paused");
643 memscrub_exit:
644 cmn_err(CE_NOTE, "Memory scrubber exiting");
645 CALLB_CPR_EXIT(&cprinfo);
646 memscrub_cleanup();
647 thread_exit();
648 /* NOTREACHED */
652 * condition address and size
653 * such that they span legal physical addresses.
655 * when appropriate, address will be rounded up to start of next
656 * struct memlist, and pages will be rounded down to the end of the
657 * memlist size.
659 * returns 1 if reached end of list, else returns 0.
661 static int
662 memscrub_verify_span(ms_paddr_t *addrp, pgcnt_t *pagesp)
664 struct memlist *mlp;
665 ms_paddr_t address = *addrp;
666 uint64_t bytes = (uint64_t)*pagesp * PAGESIZE;
667 uint64_t bytes_remaining;
668 int reached_end = 0;
670 ASSERT(mutex_owned(&memscrub_lock));
673 * find memlist struct that contains addrp
674 * assumes memlist is sorted by ascending address.
676 for (mlp = memscrub_memlist; mlp != NULL; mlp = mlp->ml_next) {
678 * if before this chunk, round up to beginning
680 if (address < mlp->ml_address) {
681 address = mlp->ml_address;
682 break;
685 * if before end of chunk, then we found it
687 if (address < (mlp->ml_address + mlp->ml_size))
688 break;
690 /* else go to next struct memlist */
693 * if we hit end of list, start at beginning
695 if (mlp == NULL) {
696 mlp = memscrub_memlist;
697 address = mlp->ml_address;
701 * now we have legal address, and its mlp, condition bytes
703 bytes_remaining = (mlp->ml_address + mlp->ml_size) - address;
705 if (bytes > bytes_remaining)
706 bytes = bytes_remaining;
709 * will this span take us to end of list?
711 if ((mlp->ml_next == NULL) &&
712 ((mlp->ml_address + mlp->ml_size) == (address + bytes)))
713 reached_end = 1;
715 /* return values */
716 *addrp = address;
717 *pagesp = bytes / PAGESIZE;
719 return (reached_end);
723 * add a span to the memscrub list
724 * add to memscrub_phys_pages
727 memscrub_add_span(pfn_t pfn, pgcnt_t pages)
729 #ifdef MEMSCRUB_DEBUG
730 ms_paddr_t address = (ms_paddr_t)pfn << PAGESHIFT;
731 uint64_t bytes = (uint64_t)pages << PAGESHIFT;
732 #endif /* MEMSCRUB_DEBUG */
734 int retval;
736 mutex_enter(&memscrub_lock);
738 #ifdef MEMSCRUB_DEBUG
739 memscrub_printmemlist("memscrub_memlist before", memscrub_memlist);
740 cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
741 cmn_err(CE_CONT, "memscrub_add_span: address: 0x%llx"
742 " size: 0x%llx\n", address, bytes);
743 #endif /* MEMSCRUB_DEBUG */
745 retval = memscrub_add_span_gen(pfn, pages, &memscrub_memlist,
746 &memscrub_phys_pages);
748 #ifdef MEMSCRUB_DEBUG
749 memscrub_printmemlist("memscrub_memlist after", memscrub_memlist);
750 cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
751 #endif /* MEMSCRUB_DEBUG */
753 mutex_exit(&memscrub_lock);
755 return (retval);
758 static int
759 memscrub_add_span_gen(
760 pfn_t pfn,
761 pgcnt_t pages,
762 struct memlist **list,
763 uint_t *npgs)
765 ms_paddr_t address = (ms_paddr_t)pfn << PAGESHIFT;
766 uint64_t bytes = (uint64_t)pages << PAGESHIFT;
767 struct memlist *dst;
768 struct memlist *prev, *next;
769 int retval = 0;
772 * allocate a new struct memlist
775 dst = (struct memlist *)
776 kmem_alloc(sizeof (struct memlist), KM_NOSLEEP);
778 if (dst == NULL) {
779 retval = -1;
780 goto add_done;
783 dst->ml_address = address;
784 dst->ml_size = bytes;
787 * first insert
789 if (*list == NULL) {
790 dst->ml_prev = NULL;
791 dst->ml_next = NULL;
792 *list = dst;
794 goto add_done;
798 * insert into sorted list
800 for (prev = NULL, next = *list;
801 next != NULL;
802 prev = next, next = next->ml_next) {
803 if (address > (next->ml_address + next->ml_size))
804 continue;
807 * else insert here
811 * prepend to next
813 if ((address + bytes) == next->ml_address) {
814 kmem_free(dst, sizeof (struct memlist));
816 next->ml_address = address;
817 next->ml_size += bytes;
819 goto add_done;
823 * append to next
825 if (address == (next->ml_address + next->ml_size)) {
826 kmem_free(dst, sizeof (struct memlist));
828 if (next->ml_next) {
830 * don't overlap with next->ml_next
832 if ((address + bytes) >
833 next->ml_next->ml_address) {
834 retval = -1;
835 goto add_done;
838 * concatenate next and next->ml_next
840 if ((address + bytes) ==
841 next->ml_next->ml_address) {
842 struct memlist *mlp = next->ml_next;
844 if (next == *list)
845 *list = next->ml_next;
847 mlp->ml_address = next->ml_address;
848 mlp->ml_size += next->ml_size;
849 mlp->ml_size += bytes;
851 if (next->ml_prev)
852 next->ml_prev->ml_next = mlp;
853 mlp->ml_prev = next->ml_prev;
855 kmem_free(next,
856 sizeof (struct memlist));
857 goto add_done;
861 next->ml_size += bytes;
863 goto add_done;
866 /* don't overlap with next */
867 if ((address + bytes) > next->ml_address) {
868 retval = -1;
869 kmem_free(dst, sizeof (struct memlist));
870 goto add_done;
874 * insert before next
876 dst->ml_prev = prev;
877 dst->ml_next = next;
878 next->ml_prev = dst;
879 if (prev == NULL) {
880 *list = dst;
881 } else {
882 prev->ml_next = dst;
884 goto add_done;
885 } /* end for */
888 * end of list, prev is valid and next is NULL
890 prev->ml_next = dst;
891 dst->ml_prev = prev;
892 dst->ml_next = NULL;
894 add_done:
896 if (retval != -1)
897 *npgs += pages;
899 return (retval);
903 * delete a span from the memscrub list
904 * subtract from memscrub_phys_pages
907 memscrub_delete_span(pfn_t pfn, pgcnt_t pages)
909 ms_paddr_t address = (ms_paddr_t)pfn << PAGESHIFT;
910 uint64_t bytes = (uint64_t)pages << PAGESHIFT;
911 struct memlist *dst, *next;
912 int retval = 0;
914 mutex_enter(&memscrub_lock);
916 #ifdef MEMSCRUB_DEBUG
917 memscrub_printmemlist("memscrub_memlist Before", memscrub_memlist);
918 cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
919 cmn_err(CE_CONT, "memscrub_delete_span: 0x%llx 0x%llx\n",
920 address, bytes);
921 #endif /* MEMSCRUB_DEBUG */
924 * find struct memlist containing page
926 for (next = memscrub_memlist; next != NULL; next = next->ml_next) {
927 if ((address >= next->ml_address) &&
928 (address < next->ml_address + next->ml_size))
929 break;
933 * if start address not in list
935 if (next == NULL) {
936 retval = -1;
937 goto delete_done;
941 * error if size goes off end of this struct memlist
943 if (address + bytes > next->ml_address + next->ml_size) {
944 retval = -1;
945 goto delete_done;
949 * pages at beginning of struct memlist
951 if (address == next->ml_address) {
953 * if start & size match, delete from list
955 if (bytes == next->ml_size) {
956 if (next == memscrub_memlist)
957 memscrub_memlist = next->ml_next;
958 if (next->ml_prev != NULL)
959 next->ml_prev->ml_next = next->ml_next;
960 if (next->ml_next != NULL)
961 next->ml_next->ml_prev = next->ml_prev;
963 kmem_free(next, sizeof (struct memlist));
964 } else {
966 * increment start address by bytes
968 next->ml_address += bytes;
969 next->ml_size -= bytes;
971 goto delete_done;
975 * pages at end of struct memlist
977 if (address + bytes == next->ml_address + next->ml_size) {
979 * decrement size by bytes
981 next->ml_size -= bytes;
982 goto delete_done;
986 * delete a span in the middle of the struct memlist
990 * create a new struct memlist
992 dst = (struct memlist *)
993 kmem_alloc(sizeof (struct memlist), KM_NOSLEEP);
995 if (dst == NULL) {
996 retval = -1;
997 goto delete_done;
1001 * existing struct memlist gets address
1002 * and size up to pfn
1004 dst->ml_address = address + bytes;
1005 dst->ml_size =
1006 (next->ml_address + next->ml_size) - dst->ml_address;
1007 next->ml_size = address - next->ml_address;
1010 * new struct memlist gets address starting
1011 * after pfn, until end
1015 * link in new memlist after old
1017 dst->ml_next = next->ml_next;
1018 dst->ml_prev = next;
1020 if (next->ml_next != NULL)
1021 next->ml_next->ml_prev = dst;
1022 next->ml_next = dst;
1025 delete_done:
1026 if (retval != -1) {
1027 memscrub_phys_pages -= pages;
1028 if (memscrub_phys_pages == 0)
1029 disable_memscrub = 1;
1032 #ifdef MEMSCRUB_DEBUG
1033 memscrub_printmemlist("memscrub_memlist After", memscrub_memlist);
1034 cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
1035 #endif /* MEMSCRUB_DEBUG */
1037 mutex_exit(&memscrub_lock);
1038 return (retval);
1041 static void
1042 memscrub_scan(uint_t blks, ms_paddr_t src)
1044 uint_t psz, bpp, pgsread;
1045 pfn_t pfn;
1046 ms_paddr_t pa;
1047 caddr_t va;
1048 on_trap_data_t otd;
1049 int scan_mmu_pagesize = 0;
1050 int retired_pages = 0;
1052 extern void memscrub_read(caddr_t src, uint_t blks);
1054 ASSERT(mutex_owned(&memscrub_lock));
1056 pgsread = 0;
1057 pa = src;
1059 if (memscrub_page_retire_span_list != NULL) {
1060 if (memscrub_page_retire_span_search(src)) {
1061 /* retired pages in current span */
1062 scan_mmu_pagesize = 1;
1066 #ifdef MEMSCRUB_DEBUG
1067 cmn_err(CE_NOTE, "scan_mmu_pagesize = %d\n" scan_mmu_pagesize);
1068 #endif /* MEMSCRUB_DEBUG */
1070 while (blks != 0) {
1071 /* Ensure the PA is properly aligned */
1072 if (((pa & MMU_PAGEMASK4M) == pa) &&
1073 (blks >= MEMSCRUB_BPP4M)) {
1074 psz = MMU_PAGESIZE4M;
1075 bpp = MEMSCRUB_BPP4M;
1076 } else if (((pa & MMU_PAGEMASK512K) == pa) &&
1077 (blks >= MEMSCRUB_BPP512K)) {
1078 psz = MMU_PAGESIZE512K;
1079 bpp = MEMSCRUB_BPP512K;
1080 } else if (((pa & MMU_PAGEMASK64K) == pa) &&
1081 (blks >= MEMSCRUB_BPP64K)) {
1082 psz = MMU_PAGESIZE64K;
1083 bpp = MEMSCRUB_BPP64K;
1084 } else if ((pa & MMU_PAGEMASK) == pa) {
1085 psz = MMU_PAGESIZE;
1086 bpp = MEMSCRUB_BPP;
1087 } else {
1088 if (memscrub_verbose) {
1089 cmn_err(CE_NOTE, "Memory scrubber ignoring "
1090 "non-page aligned block starting at 0x%"
1091 PRIx64, src);
1093 return;
1095 if (blks < bpp) bpp = blks;
1097 #ifdef MEMSCRUB_DEBUG
1098 cmn_err(CE_NOTE, "Going to run psz=%x, "
1099 "bpp=%x pa=%llx\n", psz, bpp, pa);
1100 #endif /* MEMSCRUB_DEBUG */
1103 * MEMSCRUBBASE is a 4MB aligned page in the
1104 * kernel so that we can quickly map the PA
1105 * to a VA for the block loads performed in
1106 * memscrub_read.
1108 pfn = mmu_btop(pa);
1109 va = (caddr_t)MEMSCRUBBASE;
1110 hat_devload(kas.a_hat, va, psz, pfn, PROT_READ,
1111 HAT_LOAD_NOCONSIST | HAT_LOAD_LOCK);
1114 * Can't allow the memscrubber to migrate across CPUs as
1115 * we need to know whether CEEN is enabled for the current
1116 * CPU to enable us to scrub the memory. Don't use
1117 * kpreempt_disable as the time we take to scan a span (even
1118 * without cpu_check_ce having to manually cpu_check_block)
1119 * is too long to hold a higher priority thread (eg, RT)
1120 * off cpu.
1122 thread_affinity_set(curthread, CPU_CURRENT);
1125 * Protect read scrub from async faults. For now, we simply
1126 * maintain a count of such faults caught.
1129 if (!on_trap(&otd, OT_DATA_EC) && !scan_mmu_pagesize) {
1130 memscrub_read(va, bpp);
1132 * Check if CEs require logging
1134 cpu_check_ce(SCRUBBER_CEEN_CHECK,
1135 (uint64_t)pa, va, psz);
1136 no_trap();
1137 thread_affinity_clear(curthread);
1138 } else {
1139 no_trap();
1140 thread_affinity_clear(curthread);
1143 * Got an async error..
1144 * Try rescanning it at MMU_PAGESIZE
1145 * granularity if we were trying to
1146 * read at a larger page size.
1147 * This is to ensure we continue to
1148 * scan the rest of the span.
1149 * OR scanning MMU_PAGESIZE granularity to avoid
1150 * reading retired pages memory when scan_mmu_pagesize
1151 * is set.
1153 if (psz > MMU_PAGESIZE || scan_mmu_pagesize) {
1154 caddr_t vaddr = va;
1155 ms_paddr_t paddr = pa;
1156 int tmp = 0;
1157 for (; tmp < bpp; tmp += MEMSCRUB_BPP) {
1158 /* Don't scrub retired pages */
1159 if (page_retire_check(paddr, NULL)
1160 == 0) {
1161 vaddr += MMU_PAGESIZE;
1162 paddr += MMU_PAGESIZE;
1163 retired_pages++;
1164 continue;
1166 thread_affinity_set(curthread,
1167 CPU_CURRENT);
1168 if (!on_trap(&otd, OT_DATA_EC)) {
1169 memscrub_read(vaddr,
1170 MEMSCRUB_BPP);
1171 cpu_check_ce(
1172 SCRUBBER_CEEN_CHECK,
1173 (uint64_t)paddr, vaddr,
1174 MMU_PAGESIZE);
1175 no_trap();
1176 } else {
1177 no_trap();
1178 MEMSCRUB_STAT_INC(errors_found);
1180 thread_affinity_clear(curthread);
1181 vaddr += MMU_PAGESIZE;
1182 paddr += MMU_PAGESIZE;
1186 hat_unload(kas.a_hat, va, psz, HAT_UNLOAD_UNLOCK);
1188 blks -= bpp;
1189 pa += psz;
1190 pgsread++;
1194 * If just finished scrubbing MMU_PAGESIZE at a time, but no retired
1195 * pages found so delete span from global list.
1197 if (scan_mmu_pagesize && retired_pages == 0)
1198 memscrub_page_retire_span_delete(src);
1201 * Encountered CE/UE on a retired page during memscrub read of current
1202 * span. Adding span to global list to enable avoid reading further.
1204 if (add_to_page_retire_list) {
1205 if (!memscrub_page_retire_span_search(src))
1206 memscrub_page_retire_span_add(src);
1207 add_to_page_retire_list = 0;
1210 if (memscrub_verbose) {
1211 cmn_err(CE_NOTE, "Memory scrubber read 0x%x pages starting "
1212 "at 0x%" PRIx64, pgsread, src);
1217 * Called by cpu_async_log_err() when memscrub read causes
1218 * CE/UE on a retired page.
1220 void
1221 memscrub_induced_error(void)
1223 add_to_page_retire_list = 1;
1227 * Called by page_retire() when toxic pages cannot be retired
1228 * immediately and are scheduled for retire. Memscrubber stops
1229 * scrubbing them to avoid further CE/UEs.
1231 void
1232 memscrub_notify(ms_paddr_t pa)
1234 mutex_enter(&memscrub_lock);
1235 if (!memscrub_page_retire_span_search(pa))
1236 memscrub_page_retire_span_add(pa);
1237 mutex_exit(&memscrub_lock);
1241 * Called by memscrub_scan() and memscrub_notify().
1242 * pa: physical address of span with CE/UE, add to global list.
1244 static void
1245 memscrub_page_retire_span_add(ms_paddr_t pa)
1247 memscrub_page_retire_span_t *new_span;
1249 new_span = (memscrub_page_retire_span_t *)
1250 kmem_zalloc(sizeof (memscrub_page_retire_span_t), KM_NOSLEEP);
1252 if (new_span == NULL) {
1253 #ifdef MEMSCRUB_DEBUG
1254 cmn_err(CE_NOTE, "failed to allocate new span - span with"
1255 " retired page/s not tracked.\n");
1256 #endif /* MEMSCRUB_DEBUG */
1257 return;
1260 new_span->address = pa;
1261 new_span->next = memscrub_page_retire_span_list;
1262 memscrub_page_retire_span_list = new_span;
1266 * Called by memscrub_scan().
1267 * pa: physical address of span to be removed from global list.
1269 static void
1270 memscrub_page_retire_span_delete(ms_paddr_t pa)
1272 memscrub_page_retire_span_t *prev_span, *next_span;
1274 prev_span = memscrub_page_retire_span_list;
1275 next_span = memscrub_page_retire_span_list->next;
1277 if (pa == prev_span->address) {
1278 memscrub_page_retire_span_list = next_span;
1279 kmem_free(prev_span, sizeof (memscrub_page_retire_span_t));
1280 return;
1283 while (next_span) {
1284 if (pa == next_span->address) {
1285 prev_span->next = next_span->next;
1286 kmem_free(next_span,
1287 sizeof (memscrub_page_retire_span_t));
1288 return;
1290 prev_span = next_span;
1291 next_span = next_span->next;
1296 * Called by memscrub_scan() and memscrub_notify().
1297 * pa: physical address of span to be searched in global list.
1299 static int
1300 memscrub_page_retire_span_search(ms_paddr_t pa)
1302 memscrub_page_retire_span_t *next_span = memscrub_page_retire_span_list;
1304 while (next_span) {
1305 if (pa == next_span->address)
1306 return (1);
1307 next_span = next_span->next;
1309 return (0);
1313 * Called from new_memscrub() as a result of memory delete.
1314 * Using page_numtopp_nolock() to determine if we have valid PA.
1316 static void
1317 memscrub_page_retire_span_list_update(void)
1319 memscrub_page_retire_span_t *prev, *cur, *next;
1321 if (memscrub_page_retire_span_list == NULL)
1322 return;
1324 prev = cur = memscrub_page_retire_span_list;
1325 next = cur->next;
1327 while (cur) {
1328 if (page_numtopp_nolock(mmu_btop(cur->address)) == NULL) {
1329 if (cur == memscrub_page_retire_span_list) {
1330 memscrub_page_retire_span_list = next;
1331 kmem_free(cur,
1332 sizeof (memscrub_page_retire_span_t));
1333 prev = cur = memscrub_page_retire_span_list;
1334 } else {
1335 prev->next = cur->next;
1336 kmem_free(cur,
1337 sizeof (memscrub_page_retire_span_t));
1338 cur = next;
1340 } else {
1341 prev = cur;
1342 cur = next;
1344 if (cur != NULL)
1345 next = cur->next;
1350 * The memory add/delete callback mechanism does not pass in the
1351 * page ranges. The phys_install list has been updated though, so
1352 * create a new scrub list from it.
1355 static int
1356 new_memscrub(int update_page_retire_list)
1358 struct memlist *src, *list, *old_list;
1359 uint_t npgs;
1362 * copy phys_install to memscrub_memlist
1364 list = NULL;
1365 npgs = 0;
1366 memlist_read_lock();
1367 for (src = phys_install; src; src = src->ml_next) {
1368 if (memscrub_add_span_gen((pfn_t)(src->ml_address >> PAGESHIFT),
1369 (pgcnt_t)(src->ml_size >> PAGESHIFT), &list, &npgs)) {
1370 memlist_read_unlock();
1371 while (list) {
1372 struct memlist *el;
1374 el = list;
1375 list = list->ml_next;
1376 kmem_free(el, sizeof (struct memlist));
1378 return (-1);
1381 memlist_read_unlock();
1383 mutex_enter(&memscrub_lock);
1384 memscrub_phys_pages = npgs;
1385 old_list = memscrub_memlist;
1386 memscrub_memlist = list;
1388 if (update_page_retire_list)
1389 memscrub_page_retire_span_list_update();
1391 mutex_exit(&memscrub_lock);
1393 while (old_list) {
1394 struct memlist *el;
1396 el = old_list;
1397 old_list = old_list->ml_next;
1398 kmem_free(el, sizeof (struct memlist));
1401 return (0);
1404 /*ARGSUSED*/
1405 static void
1406 memscrub_mem_config_post_add(
1407 void *arg,
1408 pgcnt_t delta_pages)
1411 * We increment pause_memscrub before entering new_memscrub(). This
1412 * will force the memscrubber to sleep, allowing the DR callback
1413 * thread to acquire memscrub_lock in new_memscrub(). The use of
1414 * atomic_add_32() allows concurrent memory DR operations to use the
1415 * callbacks safely.
1417 atomic_inc_32(&pause_memscrub);
1418 ASSERT(pause_memscrub != 0);
1421 * "Don't care" if we are not scrubbing new memory.
1423 (void) new_memscrub(0); /* retain page retire list */
1425 /* Restore the pause setting. */
1426 atomic_dec_32(&pause_memscrub);
1429 /*ARGSUSED*/
1430 static int
1431 memscrub_mem_config_pre_del(
1432 void *arg,
1433 pgcnt_t delta_pages)
1435 /* Nothing to do. */
1436 return (0);
1439 /*ARGSUSED*/
1440 static void
1441 memscrub_mem_config_post_del(
1442 void *arg,
1443 pgcnt_t delta_pages,
1444 int cancelled)
1447 * We increment pause_memscrub before entering new_memscrub(). This
1448 * will force the memscrubber to sleep, allowing the DR callback
1449 * thread to acquire memscrub_lock in new_memscrub(). The use of
1450 * atomic_add_32() allows concurrent memory DR operations to use the
1451 * callbacks safely.
1453 atomic_inc_32(&pause_memscrub);
1454 ASSERT(pause_memscrub != 0);
1457 * Must stop scrubbing deleted memory as it may be disconnected.
1459 if (new_memscrub(1)) { /* update page retire list */
1460 disable_memscrub = 1;
1463 /* Restore the pause setting. */
1464 atomic_dec_32(&pause_memscrub);
1467 static kphysm_setup_vector_t memscrub_mem_config_vec = {
1468 KPHYSM_SETUP_VECTOR_VERSION,
1469 memscrub_mem_config_post_add,
1470 memscrub_mem_config_pre_del,
1471 memscrub_mem_config_post_del,
1474 static void
1475 memscrub_init_mem_config()
1477 int ret;
1479 ret = kphysm_setup_func_register(&memscrub_mem_config_vec,
1480 (void *)NULL);
1481 ASSERT(ret == 0);
1484 static void
1485 memscrub_uninit_mem_config()
1487 /* This call is OK if the register call was not done. */
1488 kphysm_setup_func_unregister(&memscrub_mem_config_vec, (void *)NULL);