4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright (c) 2016 by Delphix. All rights reserved.
28 * Copyright 2017 Joyent, Inc.
29 * Copyright 2017 James S Blachly, MD <james.blachly@gmail.com>
36 #include <sys/types.h>
37 #include <sys/param.h>
40 #include <sys/systm.h>
50 #include <sys/memlist.h>
51 #include <sys/bootconf.h>
53 #include <vm/seg_vn.h>
54 #include <vm/seg_dev.h>
55 #include <vm/seg_kmem.h>
56 #include <vm/seg_kp.h>
57 #include <vm/seg_kpm.h>
62 #include <sys/types.h>
64 #include <sys/param.h>
65 #include <sys/systm.h>
66 #include <sys/errno.h>
67 #include <sys/modctl.h>
68 #include <sys/memlist.h>
70 #include <sys/sunddi.h>
71 #include <sys/debug.h>
72 #include <sys/fm/protocol.h>
75 extern int cpu_get_mem_name(uint64_t, uint64_t *, uint64_t, char *, int, int *);
76 extern int cpu_get_mem_info(uint64_t, uint64_t, uint64_t *, uint64_t *,
77 uint64_t *, int *, int *, int *);
78 extern size_t cpu_get_name_bufsize(void);
79 extern int cpu_get_mem_sid(char *, char *, int, int *);
80 extern int cpu_get_mem_addr(char *, char *, uint64_t, uint64_t *);
82 #include <sys/cpu_module.h>
86 * Turn a byte length into a pagecount. The DDI btop takes a
87 * 32-bit size on 32-bit machines, this handles 64-bit sizes for
88 * large physical-memory 32-bit machines.
90 #define BTOP(x) ((pgcnt_t)((x) >> _pageshift))
92 static kmutex_t mm_lock
;
93 static caddr_t mm_map
;
95 static dev_info_t
*mm_dip
; /* private copy of devinfo pointer */
97 static int mm_kmem_io_access
;
99 static int mm_kstat_update(kstat_t
*ksp
, int rw
);
100 static int mm_kstat_snapshot(kstat_t
*ksp
, void *buf
, int rw
);
102 static int mm_read_mem_name(intptr_t data
, mem_name_t
*mem_name
);
104 #define MM_KMEMLOG_NENTRIES 64
106 static int mm_kmemlogent
;
107 static mm_logentry_t mm_kmemlog
[MM_KMEMLOG_NENTRIES
];
110 * On kmem/allmem writes, we log information that might be useful in the event
111 * that a write is errant (that is, due to operator error) and induces a later
112 * problem. Note that (in particular) in the event of such operator-induced
113 * corruption, a search over the kernel address space for the corrupted
114 * address will yield the ring buffer entry that recorded the write. And
115 * should it seem baroque or otherwise unnecessary, yes, we need this kind of
116 * auditing facility and yes, we learned that the hard way: disturbingly,
117 * there exist recommendations for "tuning" the system that involve writing to
118 * kernel memory addresses via the kernel debugger, and -- as we discovered --
119 * these can easily be applied incorrectly or unsafely, yielding an entirely
120 * undebuggable "can't happen" kind of panic.
123 mm_logkmem(struct uio
*uio
)
126 proc_t
*p
= curthread
->t_procp
;
128 mutex_enter(&mm_lock
);
130 ent
= &mm_kmemlog
[mm_kmemlogent
++];
132 if (mm_kmemlogent
== MM_KMEMLOG_NENTRIES
)
135 ent
->mle_vaddr
= (uintptr_t)uio
->uio_loffset
;
136 ent
->mle_len
= uio
->uio_resid
;
137 gethrestime(&ent
->mle_hrestime
);
138 ent
->mle_hrtime
= gethrtime();
139 ent
->mle_pid
= p
->p_pidp
->pid_id
;
141 (void) strncpy(ent
->mle_psargs
,
142 p
->p_user
.u_psargs
, sizeof (ent
->mle_psargs
));
144 mutex_exit(&mm_lock
);
149 mm_attach(dev_info_t
*devi
, ddi_attach_cmd_t cmd
)
160 { "mem", M_MEM
, 0, NULL
, "all", 0640 },
161 { "kmem", M_KMEM
, 0, NULL
, "all", 0640 },
162 { "allkmem", M_ALLKMEM
, 0, "all", "all", 0600 },
163 { "null", M_NULL
, PRIVONLY_DEV
, NULL
, NULL
, 0666 },
164 { "zero", M_ZERO
, PRIVONLY_DEV
, NULL
, NULL
, 0666 },
165 { "full", M_FULL
, PRIVONLY_DEV
, NULL
, NULL
, 0666 },
169 mutex_init(&mm_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
170 mm_map
= vmem_alloc(heap_arena
, PAGESIZE
, VM_SLEEP
);
172 for (i
= 0; i
< (sizeof (mm
) / sizeof (mm
[0])); i
++) {
173 if (ddi_create_priv_minor_node(devi
, mm
[i
].name
, S_IFCHR
,
174 mm
[i
].minor
, DDI_PSEUDO
, mm
[i
].privonly
,
175 mm
[i
].rdpriv
, mm
[i
].wrpriv
, mm
[i
].priv_mode
) ==
177 ddi_remove_minor_node(devi
, NULL
);
178 return (DDI_FAILURE
);
184 ksp
= kstat_create("mm", 0, "phys_installed", "misc",
185 KSTAT_TYPE_RAW
, 0, KSTAT_FLAG_VAR_SIZE
| KSTAT_FLAG_VIRTUAL
);
187 ksp
->ks_update
= mm_kstat_update
;
188 ksp
->ks_snapshot
= mm_kstat_snapshot
;
189 ksp
->ks_lock
= &mm_lock
; /* XXX - not really needed */
193 mm_kmem_io_access
= ddi_getprop(DDI_DEV_T_ANY
, devi
, DDI_PROP_DONTPASS
,
194 "kmem_io_access", 0);
196 return (DDI_SUCCESS
);
201 mm_info(dev_info_t
*dip
, ddi_info_cmd_t infocmd
, void *arg
, void **result
)
206 case DDI_INFO_DEVT2DEVINFO
:
207 *result
= (void *)mm_dip
;
210 case DDI_INFO_DEVT2INSTANCE
:
222 mmopen(dev_t
*devp
, int flag
, int typ
, struct cred
*cred
)
224 switch (getminor(*devp
)) {
231 /* standard devices */
235 /* Unsupported or unknown type */
238 /* must be character device */
244 struct pollhead mm_pollhd
;
248 mmchpoll(dev_t dev
, short events
, int anyyet
, short *reventsp
,
249 struct pollhead
**phpp
)
251 switch (getminor(dev
)) {
258 *reventsp
= events
& (POLLIN
| POLLOUT
| POLLPRI
| POLLRDNORM
|
259 POLLWRNORM
| POLLRDBAND
| POLLWRBAND
);
261 * A non NULL pollhead pointer should be returned in case
262 * user polls for 0 events or is doing an edge-triggerd poll.
264 if ((!*reventsp
&& !anyyet
) || (events
& POLLET
))
268 /* no other devices currently support polling */
274 mmpropop(dev_t dev
, dev_info_t
*dip
, ddi_prop_op_t prop_op
, int flags
,
275 char *name
, caddr_t valuep
, int *lengthp
)
278 * implement zero size to reduce overhead (avoid two failing
279 * property lookups per stat).
281 return (ddi_prop_op_size(dev
, dip
, prop_op
,
282 flags
, name
, valuep
, lengthp
, 0));
286 mmio(struct uio
*uio
, enum uio_rw rw
, pfn_t pfn
, off_t pageoff
, int allowio
,
291 int is_memory
= pf_is_memory(pfn
);
292 size_t nbytes
= MIN((size_t)(PAGESIZE
- pageoff
),
293 (size_t)uio
->uio_iov
->iov_len
);
296 mutex_enter(&mm_lock
);
298 if (is_memory
&& kpm_enable
) {
300 va
= hat_kpm_mapin(pp
, NULL
);
302 va
= hat_kpm_mapin_pfn(pfn
);
306 hat_devload(kas
.a_hat
, mm_map
, PAGESIZE
, pfn
,
307 (uint_t
)(rw
== UIO_READ
? PROT_READ
: PROT_READ
|PROT_WRITE
),
308 HAT_LOAD_NOCONSIST
|HAT_LOAD_LOCK
);
315 size_t c
= uio
->uio_iov
->iov_len
;
317 if (ddi_peekpokeio(NULL
, uio
, rw
,
318 (caddr_t
)(uintptr_t)uio
->uio_loffset
, c
,
319 sizeof (int32_t)) != DDI_SUCCESS
)
324 error
= uiomove(va
+ pageoff
, nbytes
, rw
, uio
);
327 hat_unload(kas
.a_hat
, mm_map
, PAGESIZE
, HAT_UNLOAD_UNLOCK
);
329 hat_kpm_mapout(pp
, NULL
, va
);
331 hat_kpm_mapout_pfn(pfn
);
333 mutex_exit(&mm_lock
);
338 mmpagelock(struct as
*as
, caddr_t va
)
343 AS_LOCK_ENTER(as
, RW_READER
);
344 seg
= as_segat(as
, va
);
345 i
= (seg
!= NULL
)? segop_capable(seg
, S_CAPABILITY_NOMINFLT
) : 0;
352 #define NEED_LOCK_KVADDR(va) 0
357 mmrw(dev_t dev
, struct uio
*uio
, enum uio_rw rw
, cred_t
*cred
)
363 ssize_t oresid
= uio
->uio_resid
;
364 minor_t minor
= getminor(dev
);
366 while (uio
->uio_resid
> 0 && error
== 0) {
368 if (iov
->iov_len
== 0) {
371 if (uio
->uio_iovcnt
< 0)
379 if (!address_in_memlist(phys_install
,
380 (uint64_t)uio
->uio_loffset
, 1)) {
381 memlist_read_unlock();
385 memlist_read_unlock();
387 v
= BTOP((uoff_t
)uio
->uio_loffset
);
388 error
= mmio(uio
, rw
, v
,
389 uio
->uio_loffset
& PAGEOFFSET
, 0, NULL
);
396 caddr_t vaddr
= (caddr_t
)uio
->uio_offset
;
397 int try_lock
= NEED_LOCK_KVADDR(vaddr
);
400 if ((error
= plat_mem_do_mmio(uio
, rw
)) != ENOTSUP
)
407 * If vaddr does not map a valid page, as_pagelock()
408 * will return failure. Hence we can't check the
409 * return value and return EFAULT here as we'd like.
410 * seg_kp and seg_kpm do not properly support
411 * as_pagelock() for this context so we avoid it
412 * using the try_lock set check above. Some day when
413 * the kernel page locking gets redesigned all this
414 * muck can be cleaned up.
417 locked
= (as_pagelock(&kas
, &ppp
, vaddr
,
418 PAGESIZE
, S_WRITE
) == 0);
420 v
= hat_getpfnum(kas
.a_hat
,
421 (caddr_t
)(uintptr_t)uio
->uio_loffset
);
422 if (v
== PFN_INVALID
) {
424 as_pageunlock(&kas
, ppp
, vaddr
,
430 error
= mmio(uio
, rw
, v
, uio
->uio_loffset
& PAGEOFFSET
,
431 minor
== M_ALLKMEM
|| mm_kmem_io_access
,
432 (locked
&& ppp
) ? *ppp
: NULL
);
434 as_pageunlock(&kas
, ppp
, vaddr
, PAGESIZE
,
441 if (rw
== UIO_WRITE
) {
445 /* else it's a read, fall through to zero case */
449 if (rw
== UIO_READ
) {
452 if (on_fault(&ljb
)) {
457 uzero(iov
->iov_base
, iov
->iov_len
);
459 uio
->uio_resid
-= iov
->iov_len
;
460 uio
->uio_loffset
+= iov
->iov_len
;
463 /* else it's a write, fall through to NULL case */
472 uio
->uio_loffset
+= c
;
478 return (uio
->uio_resid
== oresid
? error
: 0);
482 mmread(dev_t dev
, struct uio
*uio
, cred_t
*cred
)
484 return (mmrw(dev
, uio
, UIO_READ
, cred
));
488 mmwrite(dev_t dev
, struct uio
*uio
, cred_t
*cred
)
490 return (mmrw(dev
, uio
, UIO_WRITE
, cred
));
494 * Private ioctl for libkvm to support kvm_physaddr().
495 * Given an address space and a VA, compute the PA.
498 mmioctl_vtop(intptr_t data
)
505 pfn_t pfn
= (pfn_t
)PFN_INVALID
;
510 if (get_udatamodel() == DATAMODEL_NATIVE
) {
511 if (copyin((void *)data
, &mem_vtop
, sizeof (mem_vtop_t
)))
516 if (copyin((void *)data
, &vtop32
, sizeof (mem_vtop32_t
)))
518 mem_vtop
.m_as
= (struct as
*)(uintptr_t)vtop32
.m_as
;
519 mem_vtop
.m_va
= (void *)(uintptr_t)vtop32
.m_va
;
521 if (mem_vtop
.m_as
!= NULL
)
526 if (mem_vtop
.m_as
== &kas
) {
527 pfn
= hat_getpfnum(kas
.a_hat
, mem_vtop
.m_va
);
529 if (mem_vtop
.m_as
== NULL
) {
531 * Assume the calling process's address space if the
532 * caller didn't specify one.
534 p
= curthread
->t_procp
;
537 mem_vtop
.m_as
= p
->p_as
;
540 mutex_enter(&pidlock
);
541 for (p
= practive
; p
!= NULL
; p
= p
->p_next
) {
542 if (p
->p_as
== mem_vtop
.m_as
) {
547 mutex_exit(&pidlock
);
554 if (as
== mem_vtop
.m_as
) {
555 mutex_exit(&p
->p_lock
);
556 AS_LOCK_ENTER(as
, RW_READER
);
557 for (seg
= AS_SEGFIRST(as
); seg
!= NULL
;
558 seg
= AS_SEGNEXT(as
, seg
))
559 if ((uintptr_t)mem_vtop
.m_va
-
560 (uintptr_t)seg
->s_base
< seg
->s_size
)
563 pfn
= hat_getpfnum(as
->a_hat
, mem_vtop
.m_va
);
565 mutex_enter(&p
->p_lock
);
569 mem_vtop
.m_pfn
= pfn
;
570 if (pfn
== PFN_INVALID
)
573 if (get_udatamodel() == DATAMODEL_NATIVE
) {
574 if (copyout(&mem_vtop
, (void *)data
, sizeof (mem_vtop_t
)))
579 vtop32
.m_pfn
= mem_vtop
.m_pfn
;
580 if (copyout(&vtop32
, (void *)data
, sizeof (mem_vtop32_t
)))
589 * Given a PA, execute the given page retire command on it.
592 mmioctl_page_retire(int cmd
, intptr_t data
)
594 extern int page_retire_test(void);
597 if (copyin((void *)data
, &pa
, sizeof (uint64_t))) {
602 case MEM_PAGE_ISRETIRED
:
603 return (page_retire_check(pa
, NULL
));
605 case MEM_PAGE_UNRETIRE
:
606 return (page_unretire(pa
));
608 case MEM_PAGE_RETIRE
:
609 return (page_retire(pa
, PR_FMA
));
611 case MEM_PAGE_RETIRE_MCE
:
612 return (page_retire(pa
, PR_MCE
));
614 case MEM_PAGE_RETIRE_UE
:
615 return (page_retire(pa
, PR_UE
));
617 case MEM_PAGE_GETERRORS
:
619 uint64_t page_errors
;
620 int rc
= page_retire_check(pa
, &page_errors
);
621 if (copyout(&page_errors
, (void *)data
,
622 sizeof (uint64_t))) {
628 case MEM_PAGE_RETIRE_TEST
:
629 return (page_retire_test());
639 * libkvm to support kvm_physaddr().
640 * FMA support for page_retire() and memory attribute information.
644 mmioctl(dev_t dev
, int cmd
, intptr_t data
, int flag
, cred_t
*cred
, int *rvalp
)
646 if ((cmd
== MEM_VTOP
&& getminor(dev
) != M_KMEM
) ||
647 (cmd
!= MEM_VTOP
&& getminor(dev
) != M_MEM
))
652 return (mmioctl_vtop(data
));
654 case MEM_PAGE_RETIRE
:
655 case MEM_PAGE_ISRETIRED
:
656 case MEM_PAGE_UNRETIRE
:
657 case MEM_PAGE_RETIRE_MCE
:
658 case MEM_PAGE_RETIRE_UE
:
659 case MEM_PAGE_GETERRORS
:
660 case MEM_PAGE_RETIRE_TEST
:
661 return (mmioctl_page_retire(cmd
, data
));
673 mmmmap(dev_t dev
, off_t off
, int prot
)
676 struct memlist
*pmem
;
677 minor_t minor
= getminor(dev
);
683 for (pmem
= phys_install
; pmem
!= NULL
; pmem
= pmem
->ml_next
) {
684 if (pf
>= BTOP(pmem
->ml_address
) &&
685 pf
< BTOP(pmem
->ml_address
+ pmem
->ml_size
)) {
686 memlist_read_unlock();
687 return (impl_obmem_pfnum(pf
));
690 memlist_read_unlock();
695 /* no longer supported with KPR */
701 * We shouldn't be mmap'ing to /dev/zero here as
702 * mmsegmap() should have already converted
703 * a mapping request for this device to a mapping
704 * using seg_vn for anonymous memory.
713 * This function is called when a memory device is mmap'ed.
714 * Set up the mapping to the correct device driver.
717 mmsegmap(dev_t dev
, off_t off
, struct as
*as
, caddr_t
*addrp
, off_t len
,
718 uint_t prot
, uint_t maxprot
, uint_t flags
, struct cred
*cred
)
720 struct segvn_crargs vn_a
;
721 struct segdev_crargs dev_a
;
726 minor
= getminor(dev
);
730 * No need to worry about vac alignment on /dev/zero
731 * since this is a "clone" object that doesn't yet exist.
733 error
= choose_addr(as
, addrp
, len
, off
,
734 (minor
== M_MEM
) || (minor
== M_KMEM
), flags
);
742 /* /dev/mem cannot be mmap'ed with MAP_PRIVATE */
743 if ((flags
& MAP_TYPE
) != MAP_SHARED
) {
749 * Check to ensure that the entire range is
750 * legal and we are not trying to map in
751 * more than the device will let us.
753 for (i
= 0; i
< len
; i
+= PAGESIZE
) {
754 if (mmmmap(dev
, off
+ i
, maxprot
) == -1) {
761 * Use seg_dev segment driver for /dev/mem mapping.
763 dev_a
.mapfunc
= mmmmap
;
766 dev_a
.type
= (flags
& MAP_TYPE
);
767 dev_a
.prot
= (uchar_t
)prot
;
768 dev_a
.maxprot
= (uchar_t
)maxprot
;
772 * Make /dev/mem mappings non-consistent since we can't
773 * alias pages that don't have page structs behind them,
774 * such as kernel stack pages. If someone mmap()s a kernel
775 * stack page and if we give them a tte with cv, a line from
776 * that page can get into both pages of the spitfire d$.
777 * But snoop from another processor will only invalidate
778 * the first page. This later caused kernel (xc_attention)
779 * to go into an infinite loop at pil 13 and no interrupts
780 * could come in. See 1203630.
783 dev_a
.hat_flags
= HAT_LOAD_NOCONSIST
;
784 dev_a
.devmap_data
= NULL
;
786 error
= as_map(as
, *addrp
, len
, segdev_create
, &dev_a
);
791 * Use seg_vn segment driver for /dev/zero mapping.
792 * Passing in a NULL amp gives us the "cloning" effect.
796 vn_a
.type
= (flags
& MAP_TYPE
);
798 vn_a
.maxprot
= maxprot
;
799 vn_a
.flags
= flags
& ~MAP_TYPE
;
803 vn_a
.lgrp_mem_policy_flags
= 0;
804 error
= as_map(as
, *addrp
, len
, segvn_create
, &vn_a
);
809 /* No longer supported with KPR. */
815 * Use seg_dev segment driver for /dev/null mapping.
817 dev_a
.mapfunc
= mmmmap
;
820 dev_a
.type
= 0; /* neither PRIVATE nor SHARED */
821 dev_a
.prot
= dev_a
.maxprot
= (uchar_t
)PROT_NONE
;
824 error
= as_map(as
, *addrp
, len
, segdev_create
, &dev_a
);
835 static struct cb_ops mm_cb_ops
= {
838 nodev
, /* strategy */
846 mmsegmap
, /* segmap */
848 mmpropop
, /* prop_op */
850 D_NEW
| D_MP
| D_64BIT
| D_U64BIT
853 static struct dev_ops mm_ops
= {
854 DEVO_REV
, /* devo_rev, */
856 mm_info
, /* get_dev_info */
857 nulldev
, /* identify */
859 mm_attach
, /* attach */
862 &mm_cb_ops
, /* driver operations */
863 NULL
, /* bus operations */
865 ddi_quiesce_not_needed
, /* quiesce */
868 static struct modldrv modldrv
= {
869 &mod_driverops
, "memory driver", &mm_ops
,
872 static struct modlinkage modlinkage
= {
873 MODREV_1
, &modldrv
, NULL
879 return (mod_install(&modlinkage
));
883 _info(struct modinfo
*modinfop
)
885 return (mod_info(&modlinkage
, modinfop
));
891 return (mod_remove(&modlinkage
));
895 mm_kstat_update(kstat_t
*ksp
, int rw
)
897 struct memlist
*pmem
;
900 if (rw
== KSTAT_WRITE
)
905 for (pmem
= phys_install
; pmem
!= NULL
; pmem
= pmem
->ml_next
) {
908 memlist_read_unlock();
910 ksp
->ks_ndata
= count
;
911 ksp
->ks_data_size
= count
* 2 * sizeof (uint64_t);
917 mm_kstat_snapshot(kstat_t
*ksp
, void *buf
, int rw
)
919 struct memlist
*pmem
;
925 if (rw
== KSTAT_WRITE
)
928 ksp
->ks_snaptime
= gethrtime();
930 kspmem
= (struct memunit
*)buf
;
932 for (pmem
= phys_install
; pmem
!= NULL
;
933 pmem
= pmem
->ml_next
, kspmem
++) {
934 if ((caddr_t
)kspmem
>= (caddr_t
)buf
+ ksp
->ks_data_size
)
936 kspmem
->address
= pmem
->ml_address
;
937 kspmem
->size
= pmem
->ml_size
;
939 memlist_read_unlock();
945 * Read a mem_name_t from user-space and store it in the mem_name_t
946 * pointed to by the mem_name argument.
949 mm_read_mem_name(intptr_t data
, mem_name_t
*mem_name
)
951 if (get_udatamodel() == DATAMODEL_NATIVE
) {
952 if (copyin((void *)data
, mem_name
, sizeof (mem_name_t
)))
957 mem_name32_t mem_name32
;
959 if (copyin((void *)data
, &mem_name32
, sizeof (mem_name32_t
)))
961 mem_name
->m_addr
= mem_name32
.m_addr
;
962 mem_name
->m_synd
= mem_name32
.m_synd
;
963 mem_name
->m_type
[0] = mem_name32
.m_type
[0];
964 mem_name
->m_type
[1] = mem_name32
.m_type
[1];
965 mem_name
->m_name
= (caddr_t
)(uintptr_t)mem_name32
.m_name
;
966 mem_name
->m_namelen
= (size_t)mem_name32
.m_namelen
;
967 mem_name
->m_sid
= (caddr_t
)(uintptr_t)mem_name32
.m_sid
;
968 mem_name
->m_sidlen
= (size_t)mem_name32
.m_sidlen
;
970 #endif /* _SYSCALL32 */