4 * Base memory management functions.
8 /* XXX: this file's a mess */
13 #define PHYSMEM_BITMAP_BASE (STACKTOP)
18 #define E820_ACPI_DATA 3
19 #define E820_ACPI_NVS 4
35 } __attribute__((packed
));
37 extern struct e820 e820_map
[MAX_E820
];
38 extern uint64 e820_entries
;
46 #define PAGE_ADDR_MASK 0x000ffffffffff000
48 #define PML4E_ADDR_MASK 0x000ffffffffff000
49 #define PML4E_NX (1 << 63)
50 #define PML4E_A (1 << 5)
51 #define PML4E_PCD (1 << 4)
52 #define PML4E_PWT (1 << 3)
53 #define PML4E_US (1 << 2)
54 #define PML4E_RW (1 << 1)
55 #define PML4E_P (1 << 0)
57 #define PDPE_ADDR_MASK 0x000ffffffffff000
58 #define PDPE_NX (1 << 63)
59 #define PDPE_A (1 << 5)
60 #define PDPE_PCD (1 << 4)
61 #define PDPE_PWT (1 << 3)
62 #define PDPE_US (1 << 2)
63 #define PDPE_RW (1 << 1)
64 #define PDPE_P (1 << 0)
66 #define PDE_ADDR_MASK 0x000ffffffffff000
67 #define PDE_NX (1 << 63)
68 #define PDE_A (1 << 5)
69 #define PDE_PCD (1 << 4)
70 #define PDE_PWT (1 << 3)
71 #define PDE_US (1 << 2)
72 #define PDE_RW (1 << 1)
73 #define PDE_P (1 << 0)
75 #define PTE_ADDR_MASK 0x000ffffffffff000
76 #define PTE_NX (1 << 63)
77 #define PTE_G (1 << 8)
78 #define PTE_PAT (1 << 7)
79 #define PTE_D (1 << 6)
80 #define PTE_A (1 << 5)
81 #define PTE_PCD (1 << 4)
82 #define PTE_PWT (1 << 3)
83 #define PTE_US (1 << 2)
84 #define PTE_RW (1 << 1)
85 #define PTE_P (1 << 0)
88 #define PTE_ENTRIES (PAGE_SIZE / sizeof(PTE))
91 #define PTOFFSET(va) (((va) >> 12) & 0x1ff)
93 #define MKPML4E(pdp, flags) (((pdp) & PML4E_ADDR_MASK) | flags)
94 #define MKPDPE(pd, flags) (((pd) & PDPE_ADDR_MASK) | flags)
95 #define MKPDE(pt, flags) (((pt) & PDE_ADDR_MASK) | flags)
96 #define MKPTE(pa, flags) (((pa) & PTE_ADDR_MASK) | flags)
100 * Use the AVL bits (11-9) in each PTE to mark a page as used or free.
103 #define MM_STATUS_MASK 0x0000000000000e00
104 #define MM_PAGE_USED 0x200
105 #define MM_PAGE_INVALID 0x400
107 #define CANONICAL_MASK 0x00007fffffffffff
110 #define BITMAP_SIZE(count) (count >> 3)
112 typedef uint8 MemoryPool
;
121 #define MEMORY_REGIONS_MAX 64
123 static MemRegion memoryRegions
[MEMORY_REGIONS_MAX
];
124 static uint64 memoryRegionCount
;
136 static char *poolNames
[] = {
150 for (r
= 0; r
< memoryRegionCount
; r
++) {
151 if (addr
>= memoryRegions
[r
].base
&& addr
< memoryRegions
[r
].limit
) {
152 return memoryRegions
[r
].type
;
161 AddMemoryRegion(PA base
, PA limit
, MemRegionType type
)
163 if (memoryRegionCount
== MEMORY_REGIONS_MAX
) {
167 memoryRegions
[memoryRegionCount
].base
= base
;
168 memoryRegions
[memoryRegionCount
].limit
= limit
;
169 memoryRegions
[memoryRegionCount
].type
= type
;
178 * Sort through the e820 entries, ensuring they're in order, and
179 * joining adjacent identical entries. Then add the regions to
190 if (e820_entries
< 2) {
194 /* sort the entries */
196 while (!doneSorting
) {
199 for (k
= 0; k
< e820_entries
- 1; k
++) {
200 if (e820_map
[k
].base
> e820_map
[k
+ 1].base
) {
203 tmp
.base
= e820_map
[k
].base
;
204 tmp
.length
= e820_map
[k
].length
;
205 tmp
.type
= e820_map
[k
].type
;
207 e820_map
[k
].base
= e820_map
[k
+ 1].base
;
208 e820_map
[k
].length
= e820_map
[k
+ 1].length
;
209 e820_map
[k
].type
= e820_map
[k
+ 1].type
;
211 e820_map
[k
].base
= tmp
.base
;
212 e820_map
[k
].length
= tmp
.length
;
213 e820_map
[k
].type
= tmp
.type
;
220 /* merge adjacent entries */
223 while (k
< e820_entries
- 1) {
224 if (((e820_map
[k
].base
+ e820_map
[k
].length
) >= e820_map
[j
].base
) &&
225 (e820_map
[k
].type
== e820_map
[j
].type
)) {
228 if (e820_map
[k
].base
+ e820_map
[k
].length
<
229 e820_map
[j
].base
+ e820_map
[j
].length
) {
230 /* second entry has higher limit than first */
231 e820_map
[k
].length
= ((e820_map
[j
].base
+ e820_map
[j
].length
) -
234 /* first entry entirely overlaps second - do nothing */
237 /* move rest of entries down */
238 for (d
= k
+ 1; j
< e820_entries
; j
++) {
239 e820_map
[d
].base
= e820_map
[j
].base
;
240 e820_map
[d
].length
= e820_map
[j
].length
;
241 e820_map
[d
].type
= e820_map
[j
].type
;
252 /* adjust to page boundaries */
253 for (k
= 0; k
< e820_entries
; k
++) {
255 if (e820_map
[k
].base
& 0xfff) {
256 if (e820_map
[k
].type
== E820_RAM
) {
257 /* RAM - adjust base up */
258 e820_map
[k
].length
-= e820_map
[k
].base
& 0xfff;
259 e820_map
[k
].base
= (e820_map
[k
].base
& (~0xfffULL
)) + 4096;
261 /* otherwise adjust down*/
262 e820_map
[k
].length
+= e820_map
[k
].base
& 0xfff;
263 e820_map
[k
].base
= e820_map
[k
].base
& (~0xfffULL
);
267 if (((e820_map
[k
].base
+ e820_map
[k
].length
) & 0xfff) &&
268 e820_map
[k
].type
== E820_RAM
) {
269 /* adjust limit down */
270 e820_map
[k
].length
-= (e820_map
[k
].base
+ e820_map
[k
].length
) & 0xfff;
283 for (k
= 0; k
< e820_entries
; k
++) {
284 MemRegionType t
= MEMTYPE_HOLE
;
286 switch (e820_map
[k
].type
) {
294 t
= MEMTYPE_ACPI_DATA
;
297 t
= MEMTYPE_ACPI_NVS
;
301 AddMemoryRegion((PA
)e820_map
[k
].base
,
302 (PA
)(e820_map
[k
].base
+ e820_map
[k
].length
),
308 extern uint64
GetCR3(void);
309 asm(".global GetCR3\n"
311 "\tmovq %cr3, %rax\n"
313 extern uint64
GetCR2(void);
314 asm(".global GetCR2\n"
316 "\tmovq %cr2, %rax\n"
318 extern void SetCR3(uint64
);
319 asm(".global SetCR3\n"
321 "\tmovq %rdi, %cr3\n"
323 extern void SetCR2(uint64
);
324 asm(".global SetCR2\n"
326 "\tmovq %rdi, %cr2\n"
328 extern void FlushCR3(void);
329 asm(".global FlushCR3\n"
331 "\tmovq %cr3, %rax\n"
332 "\tmovq %rax, %cr3\n"
336 /* global flag that is TRUE when swapping is enabled */
337 volatile Bool swapping
= FALSE
;
340 #define PAGED(e) FALSE
345 * Walk a page table for a given VA and return a pointer to the
346 * PTE. If no entry exists for the virtual address in any of the
347 * tables, the entry is created.
361 pml4
= (PML4E
*)(GetCR3() & ~0xfffULL
);
362 pml4e
= &pml4
[((addr
>> 39) & 0x1ff)];
364 if (*pml4e
& PML4E_P
) {
365 pdp
= (PDPE
*)(*pml4e
& PML4E_ADDR_MASK
);
367 /* PDP is swapped out or not yet allocated. */
369 if (swapping
&& PAGED(*pdpe
)) {
373 /* Allocate new PDP */
374 pdp
= (PDPE
*)PageAlloc(MM_VA_IDENT
, 0);
376 bzero(pdp
, PAGE_SIZE
);
377 *pml4e
= MKPDPE((PA
)pdp
, PDPE_P
);
383 pdpe
= &pdp
[((addr
>> 30) & 0x1ff)];
385 if (*pdpe
& PDPE_P
) {
386 pd
= (PDE
*)(*pdpe
& PDPE_ADDR_MASK
);
388 /* PD is either paged out or not yet allocated. */
390 if (swapping
&& PAGED(*pdpe
)) {
394 /* Allocate new PD */
396 pd
= (PDE
*)PageAlloc(MM_VA_IDENT
, 0);
397 bzero(pd
, PAGE_SIZE
);
398 *pdpe
= MKPDPE((PA
)pd
, PDPE_P
);
403 pde
= &pd
[((addr
>> 21) & 0x1ff)];
406 pt
= (PTE
*)(*pde
& PDE_ADDR_MASK
);
408 if (swapping
&& PAGED(*pde
)) {
412 /* Allocate new PT */
414 pt
= (PTE
*)PageAlloc(MM_VA_IDENT
, 0);
415 bzero(pt
, PAGE_SIZE
);
416 *pde
= MKPDE((PA
)pt
, PDE_P
);
420 pte
= &pt
[((addr
>> 12) & 0x1ff)];
427 extern uint64
GetCPL(void);
428 asm(".global GetCPL\n"
432 "\tmovzwq %ax, %rax\n" /* may not be necessary */
438 * Find a free physical page from the pool and return its
439 * address. Once swapping is turned on, this function will
440 * always return a free page.
444 GetFreePA(MemoryPool pool
)
446 PA freePage
= MM_PA_INVALID
;
448 uint64 pageOffset
= -1ULL;
452 * Find a 0 bit in the bitmask and convert it to a page offset.
455 for (byte
= 0; byte
< BITMAP_SIZE(pools
[pool
].count
); byte
++) {
456 if (pools
[pool
].map
[byte
] != 0xff) {
458 for (bit
= 0; bit
< 8; bit
++) {
459 if (pools
[pool
].map
[byte
] & (1 << bit
)) {
466 pageOffset
= byte
* 8 + bit
;
467 pools
[pool
].map
[byte
] |= (1 << bit
);
473 if (pageOffset
== -1ULL) {
474 // XXX: do some paging instead of just shutting down
479 * Now scan through the regions, finding where this page is in memory.
482 for (r
= 0; r
< memoryRegionCount
; r
++) {
483 if (memoryRegions
[r
].type
== MEMTYPE_RAM
&&
484 pools
[pool
].base
>= memoryRegions
[r
].base
&&
485 pools
[pool
].base
< memoryRegions
[r
].limit
) {
487 if (memoryRegions
[r
].base
+ (pageOffset
<< 12) <
488 memoryRegions
[r
].limit
) {
490 freePage
= memoryRegions
[r
].base
+ (pageOffset
<< 12);
494 pageOffset
-= (memoryRegions
[r
].limit
-
495 memoryRegions
[r
].base
) >> 12;
500 /* tval = pageOffset; tval2 = freePage; asm("\t.global test\ntest:\n"); */
513 AdjustVPF(VA
*pDesired
, MemoryPool
*pPool
, uint64
*pFlags
)
527 if (desired
== MM_VA_DONT_CARE
) {
528 desired
= MM_VA_KERNEL_START
;
529 } else if (desired
== MM_VA_HEAP
) {
530 desired
= MM_VA_KERNEL_HEAP
;
531 } else if (desired
== MM_VA_IDENT
) {
532 desired
= MM_VA_LOADER_START
;
538 if (desired
== MM_VA_DONT_CARE
) {
539 desired
= MM_VA_PRIV_START
;
540 } else if (desired
== MM_VA_HEAP
) {
541 desired
= MM_VA_PRIV_HEAP
;
543 pool
= POOL_PRIVILEGED
;
546 if (desired
== MM_VA_DONT_CARE
) {
547 desired
= MM_VA_USER_START
;
548 } else if (desired
== MM_VA_HEAP
) {
549 desired
= MM_VA_USER_HEAP
;
551 desired
= MM_VA_USER_START
;
557 desired
&= PTE_ADDR_MASK
; // align to page boundary and ensure canonicality
568 * Allocate a single page of memory and a physical page to back
569 * it. If a virtual address is requested, the allocater attempts
570 * to map there. If MM_VA_DONT_CARE is passed in instead, the
571 * allocater will map at the first available address.
575 PageAlloc(VA desired
, uint64 flags
)
579 VA va
= MM_VA_INVALID
;
580 MemoryPool pool
= POOL_USER
;
582 AdjustVPF(&desired
, &pool
, &flags
);
584 freePage
= GetFreePA(pool
);
586 if (pool
== POOL_IDENT
) {
587 pte
= WalkPT((VA
)freePage
);
588 ASSERT((*pte
& MM_STATUS_MASK
) == 0);
595 * Scan for an unused VA. If MM_VA_DONT_CARE was passed in,
596 * this may be slow...
599 pte
= WalkPT(desired
);
601 while (*pte
& MM_STATUS_MASK
) {
603 pte
= WalkPT(search
);
609 /* Update PTE to point to freePage */
610 *pte
= MKPTE(freePage
, flags
| MM_PAGE_USED
| PTE_P
);
619 * Allocate a contiguous region of virtual memory. Pages cannot
620 * be allocated from POOL_IDENT here.
622 * Returns NULL if a contiguous region cannot be found.
624 * This is intended to be the general-purpose memory allocater.
628 RegionAlloc(VA desired
, uint64 nPages
, uint64 flags
)
630 MemoryPool pool
= POOL_USER
;
631 VA found
= MM_VA_INVALID
, start
, scan
, limit
;
635 AdjustVPF(&desired
, &pool
, &flags
);
637 ASSERT(pool
!= POOL_IDENT
);
639 if (desired
< MM_VA_PRIV_START
) {
640 limit
= MM_VA_PRIV_START
;
641 } else if (desired
< MM_VA_USER_START
) {
642 limit
= MM_VA_USER_START
;
644 limit
= MM_VA_CANONICAL_TOP
;
647 /* Need to find an nPage region in virtual space that is available. */
649 for (start
= desired
; start
< limit
; start
+= PAGE_SIZE
) {
652 if (*pte
& MM_STATUS_MASK
) {
656 for (scan
= start
+ PAGE_SIZE
, n
= 0;
657 n
< nPages
&& scan
< limit
;
658 n
++, scan
+= PAGE_SIZE
) {
662 if (*pte
& MM_STATUS_MASK
) {
673 if (found
== MM_VA_INVALID
) {
677 for (scan
= found
, n
= 0; n
< nPages
; n
++, scan
+= PAGE_SIZE
) {
679 *pte
= MKPTE(0, flags
| MM_PAGE_USED
); /* Physmem allocation is lazy. */
689 * Release a page of memory and its virtual mapping.
699 physPage
= *pte
& PTE_ADDR_MASK
;
709 * Remap a page's virtual address. Return TRUE if successful,
710 * and FALSE if the target VA is already in use or is outside the
715 PageRemap(VA current
, VA
new)
731 * Page fault handler (int 14). Called from stub in interrupts.S.
735 HandlePF(ExcFrame
*f
)
740 MemoryPool pool
= POOL_USER
;
745 if (f
->errorCode
& PF_NP
) {
746 /* #PF caused by permissions will be handled once tasks are
748 UNIMPLEMENTED("#PF");
752 * #PF caused by mapped but not allocated page - allocate it here.
755 switch (f
->cs
& 0x3) {
761 pool
= POOL_PRIVILEGED
;
768 /* XXX: Once swapping is implemented, will need to switch to kernel
769 * stack and make this a deferred function call as getting a free
770 * PA may take time and require interrupts to be enabled. */
772 freePage
= GetFreePA(pool
);
773 *pte
= MKPTE(freePage
, (*pte
& 0xfff) | PTE_P
);
782 * Identity map the first page table. This is called very early
783 * in startup and is needed by the memory mapper.
790 PTE
*pt
= (PTE
*)PTBASE
;
793 * PML4/PDPT/PD are already initialized.
796 /* First page is BIOS data area - mark ro */
797 pt
[PTOFFSET(0)] = MKPTE(0, PTE_P
| MM_PAGE_INVALID
);
800 * Below STACKTOP (0x18000), all pages are used by the loader.
804 for (current
= PAGE_SIZE
;
805 current
< PTE_ENTRIES
* PAGE_SIZE
;
806 current
+= PAGE_SIZE
) {
810 type
= GetMemType(current
);
812 if (type
== MEMTYPE_RAM
) {
813 pte
= MKPTE(current
, PTE_P
|PTE_RW
);
815 if (current
< STACKTOP
) {
819 } else if (type
== MEMTYPE_ROM
) {
820 pte
= MKPTE(current
, PTE_P
| MM_PAGE_INVALID
);
821 } else if (type
== MEMTYPE_IO
||
822 type
== MEMTYPE_ACPI_DATA
||
823 type
== MEMTYPE_ACPI_NVS
) {
824 pte
= MKPTE(current
, PTE_P
|PTE_RW
|PTE_PCD
| MM_PAGE_INVALID
);
826 pte
= MKPTE(current
, 0 | MM_PAGE_INVALID
); /* mark page NP */
829 pt
[PTOFFSET(current
)] = pte
;
837 * Identity map an IO region.
841 MapIORegion(PA start
, PA end
, char *name
)
846 start
&= PAGE_ADDR_MASK
;
847 end
= (PAGE_SIZE
- 1 + end
) & PAGE_ADDR_MASK
;
849 for (va
= start
; va
< end
; va
+= PAGE_SIZE
) {
851 *pte
= MKPTE(va
, MM_PAGE_INVALID
|PTE_PCD
|PTE_RW
|PTE_P
);
857 * AddRegionsToPools --
859 * Go through memory regions and memory pools and assign base and
860 * limit addresses to each pool.
864 AddRegionsToPools(void)
866 uint64 r
, p
, countLeft
;
870 countLeft
= pools
[p
].count
* PAGE_SIZE
;
871 pools
[p
].base
= startAddr
= memoryRegions
[0].base
;
873 while (r
< memoryRegionCount
&& p
< 4) {
874 if (memoryRegions
[r
].type
!= MEMTYPE_RAM
) {
879 if (startAddr
< memoryRegions
[r
].base
) {
880 /* Update startAddr to the current region. */
881 startAddr
= memoryRegions
[r
].base
;
884 if (countLeft
== 0) {
885 countLeft
= pools
[p
].count
* PAGE_SIZE
;
886 pools
[p
].base
= startAddr
;
889 if (startAddr
+ countLeft
<= memoryRegions
[r
].limit
) {
890 startAddr
+= countLeft
;
892 pools
[p
].limit
= startAddr
; /* actually end address here */
894 } else if (startAddr
+ countLeft
> memoryRegions
[r
].limit
) {
895 countLeft
-= memoryRegions
[r
].limit
- startAddr
;
903 CalculateTotalMem(void)
907 for (mem
= 0, r
= 0; r
< memoryRegionCount
; r
++) {
908 if (memoryRegions
[r
].type
== MEMTYPE_RAM
) {
909 mem
+= memoryRegions
[r
].limit
- memoryRegions
[r
].base
;
920 * Divide available physical memory into pools.
922 * Initial allocations are:
924 * ident - 8MB - used for page tables and other basic data structures
925 * kernel - 8MB - kernel text/data
926 * priv - 16MB - privileged processes (drivers)
927 * user - rest - user physmem allocation
930 #define MB (1024ULL * 1024ULL)
931 #define GB (1024ULL * 1024ULL * 1024ULL)
936 uint64 totalMem
, totalPages
, bitmapPages
, p
;
939 /* the number of pages in each pool must be divisible by 8 */
940 pools
[0].name
= poolNames
[0];
941 pools
[0].count
= 8 * MB
/ PAGE_SIZE
;
942 pools
[1].name
= poolNames
[1];
943 pools
[1].count
= 8 * MB
/ PAGE_SIZE
;
944 pools
[2].name
= poolNames
[2];
945 pools
[2].count
= 16 * MB
/ PAGE_SIZE
;
946 pools
[3].name
= poolNames
[3];
949 * Each page of bitmask can represent 32768 pages (128MB). As the
950 * range 0x18000 - 0x98000 is used for bitmasks, this allows a
951 * maximum of 4194304 pages or 16GB physical memory. Should this
952 * limit become onerous, this should be pretty easy to revisit.
955 totalMem
= CalculateTotalMem();
957 if (totalMem
> 16 * GB
) {
961 if (totalMem
< 32 * MB
) {
962 // XXX: PANIC("Not enough memory");
964 } else if (totalMem
< 64 * MB
) {
965 /* Small mem - halve pool allocations. */
966 pools
[0].count
= 4 * MB
/ PAGE_SIZE
;
967 pools
[1].count
= 4 * MB
/ PAGE_SIZE
;
968 pools
[2].count
= 8 * MB
/ PAGE_SIZE
;
971 pools
[3].count
= (totalMem
/ PAGE_SIZE
- pools
[0].count
-
972 pools
[1].count
- pools
[2].count
);
974 totalPages
= (pools
[0].count
+ pools
[1].count
+
975 pools
[2].count
+ pools
[3].count
);
977 /* round up to next full bitmap page */
978 if (totalPages
& 0x7fff) {
979 totalPages
= (totalPages
& 0xffffffffffff8000) + 0x8000;
982 bitmapPages
= totalPages
>> 15; /* div 32768 */
984 //tval = bitmapPages; asm("\t.global test\ntest:\n");
986 pools
[0].map
= (uint8
*)PHYSMEM_BITMAP_BASE
;
987 pools
[1].map
= (uint8
*)((uint64
)pools
[0].map
+ BITMAP_SIZE(pools
[0].count
));
988 pools
[2].map
= (uint8
*)((uint64
)pools
[1].map
+ BITMAP_SIZE(pools
[1].count
));
989 pools
[3].map
= (uint8
*)((uint64
)pools
[2].map
+ BITMAP_SIZE(pools
[2].count
));
994 * Finally mark known pages as used in the ident bitmap.
997 /* zero out bitmaps */
998 bzero(pools
[0].map
, BITMAP_SIZE(pools
[0].count
));
999 bzero(pools
[1].map
, BITMAP_SIZE(pools
[1].count
));
1000 bzero(pools
[2].map
, BITMAP_SIZE(pools
[2].count
));
1001 bzero(pools
[3].map
, BITMAP_SIZE(pools
[3].count
));
1006 * 0x1000 - 0x6000 : GDT/IDT/PML4/PDPT/PD/PT
1007 * 0x6000 - 0x8000 : loader bss (can be freed later)
1008 * 0x8000 - 0x10000 : loader text/data/rodata
1009 * 0x10000 - 0x18000 : stack
1010 * 0x18000 - ? : bitmaps
1013 for (addr
= 0; addr
< STACKTOP
; addr
+= PAGE_SIZE
) {
1014 uint64 ppn
, byte
, bit
;
1021 pools
[0].map
[byte
] |= (uint8
)(1 << bit
);
1024 for (p
= 0; p
< bitmapPages
; p
++) {
1025 uint64 addr
, ppn
, byte
, bit
;
1027 addr
= (PA
)PHYSMEM_BITMAP_BASE
+ p
* PAGE_SIZE
;
1033 pools
[0].map
[byte
] |= (uint8
)(1 << bit
);
1043 * Add known memory regions to list.
1046 memoryRegionCount
= 0;
1050 * Fill in the first page table - that will provide some breathing
1051 * room to set up all the various data structures. As part of
1052 * this, clobber the original page tables (though since this region
1053 * will be identity mapped, it won't make a difference).
1059 asm("\t.global mapped\nmapped:\n");
1061 /* Now there's room for the rest of the page tables. */