2 * linux/arch/x86_64/mm/init.c
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
9 #include <linux/signal.h>
10 #include <linux/sched.h>
11 #include <linux/kernel.h>
12 #include <linux/errno.h>
13 #include <linux/string.h>
14 #include <linux/types.h>
15 #include <linux/ptrace.h>
16 #include <linux/mman.h>
18 #include <linux/swap.h>
19 #include <linux/smp.h>
20 #include <linux/init.h>
21 #include <linux/pagemap.h>
22 #include <linux/bootmem.h>
23 #include <linux/proc_fs.h>
24 #include <linux/pci.h>
25 #include <linux/pfn.h>
26 #include <linux/poison.h>
27 #include <linux/dma-mapping.h>
28 #include <linux/module.h>
29 #include <linux/memory_hotplug.h>
30 #include <linux/nmi.h>
32 #include <asm/processor.h>
33 #include <asm/system.h>
34 #include <asm/uaccess.h>
35 #include <asm/pgtable.h>
36 #include <asm/pgalloc.h>
38 #include <asm/fixmap.h>
42 #include <asm/mmu_context.h>
43 #include <asm/proto.h>
45 #include <asm/sections.h>
51 const struct dma_mapping_ops
* dma_ops
;
52 EXPORT_SYMBOL(dma_ops
);
54 static unsigned long dma_reserve __initdata
;
56 DEFINE_PER_CPU(struct mmu_gather
, mmu_gathers
);
59 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
60 * physical space so we can cache the place of the first one and move
61 * around without checking the pgd every time.
66 long i
, total
= 0, reserved
= 0;
67 long shared
= 0, cached
= 0;
71 printk(KERN_INFO
"Mem-info:\n");
73 printk(KERN_INFO
"Free swap: %6ldkB\n", nr_swap_pages
<<(PAGE_SHIFT
-10));
75 for_each_online_pgdat(pgdat
) {
76 for (i
= 0; i
< pgdat
->node_spanned_pages
; ++i
) {
77 /* this loop can take a while with 256 GB and 4k pages
78 so update the NMI watchdog */
79 if (unlikely(i
% MAX_ORDER_NR_PAGES
== 0)) {
82 if (!pfn_valid(pgdat
->node_start_pfn
+ i
))
84 page
= pfn_to_page(pgdat
->node_start_pfn
+ i
);
86 if (PageReserved(page
))
88 else if (PageSwapCache(page
))
90 else if (page_count(page
))
91 shared
+= page_count(page
) - 1;
94 printk(KERN_INFO
"%lu pages of RAM\n", total
);
95 printk(KERN_INFO
"%lu reserved pages\n",reserved
);
96 printk(KERN_INFO
"%lu pages shared\n",shared
);
97 printk(KERN_INFO
"%lu pages swap cached\n",cached
);
102 static __init
void *spp_getpage(void)
106 ptr
= (void *) get_zeroed_page(GFP_ATOMIC
);
108 ptr
= alloc_bootmem_pages(PAGE_SIZE
);
109 if (!ptr
|| ((unsigned long)ptr
& ~PAGE_MASK
))
110 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem
?"after bootmem":"");
112 Dprintk("spp_getpage %p\n", ptr
);
116 static __init
void set_pte_phys(unsigned long vaddr
,
117 unsigned long phys
, pgprot_t prot
)
124 Dprintk("set_pte_phys %lx to %lx\n", vaddr
, phys
);
126 pgd
= pgd_offset_k(vaddr
);
127 if (pgd_none(*pgd
)) {
128 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
131 pud
= pud_offset(pgd
, vaddr
);
132 if (pud_none(*pud
)) {
133 pmd
= (pmd_t
*) spp_getpage();
134 set_pud(pud
, __pud(__pa(pmd
) | _KERNPG_TABLE
| _PAGE_USER
));
135 if (pmd
!= pmd_offset(pud
, 0)) {
136 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd
, pmd_offset(pud
,0));
140 pmd
= pmd_offset(pud
, vaddr
);
141 if (pmd_none(*pmd
)) {
142 pte
= (pte_t
*) spp_getpage();
143 set_pmd(pmd
, __pmd(__pa(pte
) | _KERNPG_TABLE
| _PAGE_USER
));
144 if (pte
!= pte_offset_kernel(pmd
, 0)) {
145 printk("PAGETABLE BUG #02!\n");
149 new_pte
= pfn_pte(phys
>> PAGE_SHIFT
, prot
);
151 pte
= pte_offset_kernel(pmd
, vaddr
);
152 if (!pte_none(*pte
) &&
153 pte_val(*pte
) != (pte_val(new_pte
) & __supported_pte_mask
))
155 set_pte(pte
, new_pte
);
158 * It's enough to flush this one mapping.
159 * (PGE mappings get flushed as well)
161 __flush_tlb_one(vaddr
);
164 /* NOTE: this is meant to be run only at boot */
166 __set_fixmap (enum fixed_addresses idx
, unsigned long phys
, pgprot_t prot
)
168 unsigned long address
= __fix_to_virt(idx
);
170 if (idx
>= __end_of_fixed_addresses
) {
171 printk("Invalid __set_fixmap\n");
174 set_pte_phys(address
, phys
, prot
);
177 unsigned long __meminitdata table_start
, table_end
;
179 static __meminit
void *alloc_low_page(unsigned long *phys
)
181 unsigned long pfn
= table_end
++;
185 adr
= (void *)get_zeroed_page(GFP_ATOMIC
);
191 panic("alloc_low_page: ran out of memory");
193 adr
= early_ioremap(pfn
* PAGE_SIZE
, PAGE_SIZE
);
194 memset(adr
, 0, PAGE_SIZE
);
195 *phys
= pfn
* PAGE_SIZE
;
199 static __meminit
void unmap_low_page(void *adr
)
205 early_iounmap(adr
, PAGE_SIZE
);
208 /* Must run before zap_low_mappings */
209 __meminit
void *early_ioremap(unsigned long addr
, unsigned long size
)
212 pmd_t
*pmd
, *last_pmd
;
215 pmds
= ((addr
& ~PMD_MASK
) + size
+ ~PMD_MASK
) / PMD_SIZE
;
216 vaddr
= __START_KERNEL_map
;
217 pmd
= level2_kernel_pgt
;
218 last_pmd
= level2_kernel_pgt
+ PTRS_PER_PMD
- 1;
219 for (; pmd
<= last_pmd
; pmd
++, vaddr
+= PMD_SIZE
) {
220 for (i
= 0; i
< pmds
; i
++) {
221 if (pmd_present(pmd
[i
]))
224 vaddr
+= addr
& ~PMD_MASK
;
226 for (i
= 0; i
< pmds
; i
++, addr
+= PMD_SIZE
)
227 set_pmd(pmd
+ i
,__pmd(addr
| _KERNPG_TABLE
| _PAGE_PSE
));
229 return (void *)vaddr
;
233 printk("early_ioremap(0x%lx, %lu) failed\n", addr
, size
);
237 /* To avoid virtual aliases later */
238 __meminit
void early_iounmap(void *addr
, unsigned long size
)
244 vaddr
= (unsigned long)addr
;
245 pmds
= ((vaddr
& ~PMD_MASK
) + size
+ ~PMD_MASK
) / PMD_SIZE
;
246 pmd
= level2_kernel_pgt
+ pmd_index(vaddr
);
247 for (i
= 0; i
< pmds
; i
++)
252 static void __meminit
253 phys_pmd_init(pmd_t
*pmd_page
, unsigned long address
, unsigned long end
)
255 int i
= pmd_index(address
);
257 for (; i
< PTRS_PER_PMD
; i
++, address
+= PMD_SIZE
) {
259 pmd_t
*pmd
= pmd_page
+ pmd_index(address
);
261 if (address
>= end
) {
263 for (; i
< PTRS_PER_PMD
; i
++, pmd
++)
264 set_pmd(pmd
, __pmd(0));
271 entry
= _PAGE_NX
|_PAGE_PSE
|_KERNPG_TABLE
|_PAGE_GLOBAL
|address
;
272 entry
&= __supported_pte_mask
;
273 set_pmd(pmd
, __pmd(entry
));
277 static void __meminit
278 phys_pmd_update(pud_t
*pud
, unsigned long address
, unsigned long end
)
280 pmd_t
*pmd
= pmd_offset(pud
,0);
281 spin_lock(&init_mm
.page_table_lock
);
282 phys_pmd_init(pmd
, address
, end
);
283 spin_unlock(&init_mm
.page_table_lock
);
287 static void __meminit
phys_pud_init(pud_t
*pud_page
, unsigned long addr
, unsigned long end
)
289 int i
= pud_index(addr
);
292 for (; i
< PTRS_PER_PUD
; i
++, addr
= (addr
& PUD_MASK
) + PUD_SIZE
) {
293 unsigned long pmd_phys
;
294 pud_t
*pud
= pud_page
+ pud_index(addr
);
300 if (!after_bootmem
&& !e820_any_mapped(addr
,addr
+PUD_SIZE
,0)) {
301 set_pud(pud
, __pud(0));
306 phys_pmd_update(pud
, addr
, end
);
310 pmd
= alloc_low_page(&pmd_phys
);
311 spin_lock(&init_mm
.page_table_lock
);
312 set_pud(pud
, __pud(pmd_phys
| _KERNPG_TABLE
));
313 phys_pmd_init(pmd
, addr
, end
);
314 spin_unlock(&init_mm
.page_table_lock
);
320 static void __init
find_early_table_space(unsigned long end
)
322 unsigned long puds
, pmds
, tables
, start
;
324 puds
= (end
+ PUD_SIZE
- 1) >> PUD_SHIFT
;
325 pmds
= (end
+ PMD_SIZE
- 1) >> PMD_SHIFT
;
326 tables
= round_up(puds
* sizeof(pud_t
), PAGE_SIZE
) +
327 round_up(pmds
* sizeof(pmd_t
), PAGE_SIZE
);
329 /* RED-PEN putting page tables only on node 0 could
330 cause a hotspot and fill up ZONE_DMA. The page tables
331 need roughly 0.5KB per GB. */
333 table_start
= find_e820_area(start
, end
, tables
);
334 if (table_start
== -1UL)
335 panic("Cannot find space for the kernel page tables");
337 table_start
>>= PAGE_SHIFT
;
338 table_end
= table_start
;
340 early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
341 end
, table_start
<< PAGE_SHIFT
,
342 (table_start
<< PAGE_SHIFT
) + tables
);
345 /* Setup the direct mapping of the physical memory at PAGE_OFFSET.
346 This runs before bootmem is initialized and gets pages directly from the
347 physical memory. To access them they are temporarily mapped. */
348 void __meminit
init_memory_mapping(unsigned long start
, unsigned long end
)
352 Dprintk("init_memory_mapping\n");
355 * Find space for the kernel direct mapping tables.
356 * Later we should allocate these tables in the local node of the memory
357 * mapped. Unfortunately this is done currently before the nodes are
361 find_early_table_space(end
);
363 start
= (unsigned long)__va(start
);
364 end
= (unsigned long)__va(end
);
366 for (; start
< end
; start
= next
) {
367 unsigned long pud_phys
;
368 pgd_t
*pgd
= pgd_offset_k(start
);
372 pud
= pud_offset(pgd
, start
& PGDIR_MASK
);
374 pud
= alloc_low_page(&pud_phys
);
376 next
= start
+ PGDIR_SIZE
;
379 phys_pud_init(pud
, __pa(start
), __pa(next
));
381 set_pgd(pgd_offset_k(start
), mk_kernel_pgd(pud_phys
));
386 mmu_cr4_features
= read_cr4();
391 void __init
paging_init(void)
393 unsigned long max_zone_pfns
[MAX_NR_ZONES
];
394 memset(max_zone_pfns
, 0, sizeof(max_zone_pfns
));
395 max_zone_pfns
[ZONE_DMA
] = MAX_DMA_PFN
;
396 max_zone_pfns
[ZONE_DMA32
] = MAX_DMA32_PFN
;
397 max_zone_pfns
[ZONE_NORMAL
] = end_pfn
;
399 memory_present(0, 0, end_pfn
);
401 free_area_init_nodes(max_zone_pfns
);
405 /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
406 from the CPU leading to inconsistent cache lines. address and size
407 must be aligned to 2MB boundaries.
408 Does nothing when the mapping doesn't exist. */
409 void __init
clear_kernel_mapping(unsigned long address
, unsigned long size
)
411 unsigned long end
= address
+ size
;
413 BUG_ON(address
& ~LARGE_PAGE_MASK
);
414 BUG_ON(size
& ~LARGE_PAGE_MASK
);
416 for (; address
< end
; address
+= LARGE_PAGE_SIZE
) {
417 pgd_t
*pgd
= pgd_offset_k(address
);
422 pud
= pud_offset(pgd
, address
);
425 pmd
= pmd_offset(pud
, address
);
426 if (!pmd
|| pmd_none(*pmd
))
428 if (0 == (pmd_val(*pmd
) & _PAGE_PSE
)) {
429 /* Could handle this, but it should not happen currently. */
431 "clear_kernel_mapping: mapping has been split. will leak memory\n");
434 set_pmd(pmd
, __pmd(0));
440 * Memory hotplug specific functions
442 void online_page(struct page
*page
)
444 ClearPageReserved(page
);
445 init_page_count(page
);
451 #ifdef CONFIG_MEMORY_HOTPLUG
453 * Memory is added always to NORMAL zone. This means you will never get
454 * additional DMA/DMA32 memory.
456 int arch_add_memory(int nid
, u64 start
, u64 size
)
458 struct pglist_data
*pgdat
= NODE_DATA(nid
);
459 struct zone
*zone
= pgdat
->node_zones
+ ZONE_NORMAL
;
460 unsigned long start_pfn
= start
>> PAGE_SHIFT
;
461 unsigned long nr_pages
= size
>> PAGE_SHIFT
;
464 init_memory_mapping(start
, (start
+ size
-1));
466 ret
= __add_pages(zone
, start_pfn
, nr_pages
);
472 printk("%s: Problem encountered in __add_pages!\n", __func__
);
475 EXPORT_SYMBOL_GPL(arch_add_memory
);
477 #if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
478 int memory_add_physaddr_to_nid(u64 start
)
482 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid
);
485 #endif /* CONFIG_MEMORY_HOTPLUG */
487 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
489 * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
490 * just online the pages.
492 int __add_pages(struct zone
*z
, unsigned long start_pfn
, unsigned long nr_pages
)
496 unsigned long total
= 0, mem
= 0;
497 for (pfn
= start_pfn
; pfn
< start_pfn
+ nr_pages
; pfn
++) {
498 if (pfn_valid(pfn
)) {
499 online_page(pfn_to_page(pfn
));
506 z
->spanned_pages
+= total
;
507 z
->present_pages
+= mem
;
508 z
->zone_pgdat
->node_spanned_pages
+= total
;
509 z
->zone_pgdat
->node_present_pages
+= mem
;
515 static struct kcore_list kcore_mem
, kcore_vmalloc
, kcore_kernel
, kcore_modules
,
518 void __init
mem_init(void)
520 long codesize
, reservedpages
, datasize
, initsize
;
524 /* clear the zero-page */
525 memset(empty_zero_page
, 0, PAGE_SIZE
);
529 /* this will put all low memory onto the freelists */
531 totalram_pages
= numa_free_all_bootmem();
533 totalram_pages
= free_all_bootmem();
535 reservedpages
= end_pfn
- totalram_pages
-
536 absent_pages_in_range(0, end_pfn
);
540 codesize
= (unsigned long) &_etext
- (unsigned long) &_text
;
541 datasize
= (unsigned long) &_edata
- (unsigned long) &_etext
;
542 initsize
= (unsigned long) &__init_end
- (unsigned long) &__init_begin
;
544 /* Register memory areas for /proc/kcore */
545 kclist_add(&kcore_mem
, __va(0), max_low_pfn
<< PAGE_SHIFT
);
546 kclist_add(&kcore_vmalloc
, (void *)VMALLOC_START
,
547 VMALLOC_END
-VMALLOC_START
);
548 kclist_add(&kcore_kernel
, &_stext
, _end
- _stext
);
549 kclist_add(&kcore_modules
, (void *)MODULES_VADDR
, MODULES_LEN
);
550 kclist_add(&kcore_vsyscall
, (void *)VSYSCALL_START
,
551 VSYSCALL_END
- VSYSCALL_START
);
553 printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
554 (unsigned long) nr_free_pages() << (PAGE_SHIFT
-10),
555 end_pfn
<< (PAGE_SHIFT
-10),
557 reservedpages
<< (PAGE_SHIFT
-10),
562 void free_init_pages(char *what
, unsigned long begin
, unsigned long end
)
569 printk(KERN_INFO
"Freeing %s: %luk freed\n", what
, (end
- begin
) >> 10);
570 for (addr
= begin
; addr
< end
; addr
+= PAGE_SIZE
) {
571 ClearPageReserved(virt_to_page(addr
));
572 init_page_count(virt_to_page(addr
));
573 memset((void *)(addr
& ~(PAGE_SIZE
-1)),
574 POISON_FREE_INITMEM
, PAGE_SIZE
);
575 if (addr
>= __START_KERNEL_map
)
576 change_page_attr_addr(addr
, 1, __pgprot(0));
580 if (addr
> __START_KERNEL_map
)
584 void free_initmem(void)
586 free_init_pages("unused kernel memory",
587 (unsigned long)(&__init_begin
),
588 (unsigned long)(&__init_end
));
591 #ifdef CONFIG_DEBUG_RODATA
593 void mark_rodata_ro(void)
595 unsigned long start
= (unsigned long)_stext
, end
;
597 #ifdef CONFIG_HOTPLUG_CPU
598 /* It must still be possible to apply SMP alternatives. */
599 if (num_possible_cpus() > 1)
600 start
= (unsigned long)_etext
;
603 #ifdef CONFIG_KPROBES
604 start
= (unsigned long)__start_rodata
;
607 end
= (unsigned long)__end_rodata
;
608 start
= (start
+ PAGE_SIZE
- 1) & PAGE_MASK
;
613 change_page_attr_addr(start
, (end
- start
) >> PAGE_SHIFT
, PAGE_KERNEL_RO
);
615 printk(KERN_INFO
"Write protecting the kernel read-only data: %luk\n",
616 (end
- start
) >> 10);
619 * change_page_attr_addr() requires a global_flush_tlb() call after it.
620 * We do this after the printk so that if something went wrong in the
621 * change, the printk gets out at least to give a better debug hint
622 * of who is the culprit.
628 #ifdef CONFIG_BLK_DEV_INITRD
629 void free_initrd_mem(unsigned long start
, unsigned long end
)
631 free_init_pages("initrd memory", start
, end
);
635 void __init
reserve_bootmem_generic(unsigned long phys
, unsigned len
)
638 int nid
= phys_to_nid(phys
);
640 unsigned long pfn
= phys
>> PAGE_SHIFT
;
641 if (pfn
>= end_pfn
) {
642 /* This can happen with kdump kernels when accessing firmware
644 if (pfn
< end_pfn_map
)
646 printk(KERN_ERR
"reserve_bootmem: illegal reserve %lx %u\n",
651 /* Should check here against the e820 map to avoid double free */
653 reserve_bootmem_node(NODE_DATA(nid
), phys
, len
);
655 reserve_bootmem(phys
, len
);
657 if (phys
+len
<= MAX_DMA_PFN
*PAGE_SIZE
) {
658 dma_reserve
+= len
/ PAGE_SIZE
;
659 set_dma_reserve(dma_reserve
);
663 int kern_addr_valid(unsigned long addr
)
665 unsigned long above
= ((long)addr
) >> __VIRTUAL_MASK_SHIFT
;
671 if (above
!= 0 && above
!= -1UL)
674 pgd
= pgd_offset_k(addr
);
678 pud
= pud_offset(pgd
, addr
);
682 pmd
= pmd_offset(pud
, addr
);
686 return pfn_valid(pmd_pfn(*pmd
));
688 pte
= pte_offset_kernel(pmd
, addr
);
691 return pfn_valid(pte_pfn(*pte
));
694 /* A pseudo VMA to allow ptrace access for the vsyscall page. This only
695 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
696 not need special handling anymore. */
698 static struct vm_area_struct gate_vma
= {
699 .vm_start
= VSYSCALL_START
,
700 .vm_end
= VSYSCALL_START
+ (VSYSCALL_MAPPED_PAGES
<< PAGE_SHIFT
),
701 .vm_page_prot
= PAGE_READONLY_EXEC
,
702 .vm_flags
= VM_READ
| VM_EXEC
705 struct vm_area_struct
*get_gate_vma(struct task_struct
*tsk
)
707 #ifdef CONFIG_IA32_EMULATION
708 if (test_tsk_thread_flag(tsk
, TIF_IA32
))
714 int in_gate_area(struct task_struct
*task
, unsigned long addr
)
716 struct vm_area_struct
*vma
= get_gate_vma(task
);
719 return (addr
>= vma
->vm_start
) && (addr
< vma
->vm_end
);
722 /* Use this when you have no reliable task/vma, typically from interrupt
723 * context. It is less reliable than using the task's vma and may give
726 int in_gate_area_no_task(unsigned long addr
)
728 return (addr
>= VSYSCALL_START
) && (addr
< VSYSCALL_END
);
731 const char *arch_vma_name(struct vm_area_struct
*vma
)
733 if (vma
->vm_mm
&& vma
->vm_start
== (long)vma
->vm_mm
->context
.vdso
)
735 if (vma
== &gate_vma
)
740 #ifdef CONFIG_SPARSEMEM_VMEMMAP
742 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
744 int __meminit
vmemmap_populate(struct page
*start_page
,
745 unsigned long size
, int node
)
747 unsigned long addr
= (unsigned long)start_page
;
748 unsigned long end
= (unsigned long)(start_page
+ size
);
754 for (; addr
< end
; addr
= next
) {
755 next
= pmd_addr_end(addr
, end
);
757 pgd
= vmemmap_pgd_populate(addr
, node
);
760 pud
= vmemmap_pud_populate(pgd
, addr
, node
);
764 pmd
= pmd_offset(pud
, addr
);
765 if (pmd_none(*pmd
)) {
767 void *p
= vmemmap_alloc_block(PMD_SIZE
, node
);
771 entry
= pfn_pte(__pa(p
) >> PAGE_SHIFT
, PAGE_KERNEL
);
773 set_pmd(pmd
, __pmd(pte_val(entry
)));
775 printk(KERN_DEBUG
" [%lx-%lx] PMD ->%p on node %d\n",
776 addr
, addr
+ PMD_SIZE
- 1, p
, node
);
778 vmemmap_verify((pte_t
*)pmd
, node
, addr
, next
);