2 * PPC64 (POWER4) Huge TLB Page Support for Kernel.
4 * Copyright (C) 2003 David Gibson, IBM Corporation.
6 * Based on the IA-32 version:
7 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
12 #include <linux/slab.h>
13 #include <linux/hugetlb.h>
14 #include <asm/pgtable.h>
15 #include <asm/pgalloc.h>
18 #define PAGE_SHIFT_64K 16
19 #define PAGE_SHIFT_16M 24
20 #define PAGE_SHIFT_16G 34
22 #define MAX_NUMBER_GPAGES 1024
24 /* Tracks the 16G pages after the device tree is scanned and before the
25 * huge_boot_pages list is ready. */
26 static unsigned long gpage_freearray
[MAX_NUMBER_GPAGES
];
27 static unsigned nr_gpages
;
29 /* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad()
30 * will choke on pointers to hugepte tables, which is handy for
31 * catching screwups early. */
33 static inline int shift_to_mmu_psize(unsigned int shift
)
37 for (psize
= 0; psize
< MMU_PAGE_COUNT
; ++psize
)
38 if (mmu_psize_defs
[psize
].shift
== shift
)
43 static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize
)
45 if (mmu_psize_defs
[mmu_psize
].shift
)
46 return mmu_psize_defs
[mmu_psize
].shift
;
50 #define hugepd_none(hpd) ((hpd).pd == 0)
52 static inline pte_t
*hugepd_page(hugepd_t hpd
)
54 BUG_ON(!hugepd_ok(hpd
));
55 return (pte_t
*)((hpd
.pd
& ~HUGEPD_SHIFT_MASK
) | 0xc000000000000000);
58 static inline unsigned int hugepd_shift(hugepd_t hpd
)
60 return hpd
.pd
& HUGEPD_SHIFT_MASK
;
63 static inline pte_t
*hugepte_offset(hugepd_t
*hpdp
, unsigned long addr
, unsigned pdshift
)
65 unsigned long idx
= (addr
& ((1UL << pdshift
) - 1)) >> hugepd_shift(*hpdp
);
66 pte_t
*dir
= hugepd_page(*hpdp
);
71 pte_t
*find_linux_pte_or_hugepte(pgd_t
*pgdir
, unsigned long ea
, unsigned *shift
)
76 hugepd_t
*hpdp
= NULL
;
77 unsigned pdshift
= PGDIR_SHIFT
;
82 pg
= pgdir
+ pgd_index(ea
);
84 hpdp
= (hugepd_t
*)pg
;
85 } else if (!pgd_none(*pg
)) {
87 pu
= pud_offset(pg
, ea
);
89 hpdp
= (hugepd_t
*)pu
;
90 else if (!pud_none(*pu
)) {
92 pm
= pmd_offset(pu
, ea
);
94 hpdp
= (hugepd_t
*)pm
;
95 else if (!pmd_none(*pm
)) {
96 return pte_offset_map(pm
, ea
);
105 *shift
= hugepd_shift(*hpdp
);
106 return hugepte_offset(hpdp
, ea
, pdshift
);
109 pte_t
*huge_pte_offset(struct mm_struct
*mm
, unsigned long addr
)
111 return find_linux_pte_or_hugepte(mm
->pgd
, addr
, NULL
);
114 static int __hugepte_alloc(struct mm_struct
*mm
, hugepd_t
*hpdp
,
115 unsigned long address
, unsigned pdshift
, unsigned pshift
)
117 pte_t
*new = kmem_cache_zalloc(PGT_CACHE(pdshift
- pshift
),
118 GFP_KERNEL
|__GFP_REPEAT
);
120 BUG_ON(pshift
> HUGEPD_SHIFT_MASK
);
121 BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK
);
126 spin_lock(&mm
->page_table_lock
);
127 if (!hugepd_none(*hpdp
))
128 kmem_cache_free(PGT_CACHE(pdshift
- pshift
), new);
130 hpdp
->pd
= ((unsigned long)new & ~0x8000000000000000) | pshift
;
131 spin_unlock(&mm
->page_table_lock
);
135 pte_t
*huge_pte_alloc(struct mm_struct
*mm
, unsigned long addr
, unsigned long sz
)
140 hugepd_t
*hpdp
= NULL
;
141 unsigned pshift
= __ffs(sz
);
142 unsigned pdshift
= PGDIR_SHIFT
;
146 pg
= pgd_offset(mm
, addr
);
147 if (pshift
>= PUD_SHIFT
) {
148 hpdp
= (hugepd_t
*)pg
;
151 pu
= pud_alloc(mm
, pg
, addr
);
152 if (pshift
>= PMD_SHIFT
) {
153 hpdp
= (hugepd_t
*)pu
;
156 pm
= pmd_alloc(mm
, pu
, addr
);
157 hpdp
= (hugepd_t
*)pm
;
164 BUG_ON(!hugepd_none(*hpdp
) && !hugepd_ok(*hpdp
));
166 if (hugepd_none(*hpdp
) && __hugepte_alloc(mm
, hpdp
, addr
, pdshift
, pshift
))
169 return hugepte_offset(hpdp
, addr
, pdshift
);
172 /* Build list of addresses of gigantic pages. This function is used in early
173 * boot before the buddy or bootmem allocator is setup.
175 void add_gpage(unsigned long addr
, unsigned long page_size
,
176 unsigned long number_of_pages
)
180 while (number_of_pages
> 0) {
181 gpage_freearray
[nr_gpages
] = addr
;
188 /* Moves the gigantic page addresses from the temporary list to the
189 * huge_boot_pages list.
191 int alloc_bootmem_huge_page(struct hstate
*hstate
)
193 struct huge_bootmem_page
*m
;
196 m
= phys_to_virt(gpage_freearray
[--nr_gpages
]);
197 gpage_freearray
[nr_gpages
] = 0;
198 list_add(&m
->list
, &huge_boot_pages
);
203 int huge_pmd_unshare(struct mm_struct
*mm
, unsigned long *addr
, pte_t
*ptep
)
208 static void free_hugepd_range(struct mmu_gather
*tlb
, hugepd_t
*hpdp
, int pdshift
,
209 unsigned long start
, unsigned long end
,
210 unsigned long floor
, unsigned long ceiling
)
212 pte_t
*hugepte
= hugepd_page(*hpdp
);
213 unsigned shift
= hugepd_shift(*hpdp
);
214 unsigned long pdmask
= ~((1UL << pdshift
) - 1);
224 if (end
- 1 > ceiling
- 1)
229 pgtable_free_tlb(tlb
, hugepte
, pdshift
- shift
);
232 static void hugetlb_free_pmd_range(struct mmu_gather
*tlb
, pud_t
*pud
,
233 unsigned long addr
, unsigned long end
,
234 unsigned long floor
, unsigned long ceiling
)
241 pmd
= pmd_offset(pud
, addr
);
243 next
= pmd_addr_end(addr
, end
);
246 free_hugepd_range(tlb
, (hugepd_t
*)pmd
, PMD_SHIFT
,
247 addr
, next
, floor
, ceiling
);
248 } while (pmd
++, addr
= next
, addr
!= end
);
258 if (end
- 1 > ceiling
- 1)
261 pmd
= pmd_offset(pud
, start
);
263 pmd_free_tlb(tlb
, pmd
, start
);
266 static void hugetlb_free_pud_range(struct mmu_gather
*tlb
, pgd_t
*pgd
,
267 unsigned long addr
, unsigned long end
,
268 unsigned long floor
, unsigned long ceiling
)
275 pud
= pud_offset(pgd
, addr
);
277 next
= pud_addr_end(addr
, end
);
278 if (!is_hugepd(pud
)) {
279 if (pud_none_or_clear_bad(pud
))
281 hugetlb_free_pmd_range(tlb
, pud
, addr
, next
, floor
,
284 free_hugepd_range(tlb
, (hugepd_t
*)pud
, PUD_SHIFT
,
285 addr
, next
, floor
, ceiling
);
287 } while (pud
++, addr
= next
, addr
!= end
);
293 ceiling
&= PGDIR_MASK
;
297 if (end
- 1 > ceiling
- 1)
300 pud
= pud_offset(pgd
, start
);
302 pud_free_tlb(tlb
, pud
, start
);
306 * This function frees user-level page tables of a process.
308 * Must be called with pagetable lock held.
310 void hugetlb_free_pgd_range(struct mmu_gather
*tlb
,
311 unsigned long addr
, unsigned long end
,
312 unsigned long floor
, unsigned long ceiling
)
318 * Because there are a number of different possible pagetable
319 * layouts for hugepage ranges, we limit knowledge of how
320 * things should be laid out to the allocation path
321 * (huge_pte_alloc(), above). Everything else works out the
322 * structure as it goes from information in the hugepd
323 * pointers. That means that we can't here use the
324 * optimization used in the normal page free_pgd_range(), of
325 * checking whether we're actually covering a large enough
326 * range to have to do anything at the top level of the walk
327 * instead of at the bottom.
329 * To make sense of this, you should probably go read the big
330 * block comment at the top of the normal free_pgd_range(),
334 pgd
= pgd_offset(tlb
->mm
, addr
);
336 next
= pgd_addr_end(addr
, end
);
337 if (!is_hugepd(pgd
)) {
338 if (pgd_none_or_clear_bad(pgd
))
340 hugetlb_free_pud_range(tlb
, pgd
, addr
, next
, floor
, ceiling
);
342 free_hugepd_range(tlb
, (hugepd_t
*)pgd
, PGDIR_SHIFT
,
343 addr
, next
, floor
, ceiling
);
345 } while (pgd
++, addr
= next
, addr
!= end
);
349 follow_huge_addr(struct mm_struct
*mm
, unsigned long address
, int write
)
356 ptep
= find_linux_pte_or_hugepte(mm
->pgd
, address
, &shift
);
358 /* Verify it is a huge page else bail. */
360 return ERR_PTR(-EINVAL
);
362 mask
= (1UL << shift
) - 1;
363 page
= pte_page(*ptep
);
365 page
+= (address
& mask
) / PAGE_SIZE
;
370 int pmd_huge(pmd_t pmd
)
375 int pud_huge(pud_t pud
)
381 follow_huge_pmd(struct mm_struct
*mm
, unsigned long address
,
382 pmd_t
*pmd
, int write
)
388 static noinline
int gup_hugepte(pte_t
*ptep
, unsigned long sz
, unsigned long addr
,
389 unsigned long end
, int write
, struct page
**pages
, int *nr
)
392 unsigned long pte_end
;
393 struct page
*head
, *page
;
397 pte_end
= (addr
+ sz
) & ~(sz
-1);
402 mask
= _PAGE_PRESENT
| _PAGE_USER
;
406 if ((pte_val(pte
) & mask
) != mask
)
409 /* hugepages are never "special" */
410 VM_BUG_ON(!pfn_valid(pte_pfn(pte
)));
413 head
= pte_page(pte
);
415 page
= head
+ ((addr
& (sz
-1)) >> PAGE_SHIFT
);
417 VM_BUG_ON(compound_head(page
) != head
);
422 } while (addr
+= PAGE_SIZE
, addr
!= end
);
424 if (!page_cache_add_speculative(head
, refs
)) {
429 if (unlikely(pte_val(pte
) != pte_val(*ptep
))) {
430 /* Could be optimized better */
440 static unsigned long hugepte_addr_end(unsigned long addr
, unsigned long end
,
443 unsigned long __boundary
= (addr
+ sz
) & ~(sz
-1);
444 return (__boundary
- 1 < end
- 1) ? __boundary
: end
;
447 int gup_hugepd(hugepd_t
*hugepd
, unsigned pdshift
,
448 unsigned long addr
, unsigned long end
,
449 int write
, struct page
**pages
, int *nr
)
452 unsigned long sz
= 1UL << hugepd_shift(*hugepd
);
455 ptep
= hugepte_offset(hugepd
, addr
, pdshift
);
457 next
= hugepte_addr_end(addr
, end
, sz
);
458 if (!gup_hugepte(ptep
, sz
, addr
, end
, write
, pages
, nr
))
460 } while (ptep
++, addr
= next
, addr
!= end
);
465 unsigned long hugetlb_get_unmapped_area(struct file
*file
, unsigned long addr
,
466 unsigned long len
, unsigned long pgoff
,
469 struct hstate
*hstate
= hstate_file(file
);
470 int mmu_psize
= shift_to_mmu_psize(huge_page_shift(hstate
));
472 return slice_get_unmapped_area(addr
, len
, flags
, mmu_psize
, 1, 0);
475 unsigned long vma_mmu_pagesize(struct vm_area_struct
*vma
)
477 unsigned int psize
= get_slice_psize(vma
->vm_mm
, vma
->vm_start
);
479 return 1UL << mmu_psize_to_shift(psize
);
482 static int __init
add_huge_page_size(unsigned long long size
)
484 int shift
= __ffs(size
);
487 /* Check that it is a page size supported by the hardware and
488 * that it fits within pagetable and slice limits. */
489 if (!is_power_of_2(size
)
490 || (shift
> SLICE_HIGH_SHIFT
) || (shift
<= PAGE_SHIFT
))
493 if ((mmu_psize
= shift_to_mmu_psize(shift
)) < 0)
496 #ifdef CONFIG_SPU_FS_64K_LS
497 /* Disable support for 64K huge pages when 64K SPU local store
498 * support is enabled as the current implementation conflicts.
500 if (shift
== PAGE_SHIFT_64K
)
502 #endif /* CONFIG_SPU_FS_64K_LS */
504 BUG_ON(mmu_psize_defs
[mmu_psize
].shift
!= shift
);
506 /* Return if huge page size has already been setup */
507 if (size_to_hstate(size
))
510 hugetlb_add_hstate(shift
- PAGE_SHIFT
);
515 static int __init
hugepage_setup_sz(char *str
)
517 unsigned long long size
;
519 size
= memparse(str
, &str
);
521 if (add_huge_page_size(size
) != 0)
522 printk(KERN_WARNING
"Invalid huge page size specified(%llu)\n", size
);
526 __setup("hugepagesz=", hugepage_setup_sz
);
528 static int __init
hugetlbpage_init(void)
532 if (!mmu_has_feature(MMU_FTR_16M_PAGE
))
535 for (psize
= 0; psize
< MMU_PAGE_COUNT
; ++psize
) {
539 if (!mmu_psize_defs
[psize
].shift
)
542 shift
= mmu_psize_to_shift(psize
);
544 if (add_huge_page_size(1ULL << shift
) < 0)
547 if (shift
< PMD_SHIFT
)
549 else if (shift
< PUD_SHIFT
)
552 pdshift
= PGDIR_SHIFT
;
554 pgtable_cache_add(pdshift
- shift
, NULL
);
555 if (!PGT_CACHE(pdshift
- shift
))
556 panic("hugetlbpage_init(): could not create "
557 "pgtable cache for %d bit pagesize\n", shift
);
560 /* Set default large page size. Currently, we pick 16M or 1M
561 * depending on what is available
563 if (mmu_psize_defs
[MMU_PAGE_16M
].shift
)
564 HPAGE_SHIFT
= mmu_psize_defs
[MMU_PAGE_16M
].shift
;
565 else if (mmu_psize_defs
[MMU_PAGE_1M
].shift
)
566 HPAGE_SHIFT
= mmu_psize_defs
[MMU_PAGE_1M
].shift
;
571 module_init(hugetlbpage_init
);
573 void flush_dcache_icache_hugepage(struct page
*page
)
577 BUG_ON(!PageCompound(page
));
579 for (i
= 0; i
< (1UL << compound_order(page
)); i
++)
580 __flush_dcache_icache(page_address(page
+i
));