2 * IA-32 Huge TLB Page Support for Kernel.
4 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
7 #include <linux/init.h>
10 #include <linux/hugetlb.h>
11 #include <linux/pagemap.h>
12 #include <linux/err.h>
13 #include <linux/sysctl.h>
16 #include <asm/tlbflush.h>
17 #include <asm/pgalloc.h>
19 static unsigned long page_table_shareable(struct vm_area_struct
*svma
,
20 struct vm_area_struct
*vma
,
21 unsigned long addr
, pgoff_t idx
)
23 unsigned long saddr
= ((idx
- svma
->vm_pgoff
) << PAGE_SHIFT
) +
25 unsigned long sbase
= saddr
& PUD_MASK
;
26 unsigned long s_end
= sbase
+ PUD_SIZE
;
28 /* Allow segments to share if only one is marked locked */
29 unsigned long vm_flags
= vma
->vm_flags
& ~VM_LOCKED
;
30 unsigned long svm_flags
= svma
->vm_flags
& ~VM_LOCKED
;
33 * match the virtual addresses, permission and the alignment of the
36 if (pmd_index(addr
) != pmd_index(saddr
) ||
37 vm_flags
!= svm_flags
||
38 sbase
< svma
->vm_start
|| svma
->vm_end
< s_end
)
44 static int vma_shareable(struct vm_area_struct
*vma
, unsigned long addr
)
46 unsigned long base
= addr
& PUD_MASK
;
47 unsigned long end
= base
+ PUD_SIZE
;
50 * check on proper vm_flags and page table alignment
52 if (vma
->vm_flags
& VM_MAYSHARE
&&
53 vma
->vm_start
<= base
&& end
<= vma
->vm_end
)
59 * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
60 * and returns the corresponding pte. While this is not necessary for the
61 * !shared pmd case because we can allocate the pmd later as well, it makes the
62 * code much cleaner. pmd allocation is essential for the shared case because
63 * pud has to be populated inside the same i_mmap_mutex section - otherwise
64 * racing tasks could either miss the sharing (see huge_pte_offset) or select a
65 * bad pmd for sharing.
68 huge_pmd_share(struct mm_struct
*mm
, unsigned long addr
, pud_t
*pud
)
70 struct vm_area_struct
*vma
= find_vma(mm
, addr
);
71 struct address_space
*mapping
= vma
->vm_file
->f_mapping
;
72 pgoff_t idx
= ((addr
- vma
->vm_start
) >> PAGE_SHIFT
) +
74 struct vm_area_struct
*svma
;
79 if (!vma_shareable(vma
, addr
))
80 return (pte_t
*)pmd_alloc(mm
, pud
, addr
);
82 mutex_lock(&mapping
->i_mmap_mutex
);
83 vma_interval_tree_foreach(svma
, &mapping
->i_mmap
, idx
, idx
) {
87 saddr
= page_table_shareable(svma
, vma
, addr
, idx
);
89 spte
= huge_pte_offset(svma
->vm_mm
, saddr
);
91 get_page(virt_to_page(spte
));
100 spin_lock(&mm
->page_table_lock
);
102 pud_populate(mm
, pud
, (pmd_t
*)((unsigned long)spte
& PAGE_MASK
));
104 put_page(virt_to_page(spte
));
105 spin_unlock(&mm
->page_table_lock
);
107 pte
= (pte_t
*)pmd_alloc(mm
, pud
, addr
);
108 mutex_unlock(&mapping
->i_mmap_mutex
);
113 * unmap huge page backed by shared pte.
115 * Hugetlb pte page is ref counted at the time of mapping. If pte is shared
116 * indicated by page_count > 1, unmap is achieved by clearing pud and
117 * decrementing the ref count. If count == 1, the pte page is not shared.
119 * called with vma->vm_mm->page_table_lock held.
121 * returns: 1 successfully unmapped a shared pte page
122 * 0 the underlying pte page is not shared, or it is the last user
124 int huge_pmd_unshare(struct mm_struct
*mm
, unsigned long *addr
, pte_t
*ptep
)
126 pgd_t
*pgd
= pgd_offset(mm
, *addr
);
127 pud_t
*pud
= pud_offset(pgd
, *addr
);
129 BUG_ON(page_count(virt_to_page(ptep
)) == 0);
130 if (page_count(virt_to_page(ptep
)) == 1)
134 put_page(virt_to_page(ptep
));
135 *addr
= ALIGN(*addr
, HPAGE_SIZE
* PTRS_PER_PTE
) - HPAGE_SIZE
;
139 pte_t
*huge_pte_alloc(struct mm_struct
*mm
,
140 unsigned long addr
, unsigned long sz
)
146 pgd
= pgd_offset(mm
, addr
);
147 pud
= pud_alloc(mm
, pgd
, addr
);
149 if (sz
== PUD_SIZE
) {
152 BUG_ON(sz
!= PMD_SIZE
);
154 pte
= huge_pmd_share(mm
, addr
, pud
);
156 pte
= (pte_t
*)pmd_alloc(mm
, pud
, addr
);
159 BUG_ON(pte
&& !pte_none(*pte
) && !pte_huge(*pte
));
164 pte_t
*huge_pte_offset(struct mm_struct
*mm
, unsigned long addr
)
170 pgd
= pgd_offset(mm
, addr
);
171 if (pgd_present(*pgd
)) {
172 pud
= pud_offset(pgd
, addr
);
173 if (pud_present(*pud
)) {
176 pmd
= pmd_offset(pud
, addr
);
179 return (pte_t
*) pmd
;
182 #if 0 /* This is just for testing */
184 follow_huge_addr(struct mm_struct
*mm
, unsigned long address
, int write
)
186 unsigned long start
= address
;
190 struct vm_area_struct
*vma
;
192 vma
= find_vma(mm
, addr
);
193 if (!vma
|| !is_vm_hugetlb_page(vma
))
194 return ERR_PTR(-EINVAL
);
196 pte
= huge_pte_offset(mm
, address
);
198 /* hugetlb should be locked, and hence, prefaulted */
199 WARN_ON(!pte
|| pte_none(*pte
));
201 page
= &pte_page(*pte
)[vpfn
% (HPAGE_SIZE
/PAGE_SIZE
)];
203 WARN_ON(!PageHead(page
));
208 int pmd_huge(pmd_t pmd
)
213 int pud_huge(pud_t pud
)
219 follow_huge_pmd(struct mm_struct
*mm
, unsigned long address
,
220 pmd_t
*pmd
, int write
)
228 follow_huge_addr(struct mm_struct
*mm
, unsigned long address
, int write
)
230 return ERR_PTR(-EINVAL
);
233 int pmd_huge(pmd_t pmd
)
235 return !!(pmd_val(pmd
) & _PAGE_PSE
);
238 int pud_huge(pud_t pud
)
240 return !!(pud_val(pud
) & _PAGE_PSE
);
244 follow_huge_pmd(struct mm_struct
*mm
, unsigned long address
,
245 pmd_t
*pmd
, int write
)
249 page
= pte_page(*(pte_t
*)pmd
);
251 page
+= ((address
& ~PMD_MASK
) >> PAGE_SHIFT
);
256 follow_huge_pud(struct mm_struct
*mm
, unsigned long address
,
257 pud_t
*pud
, int write
)
261 page
= pte_page(*(pte_t
*)pud
);
263 page
+= ((address
& ~PUD_MASK
) >> PAGE_SHIFT
);
269 /* x86_64 also uses this file */
271 #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
272 static unsigned long hugetlb_get_unmapped_area_bottomup(struct file
*file
,
273 unsigned long addr
, unsigned long len
,
274 unsigned long pgoff
, unsigned long flags
)
276 struct hstate
*h
= hstate_file(file
);
277 struct vm_unmapped_area_info info
;
281 info
.low_limit
= TASK_UNMAPPED_BASE
;
282 info
.high_limit
= TASK_SIZE
;
283 info
.align_mask
= PAGE_MASK
& ~huge_page_mask(h
);
284 info
.align_offset
= 0;
285 return vm_unmapped_area(&info
);
288 static unsigned long hugetlb_get_unmapped_area_topdown(struct file
*file
,
289 unsigned long addr0
, unsigned long len
,
290 unsigned long pgoff
, unsigned long flags
)
292 struct hstate
*h
= hstate_file(file
);
293 struct vm_unmapped_area_info info
;
296 info
.flags
= VM_UNMAPPED_AREA_TOPDOWN
;
298 info
.low_limit
= PAGE_SIZE
;
299 info
.high_limit
= current
->mm
->mmap_base
;
300 info
.align_mask
= PAGE_MASK
& ~huge_page_mask(h
);
301 info
.align_offset
= 0;
302 addr
= vm_unmapped_area(&info
);
305 * A failed mmap() very likely causes application failure,
306 * so fall back to the bottom-up function here. This scenario
307 * can happen with large stack limits and large mmap()
310 if (addr
& ~PAGE_MASK
) {
311 VM_BUG_ON(addr
!= -ENOMEM
);
313 info
.low_limit
= TASK_UNMAPPED_BASE
;
314 info
.high_limit
= TASK_SIZE
;
315 addr
= vm_unmapped_area(&info
);
322 hugetlb_get_unmapped_area(struct file
*file
, unsigned long addr
,
323 unsigned long len
, unsigned long pgoff
, unsigned long flags
)
325 struct hstate
*h
= hstate_file(file
);
326 struct mm_struct
*mm
= current
->mm
;
327 struct vm_area_struct
*vma
;
329 if (len
& ~huge_page_mask(h
))
334 if (flags
& MAP_FIXED
) {
335 if (prepare_hugepage_range(file
, addr
, len
))
341 addr
= ALIGN(addr
, huge_page_size(h
));
342 vma
= find_vma(mm
, addr
);
343 if (TASK_SIZE
- len
>= addr
&&
344 (!vma
|| addr
+ len
<= vma
->vm_start
))
347 if (mm
->get_unmapped_area
== arch_get_unmapped_area
)
348 return hugetlb_get_unmapped_area_bottomup(file
, addr
, len
,
351 return hugetlb_get_unmapped_area_topdown(file
, addr
, len
,
355 #endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
358 static __init
int setup_hugepagesz(char *opt
)
360 unsigned long ps
= memparse(opt
, &opt
);
361 if (ps
== PMD_SIZE
) {
362 hugetlb_add_hstate(PMD_SHIFT
- PAGE_SHIFT
);
363 } else if (ps
== PUD_SIZE
&& cpu_has_gbpages
) {
364 hugetlb_add_hstate(PUD_SHIFT
- PAGE_SHIFT
);
366 printk(KERN_ERR
"hugepagesz: Unsupported page size %lu M\n",
372 __setup("hugepagesz=", setup_hugepagesz
);