2 * Generic hugetlb support.
3 * (C) William Irwin, April 2004
6 #include <linux/list.h>
7 #include <linux/init.h>
8 #include <linux/module.h>
10 #include <linux/sysctl.h>
11 #include <linux/highmem.h>
12 #include <linux/nodemask.h>
13 #include <linux/pagemap.h>
15 #include <asm/pgtable.h>
17 #include <linux/hugetlb.h>
19 const unsigned long hugetlb_zero
= 0, hugetlb_infinity
= ~0UL;
20 static unsigned long nr_huge_pages
, free_huge_pages
;
21 unsigned long max_huge_pages
;
22 static struct list_head hugepage_freelists
[MAX_NUMNODES
];
23 static unsigned int nr_huge_pages_node
[MAX_NUMNODES
];
24 static unsigned int free_huge_pages_node
[MAX_NUMNODES
];
25 static DEFINE_SPINLOCK(hugetlb_lock
);
27 static void enqueue_huge_page(struct page
*page
)
29 int nid
= page_to_nid(page
);
30 list_add(&page
->lru
, &hugepage_freelists
[nid
]);
32 free_huge_pages_node
[nid
]++;
35 static struct page
*dequeue_huge_page(void)
37 int nid
= numa_node_id();
38 struct page
*page
= NULL
;
40 if (list_empty(&hugepage_freelists
[nid
])) {
41 for (nid
= 0; nid
< MAX_NUMNODES
; ++nid
)
42 if (!list_empty(&hugepage_freelists
[nid
]))
45 if (nid
>= 0 && nid
< MAX_NUMNODES
&&
46 !list_empty(&hugepage_freelists
[nid
])) {
47 page
= list_entry(hugepage_freelists
[nid
].next
,
51 free_huge_pages_node
[nid
]--;
56 static struct page
*alloc_fresh_huge_page(void)
60 page
= alloc_pages_node(nid
, GFP_HIGHUSER
|__GFP_COMP
|__GFP_NOWARN
,
62 nid
= (nid
+ 1) % num_online_nodes();
65 nr_huge_pages_node
[page_to_nid(page
)]++;
70 void free_huge_page(struct page
*page
)
72 BUG_ON(page_count(page
));
74 INIT_LIST_HEAD(&page
->lru
);
75 page
[1].mapping
= NULL
;
77 spin_lock(&hugetlb_lock
);
78 enqueue_huge_page(page
);
79 spin_unlock(&hugetlb_lock
);
82 struct page
*alloc_huge_page(void)
87 spin_lock(&hugetlb_lock
);
88 page
= dequeue_huge_page();
90 spin_unlock(&hugetlb_lock
);
93 spin_unlock(&hugetlb_lock
);
94 set_page_count(page
, 1);
95 page
[1].mapping
= (void *)free_huge_page
;
96 for (i
= 0; i
< (HPAGE_SIZE
/PAGE_SIZE
); ++i
)
97 clear_highpage(&page
[i
]);
101 static int __init
hugetlb_init(void)
106 if (HPAGE_SHIFT
== 0)
109 for (i
= 0; i
< MAX_NUMNODES
; ++i
)
110 INIT_LIST_HEAD(&hugepage_freelists
[i
]);
112 for (i
= 0; i
< max_huge_pages
; ++i
) {
113 page
= alloc_fresh_huge_page();
116 spin_lock(&hugetlb_lock
);
117 enqueue_huge_page(page
);
118 spin_unlock(&hugetlb_lock
);
120 max_huge_pages
= free_huge_pages
= nr_huge_pages
= i
;
121 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages
);
124 module_init(hugetlb_init
);
126 static int __init
hugetlb_setup(char *s
)
128 if (sscanf(s
, "%lu", &max_huge_pages
) <= 0)
132 __setup("hugepages=", hugetlb_setup
);
135 static void update_and_free_page(struct page
*page
)
139 nr_huge_pages_node
[page_zone(page
)->zone_pgdat
->node_id
]--;
140 for (i
= 0; i
< (HPAGE_SIZE
/ PAGE_SIZE
); i
++) {
141 page
[i
].flags
&= ~(1 << PG_locked
| 1 << PG_error
| 1 << PG_referenced
|
142 1 << PG_dirty
| 1 << PG_active
| 1 << PG_reserved
|
143 1 << PG_private
| 1<< PG_writeback
);
144 set_page_count(&page
[i
], 0);
146 set_page_count(page
, 1);
147 __free_pages(page
, HUGETLB_PAGE_ORDER
);
150 #ifdef CONFIG_HIGHMEM
151 static void try_to_free_low(unsigned long count
)
154 for (i
= 0; i
< MAX_NUMNODES
; ++i
) {
155 struct page
*page
, *next
;
156 list_for_each_entry_safe(page
, next
, &hugepage_freelists
[i
], lru
) {
157 if (PageHighMem(page
))
159 list_del(&page
->lru
);
160 update_and_free_page(page
);
161 nid
= page_zone(page
)->zone_pgdat
->node_id
;
163 free_huge_pages_node
[nid
]--;
164 if (count
>= nr_huge_pages
)
170 static inline void try_to_free_low(unsigned long count
)
175 static unsigned long set_max_huge_pages(unsigned long count
)
177 while (count
> nr_huge_pages
) {
178 struct page
*page
= alloc_fresh_huge_page();
180 return nr_huge_pages
;
181 spin_lock(&hugetlb_lock
);
182 enqueue_huge_page(page
);
183 spin_unlock(&hugetlb_lock
);
185 if (count
>= nr_huge_pages
)
186 return nr_huge_pages
;
188 spin_lock(&hugetlb_lock
);
189 try_to_free_low(count
);
190 while (count
< nr_huge_pages
) {
191 struct page
*page
= dequeue_huge_page();
194 update_and_free_page(page
);
196 spin_unlock(&hugetlb_lock
);
197 return nr_huge_pages
;
200 int hugetlb_sysctl_handler(struct ctl_table
*table
, int write
,
201 struct file
*file
, void __user
*buffer
,
202 size_t *length
, loff_t
*ppos
)
204 proc_doulongvec_minmax(table
, write
, file
, buffer
, length
, ppos
);
205 max_huge_pages
= set_max_huge_pages(max_huge_pages
);
208 #endif /* CONFIG_SYSCTL */
210 int hugetlb_report_meminfo(char *buf
)
213 "HugePages_Total: %5lu\n"
214 "HugePages_Free: %5lu\n"
215 "Hugepagesize: %5lu kB\n",
221 int hugetlb_report_node_meminfo(int nid
, char *buf
)
224 "Node %d HugePages_Total: %5u\n"
225 "Node %d HugePages_Free: %5u\n",
226 nid
, nr_huge_pages_node
[nid
],
227 nid
, free_huge_pages_node
[nid
]);
230 int is_hugepage_mem_enough(size_t size
)
232 return (size
+ ~HPAGE_MASK
)/HPAGE_SIZE
<= free_huge_pages
;
235 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
236 unsigned long hugetlb_total_pages(void)
238 return nr_huge_pages
* (HPAGE_SIZE
/ PAGE_SIZE
);
240 EXPORT_SYMBOL(hugetlb_total_pages
);
243 * We cannot handle pagefaults against hugetlb pages at all. They cause
244 * handle_mm_fault() to try to instantiate regular-sized pages in the
245 * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get
248 static struct page
*hugetlb_nopage(struct vm_area_struct
*vma
,
249 unsigned long address
, int *unused
)
255 struct vm_operations_struct hugetlb_vm_ops
= {
256 .nopage
= hugetlb_nopage
,
259 static pte_t
make_huge_pte(struct vm_area_struct
*vma
, struct page
*page
)
263 if (vma
->vm_flags
& VM_WRITE
) {
265 pte_mkwrite(pte_mkdirty(mk_pte(page
, vma
->vm_page_prot
)));
267 entry
= pte_wrprotect(mk_pte(page
, vma
->vm_page_prot
));
269 entry
= pte_mkyoung(entry
);
270 entry
= pte_mkhuge(entry
);
275 int copy_hugetlb_page_range(struct mm_struct
*dst
, struct mm_struct
*src
,
276 struct vm_area_struct
*vma
)
278 pte_t
*src_pte
, *dst_pte
, entry
;
279 struct page
*ptepage
;
282 for (addr
= vma
->vm_start
; addr
< vma
->vm_end
; addr
+= HPAGE_SIZE
) {
283 src_pte
= huge_pte_offset(src
, addr
);
286 dst_pte
= huge_pte_alloc(dst
, addr
);
289 spin_lock(&dst
->page_table_lock
);
290 spin_lock(&src
->page_table_lock
);
291 if (!pte_none(*src_pte
)) {
293 ptepage
= pte_page(entry
);
295 add_mm_counter(dst
, file_rss
, HPAGE_SIZE
/ PAGE_SIZE
);
296 set_huge_pte_at(dst
, addr
, dst_pte
, entry
);
298 spin_unlock(&src
->page_table_lock
);
299 spin_unlock(&dst
->page_table_lock
);
307 void unmap_hugepage_range(struct vm_area_struct
*vma
, unsigned long start
,
310 struct mm_struct
*mm
= vma
->vm_mm
;
311 unsigned long address
;
316 WARN_ON(!is_vm_hugetlb_page(vma
));
317 BUG_ON(start
& ~HPAGE_MASK
);
318 BUG_ON(end
& ~HPAGE_MASK
);
320 spin_lock(&mm
->page_table_lock
);
322 /* Update high watermark before we lower rss */
323 update_hiwater_rss(mm
);
325 for (address
= start
; address
< end
; address
+= HPAGE_SIZE
) {
326 ptep
= huge_pte_offset(mm
, address
);
330 pte
= huge_ptep_get_and_clear(mm
, address
, ptep
);
334 page
= pte_page(pte
);
336 add_mm_counter(mm
, file_rss
, (int) -(HPAGE_SIZE
/ PAGE_SIZE
));
339 spin_unlock(&mm
->page_table_lock
);
340 flush_tlb_range(vma
, start
, end
);
343 static struct page
*find_lock_huge_page(struct address_space
*mapping
,
348 struct inode
*inode
= mapping
->host
;
352 page
= find_lock_page(mapping
, idx
);
356 /* Check to make sure the mapping hasn't been truncated */
357 size
= i_size_read(inode
) >> HPAGE_SHIFT
;
361 if (hugetlb_get_quota(mapping
))
363 page
= alloc_huge_page();
365 hugetlb_put_quota(mapping
);
369 err
= add_to_page_cache(page
, mapping
, idx
, GFP_KERNEL
);
372 hugetlb_put_quota(mapping
);
381 int hugetlb_fault(struct mm_struct
*mm
, struct vm_area_struct
*vma
,
382 unsigned long address
, int write_access
)
384 int ret
= VM_FAULT_SIGBUS
;
389 struct address_space
*mapping
;
391 pte
= huge_pte_alloc(mm
, address
);
395 mapping
= vma
->vm_file
->f_mapping
;
396 idx
= ((address
- vma
->vm_start
) >> HPAGE_SHIFT
)
397 + (vma
->vm_pgoff
>> (HPAGE_SHIFT
- PAGE_SHIFT
));
400 * Use page lock to guard against racing truncation
401 * before we get page_table_lock.
403 page
= find_lock_huge_page(mapping
, idx
);
407 spin_lock(&mm
->page_table_lock
);
408 size
= i_size_read(mapping
->host
) >> HPAGE_SHIFT
;
412 ret
= VM_FAULT_MINOR
;
416 add_mm_counter(mm
, file_rss
, HPAGE_SIZE
/ PAGE_SIZE
);
417 set_huge_pte_at(mm
, address
, pte
, make_huge_pte(vma
, page
));
418 spin_unlock(&mm
->page_table_lock
);
424 spin_unlock(&mm
->page_table_lock
);
425 hugetlb_put_quota(mapping
);
431 int follow_hugetlb_page(struct mm_struct
*mm
, struct vm_area_struct
*vma
,
432 struct page
**pages
, struct vm_area_struct
**vmas
,
433 unsigned long *position
, int *length
, int i
)
435 unsigned long vpfn
, vaddr
= *position
;
436 int remainder
= *length
;
438 vpfn
= vaddr
/PAGE_SIZE
;
439 spin_lock(&mm
->page_table_lock
);
440 while (vaddr
< vma
->vm_end
&& remainder
) {
445 * Some archs (sparc64, sh*) have multiple pte_ts to
446 * each hugepage. We have to make * sure we get the
447 * first, for the page indexing below to work.
449 pte
= huge_pte_offset(mm
, vaddr
& HPAGE_MASK
);
451 if (!pte
|| pte_none(*pte
)) {
454 spin_unlock(&mm
->page_table_lock
);
455 ret
= hugetlb_fault(mm
, vma
, vaddr
, 0);
456 spin_lock(&mm
->page_table_lock
);
457 if (ret
== VM_FAULT_MINOR
)
467 page
= &pte_page(*pte
)[vpfn
% (HPAGE_SIZE
/PAGE_SIZE
)];
480 spin_unlock(&mm
->page_table_lock
);