1 // SPDX-License-Identifier: GPL-2.0
2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
5 #include <linux/sched.h>
6 #include <linux/sched/mm.h>
7 #include <linux/mmu_notifier.h>
8 #include <linux/rmap.h>
9 #include <linux/swap.h>
10 #include <linux/mm_inline.h>
11 #include <linux/kthread.h>
12 #include <linux/khugepaged.h>
13 #include <linux/freezer.h>
14 #include <linux/mman.h>
15 #include <linux/hashtable.h>
16 #include <linux/userfaultfd_k.h>
17 #include <linux/page_idle.h>
18 #include <linux/page_table_check.h>
19 #include <linux/rcupdate_wait.h>
20 #include <linux/swapops.h>
21 #include <linux/shmem_fs.h>
22 #include <linux/ksm.h>
25 #include <asm/pgalloc.h>
37 SCAN_EXCEED_SHARED_PTE
,
40 SCAN_PTE_MAPPED_HUGEPAGE
,
42 SCAN_LACK_REFERENCED_PAGE
,
55 SCAN_ALLOC_HUGE_PAGE_FAIL
,
56 SCAN_CGROUP_CHARGE_FAIL
,
58 SCAN_PAGE_HAS_PRIVATE
,
64 #define CREATE_TRACE_POINTS
65 #include <trace/events/huge_memory.h>
67 static struct task_struct
*khugepaged_thread __read_mostly
;
68 static DEFINE_MUTEX(khugepaged_mutex
);
70 /* default scan 8*512 pte (or vmas) every 30 second */
71 static unsigned int khugepaged_pages_to_scan __read_mostly
;
72 static unsigned int khugepaged_pages_collapsed
;
73 static unsigned int khugepaged_full_scans
;
74 static unsigned int khugepaged_scan_sleep_millisecs __read_mostly
= 10000;
75 /* during fragmentation poll the hugepage allocator once every minute */
76 static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly
= 60000;
77 static unsigned long khugepaged_sleep_expire
;
78 static DEFINE_SPINLOCK(khugepaged_mm_lock
);
79 static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait
);
81 * default collapse hugepages if there is at least one pte mapped like
82 * it would have happened if the vma was large enough during page
85 * Note that these are only respected if collapse was initiated by khugepaged.
87 unsigned int khugepaged_max_ptes_none __read_mostly
;
88 static unsigned int khugepaged_max_ptes_swap __read_mostly
;
89 static unsigned int khugepaged_max_ptes_shared __read_mostly
;
91 #define MM_SLOTS_HASH_BITS 10
92 static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash
, MM_SLOTS_HASH_BITS
);
94 static struct kmem_cache
*mm_slot_cache __ro_after_init
;
96 struct collapse_control
{
99 /* Num pages scanned per node */
100 u32 node_load
[MAX_NUMNODES
];
102 /* nodemask for allocation fallback */
103 nodemask_t alloc_nmask
;
107 * struct khugepaged_mm_slot - khugepaged information per mm that is being scanned
108 * @slot: hash lookup from mm to mm_slot
110 struct khugepaged_mm_slot
{
115 * struct khugepaged_scan - cursor for scanning
116 * @mm_head: the head of the mm list to scan
117 * @mm_slot: the current mm_slot we are scanning
118 * @address: the next address inside that to be scanned
120 * There is only the one khugepaged_scan instance of this cursor structure.
122 struct khugepaged_scan
{
123 struct list_head mm_head
;
124 struct khugepaged_mm_slot
*mm_slot
;
125 unsigned long address
;
128 static struct khugepaged_scan khugepaged_scan
= {
129 .mm_head
= LIST_HEAD_INIT(khugepaged_scan
.mm_head
),
133 static ssize_t
scan_sleep_millisecs_show(struct kobject
*kobj
,
134 struct kobj_attribute
*attr
,
137 return sysfs_emit(buf
, "%u\n", khugepaged_scan_sleep_millisecs
);
140 static ssize_t
scan_sleep_millisecs_store(struct kobject
*kobj
,
141 struct kobj_attribute
*attr
,
142 const char *buf
, size_t count
)
147 err
= kstrtouint(buf
, 10, &msecs
);
151 khugepaged_scan_sleep_millisecs
= msecs
;
152 khugepaged_sleep_expire
= 0;
153 wake_up_interruptible(&khugepaged_wait
);
157 static struct kobj_attribute scan_sleep_millisecs_attr
=
158 __ATTR_RW(scan_sleep_millisecs
);
160 static ssize_t
alloc_sleep_millisecs_show(struct kobject
*kobj
,
161 struct kobj_attribute
*attr
,
164 return sysfs_emit(buf
, "%u\n", khugepaged_alloc_sleep_millisecs
);
167 static ssize_t
alloc_sleep_millisecs_store(struct kobject
*kobj
,
168 struct kobj_attribute
*attr
,
169 const char *buf
, size_t count
)
174 err
= kstrtouint(buf
, 10, &msecs
);
178 khugepaged_alloc_sleep_millisecs
= msecs
;
179 khugepaged_sleep_expire
= 0;
180 wake_up_interruptible(&khugepaged_wait
);
184 static struct kobj_attribute alloc_sleep_millisecs_attr
=
185 __ATTR_RW(alloc_sleep_millisecs
);
187 static ssize_t
pages_to_scan_show(struct kobject
*kobj
,
188 struct kobj_attribute
*attr
,
191 return sysfs_emit(buf
, "%u\n", khugepaged_pages_to_scan
);
193 static ssize_t
pages_to_scan_store(struct kobject
*kobj
,
194 struct kobj_attribute
*attr
,
195 const char *buf
, size_t count
)
200 err
= kstrtouint(buf
, 10, &pages
);
204 khugepaged_pages_to_scan
= pages
;
208 static struct kobj_attribute pages_to_scan_attr
=
209 __ATTR_RW(pages_to_scan
);
211 static ssize_t
pages_collapsed_show(struct kobject
*kobj
,
212 struct kobj_attribute
*attr
,
215 return sysfs_emit(buf
, "%u\n", khugepaged_pages_collapsed
);
217 static struct kobj_attribute pages_collapsed_attr
=
218 __ATTR_RO(pages_collapsed
);
220 static ssize_t
full_scans_show(struct kobject
*kobj
,
221 struct kobj_attribute
*attr
,
224 return sysfs_emit(buf
, "%u\n", khugepaged_full_scans
);
226 static struct kobj_attribute full_scans_attr
=
227 __ATTR_RO(full_scans
);
229 static ssize_t
defrag_show(struct kobject
*kobj
,
230 struct kobj_attribute
*attr
, char *buf
)
232 return single_hugepage_flag_show(kobj
, attr
, buf
,
233 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG
);
235 static ssize_t
defrag_store(struct kobject
*kobj
,
236 struct kobj_attribute
*attr
,
237 const char *buf
, size_t count
)
239 return single_hugepage_flag_store(kobj
, attr
, buf
, count
,
240 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG
);
242 static struct kobj_attribute khugepaged_defrag_attr
=
246 * max_ptes_none controls if khugepaged should collapse hugepages over
247 * any unmapped ptes in turn potentially increasing the memory
248 * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
249 * reduce the available free memory in the system as it
250 * runs. Increasing max_ptes_none will instead potentially reduce the
251 * free memory in the system during the khugepaged scan.
253 static ssize_t
max_ptes_none_show(struct kobject
*kobj
,
254 struct kobj_attribute
*attr
,
257 return sysfs_emit(buf
, "%u\n", khugepaged_max_ptes_none
);
259 static ssize_t
max_ptes_none_store(struct kobject
*kobj
,
260 struct kobj_attribute
*attr
,
261 const char *buf
, size_t count
)
264 unsigned long max_ptes_none
;
266 err
= kstrtoul(buf
, 10, &max_ptes_none
);
267 if (err
|| max_ptes_none
> HPAGE_PMD_NR
- 1)
270 khugepaged_max_ptes_none
= max_ptes_none
;
274 static struct kobj_attribute khugepaged_max_ptes_none_attr
=
275 __ATTR_RW(max_ptes_none
);
277 static ssize_t
max_ptes_swap_show(struct kobject
*kobj
,
278 struct kobj_attribute
*attr
,
281 return sysfs_emit(buf
, "%u\n", khugepaged_max_ptes_swap
);
284 static ssize_t
max_ptes_swap_store(struct kobject
*kobj
,
285 struct kobj_attribute
*attr
,
286 const char *buf
, size_t count
)
289 unsigned long max_ptes_swap
;
291 err
= kstrtoul(buf
, 10, &max_ptes_swap
);
292 if (err
|| max_ptes_swap
> HPAGE_PMD_NR
- 1)
295 khugepaged_max_ptes_swap
= max_ptes_swap
;
300 static struct kobj_attribute khugepaged_max_ptes_swap_attr
=
301 __ATTR_RW(max_ptes_swap
);
303 static ssize_t
max_ptes_shared_show(struct kobject
*kobj
,
304 struct kobj_attribute
*attr
,
307 return sysfs_emit(buf
, "%u\n", khugepaged_max_ptes_shared
);
310 static ssize_t
max_ptes_shared_store(struct kobject
*kobj
,
311 struct kobj_attribute
*attr
,
312 const char *buf
, size_t count
)
315 unsigned long max_ptes_shared
;
317 err
= kstrtoul(buf
, 10, &max_ptes_shared
);
318 if (err
|| max_ptes_shared
> HPAGE_PMD_NR
- 1)
321 khugepaged_max_ptes_shared
= max_ptes_shared
;
326 static struct kobj_attribute khugepaged_max_ptes_shared_attr
=
327 __ATTR_RW(max_ptes_shared
);
329 static struct attribute
*khugepaged_attr
[] = {
330 &khugepaged_defrag_attr
.attr
,
331 &khugepaged_max_ptes_none_attr
.attr
,
332 &khugepaged_max_ptes_swap_attr
.attr
,
333 &khugepaged_max_ptes_shared_attr
.attr
,
334 &pages_to_scan_attr
.attr
,
335 &pages_collapsed_attr
.attr
,
336 &full_scans_attr
.attr
,
337 &scan_sleep_millisecs_attr
.attr
,
338 &alloc_sleep_millisecs_attr
.attr
,
342 struct attribute_group khugepaged_attr_group
= {
343 .attrs
= khugepaged_attr
,
344 .name
= "khugepaged",
346 #endif /* CONFIG_SYSFS */
348 int hugepage_madvise(struct vm_area_struct
*vma
,
349 unsigned long *vm_flags
, int advice
)
355 * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
356 * can't handle this properly after s390_enable_sie, so we simply
357 * ignore the madvise to prevent qemu from causing a SIGSEGV.
359 if (mm_has_pgste(vma
->vm_mm
))
362 *vm_flags
&= ~VM_NOHUGEPAGE
;
363 *vm_flags
|= VM_HUGEPAGE
;
365 * If the vma become good for khugepaged to scan,
366 * register it here without waiting a page fault that
367 * may not happen any time soon.
369 khugepaged_enter_vma(vma
, *vm_flags
);
371 case MADV_NOHUGEPAGE
:
372 *vm_flags
&= ~VM_HUGEPAGE
;
373 *vm_flags
|= VM_NOHUGEPAGE
;
375 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
376 * this vma even if we leave the mm registered in khugepaged if
377 * it got registered before VM_NOHUGEPAGE was set.
385 int __init
khugepaged_init(void)
387 mm_slot_cache
= KMEM_CACHE(khugepaged_mm_slot
, 0);
391 khugepaged_pages_to_scan
= HPAGE_PMD_NR
* 8;
392 khugepaged_max_ptes_none
= HPAGE_PMD_NR
- 1;
393 khugepaged_max_ptes_swap
= HPAGE_PMD_NR
/ 8;
394 khugepaged_max_ptes_shared
= HPAGE_PMD_NR
/ 2;
399 void __init
khugepaged_destroy(void)
401 kmem_cache_destroy(mm_slot_cache
);
404 static inline int hpage_collapse_test_exit(struct mm_struct
*mm
)
406 return atomic_read(&mm
->mm_users
) == 0;
409 static inline int hpage_collapse_test_exit_or_disable(struct mm_struct
*mm
)
411 return hpage_collapse_test_exit(mm
) ||
412 test_bit(MMF_DISABLE_THP
, &mm
->flags
);
415 static bool hugepage_pmd_enabled(void)
418 * We cover the anon, shmem and the file-backed case here; file-backed
419 * hugepages, when configured in, are determined by the global control.
420 * Anon pmd-sized hugepages are determined by the pmd-size control.
421 * Shmem pmd-sized hugepages are also determined by its pmd-size control,
422 * except when the global shmem_huge is set to SHMEM_HUGE_DENY.
424 if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS
) &&
425 hugepage_global_enabled())
427 if (test_bit(PMD_ORDER
, &huge_anon_orders_always
))
429 if (test_bit(PMD_ORDER
, &huge_anon_orders_madvise
))
431 if (test_bit(PMD_ORDER
, &huge_anon_orders_inherit
) &&
432 hugepage_global_enabled())
434 if (IS_ENABLED(CONFIG_SHMEM
) && shmem_hpage_pmd_enabled())
439 void __khugepaged_enter(struct mm_struct
*mm
)
441 struct khugepaged_mm_slot
*mm_slot
;
442 struct mm_slot
*slot
;
445 /* __khugepaged_exit() must not run from under us */
446 VM_BUG_ON_MM(hpage_collapse_test_exit(mm
), mm
);
447 if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE
, &mm
->flags
)))
450 mm_slot
= mm_slot_alloc(mm_slot_cache
);
454 slot
= &mm_slot
->slot
;
456 spin_lock(&khugepaged_mm_lock
);
457 mm_slot_insert(mm_slots_hash
, mm
, slot
);
459 * Insert just behind the scanning cursor, to let the area settle
462 wakeup
= list_empty(&khugepaged_scan
.mm_head
);
463 list_add_tail(&slot
->mm_node
, &khugepaged_scan
.mm_head
);
464 spin_unlock(&khugepaged_mm_lock
);
468 wake_up_interruptible(&khugepaged_wait
);
471 void khugepaged_enter_vma(struct vm_area_struct
*vma
,
472 unsigned long vm_flags
)
474 if (!test_bit(MMF_VM_HUGEPAGE
, &vma
->vm_mm
->flags
) &&
475 hugepage_pmd_enabled()) {
476 if (thp_vma_allowable_order(vma
, vm_flags
, TVA_ENFORCE_SYSFS
,
478 __khugepaged_enter(vma
->vm_mm
);
482 void __khugepaged_exit(struct mm_struct
*mm
)
484 struct khugepaged_mm_slot
*mm_slot
;
485 struct mm_slot
*slot
;
488 spin_lock(&khugepaged_mm_lock
);
489 slot
= mm_slot_lookup(mm_slots_hash
, mm
);
490 mm_slot
= mm_slot_entry(slot
, struct khugepaged_mm_slot
, slot
);
491 if (mm_slot
&& khugepaged_scan
.mm_slot
!= mm_slot
) {
492 hash_del(&slot
->hash
);
493 list_del(&slot
->mm_node
);
496 spin_unlock(&khugepaged_mm_lock
);
499 clear_bit(MMF_VM_HUGEPAGE
, &mm
->flags
);
500 mm_slot_free(mm_slot_cache
, mm_slot
);
502 } else if (mm_slot
) {
504 * This is required to serialize against
505 * hpage_collapse_test_exit() (which is guaranteed to run
506 * under mmap sem read mode). Stop here (after we return all
507 * pagetables will be destroyed) until khugepaged has finished
508 * working on the pagetables under the mmap_lock.
511 mmap_write_unlock(mm
);
515 static void release_pte_folio(struct folio
*folio
)
517 node_stat_mod_folio(folio
,
518 NR_ISOLATED_ANON
+ folio_is_file_lru(folio
),
519 -folio_nr_pages(folio
));
521 folio_putback_lru(folio
);
524 static void release_pte_pages(pte_t
*pte
, pte_t
*_pte
,
525 struct list_head
*compound_pagelist
)
527 struct folio
*folio
, *tmp
;
529 while (--_pte
>= pte
) {
530 pte_t pteval
= ptep_get(_pte
);
533 if (pte_none(pteval
))
535 pfn
= pte_pfn(pteval
);
536 if (is_zero_pfn(pfn
))
538 folio
= pfn_folio(pfn
);
539 if (folio_test_large(folio
))
541 release_pte_folio(folio
);
544 list_for_each_entry_safe(folio
, tmp
, compound_pagelist
, lru
) {
545 list_del(&folio
->lru
);
546 release_pte_folio(folio
);
550 static bool is_refcount_suitable(struct folio
*folio
)
552 int expected_refcount
= folio_mapcount(folio
);
554 if (!folio_test_anon(folio
) || folio_test_swapcache(folio
))
555 expected_refcount
+= folio_nr_pages(folio
);
557 if (folio_test_private(folio
))
560 return folio_ref_count(folio
) == expected_refcount
;
563 static int __collapse_huge_page_isolate(struct vm_area_struct
*vma
,
564 unsigned long address
,
566 struct collapse_control
*cc
,
567 struct list_head
*compound_pagelist
)
569 struct page
*page
= NULL
;
570 struct folio
*folio
= NULL
;
572 int none_or_zero
= 0, shared
= 0, result
= SCAN_FAIL
, referenced
= 0;
573 bool writable
= false;
575 for (_pte
= pte
; _pte
< pte
+ HPAGE_PMD_NR
;
576 _pte
++, address
+= PAGE_SIZE
) {
577 pte_t pteval
= ptep_get(_pte
);
578 if (pte_none(pteval
) || (pte_present(pteval
) &&
579 is_zero_pfn(pte_pfn(pteval
)))) {
581 if (!userfaultfd_armed(vma
) &&
582 (!cc
->is_khugepaged
||
583 none_or_zero
<= khugepaged_max_ptes_none
)) {
586 result
= SCAN_EXCEED_NONE_PTE
;
587 count_vm_event(THP_SCAN_EXCEED_NONE_PTE
);
591 if (!pte_present(pteval
)) {
592 result
= SCAN_PTE_NON_PRESENT
;
595 if (pte_uffd_wp(pteval
)) {
596 result
= SCAN_PTE_UFFD_WP
;
599 page
= vm_normal_page(vma
, address
, pteval
);
600 if (unlikely(!page
) || unlikely(is_zone_device_page(page
))) {
601 result
= SCAN_PAGE_NULL
;
605 folio
= page_folio(page
);
606 VM_BUG_ON_FOLIO(!folio_test_anon(folio
), folio
);
608 /* See hpage_collapse_scan_pmd(). */
609 if (folio_likely_mapped_shared(folio
)) {
611 if (cc
->is_khugepaged
&&
612 shared
> khugepaged_max_ptes_shared
) {
613 result
= SCAN_EXCEED_SHARED_PTE
;
614 count_vm_event(THP_SCAN_EXCEED_SHARED_PTE
);
619 if (folio_test_large(folio
)) {
623 * Check if we have dealt with the compound page
626 list_for_each_entry(f
, compound_pagelist
, lru
) {
633 * We can do it before folio_isolate_lru because the
634 * folio can't be freed from under us. NOTE: PG_lock
635 * is needed to serialize against split_huge_page
636 * when invoked from the VM.
638 if (!folio_trylock(folio
)) {
639 result
= SCAN_PAGE_LOCK
;
644 * Check if the page has any GUP (or other external) pins.
646 * The page table that maps the page has been already unlinked
647 * from the page table tree and this process cannot get
648 * an additional pin on the page.
650 * New pins can come later if the page is shared across fork,
651 * but not from this process. The other process cannot write to
652 * the page, only trigger CoW.
654 if (!is_refcount_suitable(folio
)) {
656 result
= SCAN_PAGE_COUNT
;
661 * Isolate the page to avoid collapsing an hugepage
662 * currently in use by the VM.
664 if (!folio_isolate_lru(folio
)) {
666 result
= SCAN_DEL_PAGE_LRU
;
669 node_stat_mod_folio(folio
,
670 NR_ISOLATED_ANON
+ folio_is_file_lru(folio
),
671 folio_nr_pages(folio
));
672 VM_BUG_ON_FOLIO(!folio_test_locked(folio
), folio
);
673 VM_BUG_ON_FOLIO(folio_test_lru(folio
), folio
);
675 if (folio_test_large(folio
))
676 list_add_tail(&folio
->lru
, compound_pagelist
);
679 * If collapse was initiated by khugepaged, check that there is
680 * enough young pte to justify collapsing the page
682 if (cc
->is_khugepaged
&&
683 (pte_young(pteval
) || folio_test_young(folio
) ||
684 folio_test_referenced(folio
) || mmu_notifier_test_young(vma
->vm_mm
,
688 if (pte_write(pteval
))
692 if (unlikely(!writable
)) {
693 result
= SCAN_PAGE_RO
;
694 } else if (unlikely(cc
->is_khugepaged
&& !referenced
)) {
695 result
= SCAN_LACK_REFERENCED_PAGE
;
697 result
= SCAN_SUCCEED
;
698 trace_mm_collapse_huge_page_isolate(&folio
->page
, none_or_zero
,
699 referenced
, writable
, result
);
703 release_pte_pages(pte
, _pte
, compound_pagelist
);
704 trace_mm_collapse_huge_page_isolate(&folio
->page
, none_or_zero
,
705 referenced
, writable
, result
);
709 static void __collapse_huge_page_copy_succeeded(pte_t
*pte
,
710 struct vm_area_struct
*vma
,
711 unsigned long address
,
713 struct list_head
*compound_pagelist
)
715 struct folio
*src
, *tmp
;
719 for (_pte
= pte
; _pte
< pte
+ HPAGE_PMD_NR
;
720 _pte
++, address
+= PAGE_SIZE
) {
721 pteval
= ptep_get(_pte
);
722 if (pte_none(pteval
) || is_zero_pfn(pte_pfn(pteval
))) {
723 add_mm_counter(vma
->vm_mm
, MM_ANONPAGES
, 1);
724 if (is_zero_pfn(pte_pfn(pteval
))) {
726 * ptl mostly unnecessary.
729 ptep_clear(vma
->vm_mm
, address
, _pte
);
731 ksm_might_unmap_zero_page(vma
->vm_mm
, pteval
);
734 struct page
*src_page
= pte_page(pteval
);
736 src
= page_folio(src_page
);
737 if (!folio_test_large(src
))
738 release_pte_folio(src
);
740 * ptl mostly unnecessary, but preempt has to
741 * be disabled to update the per-cpu stats
742 * inside folio_remove_rmap_pte().
745 ptep_clear(vma
->vm_mm
, address
, _pte
);
746 folio_remove_rmap_pte(src
, src_page
, vma
);
748 free_page_and_swap_cache(src_page
);
752 list_for_each_entry_safe(src
, tmp
, compound_pagelist
, lru
) {
754 node_stat_sub_folio(src
, NR_ISOLATED_ANON
+
755 folio_is_file_lru(src
));
757 free_swap_cache(src
);
758 folio_putback_lru(src
);
762 static void __collapse_huge_page_copy_failed(pte_t
*pte
,
765 struct vm_area_struct
*vma
,
766 struct list_head
*compound_pagelist
)
771 * Re-establish the PMD to point to the original page table
772 * entry. Restoring PMD needs to be done prior to releasing
773 * pages. Since pages are still isolated and locked here,
774 * acquiring anon_vma_lock_write is unnecessary.
776 pmd_ptl
= pmd_lock(vma
->vm_mm
, pmd
);
777 pmd_populate(vma
->vm_mm
, pmd
, pmd_pgtable(orig_pmd
));
778 spin_unlock(pmd_ptl
);
780 * Release both raw and compound pages isolated
781 * in __collapse_huge_page_isolate.
783 release_pte_pages(pte
, pte
+ HPAGE_PMD_NR
, compound_pagelist
);
787 * __collapse_huge_page_copy - attempts to copy memory contents from raw
788 * pages to a hugepage. Cleans up the raw pages if copying succeeds;
789 * otherwise restores the original page table and releases isolated raw pages.
790 * Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC.
792 * @pte: starting of the PTEs to copy from
793 * @folio: the new hugepage to copy contents to
794 * @pmd: pointer to the new hugepage's PMD
795 * @orig_pmd: the original raw pages' PMD
796 * @vma: the original raw pages' virtual memory area
797 * @address: starting address to copy
798 * @ptl: lock on raw pages' PTEs
799 * @compound_pagelist: list that stores compound pages
801 static int __collapse_huge_page_copy(pte_t
*pte
, struct folio
*folio
,
802 pmd_t
*pmd
, pmd_t orig_pmd
, struct vm_area_struct
*vma
,
803 unsigned long address
, spinlock_t
*ptl
,
804 struct list_head
*compound_pagelist
)
807 int result
= SCAN_SUCCEED
;
810 * Copying pages' contents is subject to memory poison at any iteration.
812 for (i
= 0; i
< HPAGE_PMD_NR
; i
++) {
813 pte_t pteval
= ptep_get(pte
+ i
);
814 struct page
*page
= folio_page(folio
, i
);
815 unsigned long src_addr
= address
+ i
* PAGE_SIZE
;
816 struct page
*src_page
;
818 if (pte_none(pteval
) || is_zero_pfn(pte_pfn(pteval
))) {
819 clear_user_highpage(page
, src_addr
);
822 src_page
= pte_page(pteval
);
823 if (copy_mc_user_highpage(page
, src_page
, src_addr
, vma
) > 0) {
824 result
= SCAN_COPY_MC
;
829 if (likely(result
== SCAN_SUCCEED
))
830 __collapse_huge_page_copy_succeeded(pte
, vma
, address
, ptl
,
833 __collapse_huge_page_copy_failed(pte
, pmd
, orig_pmd
, vma
,
839 static void khugepaged_alloc_sleep(void)
843 add_wait_queue(&khugepaged_wait
, &wait
);
844 __set_current_state(TASK_INTERRUPTIBLE
|TASK_FREEZABLE
);
845 schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs
));
846 remove_wait_queue(&khugepaged_wait
, &wait
);
849 struct collapse_control khugepaged_collapse_control
= {
850 .is_khugepaged
= true,
853 static bool hpage_collapse_scan_abort(int nid
, struct collapse_control
*cc
)
858 * If node_reclaim_mode is disabled, then no extra effort is made to
859 * allocate memory locally.
861 if (!node_reclaim_enabled())
864 /* If there is a count for this node already, it must be acceptable */
865 if (cc
->node_load
[nid
])
868 for (i
= 0; i
< MAX_NUMNODES
; i
++) {
869 if (!cc
->node_load
[i
])
871 if (node_distance(nid
, i
) > node_reclaim_distance
)
877 #define khugepaged_defrag() \
878 (transparent_hugepage_flags & \
879 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG))
881 /* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
882 static inline gfp_t
alloc_hugepage_khugepaged_gfpmask(void)
884 return khugepaged_defrag() ? GFP_TRANSHUGE
: GFP_TRANSHUGE_LIGHT
;
888 static int hpage_collapse_find_target_node(struct collapse_control
*cc
)
890 int nid
, target_node
= 0, max_value
= 0;
892 /* find first node with max normal pages hit */
893 for (nid
= 0; nid
< MAX_NUMNODES
; nid
++)
894 if (cc
->node_load
[nid
] > max_value
) {
895 max_value
= cc
->node_load
[nid
];
899 for_each_online_node(nid
) {
900 if (max_value
== cc
->node_load
[nid
])
901 node_set(nid
, cc
->alloc_nmask
);
907 static int hpage_collapse_find_target_node(struct collapse_control
*cc
)
914 * If mmap_lock temporarily dropped, revalidate vma
915 * before taking mmap_lock.
916 * Returns enum scan_result value.
919 static int hugepage_vma_revalidate(struct mm_struct
*mm
, unsigned long address
,
921 struct vm_area_struct
**vmap
,
922 struct collapse_control
*cc
)
924 struct vm_area_struct
*vma
;
925 unsigned long tva_flags
= cc
->is_khugepaged
? TVA_ENFORCE_SYSFS
: 0;
927 if (unlikely(hpage_collapse_test_exit_or_disable(mm
)))
928 return SCAN_ANY_PROCESS
;
930 *vmap
= vma
= find_vma(mm
, address
);
932 return SCAN_VMA_NULL
;
934 if (!thp_vma_suitable_order(vma
, address
, PMD_ORDER
))
935 return SCAN_ADDRESS_RANGE
;
936 if (!thp_vma_allowable_order(vma
, vma
->vm_flags
, tva_flags
, PMD_ORDER
))
937 return SCAN_VMA_CHECK
;
939 * Anon VMA expected, the address may be unmapped then
940 * remapped to file after khugepaged reaquired the mmap_lock.
942 * thp_vma_allowable_order may return true for qualified file
945 if (expect_anon
&& (!(*vmap
)->anon_vma
|| !vma_is_anonymous(*vmap
)))
946 return SCAN_PAGE_ANON
;
950 static int find_pmd_or_thp_or_none(struct mm_struct
*mm
,
951 unsigned long address
,
956 *pmd
= mm_find_pmd(mm
, address
);
958 return SCAN_PMD_NULL
;
960 pmde
= pmdp_get_lockless(*pmd
);
962 return SCAN_PMD_NONE
;
963 if (!pmd_present(pmde
))
964 return SCAN_PMD_NULL
;
965 if (pmd_trans_huge(pmde
))
966 return SCAN_PMD_MAPPED
;
967 if (pmd_devmap(pmde
))
968 return SCAN_PMD_NULL
;
970 return SCAN_PMD_NULL
;
974 static int check_pmd_still_valid(struct mm_struct
*mm
,
975 unsigned long address
,
979 int result
= find_pmd_or_thp_or_none(mm
, address
, &new_pmd
);
981 if (result
!= SCAN_SUCCEED
)
989 * Bring missing pages in from swap, to complete THP collapse.
990 * Only done if hpage_collapse_scan_pmd believes it is worthwhile.
992 * Called and returns without pte mapped or spinlocks held.
993 * Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
995 static int __collapse_huge_page_swapin(struct mm_struct
*mm
,
996 struct vm_area_struct
*vma
,
997 unsigned long haddr
, pmd_t
*pmd
,
1002 unsigned long address
, end
= haddr
+ (HPAGE_PMD_NR
* PAGE_SIZE
);
1007 for (address
= haddr
; address
< end
; address
+= PAGE_SIZE
) {
1008 struct vm_fault vmf
= {
1011 .pgoff
= linear_page_index(vma
, address
),
1012 .flags
= FAULT_FLAG_ALLOW_RETRY
,
1018 * Here the ptl is only used to check pte_same() in
1019 * do_swap_page(), so readonly version is enough.
1021 pte
= pte_offset_map_ro_nolock(mm
, pmd
, address
, &ptl
);
1023 mmap_read_unlock(mm
);
1024 result
= SCAN_PMD_NULL
;
1029 vmf
.orig_pte
= ptep_get_lockless(pte
);
1030 if (!is_swap_pte(vmf
.orig_pte
))
1035 ret
= do_swap_page(&vmf
);
1036 /* Which unmaps pte (after perhaps re-checking the entry) */
1040 * do_swap_page returns VM_FAULT_RETRY with released mmap_lock.
1041 * Note we treat VM_FAULT_RETRY as VM_FAULT_ERROR here because
1042 * we do not retry here and swap entry will remain in pagetable
1043 * resulting in later failure.
1045 if (ret
& VM_FAULT_RETRY
) {
1046 /* Likely, but not guaranteed, that page lock failed */
1047 result
= SCAN_PAGE_LOCK
;
1050 if (ret
& VM_FAULT_ERROR
) {
1051 mmap_read_unlock(mm
);
1061 /* Drain LRU cache to remove extra pin on the swapped in pages */
1065 result
= SCAN_SUCCEED
;
1067 trace_mm_collapse_huge_page_swapin(mm
, swapped_in
, referenced
, result
);
1071 static int alloc_charge_folio(struct folio
**foliop
, struct mm_struct
*mm
,
1072 struct collapse_control
*cc
)
1074 gfp_t gfp
= (cc
->is_khugepaged
? alloc_hugepage_khugepaged_gfpmask() :
1076 int node
= hpage_collapse_find_target_node(cc
);
1077 struct folio
*folio
;
1079 folio
= __folio_alloc(gfp
, HPAGE_PMD_ORDER
, node
, &cc
->alloc_nmask
);
1082 count_vm_event(THP_COLLAPSE_ALLOC_FAILED
);
1083 return SCAN_ALLOC_HUGE_PAGE_FAIL
;
1086 count_vm_event(THP_COLLAPSE_ALLOC
);
1087 if (unlikely(mem_cgroup_charge(folio
, mm
, gfp
))) {
1090 return SCAN_CGROUP_CHARGE_FAIL
;
1093 count_memcg_folio_events(folio
, THP_COLLAPSE_ALLOC
, 1);
1096 return SCAN_SUCCEED
;
1099 static int collapse_huge_page(struct mm_struct
*mm
, unsigned long address
,
1100 int referenced
, int unmapped
,
1101 struct collapse_control
*cc
)
1103 LIST_HEAD(compound_pagelist
);
1107 struct folio
*folio
;
1108 spinlock_t
*pmd_ptl
, *pte_ptl
;
1109 int result
= SCAN_FAIL
;
1110 struct vm_area_struct
*vma
;
1111 struct mmu_notifier_range range
;
1113 VM_BUG_ON(address
& ~HPAGE_PMD_MASK
);
1116 * Before allocating the hugepage, release the mmap_lock read lock.
1117 * The allocation can take potentially a long time if it involves
1118 * sync compaction, and we do not need to hold the mmap_lock during
1119 * that. We will recheck the vma after taking it again in write mode.
1121 mmap_read_unlock(mm
);
1123 result
= alloc_charge_folio(&folio
, mm
, cc
);
1124 if (result
!= SCAN_SUCCEED
)
1128 result
= hugepage_vma_revalidate(mm
, address
, true, &vma
, cc
);
1129 if (result
!= SCAN_SUCCEED
) {
1130 mmap_read_unlock(mm
);
1134 result
= find_pmd_or_thp_or_none(mm
, address
, &pmd
);
1135 if (result
!= SCAN_SUCCEED
) {
1136 mmap_read_unlock(mm
);
1142 * __collapse_huge_page_swapin will return with mmap_lock
1143 * released when it fails. So we jump out_nolock directly in
1144 * that case. Continuing to collapse causes inconsistency.
1146 result
= __collapse_huge_page_swapin(mm
, vma
, address
, pmd
,
1148 if (result
!= SCAN_SUCCEED
)
1152 mmap_read_unlock(mm
);
1154 * Prevent all access to pagetables with the exception of
1155 * gup_fast later handled by the ptep_clear_flush and the VM
1156 * handled by the anon_vma lock + PG_lock.
1158 * UFFDIO_MOVE is prevented to race as well thanks to the
1161 mmap_write_lock(mm
);
1162 result
= hugepage_vma_revalidate(mm
, address
, true, &vma
, cc
);
1163 if (result
!= SCAN_SUCCEED
)
1165 /* check if the pmd is still valid */
1166 result
= check_pmd_still_valid(mm
, address
, pmd
);
1167 if (result
!= SCAN_SUCCEED
)
1170 vma_start_write(vma
);
1171 anon_vma_lock_write(vma
->anon_vma
);
1173 mmu_notifier_range_init(&range
, MMU_NOTIFY_CLEAR
, 0, mm
, address
,
1174 address
+ HPAGE_PMD_SIZE
);
1175 mmu_notifier_invalidate_range_start(&range
);
1177 pmd_ptl
= pmd_lock(mm
, pmd
); /* probably unnecessary */
1179 * This removes any huge TLB entry from the CPU so we won't allow
1180 * huge and small TLB entries for the same virtual address to
1181 * avoid the risk of CPU bugs in that area.
1183 * Parallel GUP-fast is fine since GUP-fast will back off when
1184 * it detects PMD is changed.
1186 _pmd
= pmdp_collapse_flush(vma
, address
, pmd
);
1187 spin_unlock(pmd_ptl
);
1188 mmu_notifier_invalidate_range_end(&range
);
1189 tlb_remove_table_sync_one();
1191 pte
= pte_offset_map_lock(mm
, &_pmd
, address
, &pte_ptl
);
1193 result
= __collapse_huge_page_isolate(vma
, address
, pte
, cc
,
1194 &compound_pagelist
);
1195 spin_unlock(pte_ptl
);
1197 result
= SCAN_PMD_NULL
;
1200 if (unlikely(result
!= SCAN_SUCCEED
)) {
1204 BUG_ON(!pmd_none(*pmd
));
1206 * We can only use set_pmd_at when establishing
1207 * hugepmds and never for establishing regular pmds that
1208 * points to regular pagetables. Use pmd_populate for that
1210 pmd_populate(mm
, pmd
, pmd_pgtable(_pmd
));
1211 spin_unlock(pmd_ptl
);
1212 anon_vma_unlock_write(vma
->anon_vma
);
1217 * All pages are isolated and locked so anon_vma rmap
1218 * can't run anymore.
1220 anon_vma_unlock_write(vma
->anon_vma
);
1222 result
= __collapse_huge_page_copy(pte
, folio
, pmd
, _pmd
,
1223 vma
, address
, pte_ptl
,
1224 &compound_pagelist
);
1226 if (unlikely(result
!= SCAN_SUCCEED
))
1230 * The smp_wmb() inside __folio_mark_uptodate() ensures the
1231 * copy_huge_page writes become visible before the set_pmd_at()
1234 __folio_mark_uptodate(folio
);
1235 pgtable
= pmd_pgtable(_pmd
);
1237 _pmd
= mk_huge_pmd(&folio
->page
, vma
->vm_page_prot
);
1238 _pmd
= maybe_pmd_mkwrite(pmd_mkdirty(_pmd
), vma
);
1241 BUG_ON(!pmd_none(*pmd
));
1242 folio_add_new_anon_rmap(folio
, vma
, address
, RMAP_EXCLUSIVE
);
1243 folio_add_lru_vma(folio
, vma
);
1244 pgtable_trans_huge_deposit(mm
, pmd
, pgtable
);
1245 set_pmd_at(mm
, address
, pmd
, _pmd
);
1246 update_mmu_cache_pmd(vma
, address
, pmd
);
1247 deferred_split_folio(folio
, false);
1248 spin_unlock(pmd_ptl
);
1252 result
= SCAN_SUCCEED
;
1254 mmap_write_unlock(mm
);
1258 trace_mm_collapse_huge_page(mm
, result
== SCAN_SUCCEED
, result
);
1262 static int hpage_collapse_scan_pmd(struct mm_struct
*mm
,
1263 struct vm_area_struct
*vma
,
1264 unsigned long address
, bool *mmap_locked
,
1265 struct collapse_control
*cc
)
1269 int result
= SCAN_FAIL
, referenced
= 0;
1270 int none_or_zero
= 0, shared
= 0;
1271 struct page
*page
= NULL
;
1272 struct folio
*folio
= NULL
;
1273 unsigned long _address
;
1275 int node
= NUMA_NO_NODE
, unmapped
= 0;
1276 bool writable
= false;
1278 VM_BUG_ON(address
& ~HPAGE_PMD_MASK
);
1280 result
= find_pmd_or_thp_or_none(mm
, address
, &pmd
);
1281 if (result
!= SCAN_SUCCEED
)
1284 memset(cc
->node_load
, 0, sizeof(cc
->node_load
));
1285 nodes_clear(cc
->alloc_nmask
);
1286 pte
= pte_offset_map_lock(mm
, pmd
, address
, &ptl
);
1288 result
= SCAN_PMD_NULL
;
1292 for (_address
= address
, _pte
= pte
; _pte
< pte
+ HPAGE_PMD_NR
;
1293 _pte
++, _address
+= PAGE_SIZE
) {
1294 pte_t pteval
= ptep_get(_pte
);
1295 if (is_swap_pte(pteval
)) {
1297 if (!cc
->is_khugepaged
||
1298 unmapped
<= khugepaged_max_ptes_swap
) {
1300 * Always be strict with uffd-wp
1301 * enabled swap entries. Please see
1302 * comment below for pte_uffd_wp().
1304 if (pte_swp_uffd_wp_any(pteval
)) {
1305 result
= SCAN_PTE_UFFD_WP
;
1310 result
= SCAN_EXCEED_SWAP_PTE
;
1311 count_vm_event(THP_SCAN_EXCEED_SWAP_PTE
);
1315 if (pte_none(pteval
) || is_zero_pfn(pte_pfn(pteval
))) {
1317 if (!userfaultfd_armed(vma
) &&
1318 (!cc
->is_khugepaged
||
1319 none_or_zero
<= khugepaged_max_ptes_none
)) {
1322 result
= SCAN_EXCEED_NONE_PTE
;
1323 count_vm_event(THP_SCAN_EXCEED_NONE_PTE
);
1327 if (pte_uffd_wp(pteval
)) {
1329 * Don't collapse the page if any of the small
1330 * PTEs are armed with uffd write protection.
1331 * Here we can also mark the new huge pmd as
1332 * write protected if any of the small ones is
1333 * marked but that could bring unknown
1334 * userfault messages that falls outside of
1335 * the registered range. So, just be simple.
1337 result
= SCAN_PTE_UFFD_WP
;
1340 if (pte_write(pteval
))
1343 page
= vm_normal_page(vma
, _address
, pteval
);
1344 if (unlikely(!page
) || unlikely(is_zone_device_page(page
))) {
1345 result
= SCAN_PAGE_NULL
;
1348 folio
= page_folio(page
);
1350 if (!folio_test_anon(folio
)) {
1351 result
= SCAN_PAGE_ANON
;
1356 * We treat a single page as shared if any part of the THP
1357 * is shared. "False negatives" from
1358 * folio_likely_mapped_shared() are not expected to matter
1361 if (folio_likely_mapped_shared(folio
)) {
1363 if (cc
->is_khugepaged
&&
1364 shared
> khugepaged_max_ptes_shared
) {
1365 result
= SCAN_EXCEED_SHARED_PTE
;
1366 count_vm_event(THP_SCAN_EXCEED_SHARED_PTE
);
1372 * Record which node the original page is from and save this
1373 * information to cc->node_load[].
1374 * Khugepaged will allocate hugepage from the node has the max
1377 node
= folio_nid(folio
);
1378 if (hpage_collapse_scan_abort(node
, cc
)) {
1379 result
= SCAN_SCAN_ABORT
;
1382 cc
->node_load
[node
]++;
1383 if (!folio_test_lru(folio
)) {
1384 result
= SCAN_PAGE_LRU
;
1387 if (folio_test_locked(folio
)) {
1388 result
= SCAN_PAGE_LOCK
;
1393 * Check if the page has any GUP (or other external) pins.
1395 * Here the check may be racy:
1396 * it may see folio_mapcount() > folio_ref_count().
1397 * But such case is ephemeral we could always retry collapse
1398 * later. However it may report false positive if the page
1399 * has excessive GUP pins (i.e. 512). Anyway the same check
1400 * will be done again later the risk seems low.
1402 if (!is_refcount_suitable(folio
)) {
1403 result
= SCAN_PAGE_COUNT
;
1408 * If collapse was initiated by khugepaged, check that there is
1409 * enough young pte to justify collapsing the page
1411 if (cc
->is_khugepaged
&&
1412 (pte_young(pteval
) || folio_test_young(folio
) ||
1413 folio_test_referenced(folio
) || mmu_notifier_test_young(vma
->vm_mm
,
1418 result
= SCAN_PAGE_RO
;
1419 } else if (cc
->is_khugepaged
&&
1421 (unmapped
&& referenced
< HPAGE_PMD_NR
/ 2))) {
1422 result
= SCAN_LACK_REFERENCED_PAGE
;
1424 result
= SCAN_SUCCEED
;
1427 pte_unmap_unlock(pte
, ptl
);
1428 if (result
== SCAN_SUCCEED
) {
1429 result
= collapse_huge_page(mm
, address
, referenced
,
1431 /* collapse_huge_page will return with the mmap_lock released */
1432 *mmap_locked
= false;
1435 trace_mm_khugepaged_scan_pmd(mm
, &folio
->page
, writable
, referenced
,
1436 none_or_zero
, result
, unmapped
);
1440 static void collect_mm_slot(struct khugepaged_mm_slot
*mm_slot
)
1442 struct mm_slot
*slot
= &mm_slot
->slot
;
1443 struct mm_struct
*mm
= slot
->mm
;
1445 lockdep_assert_held(&khugepaged_mm_lock
);
1447 if (hpage_collapse_test_exit(mm
)) {
1449 hash_del(&slot
->hash
);
1450 list_del(&slot
->mm_node
);
1453 * Not strictly needed because the mm exited already.
1455 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
1458 /* khugepaged_mm_lock actually not necessary for the below */
1459 mm_slot_free(mm_slot_cache
, mm_slot
);
1465 /* hpage must be locked, and mmap_lock must be held */
1466 static int set_huge_pmd(struct vm_area_struct
*vma
, unsigned long addr
,
1467 pmd_t
*pmdp
, struct page
*hpage
)
1469 struct vm_fault vmf
= {
1476 VM_BUG_ON(!PageTransHuge(hpage
));
1477 mmap_assert_locked(vma
->vm_mm
);
1479 if (do_set_pmd(&vmf
, hpage
))
1483 return SCAN_SUCCEED
;
1487 * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
1490 * @mm: process address space where collapse happens
1491 * @addr: THP collapse address
1492 * @install_pmd: If a huge PMD should be installed
1494 * This function checks whether all the PTEs in the PMD are pointing to the
1495 * right THP. If so, retract the page table so the THP can refault in with
1496 * as pmd-mapped. Possibly install a huge PMD mapping the THP.
1498 int collapse_pte_mapped_thp(struct mm_struct
*mm
, unsigned long addr
,
1501 struct mmu_notifier_range range
;
1502 bool notified
= false;
1503 unsigned long haddr
= addr
& HPAGE_PMD_MASK
;
1504 struct vm_area_struct
*vma
= vma_lookup(mm
, haddr
);
1505 struct folio
*folio
;
1506 pte_t
*start_pte
, *pte
;
1507 pmd_t
*pmd
, pgt_pmd
;
1508 spinlock_t
*pml
= NULL
, *ptl
;
1509 int nr_ptes
= 0, result
= SCAN_FAIL
;
1512 mmap_assert_locked(mm
);
1514 /* First check VMA found, in case page tables are being torn down */
1515 if (!vma
|| !vma
->vm_file
||
1516 !range_in_vma(vma
, haddr
, haddr
+ HPAGE_PMD_SIZE
))
1517 return SCAN_VMA_CHECK
;
1519 /* Fast check before locking page if already PMD-mapped */
1520 result
= find_pmd_or_thp_or_none(mm
, haddr
, &pmd
);
1521 if (result
== SCAN_PMD_MAPPED
)
1525 * If we are here, we've succeeded in replacing all the native pages
1526 * in the page cache with a single hugepage. If a mm were to fault-in
1527 * this memory (mapped by a suitably aligned VMA), we'd get the hugepage
1528 * and map it by a PMD, regardless of sysfs THP settings. As such, let's
1529 * analogously elide sysfs THP settings here.
1531 if (!thp_vma_allowable_order(vma
, vma
->vm_flags
, 0, PMD_ORDER
))
1532 return SCAN_VMA_CHECK
;
1534 /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
1535 if (userfaultfd_wp(vma
))
1536 return SCAN_PTE_UFFD_WP
;
1538 folio
= filemap_lock_folio(vma
->vm_file
->f_mapping
,
1539 linear_page_index(vma
, haddr
));
1541 return SCAN_PAGE_NULL
;
1543 if (folio_order(folio
) != HPAGE_PMD_ORDER
) {
1544 result
= SCAN_PAGE_COMPOUND
;
1548 result
= find_pmd_or_thp_or_none(mm
, haddr
, &pmd
);
1554 * All pte entries have been removed and pmd cleared.
1555 * Skip all the pte checks and just update the pmd mapping.
1557 goto maybe_install_pmd
;
1563 start_pte
= pte_offset_map_lock(mm
, pmd
, haddr
, &ptl
);
1564 if (!start_pte
) /* mmap_lock + page lock should prevent this */
1567 /* step 1: check all mapped PTEs are to the right huge page */
1568 for (i
= 0, addr
= haddr
, pte
= start_pte
;
1569 i
< HPAGE_PMD_NR
; i
++, addr
+= PAGE_SIZE
, pte
++) {
1571 pte_t ptent
= ptep_get(pte
);
1573 /* empty pte, skip */
1574 if (pte_none(ptent
))
1577 /* page swapped out, abort */
1578 if (!pte_present(ptent
)) {
1579 result
= SCAN_PTE_NON_PRESENT
;
1583 page
= vm_normal_page(vma
, addr
, ptent
);
1584 if (WARN_ON_ONCE(page
&& is_zone_device_page(page
)))
1587 * Note that uprobe, debugger, or MAP_PRIVATE may change the
1588 * page table, but the new page will not be a subpage of hpage.
1590 if (folio_page(folio
, i
) != page
)
1594 pte_unmap_unlock(start_pte
, ptl
);
1595 mmu_notifier_range_init(&range
, MMU_NOTIFY_CLEAR
, 0, mm
,
1596 haddr
, haddr
+ HPAGE_PMD_SIZE
);
1597 mmu_notifier_invalidate_range_start(&range
);
1601 * pmd_lock covers a wider range than ptl, and (if split from mm's
1602 * page_table_lock) ptl nests inside pml. The less time we hold pml,
1603 * the better; but userfaultfd's mfill_atomic_pte() on a private VMA
1604 * inserts a valid as-if-COWed PTE without even looking up page cache.
1605 * So page lock of folio does not protect from it, so we must not drop
1606 * ptl before pgt_pmd is removed, so uffd private needs pml taken now.
1608 if (userfaultfd_armed(vma
) && !(vma
->vm_flags
& VM_SHARED
))
1609 pml
= pmd_lock(mm
, pmd
);
1611 start_pte
= pte_offset_map_rw_nolock(mm
, pmd
, haddr
, &pgt_pmd
, &ptl
);
1612 if (!start_pte
) /* mmap_lock + page lock should prevent this */
1616 else if (ptl
!= pml
)
1617 spin_lock_nested(ptl
, SINGLE_DEPTH_NESTING
);
1619 if (unlikely(!pmd_same(pgt_pmd
, pmdp_get_lockless(pmd
))))
1622 /* step 2: clear page table and adjust rmap */
1623 for (i
= 0, addr
= haddr
, pte
= start_pte
;
1624 i
< HPAGE_PMD_NR
; i
++, addr
+= PAGE_SIZE
, pte
++) {
1626 pte_t ptent
= ptep_get(pte
);
1628 if (pte_none(ptent
))
1631 * We dropped ptl after the first scan, to do the mmu_notifier:
1632 * page lock stops more PTEs of the folio being faulted in, but
1633 * does not stop write faults COWing anon copies from existing
1634 * PTEs; and does not stop those being swapped out or migrated.
1636 if (!pte_present(ptent
)) {
1637 result
= SCAN_PTE_NON_PRESENT
;
1640 page
= vm_normal_page(vma
, addr
, ptent
);
1641 if (folio_page(folio
, i
) != page
)
1645 * Must clear entry, or a racing truncate may re-remove it.
1646 * TLB flush can be left until pmdp_collapse_flush() does it.
1647 * PTE dirty? Shmem page is already dirty; file is read-only.
1649 ptep_clear(mm
, addr
, pte
);
1650 folio_remove_rmap_pte(folio
, page
, vma
);
1657 /* step 3: set proper refcount and mm_counters. */
1659 folio_ref_sub(folio
, nr_ptes
);
1660 add_mm_counter(mm
, mm_counter_file(folio
), -nr_ptes
);
1663 /* step 4: remove empty page table */
1665 pml
= pmd_lock(mm
, pmd
);
1667 spin_lock_nested(ptl
, SINGLE_DEPTH_NESTING
);
1668 if (unlikely(!pmd_same(pgt_pmd
, pmdp_get_lockless(pmd
)))) {
1674 pgt_pmd
= pmdp_collapse_flush(vma
, haddr
, pmd
);
1675 pmdp_get_lockless_sync();
1676 pte_unmap_unlock(start_pte
, ptl
);
1680 mmu_notifier_invalidate_range_end(&range
);
1683 page_table_check_pte_clear_range(mm
, haddr
, pgt_pmd
);
1684 pte_free_defer(mm
, pmd_pgtable(pgt_pmd
));
1687 /* step 5: install pmd entry */
1688 result
= install_pmd
1689 ? set_huge_pmd(vma
, haddr
, pmd
, &folio
->page
)
1695 folio_ref_sub(folio
, nr_ptes
);
1696 add_mm_counter(mm
, mm_counter_file(folio
), -nr_ptes
);
1700 pte_unmap_unlock(start_pte
, ptl
);
1701 if (pml
&& pml
!= ptl
)
1704 mmu_notifier_invalidate_range_end(&range
);
1706 folio_unlock(folio
);
1711 static void retract_page_tables(struct address_space
*mapping
, pgoff_t pgoff
)
1713 struct vm_area_struct
*vma
;
1715 i_mmap_lock_read(mapping
);
1716 vma_interval_tree_foreach(vma
, &mapping
->i_mmap
, pgoff
, pgoff
) {
1717 struct mmu_notifier_range range
;
1718 struct mm_struct
*mm
;
1720 pmd_t
*pmd
, pgt_pmd
;
1723 bool skipped_uffd
= false;
1726 * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
1727 * got written to. These VMAs are likely not worth removing
1728 * page tables from, as PMD-mapping is likely to be split later.
1730 if (READ_ONCE(vma
->anon_vma
))
1733 addr
= vma
->vm_start
+ ((pgoff
- vma
->vm_pgoff
) << PAGE_SHIFT
);
1734 if (addr
& ~HPAGE_PMD_MASK
||
1735 vma
->vm_end
< addr
+ HPAGE_PMD_SIZE
)
1739 if (find_pmd_or_thp_or_none(mm
, addr
, &pmd
) != SCAN_SUCCEED
)
1742 if (hpage_collapse_test_exit(mm
))
1745 * When a vma is registered with uffd-wp, we cannot recycle
1746 * the page table because there may be pte markers installed.
1747 * Other vmas can still have the same file mapped hugely, but
1748 * skip this one: it will always be mapped in small page size
1749 * for uffd-wp registered ranges.
1751 if (userfaultfd_wp(vma
))
1754 /* PTEs were notified when unmapped; but now for the PMD? */
1755 mmu_notifier_range_init(&range
, MMU_NOTIFY_CLEAR
, 0, mm
,
1756 addr
, addr
+ HPAGE_PMD_SIZE
);
1757 mmu_notifier_invalidate_range_start(&range
);
1759 pml
= pmd_lock(mm
, pmd
);
1760 ptl
= pte_lockptr(mm
, pmd
);
1762 spin_lock_nested(ptl
, SINGLE_DEPTH_NESTING
);
1765 * Huge page lock is still held, so normally the page table
1766 * must remain empty; and we have already skipped anon_vma
1767 * and userfaultfd_wp() vmas. But since the mmap_lock is not
1768 * held, it is still possible for a racing userfaultfd_ioctl()
1769 * to have inserted ptes or markers. Now that we hold ptlock,
1770 * repeating the anon_vma check protects from one category,
1771 * and repeating the userfaultfd_wp() check from another.
1773 if (unlikely(vma
->anon_vma
|| userfaultfd_wp(vma
))) {
1774 skipped_uffd
= true;
1776 pgt_pmd
= pmdp_collapse_flush(vma
, addr
, pmd
);
1777 pmdp_get_lockless_sync();
1784 mmu_notifier_invalidate_range_end(&range
);
1786 if (!skipped_uffd
) {
1788 page_table_check_pte_clear_range(mm
, addr
, pgt_pmd
);
1789 pte_free_defer(mm
, pmd_pgtable(pgt_pmd
));
1792 i_mmap_unlock_read(mapping
);
1796 * collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
1798 * @mm: process address space where collapse happens
1799 * @addr: virtual collapse start address
1800 * @file: file that collapse on
1801 * @start: collapse start address
1802 * @cc: collapse context and scratchpad
1804 * Basic scheme is simple, details are more complex:
1805 * - allocate and lock a new huge page;
1806 * - scan page cache, locking old pages
1807 * + swap/gup in pages if necessary;
1808 * - copy data to new page
1809 * - handle shmem holes
1810 * + re-validate that holes weren't filled by someone else
1811 * + check for userfaultfd
1812 * - finalize updates to the page cache;
1813 * - if replacing succeeds:
1814 * + unlock huge page;
1816 * - if replacing failed;
1817 * + unlock old pages
1818 * + unlock and free huge page;
1820 static int collapse_file(struct mm_struct
*mm
, unsigned long addr
,
1821 struct file
*file
, pgoff_t start
,
1822 struct collapse_control
*cc
)
1824 struct address_space
*mapping
= file
->f_mapping
;
1826 struct folio
*folio
, *tmp
, *new_folio
;
1827 pgoff_t index
= 0, end
= start
+ HPAGE_PMD_NR
;
1828 LIST_HEAD(pagelist
);
1829 XA_STATE_ORDER(xas
, &mapping
->i_pages
, start
, HPAGE_PMD_ORDER
);
1830 int nr_none
= 0, result
= SCAN_SUCCEED
;
1831 bool is_shmem
= shmem_file(file
);
1833 VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS
) && !is_shmem
);
1834 VM_BUG_ON(start
& (HPAGE_PMD_NR
- 1));
1836 result
= alloc_charge_folio(&new_folio
, mm
, cc
);
1837 if (result
!= SCAN_SUCCEED
)
1840 __folio_set_locked(new_folio
);
1842 __folio_set_swapbacked(new_folio
);
1843 new_folio
->index
= start
;
1844 new_folio
->mapping
= mapping
;
1847 * Ensure we have slots for all the pages in the range. This is
1848 * almost certainly a no-op because most of the pages must be present
1852 xas_create_range(&xas
);
1853 if (!xas_error(&xas
))
1855 xas_unlock_irq(&xas
);
1856 if (!xas_nomem(&xas
, GFP_KERNEL
)) {
1862 for (index
= start
; index
< end
;) {
1863 xas_set(&xas
, index
);
1864 folio
= xas_load(&xas
);
1866 VM_BUG_ON(index
!= xas
.xa_index
);
1870 * Stop if extent has been truncated or
1871 * hole-punched, and is now completely
1874 if (index
== start
) {
1875 if (!xas_next_entry(&xas
, end
- 1)) {
1876 result
= SCAN_TRUNCATED
;
1885 if (xa_is_value(folio
) || !folio_test_uptodate(folio
)) {
1886 xas_unlock_irq(&xas
);
1887 /* swap in or instantiate fallocated page */
1888 if (shmem_get_folio(mapping
->host
, index
, 0,
1889 &folio
, SGP_NOALLOC
)) {
1893 /* drain lru cache to help folio_isolate_lru() */
1895 } else if (folio_trylock(folio
)) {
1897 xas_unlock_irq(&xas
);
1899 result
= SCAN_PAGE_LOCK
;
1902 } else { /* !is_shmem */
1903 if (!folio
|| xa_is_value(folio
)) {
1904 xas_unlock_irq(&xas
);
1905 page_cache_sync_readahead(mapping
, &file
->f_ra
,
1908 /* drain lru cache to help folio_isolate_lru() */
1910 folio
= filemap_lock_folio(mapping
, index
);
1911 if (IS_ERR(folio
)) {
1915 } else if (folio_test_dirty(folio
)) {
1917 * khugepaged only works on read-only fd,
1918 * so this page is dirty because it hasn't
1919 * been flushed since first write. There
1920 * won't be new dirty pages.
1922 * Trigger async flush here and hope the
1923 * writeback is done when khugepaged
1924 * revisits this page.
1926 * This is a one-off situation. We are not
1927 * forcing writeback in loop.
1929 xas_unlock_irq(&xas
);
1930 filemap_flush(mapping
);
1933 } else if (folio_test_writeback(folio
)) {
1934 xas_unlock_irq(&xas
);
1937 } else if (folio_trylock(folio
)) {
1939 xas_unlock_irq(&xas
);
1941 result
= SCAN_PAGE_LOCK
;
1947 * The folio must be locked, so we can drop the i_pages lock
1948 * without racing with truncate.
1950 VM_BUG_ON_FOLIO(!folio_test_locked(folio
), folio
);
1952 /* make sure the folio is up to date */
1953 if (unlikely(!folio_test_uptodate(folio
))) {
1959 * If file was truncated then extended, or hole-punched, before
1960 * we locked the first folio, then a THP might be there already.
1961 * This will be discovered on the first iteration.
1963 if (folio_order(folio
) == HPAGE_PMD_ORDER
&&
1964 folio
->index
== start
) {
1965 /* Maybe PMD-mapped */
1966 result
= SCAN_PTE_MAPPED_HUGEPAGE
;
1970 if (folio_mapping(folio
) != mapping
) {
1971 result
= SCAN_TRUNCATED
;
1975 if (!is_shmem
&& (folio_test_dirty(folio
) ||
1976 folio_test_writeback(folio
))) {
1978 * khugepaged only works on read-only fd, so this
1979 * folio is dirty because it hasn't been flushed
1980 * since first write.
1986 if (!folio_isolate_lru(folio
)) {
1987 result
= SCAN_DEL_PAGE_LRU
;
1991 if (!filemap_release_folio(folio
, GFP_KERNEL
)) {
1992 result
= SCAN_PAGE_HAS_PRIVATE
;
1993 folio_putback_lru(folio
);
1997 if (folio_mapped(folio
))
1999 TTU_IGNORE_MLOCK
| TTU_BATCH_FLUSH
);
2003 VM_BUG_ON_FOLIO(folio
!= xa_load(xas
.xa
, index
), folio
);
2006 * We control 2 + nr_pages references to the folio:
2007 * - we hold a pin on it;
2008 * - nr_pages reference from page cache;
2009 * - one from lru_isolate_folio;
2010 * If those are the only references, then any new usage
2011 * of the folio will have to fetch it from the page
2012 * cache. That requires locking the folio to handle
2013 * truncate, so any new usage will be blocked until we
2014 * unlock folio after collapse/during rollback.
2016 if (folio_ref_count(folio
) != 2 + folio_nr_pages(folio
)) {
2017 result
= SCAN_PAGE_COUNT
;
2018 xas_unlock_irq(&xas
);
2019 folio_putback_lru(folio
);
2024 * Accumulate the folios that are being collapsed.
2026 list_add_tail(&folio
->lru
, &pagelist
);
2027 index
+= folio_nr_pages(folio
);
2030 folio_unlock(folio
);
2036 filemap_nr_thps_inc(mapping
);
2038 * Paired with the fence in do_dentry_open() -> get_write_access()
2039 * to ensure i_writecount is up to date and the update to nr_thps
2040 * is visible. Ensures the page cache will be truncated if the
2041 * file is opened writable.
2044 if (inode_is_open_for_write(mapping
->host
)) {
2046 filemap_nr_thps_dec(mapping
);
2051 xas_unlock_irq(&xas
);
2055 * If collapse is successful, flush must be done now before copying.
2056 * If collapse is unsuccessful, does flush actually need to be done?
2057 * Do it anyway, to clear the state.
2059 try_to_unmap_flush();
2061 if (result
== SCAN_SUCCEED
&& nr_none
&&
2062 !shmem_charge(mapping
->host
, nr_none
))
2064 if (result
!= SCAN_SUCCEED
) {
2070 * The old folios are locked, so they won't change anymore.
2073 dst
= folio_page(new_folio
, 0);
2074 list_for_each_entry(folio
, &pagelist
, lru
) {
2075 int i
, nr_pages
= folio_nr_pages(folio
);
2077 while (index
< folio
->index
) {
2078 clear_highpage(dst
);
2083 for (i
= 0; i
< nr_pages
; i
++) {
2084 if (copy_mc_highpage(dst
, folio_page(folio
, i
)) > 0) {
2085 result
= SCAN_COPY_MC
;
2092 while (index
< end
) {
2093 clear_highpage(dst
);
2099 struct vm_area_struct
*vma
;
2100 int nr_none_check
= 0;
2102 i_mmap_lock_read(mapping
);
2105 xas_set(&xas
, start
);
2106 for (index
= start
; index
< end
; index
++) {
2107 if (!xas_next(&xas
)) {
2108 xas_store(&xas
, XA_RETRY_ENTRY
);
2109 if (xas_error(&xas
)) {
2110 result
= SCAN_STORE_FAILED
;
2117 if (nr_none
!= nr_none_check
) {
2118 result
= SCAN_PAGE_FILLED
;
2123 * If userspace observed a missing page in a VMA with
2124 * a MODE_MISSING userfaultfd, then it might expect a
2125 * UFFD_EVENT_PAGEFAULT for that page. If so, we need to
2126 * roll back to avoid suppressing such an event. Since
2127 * wp/minor userfaultfds don't give userspace any
2128 * guarantees that the kernel doesn't fill a missing
2129 * page with a zero page, so they don't matter here.
2131 * Any userfaultfds registered after this point will
2132 * not be able to observe any missing pages due to the
2133 * previously inserted retry entries.
2135 vma_interval_tree_foreach(vma
, &mapping
->i_mmap
, start
, end
) {
2136 if (userfaultfd_missing(vma
)) {
2137 result
= SCAN_EXCEED_NONE_PTE
;
2143 i_mmap_unlock_read(mapping
);
2144 if (result
!= SCAN_SUCCEED
) {
2145 xas_set(&xas
, start
);
2146 for (index
= start
; index
< end
; index
++) {
2147 if (xas_next(&xas
) == XA_RETRY_ENTRY
)
2148 xas_store(&xas
, NULL
);
2151 xas_unlock_irq(&xas
);
2159 __lruvec_stat_mod_folio(new_folio
, NR_SHMEM_THPS
, HPAGE_PMD_NR
);
2161 __lruvec_stat_mod_folio(new_folio
, NR_FILE_THPS
, HPAGE_PMD_NR
);
2164 __lruvec_stat_mod_folio(new_folio
, NR_FILE_PAGES
, nr_none
);
2165 /* nr_none is always 0 for non-shmem. */
2166 __lruvec_stat_mod_folio(new_folio
, NR_SHMEM
, nr_none
);
2170 * Mark new_folio as uptodate before inserting it into the
2171 * page cache so that it isn't mistaken for an fallocated but
2174 folio_mark_uptodate(new_folio
);
2175 folio_ref_add(new_folio
, HPAGE_PMD_NR
- 1);
2178 folio_mark_dirty(new_folio
);
2179 folio_add_lru(new_folio
);
2181 /* Join all the small entries into a single multi-index entry. */
2182 xas_set_order(&xas
, start
, HPAGE_PMD_ORDER
);
2183 xas_store(&xas
, new_folio
);
2184 WARN_ON_ONCE(xas_error(&xas
));
2185 xas_unlock_irq(&xas
);
2188 * Remove pte page tables, so we can re-fault the page as huge.
2189 * If MADV_COLLAPSE, adjust result to call collapse_pte_mapped_thp().
2191 retract_page_tables(mapping
, start
);
2192 if (cc
&& !cc
->is_khugepaged
)
2193 result
= SCAN_PTE_MAPPED_HUGEPAGE
;
2194 folio_unlock(new_folio
);
2197 * The collapse has succeeded, so free the old folios.
2199 list_for_each_entry_safe(folio
, tmp
, &pagelist
, lru
) {
2200 list_del(&folio
->lru
);
2201 folio
->mapping
= NULL
;
2202 folio_clear_active(folio
);
2203 folio_clear_unevictable(folio
);
2204 folio_unlock(folio
);
2205 folio_put_refs(folio
, 2 + folio_nr_pages(folio
));
2211 /* Something went wrong: roll back page cache changes */
2214 mapping
->nrpages
-= nr_none
;
2215 xas_unlock_irq(&xas
);
2216 shmem_uncharge(mapping
->host
, nr_none
);
2219 list_for_each_entry_safe(folio
, tmp
, &pagelist
, lru
) {
2220 list_del(&folio
->lru
);
2221 folio_unlock(folio
);
2222 folio_putback_lru(folio
);
2226 * Undo the updates of filemap_nr_thps_inc for non-SHMEM
2227 * file only. This undo is not needed unless failure is
2228 * due to SCAN_COPY_MC.
2230 if (!is_shmem
&& result
== SCAN_COPY_MC
) {
2231 filemap_nr_thps_dec(mapping
);
2233 * Paired with the fence in do_dentry_open() -> get_write_access()
2234 * to ensure the update to nr_thps is visible.
2239 new_folio
->mapping
= NULL
;
2241 folio_unlock(new_folio
);
2242 folio_put(new_folio
);
2244 VM_BUG_ON(!list_empty(&pagelist
));
2245 trace_mm_khugepaged_collapse_file(mm
, new_folio
, index
, addr
, is_shmem
, file
, HPAGE_PMD_NR
, result
);
2249 static int hpage_collapse_scan_file(struct mm_struct
*mm
, unsigned long addr
,
2250 struct file
*file
, pgoff_t start
,
2251 struct collapse_control
*cc
)
2253 struct folio
*folio
= NULL
;
2254 struct address_space
*mapping
= file
->f_mapping
;
2255 XA_STATE(xas
, &mapping
->i_pages
, start
);
2257 int node
= NUMA_NO_NODE
;
2258 int result
= SCAN_SUCCEED
;
2262 memset(cc
->node_load
, 0, sizeof(cc
->node_load
));
2263 nodes_clear(cc
->alloc_nmask
);
2265 xas_for_each(&xas
, folio
, start
+ HPAGE_PMD_NR
- 1) {
2266 if (xas_retry(&xas
, folio
))
2269 if (xa_is_value(folio
)) {
2270 swap
+= 1 << xas_get_order(&xas
);
2271 if (cc
->is_khugepaged
&&
2272 swap
> khugepaged_max_ptes_swap
) {
2273 result
= SCAN_EXCEED_SWAP_PTE
;
2274 count_vm_event(THP_SCAN_EXCEED_SWAP_PTE
);
2280 if (folio_order(folio
) == HPAGE_PMD_ORDER
&&
2281 folio
->index
== start
) {
2282 /* Maybe PMD-mapped */
2283 result
= SCAN_PTE_MAPPED_HUGEPAGE
;
2285 * For SCAN_PTE_MAPPED_HUGEPAGE, further processing
2286 * by the caller won't touch the page cache, and so
2287 * it's safe to skip LRU and refcount checks before
2293 node
= folio_nid(folio
);
2294 if (hpage_collapse_scan_abort(node
, cc
)) {
2295 result
= SCAN_SCAN_ABORT
;
2298 cc
->node_load
[node
]++;
2300 if (!folio_test_lru(folio
)) {
2301 result
= SCAN_PAGE_LRU
;
2305 if (!is_refcount_suitable(folio
)) {
2306 result
= SCAN_PAGE_COUNT
;
2311 * We probably should check if the folio is referenced
2312 * here, but nobody would transfer pte_young() to
2313 * folio_test_referenced() for us. And rmap walk here
2314 * is just too costly...
2317 present
+= folio_nr_pages(folio
);
2319 if (need_resched()) {
2326 if (result
== SCAN_SUCCEED
) {
2327 if (cc
->is_khugepaged
&&
2328 present
< HPAGE_PMD_NR
- khugepaged_max_ptes_none
) {
2329 result
= SCAN_EXCEED_NONE_PTE
;
2330 count_vm_event(THP_SCAN_EXCEED_NONE_PTE
);
2332 result
= collapse_file(mm
, addr
, file
, start
, cc
);
2336 trace_mm_khugepaged_scan_file(mm
, folio
, file
, present
, swap
, result
);
2340 static int hpage_collapse_scan_file(struct mm_struct
*mm
, unsigned long addr
,
2341 struct file
*file
, pgoff_t start
,
2342 struct collapse_control
*cc
)
2348 static unsigned int khugepaged_scan_mm_slot(unsigned int pages
, int *result
,
2349 struct collapse_control
*cc
)
2350 __releases(&khugepaged_mm_lock
)
2351 __acquires(&khugepaged_mm_lock
)
2353 struct vma_iterator vmi
;
2354 struct khugepaged_mm_slot
*mm_slot
;
2355 struct mm_slot
*slot
;
2356 struct mm_struct
*mm
;
2357 struct vm_area_struct
*vma
;
2361 lockdep_assert_held(&khugepaged_mm_lock
);
2362 *result
= SCAN_FAIL
;
2364 if (khugepaged_scan
.mm_slot
) {
2365 mm_slot
= khugepaged_scan
.mm_slot
;
2366 slot
= &mm_slot
->slot
;
2368 slot
= list_entry(khugepaged_scan
.mm_head
.next
,
2369 struct mm_slot
, mm_node
);
2370 mm_slot
= mm_slot_entry(slot
, struct khugepaged_mm_slot
, slot
);
2371 khugepaged_scan
.address
= 0;
2372 khugepaged_scan
.mm_slot
= mm_slot
;
2374 spin_unlock(&khugepaged_mm_lock
);
2378 * Don't wait for semaphore (to avoid long wait times). Just move to
2379 * the next mm on the list.
2382 if (unlikely(!mmap_read_trylock(mm
)))
2383 goto breakouterloop_mmap_lock
;
2386 if (unlikely(hpage_collapse_test_exit_or_disable(mm
)))
2387 goto breakouterloop
;
2389 vma_iter_init(&vmi
, mm
, khugepaged_scan
.address
);
2390 for_each_vma(vmi
, vma
) {
2391 unsigned long hstart
, hend
;
2394 if (unlikely(hpage_collapse_test_exit_or_disable(mm
))) {
2398 if (!thp_vma_allowable_order(vma
, vma
->vm_flags
,
2399 TVA_ENFORCE_SYSFS
, PMD_ORDER
)) {
2404 hstart
= round_up(vma
->vm_start
, HPAGE_PMD_SIZE
);
2405 hend
= round_down(vma
->vm_end
, HPAGE_PMD_SIZE
);
2406 if (khugepaged_scan
.address
> hend
)
2408 if (khugepaged_scan
.address
< hstart
)
2409 khugepaged_scan
.address
= hstart
;
2410 VM_BUG_ON(khugepaged_scan
.address
& ~HPAGE_PMD_MASK
);
2412 while (khugepaged_scan
.address
< hend
) {
2413 bool mmap_locked
= true;
2416 if (unlikely(hpage_collapse_test_exit_or_disable(mm
)))
2417 goto breakouterloop
;
2419 VM_BUG_ON(khugepaged_scan
.address
< hstart
||
2420 khugepaged_scan
.address
+ HPAGE_PMD_SIZE
>
2422 if (IS_ENABLED(CONFIG_SHMEM
) && vma
->vm_file
) {
2423 struct file
*file
= get_file(vma
->vm_file
);
2424 pgoff_t pgoff
= linear_page_index(vma
,
2425 khugepaged_scan
.address
);
2427 mmap_read_unlock(mm
);
2428 mmap_locked
= false;
2429 *result
= hpage_collapse_scan_file(mm
,
2430 khugepaged_scan
.address
, file
, pgoff
, cc
);
2432 if (*result
== SCAN_PTE_MAPPED_HUGEPAGE
) {
2434 if (hpage_collapse_test_exit_or_disable(mm
))
2435 goto breakouterloop
;
2436 *result
= collapse_pte_mapped_thp(mm
,
2437 khugepaged_scan
.address
, false);
2438 if (*result
== SCAN_PMD_MAPPED
)
2439 *result
= SCAN_SUCCEED
;
2440 mmap_read_unlock(mm
);
2443 *result
= hpage_collapse_scan_pmd(mm
, vma
,
2444 khugepaged_scan
.address
, &mmap_locked
, cc
);
2447 if (*result
== SCAN_SUCCEED
)
2448 ++khugepaged_pages_collapsed
;
2450 /* move to next address */
2451 khugepaged_scan
.address
+= HPAGE_PMD_SIZE
;
2452 progress
+= HPAGE_PMD_NR
;
2455 * We released mmap_lock so break loop. Note
2456 * that we drop mmap_lock before all hugepage
2457 * allocations, so if allocation fails, we are
2458 * guaranteed to break here and report the
2459 * correct result back to caller.
2461 goto breakouterloop_mmap_lock
;
2462 if (progress
>= pages
)
2463 goto breakouterloop
;
2467 mmap_read_unlock(mm
); /* exit_mmap will destroy ptes after this */
2468 breakouterloop_mmap_lock
:
2470 spin_lock(&khugepaged_mm_lock
);
2471 VM_BUG_ON(khugepaged_scan
.mm_slot
!= mm_slot
);
2473 * Release the current mm_slot if this mm is about to die, or
2474 * if we scanned all vmas of this mm.
2476 if (hpage_collapse_test_exit(mm
) || !vma
) {
2478 * Make sure that if mm_users is reaching zero while
2479 * khugepaged runs here, khugepaged_exit will find
2480 * mm_slot not pointing to the exiting mm.
2482 if (slot
->mm_node
.next
!= &khugepaged_scan
.mm_head
) {
2483 slot
= list_entry(slot
->mm_node
.next
,
2484 struct mm_slot
, mm_node
);
2485 khugepaged_scan
.mm_slot
=
2486 mm_slot_entry(slot
, struct khugepaged_mm_slot
, slot
);
2487 khugepaged_scan
.address
= 0;
2489 khugepaged_scan
.mm_slot
= NULL
;
2490 khugepaged_full_scans
++;
2493 collect_mm_slot(mm_slot
);
2499 static int khugepaged_has_work(void)
2501 return !list_empty(&khugepaged_scan
.mm_head
) && hugepage_pmd_enabled();
2504 static int khugepaged_wait_event(void)
2506 return !list_empty(&khugepaged_scan
.mm_head
) ||
2507 kthread_should_stop();
2510 static void khugepaged_do_scan(struct collapse_control
*cc
)
2512 unsigned int progress
= 0, pass_through_head
= 0;
2513 unsigned int pages
= READ_ONCE(khugepaged_pages_to_scan
);
2515 int result
= SCAN_SUCCEED
;
2517 lru_add_drain_all();
2522 if (unlikely(kthread_should_stop()))
2525 spin_lock(&khugepaged_mm_lock
);
2526 if (!khugepaged_scan
.mm_slot
)
2527 pass_through_head
++;
2528 if (khugepaged_has_work() &&
2529 pass_through_head
< 2)
2530 progress
+= khugepaged_scan_mm_slot(pages
- progress
,
2534 spin_unlock(&khugepaged_mm_lock
);
2536 if (progress
>= pages
)
2539 if (result
== SCAN_ALLOC_HUGE_PAGE_FAIL
) {
2541 * If fail to allocate the first time, try to sleep for
2542 * a while. When hit again, cancel the scan.
2547 khugepaged_alloc_sleep();
2552 static bool khugepaged_should_wakeup(void)
2554 return kthread_should_stop() ||
2555 time_after_eq(jiffies
, khugepaged_sleep_expire
);
2558 static void khugepaged_wait_work(void)
2560 if (khugepaged_has_work()) {
2561 const unsigned long scan_sleep_jiffies
=
2562 msecs_to_jiffies(khugepaged_scan_sleep_millisecs
);
2564 if (!scan_sleep_jiffies
)
2567 khugepaged_sleep_expire
= jiffies
+ scan_sleep_jiffies
;
2568 wait_event_freezable_timeout(khugepaged_wait
,
2569 khugepaged_should_wakeup(),
2570 scan_sleep_jiffies
);
2574 if (hugepage_pmd_enabled())
2575 wait_event_freezable(khugepaged_wait
, khugepaged_wait_event());
2578 static int khugepaged(void *none
)
2580 struct khugepaged_mm_slot
*mm_slot
;
2583 set_user_nice(current
, MAX_NICE
);
2585 while (!kthread_should_stop()) {
2586 khugepaged_do_scan(&khugepaged_collapse_control
);
2587 khugepaged_wait_work();
2590 spin_lock(&khugepaged_mm_lock
);
2591 mm_slot
= khugepaged_scan
.mm_slot
;
2592 khugepaged_scan
.mm_slot
= NULL
;
2594 collect_mm_slot(mm_slot
);
2595 spin_unlock(&khugepaged_mm_lock
);
2599 static void set_recommended_min_free_kbytes(void)
2603 unsigned long recommended_min
;
2605 if (!hugepage_pmd_enabled()) {
2606 calculate_min_free_kbytes();
2610 for_each_populated_zone(zone
) {
2612 * We don't need to worry about fragmentation of
2613 * ZONE_MOVABLE since it only has movable pages.
2615 if (zone_idx(zone
) > gfp_zone(GFP_USER
))
2621 /* Ensure 2 pageblocks are free to assist fragmentation avoidance */
2622 recommended_min
= pageblock_nr_pages
* nr_zones
* 2;
2625 * Make sure that on average at least two pageblocks are almost free
2626 * of another type, one for a migratetype to fall back to and a
2627 * second to avoid subsequent fallbacks of other types There are 3
2628 * MIGRATE_TYPES we care about.
2630 recommended_min
+= pageblock_nr_pages
* nr_zones
*
2631 MIGRATE_PCPTYPES
* MIGRATE_PCPTYPES
;
2633 /* don't ever allow to reserve more than 5% of the lowmem */
2634 recommended_min
= min(recommended_min
,
2635 (unsigned long) nr_free_buffer_pages() / 20);
2636 recommended_min
<<= (PAGE_SHIFT
-10);
2638 if (recommended_min
> min_free_kbytes
) {
2639 if (user_min_free_kbytes
>= 0)
2640 pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
2641 min_free_kbytes
, recommended_min
);
2643 min_free_kbytes
= recommended_min
;
2647 setup_per_zone_wmarks();
2650 int start_stop_khugepaged(void)
2654 mutex_lock(&khugepaged_mutex
);
2655 if (hugepage_pmd_enabled()) {
2656 if (!khugepaged_thread
)
2657 khugepaged_thread
= kthread_run(khugepaged
, NULL
,
2659 if (IS_ERR(khugepaged_thread
)) {
2660 pr_err("khugepaged: kthread_run(khugepaged) failed\n");
2661 err
= PTR_ERR(khugepaged_thread
);
2662 khugepaged_thread
= NULL
;
2666 if (!list_empty(&khugepaged_scan
.mm_head
))
2667 wake_up_interruptible(&khugepaged_wait
);
2668 } else if (khugepaged_thread
) {
2669 kthread_stop(khugepaged_thread
);
2670 khugepaged_thread
= NULL
;
2672 set_recommended_min_free_kbytes();
2674 mutex_unlock(&khugepaged_mutex
);
2678 void khugepaged_min_free_kbytes_update(void)
2680 mutex_lock(&khugepaged_mutex
);
2681 if (hugepage_pmd_enabled() && khugepaged_thread
)
2682 set_recommended_min_free_kbytes();
2683 mutex_unlock(&khugepaged_mutex
);
2686 bool current_is_khugepaged(void)
2688 return kthread_func(current
) == khugepaged
;
2691 static int madvise_collapse_errno(enum scan_result r
)
2694 * MADV_COLLAPSE breaks from existing madvise(2) conventions to provide
2695 * actionable feedback to caller, so they may take an appropriate
2696 * fallback measure depending on the nature of the failure.
2699 case SCAN_ALLOC_HUGE_PAGE_FAIL
:
2701 case SCAN_CGROUP_CHARGE_FAIL
:
2702 case SCAN_EXCEED_NONE_PTE
:
2704 /* Resource temporary unavailable - trying again might succeed */
2705 case SCAN_PAGE_COUNT
:
2706 case SCAN_PAGE_LOCK
:
2708 case SCAN_DEL_PAGE_LRU
:
2709 case SCAN_PAGE_FILLED
:
2712 * Other: Trying again likely not to succeed / error intrinsic to
2713 * specified memory range. khugepaged likely won't be able to collapse
2721 int madvise_collapse(struct vm_area_struct
*vma
, struct vm_area_struct
**prev
,
2722 unsigned long start
, unsigned long end
)
2724 struct collapse_control
*cc
;
2725 struct mm_struct
*mm
= vma
->vm_mm
;
2726 unsigned long hstart
, hend
, addr
;
2727 int thps
= 0, last_fail
= SCAN_FAIL
;
2728 bool mmap_locked
= true;
2730 BUG_ON(vma
->vm_start
> start
);
2731 BUG_ON(vma
->vm_end
< end
);
2735 if (!thp_vma_allowable_order(vma
, vma
->vm_flags
, 0, PMD_ORDER
))
2738 cc
= kmalloc(sizeof(*cc
), GFP_KERNEL
);
2741 cc
->is_khugepaged
= false;
2744 lru_add_drain_all();
2746 hstart
= (start
+ ~HPAGE_PMD_MASK
) & HPAGE_PMD_MASK
;
2747 hend
= end
& HPAGE_PMD_MASK
;
2749 for (addr
= hstart
; addr
< hend
; addr
+= HPAGE_PMD_SIZE
) {
2750 int result
= SCAN_FAIL
;
2756 result
= hugepage_vma_revalidate(mm
, addr
, false, &vma
,
2758 if (result
!= SCAN_SUCCEED
) {
2763 hend
= min(hend
, vma
->vm_end
& HPAGE_PMD_MASK
);
2765 mmap_assert_locked(mm
);
2766 memset(cc
->node_load
, 0, sizeof(cc
->node_load
));
2767 nodes_clear(cc
->alloc_nmask
);
2768 if (IS_ENABLED(CONFIG_SHMEM
) && vma
->vm_file
) {
2769 struct file
*file
= get_file(vma
->vm_file
);
2770 pgoff_t pgoff
= linear_page_index(vma
, addr
);
2772 mmap_read_unlock(mm
);
2773 mmap_locked
= false;
2774 result
= hpage_collapse_scan_file(mm
, addr
, file
, pgoff
,
2778 result
= hpage_collapse_scan_pmd(mm
, vma
, addr
,
2782 *prev
= NULL
; /* Tell caller we dropped mmap_lock */
2787 case SCAN_PMD_MAPPED
:
2790 case SCAN_PTE_MAPPED_HUGEPAGE
:
2791 BUG_ON(mmap_locked
);
2794 result
= collapse_pte_mapped_thp(mm
, addr
, true);
2795 mmap_read_unlock(mm
);
2797 /* Whitelisted set of results where continuing OK */
2799 case SCAN_PTE_NON_PRESENT
:
2800 case SCAN_PTE_UFFD_WP
:
2802 case SCAN_LACK_REFERENCED_PAGE
:
2803 case SCAN_PAGE_NULL
:
2804 case SCAN_PAGE_COUNT
:
2805 case SCAN_PAGE_LOCK
:
2806 case SCAN_PAGE_COMPOUND
:
2808 case SCAN_DEL_PAGE_LRU
:
2813 /* Other error, exit */
2819 /* Caller expects us to hold mmap_lock on return */
2823 mmap_assert_locked(mm
);
2827 return thps
== ((hend
- hstart
) >> HPAGE_PMD_SHIFT
) ? 0
2828 : madvise_collapse_errno(last_fail
);