1 // SPDX-License-Identifier: GPL-2.0
3 * Memory Migration functionality - linux/mm/migrate.c
5 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
7 * Page migration was first developed in the context of the memory hotplug
8 * project. The main authors of the migration code are:
10 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
11 * Hirokazu Takahashi <taka@valinux.co.jp>
12 * Dave Hansen <haveblue@us.ibm.com>
16 #include <linux/migrate.h>
17 #include <linux/export.h>
18 #include <linux/swap.h>
19 #include <linux/swapops.h>
20 #include <linux/pagemap.h>
21 #include <linux/buffer_head.h>
22 #include <linux/mm_inline.h>
23 #include <linux/ksm.h>
24 #include <linux/rmap.h>
25 #include <linux/topology.h>
26 #include <linux/cpu.h>
27 #include <linux/cpuset.h>
28 #include <linux/writeback.h>
29 #include <linux/mempolicy.h>
30 #include <linux/vmalloc.h>
31 #include <linux/security.h>
32 #include <linux/backing-dev.h>
33 #include <linux/compaction.h>
34 #include <linux/syscalls.h>
35 #include <linux/compat.h>
36 #include <linux/hugetlb.h>
37 #include <linux/gfp.h>
38 #include <linux/pfn_t.h>
39 #include <linux/page_idle.h>
40 #include <linux/page_owner.h>
41 #include <linux/sched/mm.h>
42 #include <linux/ptrace.h>
43 #include <linux/memory.h>
44 #include <linux/sched/sysctl.h>
45 #include <linux/memory-tiers.h>
46 #include <linux/pagewalk.h>
48 #include <asm/tlbflush.h>
50 #include <trace/events/migrate.h>
54 bool isolate_movable_page(struct page
*page
, isolate_mode_t mode
)
56 struct folio
*folio
= folio_get_nontail_page(page
);
57 const struct movable_operations
*mops
;
60 * Avoid burning cycles with pages that are yet under __free_pages(),
61 * or just got freed under us.
63 * In case we 'win' a race for a movable page being freed under us and
64 * raise its refcount preventing __free_pages() from doing its job
65 * the put_page() at the end of this block will take care of
66 * release this page, thus avoiding a nasty leakage.
71 if (unlikely(folio_test_slab(folio
)))
73 /* Pairs with smp_wmb() in slab freeing, e.g. SLUB's __free_slab() */
76 * Check movable flag before taking the page lock because
77 * we use non-atomic bitops on newly allocated page flags so
78 * unconditionally grabbing the lock ruins page's owner side.
80 if (unlikely(!__folio_test_movable(folio
)))
82 /* Pairs with smp_wmb() in slab allocation, e.g. SLUB's alloc_slab_page() */
84 if (unlikely(folio_test_slab(folio
)))
88 * As movable pages are not isolated from LRU lists, concurrent
89 * compaction threads can race against page migration functions
90 * as well as race against the releasing a page.
92 * In order to avoid having an already isolated movable page
93 * being (wrongly) re-isolated while it is under migration,
94 * or to avoid attempting to isolate pages being released,
95 * lets be sure we have the page lock
96 * before proceeding with the movable page isolation steps.
98 if (unlikely(!folio_trylock(folio
)))
101 if (!folio_test_movable(folio
) || folio_test_isolated(folio
))
102 goto out_no_isolated
;
104 mops
= folio_movable_ops(folio
);
105 VM_BUG_ON_FOLIO(!mops
, folio
);
107 if (!mops
->isolate_page(&folio
->page
, mode
))
108 goto out_no_isolated
;
110 /* Driver shouldn't use the isolated flag */
111 WARN_ON_ONCE(folio_test_isolated(folio
));
112 folio_set_isolated(folio
);
125 static void putback_movable_folio(struct folio
*folio
)
127 const struct movable_operations
*mops
= folio_movable_ops(folio
);
129 mops
->putback_page(&folio
->page
);
130 folio_clear_isolated(folio
);
134 * Put previously isolated pages back onto the appropriate lists
135 * from where they were once taken off for compaction/migration.
137 * This function shall be used whenever the isolated pageset has been
138 * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
139 * and isolate_hugetlb().
141 void putback_movable_pages(struct list_head
*l
)
144 struct folio
*folio2
;
146 list_for_each_entry_safe(folio
, folio2
, l
, lru
) {
147 if (unlikely(folio_test_hugetlb(folio
))) {
148 folio_putback_active_hugetlb(folio
);
151 list_del(&folio
->lru
);
153 * We isolated non-lru movable folio so here we can use
154 * __folio_test_movable because LRU folio's mapping cannot
155 * have PAGE_MAPPING_MOVABLE.
157 if (unlikely(__folio_test_movable(folio
))) {
158 VM_BUG_ON_FOLIO(!folio_test_isolated(folio
), folio
);
160 if (folio_test_movable(folio
))
161 putback_movable_folio(folio
);
163 folio_clear_isolated(folio
);
167 node_stat_mod_folio(folio
, NR_ISOLATED_ANON
+
168 folio_is_file_lru(folio
), -folio_nr_pages(folio
));
169 folio_putback_lru(folio
);
174 /* Must be called with an elevated refcount on the non-hugetlb folio */
175 bool isolate_folio_to_list(struct folio
*folio
, struct list_head
*list
)
179 if (folio_test_hugetlb(folio
))
180 return isolate_hugetlb(folio
, list
);
182 lru
= !__folio_test_movable(folio
);
184 isolated
= folio_isolate_lru(folio
);
186 isolated
= isolate_movable_page(&folio
->page
,
187 ISOLATE_UNEVICTABLE
);
192 list_add(&folio
->lru
, list
);
194 node_stat_add_folio(folio
, NR_ISOLATED_ANON
+
195 folio_is_file_lru(folio
));
200 static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk
*pvmw
,
204 struct page
*page
= folio_page(folio
, idx
);
209 if (PageCompound(page
))
211 VM_BUG_ON_PAGE(!PageAnon(page
), page
);
212 VM_BUG_ON_PAGE(!PageLocked(page
), page
);
213 VM_BUG_ON_PAGE(pte_present(*pvmw
->pte
), page
);
215 if (folio_test_mlocked(folio
) || (pvmw
->vma
->vm_flags
& VM_LOCKED
) ||
216 mm_forbids_zeropage(pvmw
->vma
->vm_mm
))
220 * The pmd entry mapping the old thp was flushed and the pte mapping
221 * this subpage has been non present. If the subpage is only zero-filled
222 * then map it to the shared zeropage.
224 addr
= kmap_local_page(page
);
225 contains_data
= memchr_inv(addr
, 0, PAGE_SIZE
);
231 newpte
= pte_mkspecial(pfn_pte(my_zero_pfn(pvmw
->address
),
232 pvmw
->vma
->vm_page_prot
));
233 set_pte_at(pvmw
->vma
->vm_mm
, pvmw
->address
, pvmw
->pte
, newpte
);
235 dec_mm_counter(pvmw
->vma
->vm_mm
, mm_counter(folio
));
239 struct rmap_walk_arg
{
241 bool map_unused_to_zeropage
;
245 * Restore a potential migration pte to a working pte entry
247 static bool remove_migration_pte(struct folio
*folio
,
248 struct vm_area_struct
*vma
, unsigned long addr
, void *arg
)
250 struct rmap_walk_arg
*rmap_walk_arg
= arg
;
251 DEFINE_FOLIO_VMA_WALK(pvmw
, rmap_walk_arg
->folio
, vma
, addr
, PVMW_SYNC
| PVMW_MIGRATION
);
253 while (page_vma_mapped_walk(&pvmw
)) {
254 rmap_t rmap_flags
= RMAP_NONE
;
259 unsigned long idx
= 0;
261 /* pgoff is invalid for ksm pages, but they are never large */
262 if (folio_test_large(folio
) && !folio_test_hugetlb(folio
))
263 idx
= linear_page_index(vma
, pvmw
.address
) - pvmw
.pgoff
;
264 new = folio_page(folio
, idx
);
266 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
267 /* PMD-mapped THP migration entry */
269 VM_BUG_ON_FOLIO(folio_test_hugetlb(folio
) ||
270 !folio_test_pmd_mappable(folio
), folio
);
271 remove_migration_pmd(&pvmw
, new);
275 if (rmap_walk_arg
->map_unused_to_zeropage
&&
276 try_to_map_unused_to_zeropage(&pvmw
, folio
, idx
))
280 pte
= mk_pte(new, READ_ONCE(vma
->vm_page_prot
));
281 old_pte
= ptep_get(pvmw
.pte
);
283 entry
= pte_to_swp_entry(old_pte
);
284 if (!is_migration_entry_young(entry
))
285 pte
= pte_mkold(pte
);
286 if (folio_test_dirty(folio
) && is_migration_entry_dirty(entry
))
287 pte
= pte_mkdirty(pte
);
288 if (pte_swp_soft_dirty(old_pte
))
289 pte
= pte_mksoft_dirty(pte
);
291 pte
= pte_clear_soft_dirty(pte
);
293 if (is_writable_migration_entry(entry
))
294 pte
= pte_mkwrite(pte
, vma
);
295 else if (pte_swp_uffd_wp(old_pte
))
296 pte
= pte_mkuffd_wp(pte
);
298 if (folio_test_anon(folio
) && !is_readable_migration_entry(entry
))
299 rmap_flags
|= RMAP_EXCLUSIVE
;
301 if (unlikely(is_device_private_page(new))) {
303 entry
= make_writable_device_private_entry(
306 entry
= make_readable_device_private_entry(
308 pte
= swp_entry_to_pte(entry
);
309 if (pte_swp_soft_dirty(old_pte
))
310 pte
= pte_swp_mksoft_dirty(pte
);
311 if (pte_swp_uffd_wp(old_pte
))
312 pte
= pte_swp_mkuffd_wp(pte
);
315 #ifdef CONFIG_HUGETLB_PAGE
316 if (folio_test_hugetlb(folio
)) {
317 struct hstate
*h
= hstate_vma(vma
);
318 unsigned int shift
= huge_page_shift(h
);
319 unsigned long psize
= huge_page_size(h
);
321 pte
= arch_make_huge_pte(pte
, shift
, vma
->vm_flags
);
322 if (folio_test_anon(folio
))
323 hugetlb_add_anon_rmap(folio
, vma
, pvmw
.address
,
326 hugetlb_add_file_rmap(folio
);
327 set_huge_pte_at(vma
->vm_mm
, pvmw
.address
, pvmw
.pte
, pte
,
332 if (folio_test_anon(folio
))
333 folio_add_anon_rmap_pte(folio
, new, vma
,
334 pvmw
.address
, rmap_flags
);
336 folio_add_file_rmap_pte(folio
, new, vma
);
337 set_pte_at(vma
->vm_mm
, pvmw
.address
, pvmw
.pte
, pte
);
339 if (vma
->vm_flags
& VM_LOCKED
)
342 trace_remove_migration_pte(pvmw
.address
, pte_val(pte
),
343 compound_order(new));
345 /* No need to invalidate - it was non-present before */
346 update_mmu_cache(vma
, pvmw
.address
, pvmw
.pte
);
353 * Get rid of all migration entries and replace them by
354 * references to the indicated page.
356 void remove_migration_ptes(struct folio
*src
, struct folio
*dst
, int flags
)
358 struct rmap_walk_arg rmap_walk_arg
= {
360 .map_unused_to_zeropage
= flags
& RMP_USE_SHARED_ZEROPAGE
,
363 struct rmap_walk_control rwc
= {
364 .rmap_one
= remove_migration_pte
,
365 .arg
= &rmap_walk_arg
,
368 VM_BUG_ON_FOLIO((flags
& RMP_USE_SHARED_ZEROPAGE
) && (src
!= dst
), src
);
370 if (flags
& RMP_LOCKED
)
371 rmap_walk_locked(dst
, &rwc
);
373 rmap_walk(dst
, &rwc
);
377 * Something used the pte of a page under migration. We need to
378 * get to the page and wait until migration is finished.
379 * When we return from this function the fault will be retried.
381 void migration_entry_wait(struct mm_struct
*mm
, pmd_t
*pmd
,
382 unsigned long address
)
389 ptep
= pte_offset_map_lock(mm
, pmd
, address
, &ptl
);
393 pte
= ptep_get(ptep
);
396 if (!is_swap_pte(pte
))
399 entry
= pte_to_swp_entry(pte
);
400 if (!is_migration_entry(entry
))
403 migration_entry_wait_on_locked(entry
, ptl
);
409 #ifdef CONFIG_HUGETLB_PAGE
411 * The vma read lock must be held upon entry. Holding that lock prevents either
412 * the pte or the ptl from being freed.
414 * This function will release the vma lock before returning.
416 void migration_entry_wait_huge(struct vm_area_struct
*vma
, unsigned long addr
, pte_t
*ptep
)
418 spinlock_t
*ptl
= huge_pte_lockptr(hstate_vma(vma
), vma
->vm_mm
, ptep
);
421 hugetlb_vma_assert_locked(vma
);
423 pte
= huge_ptep_get(vma
->vm_mm
, addr
, ptep
);
425 if (unlikely(!is_hugetlb_entry_migration(pte
))) {
427 hugetlb_vma_unlock_read(vma
);
430 * If migration entry existed, safe to release vma lock
431 * here because the pgtable page won't be freed without the
432 * pgtable lock released. See comment right above pgtable
433 * lock release in migration_entry_wait_on_locked().
435 hugetlb_vma_unlock_read(vma
);
436 migration_entry_wait_on_locked(pte_to_swp_entry(pte
), ptl
);
441 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
442 void pmd_migration_entry_wait(struct mm_struct
*mm
, pmd_t
*pmd
)
446 ptl
= pmd_lock(mm
, pmd
);
447 if (!is_pmd_migration_entry(*pmd
))
449 migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd
), ptl
);
456 static int folio_expected_refs(struct address_space
*mapping
,
463 refs
+= folio_nr_pages(folio
);
464 if (folio_test_private(folio
))
471 * Replace the folio in the mapping.
473 * The number of remaining references must be:
474 * 1 for anonymous folios without a mapping
475 * 2 for folios with a mapping
476 * 3 for folios with a mapping and the private flag set.
478 static int __folio_migrate_mapping(struct address_space
*mapping
,
479 struct folio
*newfolio
, struct folio
*folio
, int expected_count
)
481 XA_STATE(xas
, &mapping
->i_pages
, folio_index(folio
));
482 struct zone
*oldzone
, *newzone
;
484 long nr
= folio_nr_pages(folio
);
488 /* Take off deferred split queue while frozen and memcg set */
489 if (folio_test_large(folio
) &&
490 folio_test_large_rmappable(folio
)) {
491 if (!folio_ref_freeze(folio
, expected_count
))
493 folio_unqueue_deferred_split(folio
);
494 folio_ref_unfreeze(folio
, expected_count
);
497 /* No turning back from here */
498 newfolio
->index
= folio
->index
;
499 newfolio
->mapping
= folio
->mapping
;
500 if (folio_test_anon(folio
) && folio_test_large(folio
))
501 mod_mthp_stat(folio_order(folio
), MTHP_STAT_NR_ANON
, 1);
502 if (folio_test_swapbacked(folio
))
503 __folio_set_swapbacked(newfolio
);
505 return MIGRATEPAGE_SUCCESS
;
508 oldzone
= folio_zone(folio
);
509 newzone
= folio_zone(newfolio
);
512 if (!folio_ref_freeze(folio
, expected_count
)) {
513 xas_unlock_irq(&xas
);
517 /* Take off deferred split queue while frozen and memcg set */
518 folio_unqueue_deferred_split(folio
);
521 * Now we know that no one else is looking at the folio:
522 * no turning back from here.
524 newfolio
->index
= folio
->index
;
525 newfolio
->mapping
= folio
->mapping
;
526 if (folio_test_anon(folio
) && folio_test_large(folio
))
527 mod_mthp_stat(folio_order(folio
), MTHP_STAT_NR_ANON
, 1);
528 folio_ref_add(newfolio
, nr
); /* add cache reference */
529 if (folio_test_swapbacked(folio
)) {
530 __folio_set_swapbacked(newfolio
);
531 if (folio_test_swapcache(folio
)) {
532 folio_set_swapcache(newfolio
);
533 newfolio
->private = folio_get_private(folio
);
537 VM_BUG_ON_FOLIO(folio_test_swapcache(folio
), folio
);
541 /* Move dirty while folio refs frozen and newfolio not yet exposed */
542 dirty
= folio_test_dirty(folio
);
544 folio_clear_dirty(folio
);
545 folio_set_dirty(newfolio
);
548 /* Swap cache still stores N entries instead of a high-order entry */
549 for (i
= 0; i
< entries
; i
++) {
550 xas_store(&xas
, newfolio
);
555 * Drop cache reference from old folio by unfreezing
556 * to one less reference.
557 * We know this isn't the last reference.
559 folio_ref_unfreeze(folio
, expected_count
- nr
);
562 /* Leave irq disabled to prevent preemption while updating stats */
565 * If moved to a different zone then also account
566 * the folio for that zone. Other VM counters will be
567 * taken care of when we establish references to the
568 * new folio and drop references to the old folio.
570 * Note that anonymous folios are accounted for
571 * via NR_FILE_PAGES and NR_ANON_MAPPED if they
572 * are mapped to swap space.
574 if (newzone
!= oldzone
) {
575 struct lruvec
*old_lruvec
, *new_lruvec
;
576 struct mem_cgroup
*memcg
;
578 memcg
= folio_memcg(folio
);
579 old_lruvec
= mem_cgroup_lruvec(memcg
, oldzone
->zone_pgdat
);
580 new_lruvec
= mem_cgroup_lruvec(memcg
, newzone
->zone_pgdat
);
582 __mod_lruvec_state(old_lruvec
, NR_FILE_PAGES
, -nr
);
583 __mod_lruvec_state(new_lruvec
, NR_FILE_PAGES
, nr
);
584 if (folio_test_swapbacked(folio
) && !folio_test_swapcache(folio
)) {
585 __mod_lruvec_state(old_lruvec
, NR_SHMEM
, -nr
);
586 __mod_lruvec_state(new_lruvec
, NR_SHMEM
, nr
);
588 if (folio_test_pmd_mappable(folio
)) {
589 __mod_lruvec_state(old_lruvec
, NR_SHMEM_THPS
, -nr
);
590 __mod_lruvec_state(new_lruvec
, NR_SHMEM_THPS
, nr
);
594 if (folio_test_swapcache(folio
)) {
595 __mod_lruvec_state(old_lruvec
, NR_SWAPCACHE
, -nr
);
596 __mod_lruvec_state(new_lruvec
, NR_SWAPCACHE
, nr
);
599 if (dirty
&& mapping_can_writeback(mapping
)) {
600 __mod_lruvec_state(old_lruvec
, NR_FILE_DIRTY
, -nr
);
601 __mod_zone_page_state(oldzone
, NR_ZONE_WRITE_PENDING
, -nr
);
602 __mod_lruvec_state(new_lruvec
, NR_FILE_DIRTY
, nr
);
603 __mod_zone_page_state(newzone
, NR_ZONE_WRITE_PENDING
, nr
);
608 return MIGRATEPAGE_SUCCESS
;
611 int folio_migrate_mapping(struct address_space
*mapping
,
612 struct folio
*newfolio
, struct folio
*folio
, int extra_count
)
614 int expected_count
= folio_expected_refs(mapping
, folio
) + extra_count
;
616 if (folio_ref_count(folio
) != expected_count
)
619 return __folio_migrate_mapping(mapping
, newfolio
, folio
, expected_count
);
621 EXPORT_SYMBOL(folio_migrate_mapping
);
624 * The expected number of remaining references is the same as that
625 * of folio_migrate_mapping().
627 int migrate_huge_page_move_mapping(struct address_space
*mapping
,
628 struct folio
*dst
, struct folio
*src
)
630 XA_STATE(xas
, &mapping
->i_pages
, folio_index(src
));
631 int rc
, expected_count
= folio_expected_refs(mapping
, src
);
633 if (folio_ref_count(src
) != expected_count
)
636 rc
= folio_mc_copy(dst
, src
);
641 if (!folio_ref_freeze(src
, expected_count
)) {
642 xas_unlock_irq(&xas
);
646 dst
->index
= src
->index
;
647 dst
->mapping
= src
->mapping
;
649 folio_ref_add(dst
, folio_nr_pages(dst
));
651 xas_store(&xas
, dst
);
653 folio_ref_unfreeze(src
, expected_count
- folio_nr_pages(src
));
655 xas_unlock_irq(&xas
);
657 return MIGRATEPAGE_SUCCESS
;
661 * Copy the flags and some other ancillary information
663 void folio_migrate_flags(struct folio
*newfolio
, struct folio
*folio
)
667 if (folio_test_referenced(folio
))
668 folio_set_referenced(newfolio
);
669 if (folio_test_uptodate(folio
))
670 folio_mark_uptodate(newfolio
);
671 if (folio_test_clear_active(folio
)) {
672 VM_BUG_ON_FOLIO(folio_test_unevictable(folio
), folio
);
673 folio_set_active(newfolio
);
674 } else if (folio_test_clear_unevictable(folio
))
675 folio_set_unevictable(newfolio
);
676 if (folio_test_workingset(folio
))
677 folio_set_workingset(newfolio
);
678 if (folio_test_checked(folio
))
679 folio_set_checked(newfolio
);
681 * PG_anon_exclusive (-> PG_mappedtodisk) is always migrated via
682 * migration entries. We can still have PG_anon_exclusive set on an
683 * effectively unmapped and unreferenced first sub-pages of an
684 * anonymous THP: we can simply copy it here via PG_mappedtodisk.
686 if (folio_test_mappedtodisk(folio
))
687 folio_set_mappedtodisk(newfolio
);
689 /* Move dirty on pages not done by folio_migrate_mapping() */
690 if (folio_test_dirty(folio
))
691 folio_set_dirty(newfolio
);
693 if (folio_test_young(folio
))
694 folio_set_young(newfolio
);
695 if (folio_test_idle(folio
))
696 folio_set_idle(newfolio
);
698 folio_migrate_refs(newfolio
, folio
);
700 * Copy NUMA information to the new page, to prevent over-eager
701 * future migrations of this same page.
703 cpupid
= folio_xchg_last_cpupid(folio
, -1);
705 * For memory tiering mode, when migrate between slow and fast
706 * memory node, reset cpupid, because that is used to record
707 * page access time in slow memory node.
709 if (sysctl_numa_balancing_mode
& NUMA_BALANCING_MEMORY_TIERING
) {
710 bool f_toptier
= node_is_toptier(folio_nid(folio
));
711 bool t_toptier
= node_is_toptier(folio_nid(newfolio
));
713 if (f_toptier
!= t_toptier
)
716 folio_xchg_last_cpupid(newfolio
, cpupid
);
718 folio_migrate_ksm(newfolio
, folio
);
720 * Please do not reorder this without considering how mm/ksm.c's
721 * ksm_get_folio() depends upon ksm_migrate_page() and the
724 if (folio_test_swapcache(folio
))
725 folio_clear_swapcache(folio
);
726 folio_clear_private(folio
);
728 /* page->private contains hugetlb specific flags */
729 if (!folio_test_hugetlb(folio
))
730 folio
->private = NULL
;
733 * If any waiters have accumulated on the new page then
736 if (folio_test_writeback(newfolio
))
737 folio_end_writeback(newfolio
);
740 * PG_readahead shares the same bit with PG_reclaim. The above
741 * end_page_writeback() may clear PG_readahead mistakenly, so set the
744 if (folio_test_readahead(folio
))
745 folio_set_readahead(newfolio
);
747 folio_copy_owner(newfolio
, folio
);
748 pgalloc_tag_copy(newfolio
, folio
);
750 mem_cgroup_migrate(folio
, newfolio
);
752 EXPORT_SYMBOL(folio_migrate_flags
);
754 /************************************************************
755 * Migration functions
756 ***********************************************************/
758 static int __migrate_folio(struct address_space
*mapping
, struct folio
*dst
,
759 struct folio
*src
, void *src_private
,
760 enum migrate_mode mode
)
762 int rc
, expected_count
= folio_expected_refs(mapping
, src
);
764 /* Check whether src does not have extra refs before we do more work */
765 if (folio_ref_count(src
) != expected_count
)
768 rc
= folio_mc_copy(dst
, src
);
772 rc
= __folio_migrate_mapping(mapping
, dst
, src
, expected_count
);
773 if (rc
!= MIGRATEPAGE_SUCCESS
)
777 folio_attach_private(dst
, folio_detach_private(src
));
779 folio_migrate_flags(dst
, src
);
780 return MIGRATEPAGE_SUCCESS
;
784 * migrate_folio() - Simple folio migration.
785 * @mapping: The address_space containing the folio.
786 * @dst: The folio to migrate the data to.
787 * @src: The folio containing the current data.
788 * @mode: How to migrate the page.
790 * Common logic to directly migrate a single LRU folio suitable for
791 * folios that do not have private data.
793 * Folios are locked upon entry and exit.
795 int migrate_folio(struct address_space
*mapping
, struct folio
*dst
,
796 struct folio
*src
, enum migrate_mode mode
)
798 BUG_ON(folio_test_writeback(src
)); /* Writeback must be complete */
799 return __migrate_folio(mapping
, dst
, src
, NULL
, mode
);
801 EXPORT_SYMBOL(migrate_folio
);
803 #ifdef CONFIG_BUFFER_HEAD
804 /* Returns true if all buffers are successfully locked */
805 static bool buffer_migrate_lock_buffers(struct buffer_head
*head
,
806 enum migrate_mode mode
)
808 struct buffer_head
*bh
= head
;
809 struct buffer_head
*failed_bh
;
812 if (!trylock_buffer(bh
)) {
813 if (mode
== MIGRATE_ASYNC
)
815 if (mode
== MIGRATE_SYNC_LIGHT
&& !buffer_uptodate(bh
))
820 bh
= bh
->b_this_page
;
821 } while (bh
!= head
);
826 /* We failed to lock the buffer and cannot stall. */
829 while (bh
!= failed_bh
) {
831 bh
= bh
->b_this_page
;
837 static int __buffer_migrate_folio(struct address_space
*mapping
,
838 struct folio
*dst
, struct folio
*src
, enum migrate_mode mode
,
841 struct buffer_head
*bh
, *head
;
845 head
= folio_buffers(src
);
847 return migrate_folio(mapping
, dst
, src
, mode
);
849 /* Check whether page does not have extra refs before we do more work */
850 expected_count
= folio_expected_refs(mapping
, src
);
851 if (folio_ref_count(src
) != expected_count
)
854 if (!buffer_migrate_lock_buffers(head
, mode
))
859 bool invalidated
= false;
863 spin_lock(&mapping
->i_private_lock
);
866 if (atomic_read(&bh
->b_count
)) {
870 bh
= bh
->b_this_page
;
871 } while (bh
!= head
);
877 spin_unlock(&mapping
->i_private_lock
);
878 invalidate_bh_lrus();
880 goto recheck_buffers
;
884 rc
= filemap_migrate_folio(mapping
, dst
, src
, mode
);
885 if (rc
!= MIGRATEPAGE_SUCCESS
)
890 folio_set_bh(bh
, dst
, bh_offset(bh
));
891 bh
= bh
->b_this_page
;
892 } while (bh
!= head
);
896 spin_unlock(&mapping
->i_private_lock
);
900 bh
= bh
->b_this_page
;
901 } while (bh
!= head
);
907 * buffer_migrate_folio() - Migration function for folios with buffers.
908 * @mapping: The address space containing @src.
909 * @dst: The folio to migrate to.
910 * @src: The folio to migrate from.
911 * @mode: How to migrate the folio.
913 * This function can only be used if the underlying filesystem guarantees
914 * that no other references to @src exist. For example attached buffer
915 * heads are accessed only under the folio lock. If your filesystem cannot
916 * provide this guarantee, buffer_migrate_folio_norefs() may be more
919 * Return: 0 on success or a negative errno on failure.
921 int buffer_migrate_folio(struct address_space
*mapping
,
922 struct folio
*dst
, struct folio
*src
, enum migrate_mode mode
)
924 return __buffer_migrate_folio(mapping
, dst
, src
, mode
, false);
926 EXPORT_SYMBOL(buffer_migrate_folio
);
929 * buffer_migrate_folio_norefs() - Migration function for folios with buffers.
930 * @mapping: The address space containing @src.
931 * @dst: The folio to migrate to.
932 * @src: The folio to migrate from.
933 * @mode: How to migrate the folio.
935 * Like buffer_migrate_folio() except that this variant is more careful
936 * and checks that there are also no buffer head references. This function
937 * is the right one for mappings where buffer heads are directly looked
938 * up and referenced (such as block device mappings).
940 * Return: 0 on success or a negative errno on failure.
942 int buffer_migrate_folio_norefs(struct address_space
*mapping
,
943 struct folio
*dst
, struct folio
*src
, enum migrate_mode mode
)
945 return __buffer_migrate_folio(mapping
, dst
, src
, mode
, true);
947 EXPORT_SYMBOL_GPL(buffer_migrate_folio_norefs
);
948 #endif /* CONFIG_BUFFER_HEAD */
950 int filemap_migrate_folio(struct address_space
*mapping
,
951 struct folio
*dst
, struct folio
*src
, enum migrate_mode mode
)
953 return __migrate_folio(mapping
, dst
, src
, folio_get_private(src
), mode
);
955 EXPORT_SYMBOL_GPL(filemap_migrate_folio
);
958 * Writeback a folio to clean the dirty state
960 static int writeout(struct address_space
*mapping
, struct folio
*folio
)
962 struct writeback_control wbc
= {
963 .sync_mode
= WB_SYNC_NONE
,
966 .range_end
= LLONG_MAX
,
971 if (!mapping
->a_ops
->writepage
)
972 /* No write method for the address space */
975 if (!folio_clear_dirty_for_io(folio
))
976 /* Someone else already triggered a write */
980 * A dirty folio may imply that the underlying filesystem has
981 * the folio on some queue. So the folio must be clean for
982 * migration. Writeout may mean we lose the lock and the
983 * folio state is no longer what we checked for earlier.
984 * At this point we know that the migration attempt cannot
987 remove_migration_ptes(folio
, folio
, 0);
989 rc
= mapping
->a_ops
->writepage(&folio
->page
, &wbc
);
991 if (rc
!= AOP_WRITEPAGE_ACTIVATE
)
992 /* unlocked. Relock */
995 return (rc
< 0) ? -EIO
: -EAGAIN
;
999 * Default handling if a filesystem does not provide a migration function.
1001 static int fallback_migrate_folio(struct address_space
*mapping
,
1002 struct folio
*dst
, struct folio
*src
, enum migrate_mode mode
)
1004 if (folio_test_dirty(src
)) {
1005 /* Only writeback folios in full synchronous migration */
1012 return writeout(mapping
, src
);
1016 * Buffers may be managed in a filesystem specific way.
1017 * We must have no buffers or drop them.
1019 if (!filemap_release_folio(src
, GFP_KERNEL
))
1020 return mode
== MIGRATE_SYNC
? -EAGAIN
: -EBUSY
;
1022 return migrate_folio(mapping
, dst
, src
, mode
);
1026 * Move a page to a newly allocated page
1027 * The page is locked and all ptes have been successfully removed.
1029 * The new page will have replaced the old page if this function
1034 * MIGRATEPAGE_SUCCESS - success
1036 static int move_to_new_folio(struct folio
*dst
, struct folio
*src
,
1037 enum migrate_mode mode
)
1040 bool is_lru
= !__folio_test_movable(src
);
1042 VM_BUG_ON_FOLIO(!folio_test_locked(src
), src
);
1043 VM_BUG_ON_FOLIO(!folio_test_locked(dst
), dst
);
1045 if (likely(is_lru
)) {
1046 struct address_space
*mapping
= folio_mapping(src
);
1049 rc
= migrate_folio(mapping
, dst
, src
, mode
);
1050 else if (mapping_inaccessible(mapping
))
1052 else if (mapping
->a_ops
->migrate_folio
)
1054 * Most folios have a mapping and most filesystems
1055 * provide a migrate_folio callback. Anonymous folios
1056 * are part of swap space which also has its own
1057 * migrate_folio callback. This is the most common path
1058 * for page migration.
1060 rc
= mapping
->a_ops
->migrate_folio(mapping
, dst
, src
,
1063 rc
= fallback_migrate_folio(mapping
, dst
, src
, mode
);
1065 const struct movable_operations
*mops
;
1068 * In case of non-lru page, it could be released after
1069 * isolation step. In that case, we shouldn't try migration.
1071 VM_BUG_ON_FOLIO(!folio_test_isolated(src
), src
);
1072 if (!folio_test_movable(src
)) {
1073 rc
= MIGRATEPAGE_SUCCESS
;
1074 folio_clear_isolated(src
);
1078 mops
= folio_movable_ops(src
);
1079 rc
= mops
->migrate_page(&dst
->page
, &src
->page
, mode
);
1080 WARN_ON_ONCE(rc
== MIGRATEPAGE_SUCCESS
&&
1081 !folio_test_isolated(src
));
1085 * When successful, old pagecache src->mapping must be cleared before
1086 * src is freed; but stats require that PageAnon be left as PageAnon.
1088 if (rc
== MIGRATEPAGE_SUCCESS
) {
1089 if (__folio_test_movable(src
)) {
1090 VM_BUG_ON_FOLIO(!folio_test_isolated(src
), src
);
1093 * We clear PG_movable under page_lock so any compactor
1094 * cannot try to migrate this page.
1096 folio_clear_isolated(src
);
1100 * Anonymous and movable src->mapping will be cleared by
1101 * free_pages_prepare so don't reset it here for keeping
1102 * the type to work PageAnon, for example.
1104 if (!folio_mapping_flags(src
))
1105 src
->mapping
= NULL
;
1107 if (likely(!folio_is_zone_device(dst
)))
1108 flush_dcache_folio(dst
);
1115 * To record some information during migration, we use unused private
1116 * field of struct folio of the newly allocated destination folio.
1117 * This is safe because nobody is using it except us.
1120 PAGE_WAS_MAPPED
= BIT(0),
1121 PAGE_WAS_MLOCKED
= BIT(1),
1122 PAGE_OLD_STATES
= PAGE_WAS_MAPPED
| PAGE_WAS_MLOCKED
,
1125 static void __migrate_folio_record(struct folio
*dst
,
1127 struct anon_vma
*anon_vma
)
1129 dst
->private = (void *)anon_vma
+ old_page_state
;
1132 static void __migrate_folio_extract(struct folio
*dst
,
1133 int *old_page_state
,
1134 struct anon_vma
**anon_vmap
)
1136 unsigned long private = (unsigned long)dst
->private;
1138 *anon_vmap
= (struct anon_vma
*)(private & ~PAGE_OLD_STATES
);
1139 *old_page_state
= private & PAGE_OLD_STATES
;
1140 dst
->private = NULL
;
1143 /* Restore the source folio to the original state upon failure */
1144 static void migrate_folio_undo_src(struct folio
*src
,
1145 int page_was_mapped
,
1146 struct anon_vma
*anon_vma
,
1148 struct list_head
*ret
)
1150 if (page_was_mapped
)
1151 remove_migration_ptes(src
, src
, 0);
1152 /* Drop an anon_vma reference if we took one */
1154 put_anon_vma(anon_vma
);
1158 list_move_tail(&src
->lru
, ret
);
1161 /* Restore the destination folio to the original state upon failure */
1162 static void migrate_folio_undo_dst(struct folio
*dst
, bool locked
,
1163 free_folio_t put_new_folio
, unsigned long private)
1168 put_new_folio(dst
, private);
1173 /* Cleanup src folio upon migration success */
1174 static void migrate_folio_done(struct folio
*src
,
1175 enum migrate_reason reason
)
1178 * Compaction can migrate also non-LRU pages which are
1179 * not accounted to NR_ISOLATED_*. They can be recognized
1180 * as __folio_test_movable
1182 if (likely(!__folio_test_movable(src
)) && reason
!= MR_DEMOTION
)
1183 mod_node_page_state(folio_pgdat(src
), NR_ISOLATED_ANON
+
1184 folio_is_file_lru(src
), -folio_nr_pages(src
));
1186 if (reason
!= MR_MEMORY_FAILURE
)
1187 /* We release the page in page_handle_poison. */
1191 /* Obtain the lock on page, remove all ptes. */
1192 static int migrate_folio_unmap(new_folio_t get_new_folio
,
1193 free_folio_t put_new_folio
, unsigned long private,
1194 struct folio
*src
, struct folio
**dstp
, enum migrate_mode mode
,
1195 enum migrate_reason reason
, struct list_head
*ret
)
1199 int old_page_state
= 0;
1200 struct anon_vma
*anon_vma
= NULL
;
1201 bool is_lru
= data_race(!__folio_test_movable(src
));
1202 bool locked
= false;
1203 bool dst_locked
= false;
1205 if (folio_ref_count(src
) == 1) {
1206 /* Folio was freed from under us. So we are done. */
1207 folio_clear_active(src
);
1208 folio_clear_unevictable(src
);
1209 /* free_pages_prepare() will clear PG_isolated. */
1210 list_del(&src
->lru
);
1211 migrate_folio_done(src
, reason
);
1212 return MIGRATEPAGE_SUCCESS
;
1215 dst
= get_new_folio(src
, private);
1220 dst
->private = NULL
;
1222 if (!folio_trylock(src
)) {
1223 if (mode
== MIGRATE_ASYNC
)
1227 * It's not safe for direct compaction to call lock_page.
1228 * For example, during page readahead pages are added locked
1229 * to the LRU. Later, when the IO completes the pages are
1230 * marked uptodate and unlocked. However, the queueing
1231 * could be merging multiple pages for one bio (e.g.
1232 * mpage_readahead). If an allocation happens for the
1233 * second or third page, the process can end up locking
1234 * the same page twice and deadlocking. Rather than
1235 * trying to be clever about what pages can be locked,
1236 * avoid the use of lock_page for direct compaction
1239 if (current
->flags
& PF_MEMALLOC
)
1243 * In "light" mode, we can wait for transient locks (eg
1244 * inserting a page into the page table), but it's not
1245 * worth waiting for I/O.
1247 if (mode
== MIGRATE_SYNC_LIGHT
&& !folio_test_uptodate(src
))
1253 if (folio_test_mlocked(src
))
1254 old_page_state
|= PAGE_WAS_MLOCKED
;
1256 if (folio_test_writeback(src
)) {
1258 * Only in the case of a full synchronous migration is it
1259 * necessary to wait for PageWriteback. In the async case,
1260 * the retry loop is too short and in the sync-light case,
1261 * the overhead of stalling is too much
1270 folio_wait_writeback(src
);
1274 * By try_to_migrate(), src->mapcount goes down to 0 here. In this case,
1275 * we cannot notice that anon_vma is freed while we migrate a page.
1276 * This get_anon_vma() delays freeing anon_vma pointer until the end
1277 * of migration. File cache pages are no problem because of page_lock()
1278 * File Caches may use write_page() or lock_page() in migration, then,
1279 * just care Anon page here.
1281 * Only folio_get_anon_vma() understands the subtleties of
1282 * getting a hold on an anon_vma from outside one of its mms.
1283 * But if we cannot get anon_vma, then we won't need it anyway,
1284 * because that implies that the anon page is no longer mapped
1285 * (and cannot be remapped so long as we hold the page lock).
1287 if (folio_test_anon(src
) && !folio_test_ksm(src
))
1288 anon_vma
= folio_get_anon_vma(src
);
1291 * Block others from accessing the new page when we get around to
1292 * establishing additional references. We are usually the only one
1293 * holding a reference to dst at this point. We used to have a BUG
1294 * here if folio_trylock(dst) fails, but would like to allow for
1295 * cases where there might be a race with the previous use of dst.
1296 * This is much like races on refcount of oldpage: just don't BUG().
1298 if (unlikely(!folio_trylock(dst
)))
1302 if (unlikely(!is_lru
)) {
1303 __migrate_folio_record(dst
, old_page_state
, anon_vma
);
1304 return MIGRATEPAGE_UNMAP
;
1308 * Corner case handling:
1309 * 1. When a new swap-cache page is read into, it is added to the LRU
1310 * and treated as swapcache but it has no rmap yet.
1311 * Calling try_to_unmap() against a src->mapping==NULL page will
1312 * trigger a BUG. So handle it here.
1313 * 2. An orphaned page (see truncate_cleanup_page) might have
1314 * fs-private metadata. The page can be picked up due to memory
1315 * offlining. Everywhere else except page reclaim, the page is
1316 * invisible to the vm, so the page can not be migrated. So try to
1317 * free the metadata, so the page can be freed.
1319 if (!src
->mapping
) {
1320 if (folio_test_private(src
)) {
1321 try_to_free_buffers(src
);
1324 } else if (folio_mapped(src
)) {
1325 /* Establish migration ptes */
1326 VM_BUG_ON_FOLIO(folio_test_anon(src
) &&
1327 !folio_test_ksm(src
) && !anon_vma
, src
);
1328 try_to_migrate(src
, mode
== MIGRATE_ASYNC
? TTU_BATCH_FLUSH
: 0);
1329 old_page_state
|= PAGE_WAS_MAPPED
;
1332 if (!folio_mapped(src
)) {
1333 __migrate_folio_record(dst
, old_page_state
, anon_vma
);
1334 return MIGRATEPAGE_UNMAP
;
1339 * A folio that has not been unmapped will be restored to
1340 * right list unless we want to retry.
1345 migrate_folio_undo_src(src
, old_page_state
& PAGE_WAS_MAPPED
,
1346 anon_vma
, locked
, ret
);
1347 migrate_folio_undo_dst(dst
, dst_locked
, put_new_folio
, private);
1352 /* Migrate the folio to the newly allocated folio in dst. */
1353 static int migrate_folio_move(free_folio_t put_new_folio
, unsigned long private,
1354 struct folio
*src
, struct folio
*dst
,
1355 enum migrate_mode mode
, enum migrate_reason reason
,
1356 struct list_head
*ret
)
1359 int old_page_state
= 0;
1360 struct anon_vma
*anon_vma
= NULL
;
1361 bool is_lru
= !__folio_test_movable(src
);
1362 struct list_head
*prev
;
1364 __migrate_folio_extract(dst
, &old_page_state
, &anon_vma
);
1365 prev
= dst
->lru
.prev
;
1366 list_del(&dst
->lru
);
1368 rc
= move_to_new_folio(dst
, src
, mode
);
1372 if (unlikely(!is_lru
))
1373 goto out_unlock_both
;
1376 * When successful, push dst to LRU immediately: so that if it
1377 * turns out to be an mlocked page, remove_migration_ptes() will
1378 * automatically build up the correct dst->mlock_count for it.
1380 * We would like to do something similar for the old page, when
1381 * unsuccessful, and other cases when a page has been temporarily
1382 * isolated from the unevictable LRU: but this case is the easiest.
1385 if (old_page_state
& PAGE_WAS_MLOCKED
)
1388 if (old_page_state
& PAGE_WAS_MAPPED
)
1389 remove_migration_ptes(src
, dst
, 0);
1393 set_page_owner_migrate_reason(&dst
->page
, reason
);
1395 * If migration is successful, decrease refcount of dst,
1396 * which will not free the page because new page owner increased
1402 * A folio that has been migrated has all references removed
1403 * and will be freed.
1405 list_del(&src
->lru
);
1406 /* Drop an anon_vma reference if we took one */
1408 put_anon_vma(anon_vma
);
1410 migrate_folio_done(src
, reason
);
1415 * A folio that has not been migrated will be restored to
1416 * right list unless we want to retry.
1418 if (rc
== -EAGAIN
) {
1419 list_add(&dst
->lru
, prev
);
1420 __migrate_folio_record(dst
, old_page_state
, anon_vma
);
1424 migrate_folio_undo_src(src
, old_page_state
& PAGE_WAS_MAPPED
,
1425 anon_vma
, true, ret
);
1426 migrate_folio_undo_dst(dst
, true, put_new_folio
, private);
1432 * Counterpart of unmap_and_move_page() for hugepage migration.
1434 * This function doesn't wait the completion of hugepage I/O
1435 * because there is no race between I/O and migration for hugepage.
1436 * Note that currently hugepage I/O occurs only in direct I/O
1437 * where no lock is held and PG_writeback is irrelevant,
1438 * and writeback status of all subpages are counted in the reference
1439 * count of the head page (i.e. if all subpages of a 2MB hugepage are
1440 * under direct I/O, the reference of the head page is 512 and a bit more.)
1441 * This means that when we try to migrate hugepage whose subpages are
1442 * doing direct I/O, some references remain after try_to_unmap() and
1443 * hugepage migration fails without data corruption.
1445 * There is also no race when direct I/O is issued on the page under migration,
1446 * because then pte is replaced with migration swap entry and direct I/O code
1447 * will wait in the page fault for migration to complete.
1449 static int unmap_and_move_huge_page(new_folio_t get_new_folio
,
1450 free_folio_t put_new_folio
, unsigned long private,
1451 struct folio
*src
, int force
, enum migrate_mode mode
,
1452 int reason
, struct list_head
*ret
)
1456 int page_was_mapped
= 0;
1457 struct anon_vma
*anon_vma
= NULL
;
1458 struct address_space
*mapping
= NULL
;
1460 if (folio_ref_count(src
) == 1) {
1461 /* page was freed from under us. So we are done. */
1462 folio_putback_active_hugetlb(src
);
1463 return MIGRATEPAGE_SUCCESS
;
1466 dst
= get_new_folio(src
, private);
1470 if (!folio_trylock(src
)) {
1483 * Check for pages which are in the process of being freed. Without
1484 * folio_mapping() set, hugetlbfs specific move page routine will not
1485 * be called and we could leak usage counts for subpools.
1487 if (hugetlb_folio_subpool(src
) && !folio_mapping(src
)) {
1492 if (folio_test_anon(src
))
1493 anon_vma
= folio_get_anon_vma(src
);
1495 if (unlikely(!folio_trylock(dst
)))
1498 if (folio_mapped(src
)) {
1499 enum ttu_flags ttu
= 0;
1501 if (!folio_test_anon(src
)) {
1503 * In shared mappings, try_to_unmap could potentially
1504 * call huge_pmd_unshare. Because of this, take
1505 * semaphore in write mode here and set TTU_RMAP_LOCKED
1506 * to let lower levels know we have taken the lock.
1508 mapping
= hugetlb_folio_mapping_lock_write(src
);
1509 if (unlikely(!mapping
))
1510 goto unlock_put_anon
;
1512 ttu
= TTU_RMAP_LOCKED
;
1515 try_to_migrate(src
, ttu
);
1516 page_was_mapped
= 1;
1518 if (ttu
& TTU_RMAP_LOCKED
)
1519 i_mmap_unlock_write(mapping
);
1522 if (!folio_mapped(src
))
1523 rc
= move_to_new_folio(dst
, src
, mode
);
1525 if (page_was_mapped
)
1526 remove_migration_ptes(src
,
1527 rc
== MIGRATEPAGE_SUCCESS
? dst
: src
, 0);
1534 put_anon_vma(anon_vma
);
1536 if (rc
== MIGRATEPAGE_SUCCESS
) {
1537 move_hugetlb_state(src
, dst
, reason
);
1538 put_new_folio
= NULL
;
1544 if (rc
== MIGRATEPAGE_SUCCESS
)
1545 folio_putback_active_hugetlb(src
);
1546 else if (rc
!= -EAGAIN
)
1547 list_move_tail(&src
->lru
, ret
);
1550 * If migration was not successful and there's a freeing callback, use
1551 * it. Otherwise, put_page() will drop the reference grabbed during
1555 put_new_folio(dst
, private);
1557 folio_putback_active_hugetlb(dst
);
1562 static inline int try_split_folio(struct folio
*folio
, struct list_head
*split_folios
,
1563 enum migrate_mode mode
)
1567 if (mode
== MIGRATE_ASYNC
) {
1568 if (!folio_trylock(folio
))
1573 rc
= split_folio_to_list(folio
, split_folios
);
1574 folio_unlock(folio
);
1576 list_move_tail(&folio
->lru
, split_folios
);
1581 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1582 #define NR_MAX_BATCHED_MIGRATION HPAGE_PMD_NR
1584 #define NR_MAX_BATCHED_MIGRATION 512
1586 #define NR_MAX_MIGRATE_PAGES_RETRY 10
1587 #define NR_MAX_MIGRATE_ASYNC_RETRY 3
1588 #define NR_MAX_MIGRATE_SYNC_RETRY \
1589 (NR_MAX_MIGRATE_PAGES_RETRY - NR_MAX_MIGRATE_ASYNC_RETRY)
1591 struct migrate_pages_stats
{
1592 int nr_succeeded
; /* Normal and large folios migrated successfully, in
1593 units of base pages */
1594 int nr_failed_pages
; /* Normal and large folios failed to be migrated, in
1595 units of base pages. Untried folios aren't counted */
1596 int nr_thp_succeeded
; /* THP migrated successfully */
1597 int nr_thp_failed
; /* THP failed to be migrated */
1598 int nr_thp_split
; /* THP split before migrating */
1599 int nr_split
; /* Large folio (include THP) split before migrating */
1603 * Returns the number of hugetlb folios that were not migrated, or an error code
1604 * after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no hugetlb folios are movable
1605 * any more because the list has become empty or no retryable hugetlb folios
1606 * exist any more. It is caller's responsibility to call putback_movable_pages()
1609 static int migrate_hugetlbs(struct list_head
*from
, new_folio_t get_new_folio
,
1610 free_folio_t put_new_folio
, unsigned long private,
1611 enum migrate_mode mode
, int reason
,
1612 struct migrate_pages_stats
*stats
,
1613 struct list_head
*ret_folios
)
1617 int nr_retry_pages
= 0;
1619 struct folio
*folio
, *folio2
;
1622 for (pass
= 0; pass
< NR_MAX_MIGRATE_PAGES_RETRY
&& retry
; pass
++) {
1626 list_for_each_entry_safe(folio
, folio2
, from
, lru
) {
1627 if (!folio_test_hugetlb(folio
))
1630 nr_pages
= folio_nr_pages(folio
);
1635 * Migratability of hugepages depends on architectures and
1636 * their size. This check is necessary because some callers
1637 * of hugepage migration like soft offline and memory
1638 * hotremove don't walk through page tables or check whether
1639 * the hugepage is pmd-based or not before kicking migration.
1641 if (!hugepage_migration_supported(folio_hstate(folio
))) {
1643 stats
->nr_failed_pages
+= nr_pages
;
1644 list_move_tail(&folio
->lru
, ret_folios
);
1648 rc
= unmap_and_move_huge_page(get_new_folio
,
1649 put_new_folio
, private,
1650 folio
, pass
> 2, mode
,
1651 reason
, ret_folios
);
1654 * Success: hugetlb folio will be put back
1655 * -EAGAIN: stay on the from list
1656 * -ENOMEM: stay on the from list
1657 * Other errno: put on ret_folios list
1662 * When memory is low, don't bother to try to migrate
1663 * other folios, just exit.
1665 stats
->nr_failed_pages
+= nr_pages
+ nr_retry_pages
;
1669 nr_retry_pages
+= nr_pages
;
1671 case MIGRATEPAGE_SUCCESS
:
1672 stats
->nr_succeeded
+= nr_pages
;
1676 * Permanent failure (-EBUSY, etc.):
1677 * unlike -EAGAIN case, the failed folio is
1678 * removed from migration folio list and not
1679 * retried in the next outer loop.
1682 stats
->nr_failed_pages
+= nr_pages
;
1688 * nr_failed is number of hugetlb folios failed to be migrated. After
1689 * NR_MAX_MIGRATE_PAGES_RETRY attempts, give up and count retried hugetlb
1693 stats
->nr_failed_pages
+= nr_retry_pages
;
1699 * migrate_pages_batch() first unmaps folios in the from list as many as
1700 * possible, then move the unmapped folios.
1702 * We only batch migration if mode == MIGRATE_ASYNC to avoid to wait a
1703 * lock or bit when we have locked more than one folio. Which may cause
1704 * deadlock (e.g., for loop device). So, if mode != MIGRATE_ASYNC, the
1705 * length of the from list must be <= 1.
1707 static int migrate_pages_batch(struct list_head
*from
,
1708 new_folio_t get_new_folio
, free_folio_t put_new_folio
,
1709 unsigned long private, enum migrate_mode mode
, int reason
,
1710 struct list_head
*ret_folios
, struct list_head
*split_folios
,
1711 struct migrate_pages_stats
*stats
, int nr_pass
)
1716 int nr_retry_pages
= 0;
1718 bool is_thp
= false;
1719 bool is_large
= false;
1720 struct folio
*folio
, *folio2
, *dst
= NULL
, *dst2
;
1721 int rc
, rc_saved
= 0, nr_pages
;
1722 LIST_HEAD(unmap_folios
);
1723 LIST_HEAD(dst_folios
);
1724 bool nosplit
= (reason
== MR_NUMA_MISPLACED
);
1726 VM_WARN_ON_ONCE(mode
!= MIGRATE_ASYNC
&&
1727 !list_empty(from
) && !list_is_singular(from
));
1729 for (pass
= 0; pass
< nr_pass
&& retry
; pass
++) {
1734 list_for_each_entry_safe(folio
, folio2
, from
, lru
) {
1735 is_large
= folio_test_large(folio
);
1736 is_thp
= folio_test_pmd_mappable(folio
);
1737 nr_pages
= folio_nr_pages(folio
);
1742 * The rare folio on the deferred split list should
1743 * be split now. It should not count as a failure:
1744 * but increment nr_failed because, without doing so,
1745 * migrate_pages() may report success with (split but
1746 * unmigrated) pages still on its fromlist; whereas it
1747 * always reports success when its fromlist is empty.
1748 * stats->nr_thp_failed should be increased too,
1749 * otherwise stats inconsistency will happen when
1750 * migrate_pages_batch is called via migrate_pages()
1751 * with MIGRATE_SYNC and MIGRATE_ASYNC.
1753 * Only check it without removing it from the list.
1754 * Since the folio can be on deferred_split_scan()
1755 * local list and removing it can cause the local list
1756 * corruption. Folio split process below can handle it
1757 * with the help of folio_ref_freeze().
1759 * nr_pages > 2 is needed to avoid checking order-1
1760 * page cache folios. They exist, in contrast to
1761 * non-existent order-1 anonymous folios, and do not
1762 * use _deferred_list.
1765 !list_empty(&folio
->_deferred_list
) &&
1766 folio_test_partially_mapped(folio
)) {
1767 if (!try_split_folio(folio
, split_folios
, mode
)) {
1769 stats
->nr_thp_failed
+= is_thp
;
1770 stats
->nr_thp_split
+= is_thp
;
1777 * Large folio migration might be unsupported or
1778 * the allocation might be failed so we should retry
1779 * on the same folio with the large folio split
1782 * Split folios are put in split_folios, and
1783 * we will migrate them after the rest of the
1784 * list is processed.
1786 if (!thp_migration_supported() && is_thp
) {
1788 stats
->nr_thp_failed
++;
1789 if (!try_split_folio(folio
, split_folios
, mode
)) {
1790 stats
->nr_thp_split
++;
1794 stats
->nr_failed_pages
+= nr_pages
;
1795 list_move_tail(&folio
->lru
, ret_folios
);
1799 rc
= migrate_folio_unmap(get_new_folio
, put_new_folio
,
1800 private, folio
, &dst
, mode
, reason
,
1804 * Success: folio will be freed
1805 * Unmap: folio will be put on unmap_folios list,
1806 * dst folio put on dst_folios list
1807 * -EAGAIN: stay on the from list
1808 * -ENOMEM: stay on the from list
1809 * Other errno: put on ret_folios list
1814 * When memory is low, don't bother to try to migrate
1815 * other folios, move unmapped folios, then exit.
1818 stats
->nr_thp_failed
+= is_thp
;
1819 /* Large folio NUMA faulting doesn't split to retry. */
1820 if (is_large
&& !nosplit
) {
1821 int ret
= try_split_folio(folio
, split_folios
, mode
);
1824 stats
->nr_thp_split
+= is_thp
;
1827 } else if (reason
== MR_LONGTERM_PIN
&&
1830 * Try again to split large folio to
1831 * mitigate the failure of longterm pinning.
1834 thp_retry
+= is_thp
;
1835 nr_retry_pages
+= nr_pages
;
1836 /* Undo duplicated failure counting. */
1838 stats
->nr_thp_failed
-= is_thp
;
1843 stats
->nr_failed_pages
+= nr_pages
+ nr_retry_pages
;
1844 /* nr_failed isn't updated for not used */
1845 stats
->nr_thp_failed
+= thp_retry
;
1847 if (list_empty(&unmap_folios
))
1853 thp_retry
+= is_thp
;
1854 nr_retry_pages
+= nr_pages
;
1856 case MIGRATEPAGE_SUCCESS
:
1857 stats
->nr_succeeded
+= nr_pages
;
1858 stats
->nr_thp_succeeded
+= is_thp
;
1860 case MIGRATEPAGE_UNMAP
:
1861 list_move_tail(&folio
->lru
, &unmap_folios
);
1862 list_add_tail(&dst
->lru
, &dst_folios
);
1866 * Permanent failure (-EBUSY, etc.):
1867 * unlike -EAGAIN case, the failed folio is
1868 * removed from migration folio list and not
1869 * retried in the next outer loop.
1872 stats
->nr_thp_failed
+= is_thp
;
1873 stats
->nr_failed_pages
+= nr_pages
;
1879 stats
->nr_thp_failed
+= thp_retry
;
1880 stats
->nr_failed_pages
+= nr_retry_pages
;
1882 /* Flush TLBs for all unmapped folios */
1883 try_to_unmap_flush();
1886 for (pass
= 0; pass
< nr_pass
&& retry
; pass
++) {
1891 dst
= list_first_entry(&dst_folios
, struct folio
, lru
);
1892 dst2
= list_next_entry(dst
, lru
);
1893 list_for_each_entry_safe(folio
, folio2
, &unmap_folios
, lru
) {
1894 is_thp
= folio_test_large(folio
) && folio_test_pmd_mappable(folio
);
1895 nr_pages
= folio_nr_pages(folio
);
1899 rc
= migrate_folio_move(put_new_folio
, private,
1901 reason
, ret_folios
);
1904 * Success: folio will be freed
1905 * -EAGAIN: stay on the unmap_folios list
1906 * Other errno: put on ret_folios list
1911 thp_retry
+= is_thp
;
1912 nr_retry_pages
+= nr_pages
;
1914 case MIGRATEPAGE_SUCCESS
:
1915 stats
->nr_succeeded
+= nr_pages
;
1916 stats
->nr_thp_succeeded
+= is_thp
;
1920 stats
->nr_thp_failed
+= is_thp
;
1921 stats
->nr_failed_pages
+= nr_pages
;
1925 dst2
= list_next_entry(dst
, lru
);
1929 stats
->nr_thp_failed
+= thp_retry
;
1930 stats
->nr_failed_pages
+= nr_retry_pages
;
1932 rc
= rc_saved
? : nr_failed
;
1934 /* Cleanup remaining folios */
1935 dst
= list_first_entry(&dst_folios
, struct folio
, lru
);
1936 dst2
= list_next_entry(dst
, lru
);
1937 list_for_each_entry_safe(folio
, folio2
, &unmap_folios
, lru
) {
1938 int old_page_state
= 0;
1939 struct anon_vma
*anon_vma
= NULL
;
1941 __migrate_folio_extract(dst
, &old_page_state
, &anon_vma
);
1942 migrate_folio_undo_src(folio
, old_page_state
& PAGE_WAS_MAPPED
,
1943 anon_vma
, true, ret_folios
);
1944 list_del(&dst
->lru
);
1945 migrate_folio_undo_dst(dst
, true, put_new_folio
, private);
1947 dst2
= list_next_entry(dst
, lru
);
1953 static int migrate_pages_sync(struct list_head
*from
, new_folio_t get_new_folio
,
1954 free_folio_t put_new_folio
, unsigned long private,
1955 enum migrate_mode mode
, int reason
,
1956 struct list_head
*ret_folios
, struct list_head
*split_folios
,
1957 struct migrate_pages_stats
*stats
)
1959 int rc
, nr_failed
= 0;
1961 struct migrate_pages_stats astats
;
1963 memset(&astats
, 0, sizeof(astats
));
1964 /* Try to migrate in batch with MIGRATE_ASYNC mode firstly */
1965 rc
= migrate_pages_batch(from
, get_new_folio
, put_new_folio
, private, MIGRATE_ASYNC
,
1966 reason
, &folios
, split_folios
, &astats
,
1967 NR_MAX_MIGRATE_ASYNC_RETRY
);
1968 stats
->nr_succeeded
+= astats
.nr_succeeded
;
1969 stats
->nr_thp_succeeded
+= astats
.nr_thp_succeeded
;
1970 stats
->nr_thp_split
+= astats
.nr_thp_split
;
1971 stats
->nr_split
+= astats
.nr_split
;
1973 stats
->nr_failed_pages
+= astats
.nr_failed_pages
;
1974 stats
->nr_thp_failed
+= astats
.nr_thp_failed
;
1975 list_splice_tail(&folios
, ret_folios
);
1978 stats
->nr_thp_failed
+= astats
.nr_thp_split
;
1980 * Do not count rc, as pages will be retried below.
1981 * Count nr_split only, since it includes nr_thp_split.
1983 nr_failed
+= astats
.nr_split
;
1985 * Fall back to migrate all failed folios one by one synchronously. All
1986 * failed folios except split THPs will be retried, so their failure
1989 list_splice_tail_init(&folios
, from
);
1990 while (!list_empty(from
)) {
1991 list_move(from
->next
, &folios
);
1992 rc
= migrate_pages_batch(&folios
, get_new_folio
, put_new_folio
,
1993 private, mode
, reason
, ret_folios
,
1994 split_folios
, stats
, NR_MAX_MIGRATE_SYNC_RETRY
);
1995 list_splice_tail_init(&folios
, ret_folios
);
2005 * migrate_pages - migrate the folios specified in a list, to the free folios
2006 * supplied as the target for the page migration
2008 * @from: The list of folios to be migrated.
2009 * @get_new_folio: The function used to allocate free folios to be used
2010 * as the target of the folio migration.
2011 * @put_new_folio: The function used to free target folios if migration
2012 * fails, or NULL if no special handling is necessary.
2013 * @private: Private data to be passed on to get_new_folio()
2014 * @mode: The migration mode that specifies the constraints for
2015 * folio migration, if any.
2016 * @reason: The reason for folio migration.
2017 * @ret_succeeded: Set to the number of folios migrated successfully if
2018 * the caller passes a non-NULL pointer.
2020 * The function returns after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no folios
2021 * are movable any more because the list has become empty or no retryable folios
2022 * exist any more. It is caller's responsibility to call putback_movable_pages()
2025 * Returns the number of {normal folio, large folio, hugetlb} that were not
2026 * migrated, or an error code. The number of large folio splits will be
2027 * considered as the number of non-migrated large folio, no matter how many
2028 * split folios of the large folio are migrated successfully.
2030 int migrate_pages(struct list_head
*from
, new_folio_t get_new_folio
,
2031 free_folio_t put_new_folio
, unsigned long private,
2032 enum migrate_mode mode
, int reason
, unsigned int *ret_succeeded
)
2036 struct folio
*folio
, *folio2
;
2038 LIST_HEAD(ret_folios
);
2039 LIST_HEAD(split_folios
);
2040 struct migrate_pages_stats stats
;
2042 trace_mm_migrate_pages_start(mode
, reason
);
2044 memset(&stats
, 0, sizeof(stats
));
2046 rc_gather
= migrate_hugetlbs(from
, get_new_folio
, put_new_folio
, private,
2047 mode
, reason
, &stats
, &ret_folios
);
2053 list_for_each_entry_safe(folio
, folio2
, from
, lru
) {
2054 /* Retried hugetlb folios will be kept in list */
2055 if (folio_test_hugetlb(folio
)) {
2056 list_move_tail(&folio
->lru
, &ret_folios
);
2060 nr_pages
+= folio_nr_pages(folio
);
2061 if (nr_pages
>= NR_MAX_BATCHED_MIGRATION
)
2064 if (nr_pages
>= NR_MAX_BATCHED_MIGRATION
)
2065 list_cut_before(&folios
, from
, &folio2
->lru
);
2067 list_splice_init(from
, &folios
);
2068 if (mode
== MIGRATE_ASYNC
)
2069 rc
= migrate_pages_batch(&folios
, get_new_folio
, put_new_folio
,
2070 private, mode
, reason
, &ret_folios
,
2071 &split_folios
, &stats
,
2072 NR_MAX_MIGRATE_PAGES_RETRY
);
2074 rc
= migrate_pages_sync(&folios
, get_new_folio
, put_new_folio
,
2075 private, mode
, reason
, &ret_folios
,
2076 &split_folios
, &stats
);
2077 list_splice_tail_init(&folios
, &ret_folios
);
2080 list_splice_tail(&split_folios
, &ret_folios
);
2083 if (!list_empty(&split_folios
)) {
2085 * Failure isn't counted since all split folios of a large folio
2086 * is counted as 1 failure already. And, we only try to migrate
2087 * with minimal effort, force MIGRATE_ASYNC mode and retry once.
2089 migrate_pages_batch(&split_folios
, get_new_folio
,
2090 put_new_folio
, private, MIGRATE_ASYNC
, reason
,
2091 &ret_folios
, NULL
, &stats
, 1);
2092 list_splice_tail_init(&split_folios
, &ret_folios
);
2095 if (!list_empty(from
))
2099 * Put the permanent failure folio back to migration list, they
2100 * will be put back to the right list by the caller.
2102 list_splice(&ret_folios
, from
);
2105 * Return 0 in case all split folios of fail-to-migrate large folios
2106 * are migrated successfully.
2108 if (list_empty(from
))
2111 count_vm_events(PGMIGRATE_SUCCESS
, stats
.nr_succeeded
);
2112 count_vm_events(PGMIGRATE_FAIL
, stats
.nr_failed_pages
);
2113 count_vm_events(THP_MIGRATION_SUCCESS
, stats
.nr_thp_succeeded
);
2114 count_vm_events(THP_MIGRATION_FAIL
, stats
.nr_thp_failed
);
2115 count_vm_events(THP_MIGRATION_SPLIT
, stats
.nr_thp_split
);
2116 trace_mm_migrate_pages(stats
.nr_succeeded
, stats
.nr_failed_pages
,
2117 stats
.nr_thp_succeeded
, stats
.nr_thp_failed
,
2118 stats
.nr_thp_split
, stats
.nr_split
, mode
,
2122 *ret_succeeded
= stats
.nr_succeeded
;
2127 struct folio
*alloc_migration_target(struct folio
*src
, unsigned long private)
2129 struct migration_target_control
*mtc
;
2131 unsigned int order
= 0;
2135 mtc
= (struct migration_target_control
*)private;
2136 gfp_mask
= mtc
->gfp_mask
;
2138 if (nid
== NUMA_NO_NODE
)
2139 nid
= folio_nid(src
);
2141 if (folio_test_hugetlb(src
)) {
2142 struct hstate
*h
= folio_hstate(src
);
2144 gfp_mask
= htlb_modify_alloc_mask(h
, gfp_mask
);
2145 return alloc_hugetlb_folio_nodemask(h
, nid
,
2146 mtc
->nmask
, gfp_mask
,
2147 htlb_allow_alloc_fallback(mtc
->reason
));
2150 if (folio_test_large(src
)) {
2152 * clear __GFP_RECLAIM to make the migration callback
2153 * consistent with regular THP allocations.
2155 gfp_mask
&= ~__GFP_RECLAIM
;
2156 gfp_mask
|= GFP_TRANSHUGE
;
2157 order
= folio_order(src
);
2159 zidx
= zone_idx(folio_zone(src
));
2160 if (is_highmem_idx(zidx
) || zidx
== ZONE_MOVABLE
)
2161 gfp_mask
|= __GFP_HIGHMEM
;
2163 return __folio_alloc(gfp_mask
, order
, nid
, mtc
->nmask
);
2168 static int store_status(int __user
*status
, int start
, int value
, int nr
)
2171 if (put_user(value
, status
+ start
))
2179 static int do_move_pages_to_node(struct list_head
*pagelist
, int node
)
2182 struct migration_target_control mtc
= {
2184 .gfp_mask
= GFP_HIGHUSER_MOVABLE
| __GFP_THISNODE
,
2185 .reason
= MR_SYSCALL
,
2188 err
= migrate_pages(pagelist
, alloc_migration_target
, NULL
,
2189 (unsigned long)&mtc
, MIGRATE_SYNC
, MR_SYSCALL
, NULL
);
2191 putback_movable_pages(pagelist
);
2195 static int __add_folio_for_migration(struct folio
*folio
, int node
,
2196 struct list_head
*pagelist
, bool migrate_all
)
2198 if (is_zero_folio(folio
) || is_huge_zero_folio(folio
))
2201 if (folio_is_zone_device(folio
))
2204 if (folio_nid(folio
) == node
)
2207 if (folio_likely_mapped_shared(folio
) && !migrate_all
)
2210 if (folio_test_hugetlb(folio
)) {
2211 if (isolate_hugetlb(folio
, pagelist
))
2213 } else if (folio_isolate_lru(folio
)) {
2214 list_add_tail(&folio
->lru
, pagelist
);
2215 node_stat_mod_folio(folio
,
2216 NR_ISOLATED_ANON
+ folio_is_file_lru(folio
),
2217 folio_nr_pages(folio
));
2224 * Resolves the given address to a struct folio, isolates it from the LRU and
2225 * puts it to the given pagelist.
2227 * errno - if the folio cannot be found/isolated
2228 * 0 - when it doesn't have to be migrated because it is already on the
2230 * 1 - when it has been queued
2232 static int add_folio_for_migration(struct mm_struct
*mm
, const void __user
*p
,
2233 int node
, struct list_head
*pagelist
, bool migrate_all
)
2235 struct vm_area_struct
*vma
;
2236 struct folio_walk fw
;
2237 struct folio
*folio
;
2242 addr
= (unsigned long)untagged_addr_remote(mm
, p
);
2244 vma
= vma_lookup(mm
, addr
);
2245 if (vma
&& vma_migratable(vma
)) {
2246 folio
= folio_walk_start(&fw
, vma
, addr
, FW_ZEROPAGE
);
2248 err
= __add_folio_for_migration(folio
, node
, pagelist
,
2250 folio_walk_end(&fw
, vma
);
2255 mmap_read_unlock(mm
);
2259 static int move_pages_and_store_status(int node
,
2260 struct list_head
*pagelist
, int __user
*status
,
2261 int start
, int i
, unsigned long nr_pages
)
2265 if (list_empty(pagelist
))
2268 err
= do_move_pages_to_node(pagelist
, node
);
2271 * Positive err means the number of failed
2272 * pages to migrate. Since we are going to
2273 * abort and return the number of non-migrated
2274 * pages, so need to include the rest of the
2275 * nr_pages that have not been attempted as
2279 err
+= nr_pages
- i
;
2282 return store_status(status
, start
, node
, i
- start
);
2286 * Migrate an array of page address onto an array of nodes and fill
2287 * the corresponding array of status.
2289 static int do_pages_move(struct mm_struct
*mm
, nodemask_t task_nodes
,
2290 unsigned long nr_pages
,
2291 const void __user
* __user
*pages
,
2292 const int __user
*nodes
,
2293 int __user
*status
, int flags
)
2295 compat_uptr_t __user
*compat_pages
= (void __user
*)pages
;
2296 int current_node
= NUMA_NO_NODE
;
2297 LIST_HEAD(pagelist
);
2301 lru_cache_disable();
2303 for (i
= start
= 0; i
< nr_pages
; i
++) {
2304 const void __user
*p
;
2308 if (in_compat_syscall()) {
2311 if (get_user(cp
, compat_pages
+ i
))
2316 if (get_user(p
, pages
+ i
))
2319 if (get_user(node
, nodes
+ i
))
2323 if (node
< 0 || node
>= MAX_NUMNODES
)
2325 if (!node_state(node
, N_MEMORY
))
2329 if (!node_isset(node
, task_nodes
))
2332 if (current_node
== NUMA_NO_NODE
) {
2333 current_node
= node
;
2335 } else if (node
!= current_node
) {
2336 err
= move_pages_and_store_status(current_node
,
2337 &pagelist
, status
, start
, i
, nr_pages
);
2341 current_node
= node
;
2345 * Errors in the page lookup or isolation are not fatal and we simply
2346 * report them via status
2348 err
= add_folio_for_migration(mm
, p
, current_node
, &pagelist
,
2349 flags
& MPOL_MF_MOVE_ALL
);
2352 /* The page is successfully queued for migration */
2357 * The move_pages() man page does not have an -EEXIST choice, so
2358 * use -EFAULT instead.
2364 * If the page is already on the target node (!err), store the
2365 * node, otherwise, store the err.
2367 err
= store_status(status
, i
, err
? : current_node
, 1);
2371 err
= move_pages_and_store_status(current_node
, &pagelist
,
2372 status
, start
, i
, nr_pages
);
2374 /* We have accounted for page i */
2379 current_node
= NUMA_NO_NODE
;
2382 /* Make sure we do not overwrite the existing error */
2383 err1
= move_pages_and_store_status(current_node
, &pagelist
,
2384 status
, start
, i
, nr_pages
);
2393 * Determine the nodes of an array of pages and store it in an array of status.
2395 static void do_pages_stat_array(struct mm_struct
*mm
, unsigned long nr_pages
,
2396 const void __user
**pages
, int *status
)
2402 for (i
= 0; i
< nr_pages
; i
++) {
2403 unsigned long addr
= (unsigned long)(*pages
);
2404 struct vm_area_struct
*vma
;
2405 struct folio_walk fw
;
2406 struct folio
*folio
;
2409 vma
= vma_lookup(mm
, addr
);
2413 folio
= folio_walk_start(&fw
, vma
, addr
, FW_ZEROPAGE
);
2415 if (is_zero_folio(folio
) || is_huge_zero_folio(folio
))
2417 else if (folio_is_zone_device(folio
))
2420 err
= folio_nid(folio
);
2421 folio_walk_end(&fw
, vma
);
2432 mmap_read_unlock(mm
);
2435 static int get_compat_pages_array(const void __user
*chunk_pages
[],
2436 const void __user
* __user
*pages
,
2437 unsigned long chunk_nr
)
2439 compat_uptr_t __user
*pages32
= (compat_uptr_t __user
*)pages
;
2443 for (i
= 0; i
< chunk_nr
; i
++) {
2444 if (get_user(p
, pages32
+ i
))
2446 chunk_pages
[i
] = compat_ptr(p
);
2453 * Determine the nodes of a user array of pages and store it in
2454 * a user array of status.
2456 static int do_pages_stat(struct mm_struct
*mm
, unsigned long nr_pages
,
2457 const void __user
* __user
*pages
,
2460 #define DO_PAGES_STAT_CHUNK_NR 16UL
2461 const void __user
*chunk_pages
[DO_PAGES_STAT_CHUNK_NR
];
2462 int chunk_status
[DO_PAGES_STAT_CHUNK_NR
];
2465 unsigned long chunk_nr
= min(nr_pages
, DO_PAGES_STAT_CHUNK_NR
);
2467 if (in_compat_syscall()) {
2468 if (get_compat_pages_array(chunk_pages
, pages
,
2472 if (copy_from_user(chunk_pages
, pages
,
2473 chunk_nr
* sizeof(*chunk_pages
)))
2477 do_pages_stat_array(mm
, chunk_nr
, chunk_pages
, chunk_status
);
2479 if (copy_to_user(status
, chunk_status
, chunk_nr
* sizeof(*status
)))
2484 nr_pages
-= chunk_nr
;
2486 return nr_pages
? -EFAULT
: 0;
2489 static struct mm_struct
*find_mm_struct(pid_t pid
, nodemask_t
*mem_nodes
)
2491 struct task_struct
*task
;
2492 struct mm_struct
*mm
;
2495 * There is no need to check if current process has the right to modify
2496 * the specified process when they are same.
2500 *mem_nodes
= cpuset_mems_allowed(current
);
2504 task
= find_get_task_by_vpid(pid
);
2506 return ERR_PTR(-ESRCH
);
2510 * Check if this process has the right to modify the specified
2511 * process. Use the regular "ptrace_may_access()" checks.
2513 if (!ptrace_may_access(task
, PTRACE_MODE_READ_REALCREDS
)) {
2514 mm
= ERR_PTR(-EPERM
);
2518 mm
= ERR_PTR(security_task_movememory(task
));
2521 *mem_nodes
= cpuset_mems_allowed(task
);
2522 mm
= get_task_mm(task
);
2524 put_task_struct(task
);
2526 mm
= ERR_PTR(-EINVAL
);
2531 * Move a list of pages in the address space of the currently executing
2534 static int kernel_move_pages(pid_t pid
, unsigned long nr_pages
,
2535 const void __user
* __user
*pages
,
2536 const int __user
*nodes
,
2537 int __user
*status
, int flags
)
2539 struct mm_struct
*mm
;
2541 nodemask_t task_nodes
;
2544 if (flags
& ~(MPOL_MF_MOVE
|MPOL_MF_MOVE_ALL
))
2547 if ((flags
& MPOL_MF_MOVE_ALL
) && !capable(CAP_SYS_NICE
))
2550 mm
= find_mm_struct(pid
, &task_nodes
);
2555 err
= do_pages_move(mm
, task_nodes
, nr_pages
, pages
,
2556 nodes
, status
, flags
);
2558 err
= do_pages_stat(mm
, nr_pages
, pages
, status
);
2564 SYSCALL_DEFINE6(move_pages
, pid_t
, pid
, unsigned long, nr_pages
,
2565 const void __user
* __user
*, pages
,
2566 const int __user
*, nodes
,
2567 int __user
*, status
, int, flags
)
2569 return kernel_move_pages(pid
, nr_pages
, pages
, nodes
, status
, flags
);
2572 #ifdef CONFIG_NUMA_BALANCING
2574 * Returns true if this is a safe migration target node for misplaced NUMA
2575 * pages. Currently it only checks the watermarks which is crude.
2577 static bool migrate_balanced_pgdat(struct pglist_data
*pgdat
,
2578 unsigned long nr_migrate_pages
)
2582 for (z
= pgdat
->nr_zones
- 1; z
>= 0; z
--) {
2583 struct zone
*zone
= pgdat
->node_zones
+ z
;
2585 if (!managed_zone(zone
))
2588 /* Avoid waking kswapd by allocating pages_to_migrate pages. */
2589 if (!zone_watermark_ok(zone
, 0,
2590 high_wmark_pages(zone
) +
2592 ZONE_MOVABLE
, ALLOC_CMA
))
2599 static struct folio
*alloc_misplaced_dst_folio(struct folio
*src
,
2602 int nid
= (int) data
;
2603 int order
= folio_order(src
);
2604 gfp_t gfp
= __GFP_THISNODE
;
2607 gfp
|= GFP_TRANSHUGE_LIGHT
;
2609 gfp
|= GFP_HIGHUSER_MOVABLE
| __GFP_NOMEMALLOC
| __GFP_NORETRY
|
2611 gfp
&= ~__GFP_RECLAIM
;
2613 return __folio_alloc_node(gfp
, order
, nid
);
2617 * Prepare for calling migrate_misplaced_folio() by isolating the folio if
2618 * permitted. Must be called with the PTL still held.
2620 int migrate_misplaced_folio_prepare(struct folio
*folio
,
2621 struct vm_area_struct
*vma
, int node
)
2623 int nr_pages
= folio_nr_pages(folio
);
2624 pg_data_t
*pgdat
= NODE_DATA(node
);
2626 if (folio_is_file_lru(folio
)) {
2628 * Do not migrate file folios that are mapped in multiple
2629 * processes with execute permissions as they are probably
2632 * See folio_likely_mapped_shared() on possible imprecision
2633 * when we cannot easily detect if a folio is shared.
2635 if ((vma
->vm_flags
& VM_EXEC
) &&
2636 folio_likely_mapped_shared(folio
))
2640 * Do not migrate dirty folios as not all filesystems can move
2641 * dirty folios in MIGRATE_ASYNC mode which is a waste of
2644 if (folio_test_dirty(folio
))
2648 /* Avoid migrating to a node that is nearly full */
2649 if (!migrate_balanced_pgdat(pgdat
, nr_pages
)) {
2652 if (!(sysctl_numa_balancing_mode
& NUMA_BALANCING_MEMORY_TIERING
))
2654 for (z
= pgdat
->nr_zones
- 1; z
>= 0; z
--) {
2655 if (managed_zone(pgdat
->node_zones
+ z
))
2660 * If there are no managed zones, it should not proceed
2666 wakeup_kswapd(pgdat
->node_zones
+ z
, 0,
2667 folio_order(folio
), ZONE_MOVABLE
);
2671 if (!folio_isolate_lru(folio
))
2674 node_stat_mod_folio(folio
, NR_ISOLATED_ANON
+ folio_is_file_lru(folio
),
2680 * Attempt to migrate a misplaced folio to the specified destination
2681 * node. Caller is expected to have isolated the folio by calling
2682 * migrate_misplaced_folio_prepare(), which will result in an
2683 * elevated reference count on the folio. This function will un-isolate the
2684 * folio, dereferencing the folio before returning.
2686 int migrate_misplaced_folio(struct folio
*folio
, struct vm_area_struct
*vma
,
2689 pg_data_t
*pgdat
= NODE_DATA(node
);
2691 unsigned int nr_succeeded
;
2692 LIST_HEAD(migratepages
);
2693 struct mem_cgroup
*memcg
= get_mem_cgroup_from_folio(folio
);
2694 struct lruvec
*lruvec
= mem_cgroup_lruvec(memcg
, pgdat
);
2696 list_add(&folio
->lru
, &migratepages
);
2697 nr_remaining
= migrate_pages(&migratepages
, alloc_misplaced_dst_folio
,
2698 NULL
, node
, MIGRATE_ASYNC
,
2699 MR_NUMA_MISPLACED
, &nr_succeeded
);
2700 if (nr_remaining
&& !list_empty(&migratepages
))
2701 putback_movable_pages(&migratepages
);
2703 count_vm_numa_events(NUMA_PAGE_MIGRATE
, nr_succeeded
);
2704 count_memcg_events(memcg
, NUMA_PAGE_MIGRATE
, nr_succeeded
);
2705 if ((sysctl_numa_balancing_mode
& NUMA_BALANCING_MEMORY_TIERING
)
2706 && !node_is_toptier(folio_nid(folio
))
2707 && node_is_toptier(node
))
2708 mod_lruvec_state(lruvec
, PGPROMOTE_SUCCESS
, nr_succeeded
);
2710 mem_cgroup_put(memcg
);
2711 BUG_ON(!list_empty(&migratepages
));
2712 return nr_remaining
? -EAGAIN
: 0;
2714 #endif /* CONFIG_NUMA_BALANCING */
2715 #endif /* CONFIG_NUMA */