1 // SPDX-License-Identifier: GPL-2.0
5 * Copyright (C) 1999 Linus Torvalds
6 * Copyright (C) 2002 Christoph Hellwig
9 #include <linux/mman.h>
10 #include <linux/pagemap.h>
11 #include <linux/syscalls.h>
12 #include <linux/mempolicy.h>
13 #include <linux/page-isolation.h>
14 #include <linux/page_idle.h>
15 #include <linux/userfaultfd_k.h>
16 #include <linux/hugetlb.h>
17 #include <linux/falloc.h>
18 #include <linux/fadvise.h>
19 #include <linux/sched.h>
20 #include <linux/sched/mm.h>
21 #include <linux/mm_inline.h>
22 #include <linux/string.h>
23 #include <linux/uio.h>
24 #include <linux/ksm.h>
26 #include <linux/file.h>
27 #include <linux/blkdev.h>
28 #include <linux/backing-dev.h>
29 #include <linux/pagewalk.h>
30 #include <linux/swap.h>
31 #include <linux/swapops.h>
32 #include <linux/shmem_fs.h>
33 #include <linux/mmu_notifier.h>
41 * Maximum number of attempts we make to install guard pages before we give up
42 * and return -ERESTARTNOINTR to have userspace try again.
44 #define MAX_MADVISE_GUARD_RETRIES 3
46 struct madvise_walk_private
{
47 struct mmu_gather
*tlb
;
52 * Any behaviour which results in changes to the vma->vm_flags needs to
53 * take mmap_lock for writing. Others, which simply traverse vmas, need
54 * to only take it for reading.
56 static int madvise_need_mmap_write(int behavior
)
62 case MADV_DONTNEED_LOCKED
:
66 case MADV_POPULATE_READ
:
67 case MADV_POPULATE_WRITE
:
69 case MADV_GUARD_INSTALL
:
70 case MADV_GUARD_REMOVE
:
73 /* be safe, default to 1. list exceptions explicitly */
78 #ifdef CONFIG_ANON_VMA_NAME
79 struct anon_vma_name
*anon_vma_name_alloc(const char *name
)
81 struct anon_vma_name
*anon_name
;
84 /* Add 1 for NUL terminator at the end of the anon_name->name */
85 count
= strlen(name
) + 1;
86 anon_name
= kmalloc(struct_size(anon_name
, name
, count
), GFP_KERNEL
);
88 kref_init(&anon_name
->kref
);
89 memcpy(anon_name
->name
, name
, count
);
95 void anon_vma_name_free(struct kref
*kref
)
97 struct anon_vma_name
*anon_name
=
98 container_of(kref
, struct anon_vma_name
, kref
);
102 struct anon_vma_name
*anon_vma_name(struct vm_area_struct
*vma
)
104 mmap_assert_locked(vma
->vm_mm
);
106 return vma
->anon_name
;
109 /* mmap_lock should be write-locked */
110 static int replace_anon_vma_name(struct vm_area_struct
*vma
,
111 struct anon_vma_name
*anon_name
)
113 struct anon_vma_name
*orig_name
= anon_vma_name(vma
);
116 vma
->anon_name
= NULL
;
117 anon_vma_name_put(orig_name
);
121 if (anon_vma_name_eq(orig_name
, anon_name
))
124 vma
->anon_name
= anon_vma_name_reuse(anon_name
);
125 anon_vma_name_put(orig_name
);
129 #else /* CONFIG_ANON_VMA_NAME */
130 static int replace_anon_vma_name(struct vm_area_struct
*vma
,
131 struct anon_vma_name
*anon_name
)
138 #endif /* CONFIG_ANON_VMA_NAME */
140 * Update the vm_flags on region of a vma, splitting it or merging it as
141 * necessary. Must be called with mmap_lock held for writing;
142 * Caller should ensure anon_name stability by raising its refcount even when
143 * anon_name belongs to a valid vma because this function might free that vma.
145 static int madvise_update_vma(struct vm_area_struct
*vma
,
146 struct vm_area_struct
**prev
, unsigned long start
,
147 unsigned long end
, unsigned long new_flags
,
148 struct anon_vma_name
*anon_name
)
150 struct mm_struct
*mm
= vma
->vm_mm
;
152 VMA_ITERATOR(vmi
, mm
, start
);
154 if (new_flags
== vma
->vm_flags
&& anon_vma_name_eq(anon_vma_name(vma
), anon_name
)) {
159 vma
= vma_modify_flags_name(&vmi
, *prev
, vma
, start
, end
, new_flags
,
166 /* vm_flags is protected by the mmap_lock held in write mode. */
167 vma_start_write(vma
);
168 vm_flags_reset(vma
, new_flags
);
169 if (!vma
->vm_file
|| vma_is_anon_shmem(vma
)) {
170 error
= replace_anon_vma_name(vma
, anon_name
);
179 static int swapin_walk_pmd_entry(pmd_t
*pmd
, unsigned long start
,
180 unsigned long end
, struct mm_walk
*walk
)
182 struct vm_area_struct
*vma
= walk
->private;
183 struct swap_iocb
*splug
= NULL
;
188 for (addr
= start
; addr
< end
; addr
+= PAGE_SIZE
) {
194 ptep
= pte_offset_map_lock(vma
->vm_mm
, pmd
, addr
, &ptl
);
199 pte
= ptep_get(ptep
);
200 if (!is_swap_pte(pte
))
202 entry
= pte_to_swp_entry(pte
);
203 if (unlikely(non_swap_entry(entry
)))
206 pte_unmap_unlock(ptep
, ptl
);
209 folio
= read_swap_cache_async(entry
, GFP_HIGHUSER_MOVABLE
,
216 pte_unmap_unlock(ptep
, ptl
);
217 swap_read_unplug(splug
);
223 static const struct mm_walk_ops swapin_walk_ops
= {
224 .pmd_entry
= swapin_walk_pmd_entry
,
225 .walk_lock
= PGWALK_RDLOCK
,
228 static void shmem_swapin_range(struct vm_area_struct
*vma
,
229 unsigned long start
, unsigned long end
,
230 struct address_space
*mapping
)
232 XA_STATE(xas
, &mapping
->i_pages
, linear_page_index(vma
, start
));
233 pgoff_t end_index
= linear_page_index(vma
, end
) - 1;
235 struct swap_iocb
*splug
= NULL
;
238 xas_for_each(&xas
, folio
, end_index
) {
242 if (!xa_is_value(folio
))
244 entry
= radix_to_swp_entry(folio
);
245 /* There might be swapin error entries in shmem mapping. */
246 if (non_swap_entry(entry
))
249 addr
= vma
->vm_start
+
250 ((xas
.xa_index
- vma
->vm_pgoff
) << PAGE_SHIFT
);
254 folio
= read_swap_cache_async(entry
, mapping_gfp_mask(mapping
),
262 swap_read_unplug(splug
);
264 #endif /* CONFIG_SWAP */
267 * Schedule all required I/O operations. Do not wait for completion.
269 static long madvise_willneed(struct vm_area_struct
*vma
,
270 struct vm_area_struct
**prev
,
271 unsigned long start
, unsigned long end
)
273 struct mm_struct
*mm
= vma
->vm_mm
;
274 struct file
*file
= vma
->vm_file
;
280 walk_page_range(vma
->vm_mm
, start
, end
, &swapin_walk_ops
, vma
);
281 lru_add_drain(); /* Push any new pages onto the LRU now */
285 if (shmem_mapping(file
->f_mapping
)) {
286 shmem_swapin_range(vma
, start
, end
, file
->f_mapping
);
287 lru_add_drain(); /* Push any new pages onto the LRU now */
295 if (IS_DAX(file_inode(file
))) {
296 /* no bad return value, but ignore advice */
301 * Filesystem's fadvise may need to take various locks. We need to
302 * explicitly grab a reference because the vma (and hence the
303 * vma's reference to the file) can go away as soon as we drop
306 *prev
= NULL
; /* tell sys_madvise we drop mmap_lock */
308 offset
= (loff_t
)(start
- vma
->vm_start
)
309 + ((loff_t
)vma
->vm_pgoff
<< PAGE_SHIFT
);
310 mmap_read_unlock(mm
);
311 vfs_fadvise(file
, offset
, end
- start
, POSIX_FADV_WILLNEED
);
317 static inline bool can_do_file_pageout(struct vm_area_struct
*vma
)
322 * paging out pagecache only for non-anonymous mappings that correspond
323 * to the files the calling process could (if tried) open for writing;
324 * otherwise we'd be including shared non-exclusive mappings, which
325 * opens a side channel.
327 return inode_owner_or_capable(&nop_mnt_idmap
,
328 file_inode(vma
->vm_file
)) ||
329 file_permission(vma
->vm_file
, MAY_WRITE
) == 0;
332 static inline int madvise_folio_pte_batch(unsigned long addr
, unsigned long end
,
333 struct folio
*folio
, pte_t
*ptep
,
334 pte_t pte
, bool *any_young
,
337 const fpb_t fpb_flags
= FPB_IGNORE_DIRTY
| FPB_IGNORE_SOFT_DIRTY
;
338 int max_nr
= (end
- addr
) / PAGE_SIZE
;
340 return folio_pte_batch(folio
, addr
, ptep
, pte
, max_nr
, fpb_flags
, NULL
,
341 any_young
, any_dirty
);
344 static int madvise_cold_or_pageout_pte_range(pmd_t
*pmd
,
345 unsigned long addr
, unsigned long end
,
346 struct mm_walk
*walk
)
348 struct madvise_walk_private
*private = walk
->private;
349 struct mmu_gather
*tlb
= private->tlb
;
350 bool pageout
= private->pageout
;
351 struct mm_struct
*mm
= tlb
->mm
;
352 struct vm_area_struct
*vma
= walk
->vma
;
353 pte_t
*start_pte
, *pte
, ptent
;
355 struct folio
*folio
= NULL
;
356 LIST_HEAD(folio_list
);
357 bool pageout_anon_only_filter
;
358 unsigned int batch_count
= 0;
361 if (fatal_signal_pending(current
))
364 pageout_anon_only_filter
= pageout
&& !vma_is_anonymous(vma
) &&
365 !can_do_file_pageout(vma
);
367 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
368 if (pmd_trans_huge(*pmd
)) {
370 unsigned long next
= pmd_addr_end(addr
, end
);
372 tlb_change_page_size(tlb
, HPAGE_PMD_SIZE
);
373 ptl
= pmd_trans_huge_lock(pmd
, vma
);
378 if (is_huge_zero_pmd(orig_pmd
))
381 if (unlikely(!pmd_present(orig_pmd
))) {
382 VM_BUG_ON(thp_migration_supported() &&
383 !is_pmd_migration_entry(orig_pmd
));
387 folio
= pmd_folio(orig_pmd
);
389 /* Do not interfere with other mappings of this folio */
390 if (folio_likely_mapped_shared(folio
))
393 if (pageout_anon_only_filter
&& !folio_test_anon(folio
))
396 if (next
- addr
!= HPAGE_PMD_SIZE
) {
402 err
= split_folio(folio
);
410 if (!pageout
&& pmd_young(orig_pmd
)) {
411 pmdp_invalidate(vma
, addr
, pmd
);
412 orig_pmd
= pmd_mkold(orig_pmd
);
414 set_pmd_at(mm
, addr
, pmd
, orig_pmd
);
415 tlb_remove_pmd_tlb_entry(tlb
, pmd
, addr
);
418 folio_clear_referenced(folio
);
419 folio_test_clear_young(folio
);
420 if (folio_test_active(folio
))
421 folio_set_workingset(folio
);
423 if (folio_isolate_lru(folio
)) {
424 if (folio_test_unevictable(folio
))
425 folio_putback_lru(folio
);
427 list_add(&folio
->lru
, &folio_list
);
430 folio_deactivate(folio
);
434 reclaim_pages(&folio_list
);
440 tlb_change_page_size(tlb
, PAGE_SIZE
);
442 start_pte
= pte
= pte_offset_map_lock(vma
->vm_mm
, pmd
, addr
, &ptl
);
445 flush_tlb_batched_pending(mm
);
446 arch_enter_lazy_mmu_mode();
447 for (; addr
< end
; pte
+= nr
, addr
+= nr
* PAGE_SIZE
) {
449 ptent
= ptep_get(pte
);
451 if (++batch_count
== SWAP_CLUSTER_MAX
) {
453 if (need_resched()) {
454 arch_leave_lazy_mmu_mode();
455 pte_unmap_unlock(start_pte
, ptl
);
464 if (!pte_present(ptent
))
467 folio
= vm_normal_folio(vma
, addr
, ptent
);
468 if (!folio
|| folio_is_zone_device(folio
))
472 * If we encounter a large folio, only split it if it is not
473 * fully mapped within the range we are operating on. Otherwise
474 * leave it as is so that it can be swapped out whole. If we
475 * fail to split a folio, leave it in place and advance to the
476 * next pte in the range.
478 if (folio_test_large(folio
)) {
481 nr
= madvise_folio_pte_batch(addr
, end
, folio
, pte
,
482 ptent
, &any_young
, NULL
);
484 ptent
= pte_mkyoung(ptent
);
486 if (nr
< folio_nr_pages(folio
)) {
489 if (folio_likely_mapped_shared(folio
))
491 if (pageout_anon_only_filter
&& !folio_test_anon(folio
))
493 if (!folio_trylock(folio
))
496 arch_leave_lazy_mmu_mode();
497 pte_unmap_unlock(start_pte
, ptl
);
499 err
= split_folio(folio
);
503 pte_offset_map_lock(mm
, pmd
, addr
, &ptl
);
506 arch_enter_lazy_mmu_mode();
514 * Do not interfere with other mappings of this folio and
515 * non-LRU folio. If we have a large folio at this point, we
516 * know it is fully mapped so if its mapcount is the same as its
517 * number of pages, it must be exclusive.
519 if (!folio_test_lru(folio
) ||
520 folio_mapcount(folio
) != folio_nr_pages(folio
))
523 if (pageout_anon_only_filter
&& !folio_test_anon(folio
))
526 if (!pageout
&& pte_young(ptent
)) {
527 clear_young_dirty_ptes(vma
, addr
, pte
, nr
,
529 tlb_remove_tlb_entries(tlb
, pte
, nr
, addr
);
533 * We are deactivating a folio for accelerating reclaiming.
534 * VM couldn't reclaim the folio unless we clear PG_young.
535 * As a side effect, it makes confuse idle-page tracking
536 * because they will miss recent referenced history.
538 folio_clear_referenced(folio
);
539 folio_test_clear_young(folio
);
540 if (folio_test_active(folio
))
541 folio_set_workingset(folio
);
543 if (folio_isolate_lru(folio
)) {
544 if (folio_test_unevictable(folio
))
545 folio_putback_lru(folio
);
547 list_add(&folio
->lru
, &folio_list
);
550 folio_deactivate(folio
);
554 arch_leave_lazy_mmu_mode();
555 pte_unmap_unlock(start_pte
, ptl
);
558 reclaim_pages(&folio_list
);
564 static const struct mm_walk_ops cold_walk_ops
= {
565 .pmd_entry
= madvise_cold_or_pageout_pte_range
,
566 .walk_lock
= PGWALK_RDLOCK
,
569 static void madvise_cold_page_range(struct mmu_gather
*tlb
,
570 struct vm_area_struct
*vma
,
571 unsigned long addr
, unsigned long end
)
573 struct madvise_walk_private walk_private
= {
578 tlb_start_vma(tlb
, vma
);
579 walk_page_range(vma
->vm_mm
, addr
, end
, &cold_walk_ops
, &walk_private
);
580 tlb_end_vma(tlb
, vma
);
583 static inline bool can_madv_lru_vma(struct vm_area_struct
*vma
)
585 return !(vma
->vm_flags
& (VM_LOCKED
|VM_PFNMAP
|VM_HUGETLB
));
588 static long madvise_cold(struct vm_area_struct
*vma
,
589 struct vm_area_struct
**prev
,
590 unsigned long start_addr
, unsigned long end_addr
)
592 struct mm_struct
*mm
= vma
->vm_mm
;
593 struct mmu_gather tlb
;
596 if (!can_madv_lru_vma(vma
))
600 tlb_gather_mmu(&tlb
, mm
);
601 madvise_cold_page_range(&tlb
, vma
, start_addr
, end_addr
);
602 tlb_finish_mmu(&tlb
);
607 static void madvise_pageout_page_range(struct mmu_gather
*tlb
,
608 struct vm_area_struct
*vma
,
609 unsigned long addr
, unsigned long end
)
611 struct madvise_walk_private walk_private
= {
616 tlb_start_vma(tlb
, vma
);
617 walk_page_range(vma
->vm_mm
, addr
, end
, &cold_walk_ops
, &walk_private
);
618 tlb_end_vma(tlb
, vma
);
621 static long madvise_pageout(struct vm_area_struct
*vma
,
622 struct vm_area_struct
**prev
,
623 unsigned long start_addr
, unsigned long end_addr
)
625 struct mm_struct
*mm
= vma
->vm_mm
;
626 struct mmu_gather tlb
;
629 if (!can_madv_lru_vma(vma
))
633 * If the VMA belongs to a private file mapping, there can be private
634 * dirty pages which can be paged out if even this process is neither
635 * owner nor write capable of the file. We allow private file mappings
636 * further to pageout dirty anon pages.
638 if (!vma_is_anonymous(vma
) && (!can_do_file_pageout(vma
) &&
639 (vma
->vm_flags
& VM_MAYSHARE
)))
643 tlb_gather_mmu(&tlb
, mm
);
644 madvise_pageout_page_range(&tlb
, vma
, start_addr
, end_addr
);
645 tlb_finish_mmu(&tlb
);
650 static int madvise_free_pte_range(pmd_t
*pmd
, unsigned long addr
,
651 unsigned long end
, struct mm_walk
*walk
)
654 const cydp_t cydp_flags
= CYDP_CLEAR_YOUNG
| CYDP_CLEAR_DIRTY
;
655 struct mmu_gather
*tlb
= walk
->private;
656 struct mm_struct
*mm
= tlb
->mm
;
657 struct vm_area_struct
*vma
= walk
->vma
;
659 pte_t
*start_pte
, *pte
, ptent
;
665 next
= pmd_addr_end(addr
, end
);
666 if (pmd_trans_huge(*pmd
))
667 if (madvise_free_huge_pmd(tlb
, vma
, pmd
, addr
, next
))
670 tlb_change_page_size(tlb
, PAGE_SIZE
);
671 start_pte
= pte
= pte_offset_map_lock(mm
, pmd
, addr
, &ptl
);
674 flush_tlb_batched_pending(mm
);
675 arch_enter_lazy_mmu_mode();
676 for (; addr
!= end
; pte
+= nr
, addr
+= PAGE_SIZE
* nr
) {
678 ptent
= ptep_get(pte
);
683 * If the pte has swp_entry, just clear page table to
684 * prevent swap-in which is more expensive rather than
685 * (page allocation + zeroing).
687 if (!pte_present(ptent
)) {
690 entry
= pte_to_swp_entry(ptent
);
691 if (!non_swap_entry(entry
)) {
692 max_nr
= (end
- addr
) / PAGE_SIZE
;
693 nr
= swap_pte_batch(pte
, max_nr
, ptent
);
695 free_swap_and_cache_nr(entry
, nr
);
696 clear_not_present_full_ptes(mm
, addr
, pte
, nr
, tlb
->fullmm
);
697 } else if (is_hwpoison_entry(entry
) ||
698 is_poisoned_swp_entry(entry
)) {
699 pte_clear_not_present_full(mm
, addr
, pte
, tlb
->fullmm
);
704 folio
= vm_normal_folio(vma
, addr
, ptent
);
705 if (!folio
|| folio_is_zone_device(folio
))
709 * If we encounter a large folio, only split it if it is not
710 * fully mapped within the range we are operating on. Otherwise
711 * leave it as is so that it can be marked as lazyfree. If we
712 * fail to split a folio, leave it in place and advance to the
713 * next pte in the range.
715 if (folio_test_large(folio
)) {
716 bool any_young
, any_dirty
;
718 nr
= madvise_folio_pte_batch(addr
, end
, folio
, pte
,
719 ptent
, &any_young
, &any_dirty
);
721 if (nr
< folio_nr_pages(folio
)) {
724 if (folio_likely_mapped_shared(folio
))
726 if (!folio_trylock(folio
))
729 arch_leave_lazy_mmu_mode();
730 pte_unmap_unlock(start_pte
, ptl
);
732 err
= split_folio(folio
);
735 pte
= pte_offset_map_lock(mm
, pmd
, addr
, &ptl
);
739 arch_enter_lazy_mmu_mode();
746 ptent
= pte_mkyoung(ptent
);
748 ptent
= pte_mkdirty(ptent
);
751 if (folio_test_swapcache(folio
) || folio_test_dirty(folio
)) {
752 if (!folio_trylock(folio
))
755 * If we have a large folio at this point, we know it is
756 * fully mapped so if its mapcount is the same as its
757 * number of pages, it must be exclusive.
759 if (folio_mapcount(folio
) != folio_nr_pages(folio
)) {
764 if (folio_test_swapcache(folio
) &&
765 !folio_free_swap(folio
)) {
770 folio_clear_dirty(folio
);
774 if (pte_young(ptent
) || pte_dirty(ptent
)) {
775 clear_young_dirty_ptes(vma
, addr
, pte
, nr
, cydp_flags
);
776 tlb_remove_tlb_entries(tlb
, pte
, nr
, addr
);
778 folio_mark_lazyfree(folio
);
782 add_mm_counter(mm
, MM_SWAPENTS
, nr_swap
);
784 arch_leave_lazy_mmu_mode();
785 pte_unmap_unlock(start_pte
, ptl
);
792 static const struct mm_walk_ops madvise_free_walk_ops
= {
793 .pmd_entry
= madvise_free_pte_range
,
794 .walk_lock
= PGWALK_RDLOCK
,
797 static int madvise_free_single_vma(struct vm_area_struct
*vma
,
798 unsigned long start_addr
, unsigned long end_addr
)
800 struct mm_struct
*mm
= vma
->vm_mm
;
801 struct mmu_notifier_range range
;
802 struct mmu_gather tlb
;
804 /* MADV_FREE works for only anon vma at the moment */
805 if (!vma_is_anonymous(vma
))
808 range
.start
= max(vma
->vm_start
, start_addr
);
809 if (range
.start
>= vma
->vm_end
)
811 range
.end
= min(vma
->vm_end
, end_addr
);
812 if (range
.end
<= vma
->vm_start
)
814 mmu_notifier_range_init(&range
, MMU_NOTIFY_CLEAR
, 0, mm
,
815 range
.start
, range
.end
);
818 tlb_gather_mmu(&tlb
, mm
);
819 update_hiwater_rss(mm
);
821 mmu_notifier_invalidate_range_start(&range
);
822 tlb_start_vma(&tlb
, vma
);
823 walk_page_range(vma
->vm_mm
, range
.start
, range
.end
,
824 &madvise_free_walk_ops
, &tlb
);
825 tlb_end_vma(&tlb
, vma
);
826 mmu_notifier_invalidate_range_end(&range
);
827 tlb_finish_mmu(&tlb
);
833 * Application no longer needs these pages. If the pages are dirty,
834 * it's OK to just throw them away. The app will be more careful about
835 * data it wants to keep. Be sure to free swap resources too. The
836 * zap_page_range_single call sets things up for shrink_active_list to actually
837 * free these pages later if no one else has touched them in the meantime,
838 * although we could add these pages to a global reuse list for
839 * shrink_active_list to pick up before reclaiming other pages.
841 * NB: This interface discards data rather than pushes it out to swap,
842 * as some implementations do. This has performance implications for
843 * applications like large transactional databases which want to discard
844 * pages in anonymous maps after committing to backing store the data
845 * that was kept in them. There is no reason to write this data out to
846 * the swap area if the application is discarding it.
848 * An interface that causes the system to free clean pages and flush
849 * dirty pages is already available as msync(MS_INVALIDATE).
851 static long madvise_dontneed_single_vma(struct vm_area_struct
*vma
,
852 unsigned long start
, unsigned long end
)
854 zap_page_range_single(vma
, start
, end
- start
, NULL
);
858 static bool madvise_dontneed_free_valid_vma(struct vm_area_struct
*vma
,
863 if (!is_vm_hugetlb_page(vma
)) {
864 unsigned int forbidden
= VM_PFNMAP
;
866 if (behavior
!= MADV_DONTNEED_LOCKED
)
867 forbidden
|= VM_LOCKED
;
869 return !(vma
->vm_flags
& forbidden
);
872 if (behavior
!= MADV_DONTNEED
&& behavior
!= MADV_DONTNEED_LOCKED
)
874 if (start
& ~huge_page_mask(hstate_vma(vma
)))
878 * Madvise callers expect the length to be rounded up to PAGE_SIZE
879 * boundaries, and may be unaware that this VMA uses huge pages.
880 * Avoid unexpected data loss by rounding down the number of
883 *end
= ALIGN_DOWN(*end
, huge_page_size(hstate_vma(vma
)));
888 static long madvise_dontneed_free(struct vm_area_struct
*vma
,
889 struct vm_area_struct
**prev
,
890 unsigned long start
, unsigned long end
,
893 struct mm_struct
*mm
= vma
->vm_mm
;
896 if (!madvise_dontneed_free_valid_vma(vma
, start
, &end
, behavior
))
902 if (!userfaultfd_remove(vma
, start
, end
)) {
903 *prev
= NULL
; /* mmap_lock has been dropped, prev is stale */
906 vma
= vma_lookup(mm
, start
);
910 * Potential end adjustment for hugetlb vma is OK as
911 * the check below keeps end within vma.
913 if (!madvise_dontneed_free_valid_vma(vma
, start
, &end
,
916 if (end
> vma
->vm_end
) {
918 * Don't fail if end > vma->vm_end. If the old
919 * vma was split while the mmap_lock was
920 * released the effect of the concurrent
921 * operation may not cause madvise() to
922 * have an undefined result. There may be an
923 * adjacent next vma that we'll walk
924 * next. userfaultfd_remove() will generate an
925 * UFFD_EVENT_REMOVE repetition on the
926 * end-vma->vm_end range, but the manager can
927 * handle a repetition fine.
931 VM_WARN_ON(start
>= end
);
934 if (behavior
== MADV_DONTNEED
|| behavior
== MADV_DONTNEED_LOCKED
)
935 return madvise_dontneed_single_vma(vma
, start
, end
);
936 else if (behavior
== MADV_FREE
)
937 return madvise_free_single_vma(vma
, start
, end
);
942 static long madvise_populate(struct mm_struct
*mm
, unsigned long start
,
943 unsigned long end
, int behavior
)
945 const bool write
= behavior
== MADV_POPULATE_WRITE
;
949 while (start
< end
) {
950 /* Populate (prefault) page tables readable/writable. */
951 pages
= faultin_page_range(mm
, start
, end
, write
, &locked
);
960 case -EINVAL
: /* Incompatible mappings / permissions. */
964 case -EFAULT
: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */
967 pr_warn_once("%s: unhandled return value: %ld\n",
970 case -ENOMEM
: /* No VMA or out of memory. */
974 start
+= pages
* PAGE_SIZE
;
980 * Application wants to free up the pages and associated backing store.
981 * This is effectively punching a hole into the middle of a file.
983 static long madvise_remove(struct vm_area_struct
*vma
,
984 struct vm_area_struct
**prev
,
985 unsigned long start
, unsigned long end
)
990 struct mm_struct
*mm
= vma
->vm_mm
;
992 *prev
= NULL
; /* tell sys_madvise we drop mmap_lock */
994 if (vma
->vm_flags
& VM_LOCKED
)
999 if (!f
|| !f
->f_mapping
|| !f
->f_mapping
->host
) {
1003 if (!vma_is_shared_maywrite(vma
))
1006 offset
= (loff_t
)(start
- vma
->vm_start
)
1007 + ((loff_t
)vma
->vm_pgoff
<< PAGE_SHIFT
);
1010 * Filesystem's fallocate may need to take i_rwsem. We need to
1011 * explicitly grab a reference because the vma (and hence the
1012 * vma's reference to the file) can go away as soon as we drop
1016 if (userfaultfd_remove(vma
, start
, end
)) {
1017 /* mmap_lock was not released by userfaultfd_remove() */
1018 mmap_read_unlock(mm
);
1020 error
= vfs_fallocate(f
,
1021 FALLOC_FL_PUNCH_HOLE
| FALLOC_FL_KEEP_SIZE
,
1022 offset
, end
- start
);
1028 static bool is_valid_guard_vma(struct vm_area_struct
*vma
, bool allow_locked
)
1030 vm_flags_t disallowed
= VM_SPECIAL
| VM_HUGETLB
;
1033 * A user could lock after setting a guard range but that's fine, as
1034 * they'd not be able to fault in. The issue arises when we try to zap
1035 * existing locked VMAs. We don't want to do that.
1038 disallowed
|= VM_LOCKED
;
1040 if (!vma_is_anonymous(vma
))
1043 if ((vma
->vm_flags
& (VM_MAYWRITE
| disallowed
)) != VM_MAYWRITE
)
1049 static bool is_guard_pte_marker(pte_t ptent
)
1051 return is_pte_marker(ptent
) &&
1052 is_guard_swp_entry(pte_to_swp_entry(ptent
));
1055 static int guard_install_pud_entry(pud_t
*pud
, unsigned long addr
,
1056 unsigned long next
, struct mm_walk
*walk
)
1058 pud_t pudval
= pudp_get(pud
);
1060 /* If huge return >0 so we abort the operation + zap. */
1061 return pud_trans_huge(pudval
) || pud_devmap(pudval
);
1064 static int guard_install_pmd_entry(pmd_t
*pmd
, unsigned long addr
,
1065 unsigned long next
, struct mm_walk
*walk
)
1067 pmd_t pmdval
= pmdp_get(pmd
);
1069 /* If huge return >0 so we abort the operation + zap. */
1070 return pmd_trans_huge(pmdval
) || pmd_devmap(pmdval
);
1073 static int guard_install_pte_entry(pte_t
*pte
, unsigned long addr
,
1074 unsigned long next
, struct mm_walk
*walk
)
1076 pte_t pteval
= ptep_get(pte
);
1077 unsigned long *nr_pages
= (unsigned long *)walk
->private;
1079 /* If there is already a guard page marker, we have nothing to do. */
1080 if (is_guard_pte_marker(pteval
)) {
1086 /* If populated return >0 so we abort the operation + zap. */
1090 static int guard_install_set_pte(unsigned long addr
, unsigned long next
,
1091 pte_t
*ptep
, struct mm_walk
*walk
)
1093 unsigned long *nr_pages
= (unsigned long *)walk
->private;
1095 /* Simply install a PTE marker, this causes segfault on access. */
1096 *ptep
= make_pte_marker(PTE_MARKER_GUARD
);
1102 static const struct mm_walk_ops guard_install_walk_ops
= {
1103 .pud_entry
= guard_install_pud_entry
,
1104 .pmd_entry
= guard_install_pmd_entry
,
1105 .pte_entry
= guard_install_pte_entry
,
1106 .install_pte
= guard_install_set_pte
,
1107 .walk_lock
= PGWALK_RDLOCK
,
1110 static long madvise_guard_install(struct vm_area_struct
*vma
,
1111 struct vm_area_struct
**prev
,
1112 unsigned long start
, unsigned long end
)
1118 if (!is_valid_guard_vma(vma
, /* allow_locked = */false))
1122 * If we install guard markers, then the range is no longer
1123 * empty from a page table perspective and therefore it's
1124 * appropriate to have an anon_vma.
1126 * This ensures that on fork, we copy page tables correctly.
1128 err
= anon_vma_prepare(vma
);
1133 * Optimistically try to install the guard marker pages first. If any
1134 * non-guard pages are encountered, give up and zap the range before
1137 * We try a few times before giving up and releasing back to userland to
1138 * loop around, releasing locks in the process to avoid contention. This
1139 * would only happen if there was a great many racing page faults.
1141 * In most cases we should simply install the guard markers immediately
1142 * with no zap or looping.
1144 for (i
= 0; i
< MAX_MADVISE_GUARD_RETRIES
; i
++) {
1145 unsigned long nr_pages
= 0;
1147 /* Returns < 0 on error, == 0 if success, > 0 if zap needed. */
1148 err
= walk_page_range_mm(vma
->vm_mm
, start
, end
,
1149 &guard_install_walk_ops
, &nr_pages
);
1154 unsigned long nr_expected_pages
= PHYS_PFN(end
- start
);
1156 VM_WARN_ON(nr_pages
!= nr_expected_pages
);
1161 * OK some of the range have non-guard pages mapped, zap
1162 * them. This leaves existing guard pages in place.
1164 zap_page_range_single(vma
, start
, end
- start
, NULL
);
1168 * We were unable to install the guard pages due to being raced by page
1169 * faults. This should not happen ordinarily. We return to userspace and
1170 * immediately retry, relieving lock contention.
1172 return restart_syscall();
1175 static int guard_remove_pud_entry(pud_t
*pud
, unsigned long addr
,
1176 unsigned long next
, struct mm_walk
*walk
)
1178 pud_t pudval
= pudp_get(pud
);
1180 /* If huge, cannot have guard pages present, so no-op - skip. */
1181 if (pud_trans_huge(pudval
) || pud_devmap(pudval
))
1182 walk
->action
= ACTION_CONTINUE
;
1187 static int guard_remove_pmd_entry(pmd_t
*pmd
, unsigned long addr
,
1188 unsigned long next
, struct mm_walk
*walk
)
1190 pmd_t pmdval
= pmdp_get(pmd
);
1192 /* If huge, cannot have guard pages present, so no-op - skip. */
1193 if (pmd_trans_huge(pmdval
) || pmd_devmap(pmdval
))
1194 walk
->action
= ACTION_CONTINUE
;
1199 static int guard_remove_pte_entry(pte_t
*pte
, unsigned long addr
,
1200 unsigned long next
, struct mm_walk
*walk
)
1202 pte_t ptent
= ptep_get(pte
);
1204 if (is_guard_pte_marker(ptent
)) {
1205 /* Simply clear the PTE marker. */
1206 pte_clear_not_present_full(walk
->mm
, addr
, pte
, false);
1207 update_mmu_cache(walk
->vma
, addr
, pte
);
1213 static const struct mm_walk_ops guard_remove_walk_ops
= {
1214 .pud_entry
= guard_remove_pud_entry
,
1215 .pmd_entry
= guard_remove_pmd_entry
,
1216 .pte_entry
= guard_remove_pte_entry
,
1217 .walk_lock
= PGWALK_RDLOCK
,
1220 static long madvise_guard_remove(struct vm_area_struct
*vma
,
1221 struct vm_area_struct
**prev
,
1222 unsigned long start
, unsigned long end
)
1226 * We're ok with removing guards in mlock()'d ranges, as this is a
1227 * non-destructive action.
1229 if (!is_valid_guard_vma(vma
, /* allow_locked = */true))
1232 return walk_page_range(vma
->vm_mm
, start
, end
,
1233 &guard_remove_walk_ops
, NULL
);
1237 * Apply an madvise behavior to a region of a vma. madvise_update_vma
1238 * will handle splitting a vm area into separate areas, each area with its own
1241 static int madvise_vma_behavior(struct vm_area_struct
*vma
,
1242 struct vm_area_struct
**prev
,
1243 unsigned long start
, unsigned long end
,
1244 unsigned long behavior
)
1247 struct anon_vma_name
*anon_name
;
1248 unsigned long new_flags
= vma
->vm_flags
;
1250 if (unlikely(!can_modify_vma_madv(vma
, behavior
)))
1255 return madvise_remove(vma
, prev
, start
, end
);
1257 return madvise_willneed(vma
, prev
, start
, end
);
1259 return madvise_cold(vma
, prev
, start
, end
);
1261 return madvise_pageout(vma
, prev
, start
, end
);
1264 case MADV_DONTNEED_LOCKED
:
1265 return madvise_dontneed_free(vma
, prev
, start
, end
, behavior
);
1267 new_flags
= new_flags
& ~VM_RAND_READ
& ~VM_SEQ_READ
;
1269 case MADV_SEQUENTIAL
:
1270 new_flags
= (new_flags
& ~VM_RAND_READ
) | VM_SEQ_READ
;
1273 new_flags
= (new_flags
& ~VM_SEQ_READ
) | VM_RAND_READ
;
1276 new_flags
|= VM_DONTCOPY
;
1279 if (vma
->vm_flags
& VM_IO
)
1281 new_flags
&= ~VM_DONTCOPY
;
1283 case MADV_WIPEONFORK
:
1284 /* MADV_WIPEONFORK is only supported on anonymous memory. */
1285 if (vma
->vm_file
|| vma
->vm_flags
& VM_SHARED
)
1287 new_flags
|= VM_WIPEONFORK
;
1289 case MADV_KEEPONFORK
:
1290 if (vma
->vm_flags
& VM_DROPPABLE
)
1292 new_flags
&= ~VM_WIPEONFORK
;
1295 new_flags
|= VM_DONTDUMP
;
1298 if ((!is_vm_hugetlb_page(vma
) && new_flags
& VM_SPECIAL
) ||
1299 (vma
->vm_flags
& VM_DROPPABLE
))
1301 new_flags
&= ~VM_DONTDUMP
;
1303 case MADV_MERGEABLE
:
1304 case MADV_UNMERGEABLE
:
1305 error
= ksm_madvise(vma
, start
, end
, behavior
, &new_flags
);
1310 case MADV_NOHUGEPAGE
:
1311 error
= hugepage_madvise(vma
, &new_flags
, behavior
);
1316 return madvise_collapse(vma
, prev
, start
, end
);
1317 case MADV_GUARD_INSTALL
:
1318 return madvise_guard_install(vma
, prev
, start
, end
);
1319 case MADV_GUARD_REMOVE
:
1320 return madvise_guard_remove(vma
, prev
, start
, end
);
1323 anon_name
= anon_vma_name(vma
);
1324 anon_vma_name_get(anon_name
);
1325 error
= madvise_update_vma(vma
, prev
, start
, end
, new_flags
,
1327 anon_vma_name_put(anon_name
);
1331 * madvise() returns EAGAIN if kernel resources, such as
1332 * slab, are temporarily unavailable.
1334 if (error
== -ENOMEM
)
1339 #ifdef CONFIG_MEMORY_FAILURE
1341 * Error injection support for memory error handling.
1343 static int madvise_inject_error(int behavior
,
1344 unsigned long start
, unsigned long end
)
1348 if (!capable(CAP_SYS_ADMIN
))
1352 for (; start
< end
; start
+= size
) {
1357 ret
= get_user_pages_fast(start
, 1, 0, &page
);
1360 pfn
= page_to_pfn(page
);
1363 * When soft offlining hugepages, after migrating the page
1364 * we dissolve it, therefore in the second loop "page" will
1365 * no longer be a compound page.
1367 size
= page_size(compound_head(page
));
1369 if (behavior
== MADV_SOFT_OFFLINE
) {
1370 pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
1372 ret
= soft_offline_page(pfn
, MF_COUNT_INCREASED
);
1374 pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
1376 ret
= memory_failure(pfn
, MF_ACTION_REQUIRED
| MF_COUNT_INCREASED
| MF_SW_SIMULATED
);
1377 if (ret
== -EOPNOTSUPP
)
1390 madvise_behavior_valid(int behavior
)
1396 case MADV_SEQUENTIAL
:
1401 case MADV_DONTNEED_LOCKED
:
1405 case MADV_POPULATE_READ
:
1406 case MADV_POPULATE_WRITE
:
1408 case MADV_MERGEABLE
:
1409 case MADV_UNMERGEABLE
:
1411 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1413 case MADV_NOHUGEPAGE
:
1418 case MADV_WIPEONFORK
:
1419 case MADV_KEEPONFORK
:
1420 case MADV_GUARD_INSTALL
:
1421 case MADV_GUARD_REMOVE
:
1422 #ifdef CONFIG_MEMORY_FAILURE
1423 case MADV_SOFT_OFFLINE
:
1433 /* Can we invoke process_madvise() on a remote mm for the specified behavior? */
1434 static bool process_madvise_remote_valid(int behavior
)
1448 * Walk the vmas in range [start,end), and call the visit function on each one.
1449 * The visit function will get start and end parameters that cover the overlap
1450 * between the current vma and the original range. Any unmapped regions in the
1451 * original range will result in this function returning -ENOMEM while still
1452 * calling the visit function on all of the existing vmas in the range.
1453 * Must be called with the mmap_lock held for reading or writing.
1456 int madvise_walk_vmas(struct mm_struct
*mm
, unsigned long start
,
1457 unsigned long end
, unsigned long arg
,
1458 int (*visit
)(struct vm_area_struct
*vma
,
1459 struct vm_area_struct
**prev
, unsigned long start
,
1460 unsigned long end
, unsigned long arg
))
1462 struct vm_area_struct
*vma
;
1463 struct vm_area_struct
*prev
;
1465 int unmapped_error
= 0;
1468 * If the interval [start,end) covers some unmapped address
1469 * ranges, just ignore them, but return -ENOMEM at the end.
1470 * - different from the way of handling in mlock etc.
1472 vma
= find_vma_prev(mm
, start
, &prev
);
1473 if (vma
&& start
> vma
->vm_start
)
1479 /* Still start < end. */
1483 /* Here start < (end|vma->vm_end). */
1484 if (start
< vma
->vm_start
) {
1485 unmapped_error
= -ENOMEM
;
1486 start
= vma
->vm_start
;
1491 /* Here vma->vm_start <= start < (end|vma->vm_end) */
1496 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
1497 error
= visit(vma
, &prev
, start
, tmp
, arg
);
1501 if (prev
&& start
< prev
->vm_end
)
1502 start
= prev
->vm_end
;
1506 vma
= find_vma(mm
, prev
->vm_end
);
1507 else /* madvise_remove dropped mmap_lock */
1508 vma
= find_vma(mm
, start
);
1511 return unmapped_error
;
1514 #ifdef CONFIG_ANON_VMA_NAME
1515 static int madvise_vma_anon_name(struct vm_area_struct
*vma
,
1516 struct vm_area_struct
**prev
,
1517 unsigned long start
, unsigned long end
,
1518 unsigned long anon_name
)
1522 /* Only anonymous mappings can be named */
1523 if (vma
->vm_file
&& !vma_is_anon_shmem(vma
))
1526 error
= madvise_update_vma(vma
, prev
, start
, end
, vma
->vm_flags
,
1527 (struct anon_vma_name
*)anon_name
);
1530 * madvise() returns EAGAIN if kernel resources, such as
1531 * slab, are temporarily unavailable.
1533 if (error
== -ENOMEM
)
1538 int madvise_set_anon_name(struct mm_struct
*mm
, unsigned long start
,
1539 unsigned long len_in
, struct anon_vma_name
*anon_name
)
1544 if (start
& ~PAGE_MASK
)
1546 len
= (len_in
+ ~PAGE_MASK
) & PAGE_MASK
;
1548 /* Check to see whether len was rounded up from small -ve to zero */
1559 return madvise_walk_vmas(mm
, start
, end
, (unsigned long)anon_name
,
1560 madvise_vma_anon_name
);
1562 #endif /* CONFIG_ANON_VMA_NAME */
1564 * The madvise(2) system call.
1566 * Applications can use madvise() to advise the kernel how it should
1567 * handle paging I/O in this VM area. The idea is to help the kernel
1568 * use appropriate read-ahead and caching techniques. The information
1569 * provided is advisory only, and can be safely disregarded by the
1570 * kernel without affecting the correct operation of the application.
1573 * MADV_NORMAL - the default behavior is to read clusters. This
1574 * results in some read-ahead and read-behind.
1575 * MADV_RANDOM - the system should read the minimum amount of data
1576 * on any access, since it is unlikely that the appli-
1577 * cation will need more than what it asks for.
1578 * MADV_SEQUENTIAL - pages in the given range will probably be accessed
1579 * once, so they can be aggressively read ahead, and
1580 * can be freed soon after they are accessed.
1581 * MADV_WILLNEED - the application is notifying the system to read
1583 * MADV_DONTNEED - the application is finished with the given range,
1584 * so the kernel can free resources associated with it.
1585 * MADV_FREE - the application marks pages in the given range as lazy free,
1586 * where actual purges are postponed until memory pressure happens.
1587 * MADV_REMOVE - the application wants to free up the given range of
1588 * pages and associated backing store.
1589 * MADV_DONTFORK - omit this area from child's address space when forking:
1590 * typically, to avoid COWing pages pinned by get_user_pages().
1591 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
1592 * MADV_WIPEONFORK - present the child process with zero-filled memory in this
1593 * range after a fork.
1594 * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
1595 * MADV_HWPOISON - trigger memory error handler as if the given memory range
1596 * were corrupted by unrecoverable hardware memory failure.
1597 * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
1598 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in
1599 * this area with pages of identical content from other such areas.
1600 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
1601 * MADV_HUGEPAGE - the application wants to back the given range by transparent
1602 * huge pages in the future. Existing pages might be coalesced and
1603 * new pages might be allocated as THP.
1604 * MADV_NOHUGEPAGE - mark the given range as not worth being backed by
1605 * transparent huge pages so the existing pages will not be
1606 * coalesced into THP and new pages will not be allocated as THP.
1607 * MADV_COLLAPSE - synchronously coalesce pages into new THP.
1608 * MADV_DONTDUMP - the application wants to prevent pages in the given range
1609 * from being included in its core dump.
1610 * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
1611 * MADV_COLD - the application is not expected to use this memory soon,
1612 * deactivate pages in this range so that they can be reclaimed
1613 * easily if memory pressure happens.
1614 * MADV_PAGEOUT - the application is not expected to use this memory soon,
1615 * page out the pages in this range immediately.
1616 * MADV_POPULATE_READ - populate (prefault) page tables readable by
1617 * triggering read faults if required
1618 * MADV_POPULATE_WRITE - populate (prefault) page tables writable by
1619 * triggering write faults if required
1623 * -EINVAL - start + len < 0, start is not page-aligned,
1624 * "behavior" is not a valid value, or application
1625 * is attempting to release locked or shared pages,
1626 * or the specified address range includes file, Huge TLB,
1627 * MAP_SHARED or VMPFNMAP range.
1628 * -ENOMEM - addresses in the specified range are not currently
1629 * mapped, or are outside the AS of the process.
1630 * -EIO - an I/O error occurred while paging in data.
1631 * -EBADF - map exists, but area maps something that isn't a file.
1632 * -EAGAIN - a kernel resource was temporarily unavailable.
1633 * -EPERM - memory is sealed.
1635 int do_madvise(struct mm_struct
*mm
, unsigned long start
, size_t len_in
, int behavior
)
1641 struct blk_plug plug
;
1643 if (!madvise_behavior_valid(behavior
))
1646 if (!PAGE_ALIGNED(start
))
1648 len
= PAGE_ALIGN(len_in
);
1650 /* Check to see whether len was rounded up from small -ve to zero */
1661 #ifdef CONFIG_MEMORY_FAILURE
1662 if (behavior
== MADV_HWPOISON
|| behavior
== MADV_SOFT_OFFLINE
)
1663 return madvise_inject_error(behavior
, start
, start
+ len_in
);
1666 write
= madvise_need_mmap_write(behavior
);
1668 if (mmap_write_lock_killable(mm
))
1674 start
= untagged_addr_remote(mm
, start
);
1677 blk_start_plug(&plug
);
1679 case MADV_POPULATE_READ
:
1680 case MADV_POPULATE_WRITE
:
1681 error
= madvise_populate(mm
, start
, end
, behavior
);
1684 error
= madvise_walk_vmas(mm
, start
, end
, behavior
,
1685 madvise_vma_behavior
);
1688 blk_finish_plug(&plug
);
1691 mmap_write_unlock(mm
);
1693 mmap_read_unlock(mm
);
1698 SYSCALL_DEFINE3(madvise
, unsigned long, start
, size_t, len_in
, int, behavior
)
1700 return do_madvise(current
->mm
, start
, len_in
, behavior
);
1703 /* Perform an madvise operation over a vector of addresses and lengths. */
1704 static ssize_t
vector_madvise(struct mm_struct
*mm
, struct iov_iter
*iter
,
1710 total_len
= iov_iter_count(iter
);
1712 while (iov_iter_count(iter
)) {
1713 ret
= do_madvise(mm
, (unsigned long)iter_iov_addr(iter
),
1714 iter_iov_len(iter
), behavior
);
1716 * An madvise operation is attempting to restart the syscall,
1717 * but we cannot proceed as it would not be correct to repeat
1718 * the operation in aggregate, and would be surprising to the
1721 * As we have already dropped locks, it is safe to just loop and
1722 * try again. We check for fatal signals in case we need exit
1725 if (ret
== -ERESTARTNOINTR
) {
1726 if (fatal_signal_pending(current
)) {
1734 iov_iter_advance(iter
, iter_iov_len(iter
));
1737 ret
= (total_len
- iov_iter_count(iter
)) ? : ret
;
1742 SYSCALL_DEFINE5(process_madvise
, int, pidfd
, const struct iovec __user
*, vec
,
1743 size_t, vlen
, int, behavior
, unsigned int, flags
)
1746 struct iovec iovstack
[UIO_FASTIOV
];
1747 struct iovec
*iov
= iovstack
;
1748 struct iov_iter iter
;
1749 struct task_struct
*task
;
1750 struct mm_struct
*mm
;
1751 unsigned int f_flags
;
1758 ret
= import_iovec(ITER_DEST
, vec
, vlen
, ARRAY_SIZE(iovstack
), &iov
, &iter
);
1762 task
= pidfd_get_task(pidfd
, &f_flags
);
1764 ret
= PTR_ERR(task
);
1768 /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
1769 mm
= mm_access(task
, PTRACE_MODE_READ_FSCREDS
);
1776 * We need only perform this check if we are attempting to manipulate a
1777 * remote process's address space.
1779 if (mm
!= current
->mm
&& !process_madvise_remote_valid(behavior
)) {
1785 * Require CAP_SYS_NICE for influencing process performance. Note that
1786 * only non-destructive hints are currently supported for remote
1789 if (mm
!= current
->mm
&& !capable(CAP_SYS_NICE
)) {
1794 ret
= vector_madvise(mm
, &iter
, behavior
);
1799 put_task_struct(task
);