mm/mremap.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  *      mm/mremap.c
   4  *
   5  *      (C) Copyright 1996 Linus Torvalds
   6  *
   7  *      Address space accounting code   <alan@lxorguk.ukuu.org.uk>
   8  *      (C) Copyright 2002 Red Hat Inc, All Rights Reserved
   9  */
  10
  11 #include <linux/mm.h>
  12 #include <linux/mm_inline.h>
  13 #include <linux/hugetlb.h>
  14 #include <linux/shm.h>
  15 #include <linux/ksm.h>
  16 #include <linux/mman.h>
  17 #include <linux/swap.h>
  18 #include <linux/capability.h>
  19 #include <linux/fs.h>
  20 #include <linux/swapops.h>
  21 #include <linux/highmem.h>
  22 #include <linux/security.h>
  23 #include <linux/syscalls.h>
  24 #include <linux/mmu_notifier.h>
  25 #include <linux/uaccess.h>
  26 #include <linux/userfaultfd_k.h>
  27 #include <linux/mempolicy.h>
  28
  29 #include <asm/cacheflush.h>
  30 #include <asm/tlb.h>
  31 #include <asm/pgalloc.h>
  32
  33 #include "internal.h"
  34
  35 static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr)
  36 {
  37         pgd_t *pgd;
  38         p4d_t *p4d;
  39         pud_t *pud;
  40
  41         pgd = pgd_offset(mm, addr);
  42         if (pgd_none_or_clear_bad(pgd))
  43                 return NULL;
  44
  45         p4d = p4d_offset(pgd, addr);
  46         if (p4d_none_or_clear_bad(p4d))
  47                 return NULL;
  48
  49         pud = pud_offset(p4d, addr);
  50         if (pud_none_or_clear_bad(pud))
  51                 return NULL;
  52
  53         return pud;
  54 }
  55
  56 static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
  57 {
  58         pud_t *pud;
  59         pmd_t *pmd;
  60
  61         pud = get_old_pud(mm, addr);
  62         if (!pud)
  63                 return NULL;
  64
  65         pmd = pmd_offset(pud, addr);
  66         if (pmd_none(*pmd))
  67                 return NULL;
  68
  69         return pmd;
  70 }
  71
  72 static pud_t *alloc_new_pud(struct mm_struct *mm, struct vm_area_struct *vma,
  73                             unsigned long addr)
  74 {
  75         pgd_t *pgd;
  76         p4d_t *p4d;
  77
  78         pgd = pgd_offset(mm, addr);
  79         p4d = p4d_alloc(mm, pgd, addr);
  80         if (!p4d)
  81                 return NULL;
  82
  83         return pud_alloc(mm, p4d, addr);
  84 }
  85
  86 static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
  87                             unsigned long addr)
  88 {
  89         pud_t *pud;
  90         pmd_t *pmd;
  91
  92         pud = alloc_new_pud(mm, vma, addr);
  93         if (!pud)
  94                 return NULL;
  95
  96         pmd = pmd_alloc(mm, pud, addr);
  97         if (!pmd)
  98                 return NULL;
  99
 100         VM_BUG_ON(pmd_trans_huge(*pmd));
 101
 102         return pmd;
 103 }
 104
 105 static void take_rmap_locks(struct vm_area_struct *vma)
 106 {
 107         if (vma->vm_file)
 108                 i_mmap_lock_write(vma->vm_file->f_mapping);
 109         if (vma->anon_vma)
 110                 anon_vma_lock_write(vma->anon_vma);
 111 }
 112
 113 static void drop_rmap_locks(struct vm_area_struct *vma)
 114 {
 115         if (vma->anon_vma)
 116                 anon_vma_unlock_write(vma->anon_vma);
 117         if (vma->vm_file)
 118                 i_mmap_unlock_write(vma->vm_file->f_mapping);
 119 }
 120
 121 static pte_t move_soft_dirty_pte(pte_t pte)
 122 {
 123         /*
 124          * Set soft dirty bit so we can notice
 125          * in userspace the ptes were moved.
 126          */
 127 #ifdef CONFIG_MEM_SOFT_DIRTY
 128         if (pte_present(pte))
 129                 pte = pte_mksoft_dirty(pte);
 130         else if (is_swap_pte(pte))
 131                 pte = pte_swp_mksoft_dirty(pte);
 132 #endif
 133         return pte;
 134 }
 135
 136 static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 137                 unsigned long old_addr, unsigned long old_end,
 138                 struct vm_area_struct *new_vma, pmd_t *new_pmd,
 139                 unsigned long new_addr, bool need_rmap_locks)
 140 {
 141         bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
 142         struct mm_struct *mm = vma->vm_mm;
 143         pte_t *old_pte, *new_pte, pte;
 144         pmd_t dummy_pmdval;
 145         spinlock_t *old_ptl, *new_ptl;
 146         bool force_flush = false;
 147         unsigned long len = old_end - old_addr;
 148         int err = 0;
 149
 150         /*
 151          * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
 152          * locks to ensure that rmap will always observe either the old or the
 153          * new ptes. This is the easiest way to avoid races with
 154          * truncate_pagecache(), page migration, etc...
 155          *
 156          * When need_rmap_locks is false, we use other ways to avoid
 157          * such races:
 158          *
 159          * - During exec() shift_arg_pages(), we use a specially tagged vma
 160          *   which rmap call sites look for using vma_is_temporary_stack().
 161          *
 162          * - During mremap(), new_vma is often known to be placed after vma
 163          *   in rmap traversal order. This ensures rmap will always observe
 164          *   either the old pte, or the new pte, or both (the page table locks
 165          *   serialize access to individual ptes, but only rmap traversal
 166          *   order guarantees that we won't miss both the old and new ptes).
 167          */
 168         if (need_rmap_locks)
 169                 take_rmap_locks(vma);
 170
 171         /*
 172          * We don't have to worry about the ordering of src and dst
 173          * pte locks because exclusive mmap_lock prevents deadlock.
 174          */
 175         old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
 176         if (!old_pte) {
 177                 err = -EAGAIN;
 178                 goto out;
 179         }
 180         /*
 181          * Now new_pte is none, so hpage_collapse_scan_file() path can not find
 182          * this by traversing file->f_mapping, so there is no concurrency with
 183          * retract_page_tables(). In addition, we already hold the exclusive
 184          * mmap_lock, so this new_pte page is stable, so there is no need to get
 185          * pmdval and do pmd_same() check.
 186          */
 187         new_pte = pte_offset_map_rw_nolock(mm, new_pmd, new_addr, &dummy_pmdval,
 188                                            &new_ptl);
 189         if (!new_pte) {
 190                 pte_unmap_unlock(old_pte, old_ptl);
 191                 err = -EAGAIN;
 192                 goto out;
 193         }
 194         if (new_ptl != old_ptl)
 195                 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
 196         flush_tlb_batched_pending(vma->vm_mm);
 197         arch_enter_lazy_mmu_mode();
 198
 199         for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
 200                                    new_pte++, new_addr += PAGE_SIZE) {
 201                 if (pte_none(ptep_get(old_pte)))
 202                         continue;
 203
 204                 pte = ptep_get_and_clear(mm, old_addr, old_pte);
 205                 /*
 206                  * If we are remapping a valid PTE, make sure
 207                  * to flush TLB before we drop the PTL for the
 208                  * PTE.
 209                  *
 210                  * NOTE! Both old and new PTL matter: the old one
 211                  * for racing with folio_mkclean(), the new one to
 212                  * make sure the physical page stays valid until
 213                  * the TLB entry for the old mapping has been
 214                  * flushed.
 215                  */
 216                 if (pte_present(pte))
 217                         force_flush = true;
 218                 pte = move_pte(pte, old_addr, new_addr);
 219                 pte = move_soft_dirty_pte(pte);
 220
 221                 if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
 222                         pte_clear(mm, new_addr, new_pte);
 223                 else {
 224                         if (need_clear_uffd_wp) {
 225                                 if (pte_present(pte))
 226                                         pte = pte_clear_uffd_wp(pte);
 227                                 else if (is_swap_pte(pte))
 228                                         pte = pte_swp_clear_uffd_wp(pte);
 229                         }
 230                         set_pte_at(mm, new_addr, new_pte, pte);
 231                 }
 232         }
 233
 234         arch_leave_lazy_mmu_mode();
 235         if (force_flush)
 236                 flush_tlb_range(vma, old_end - len, old_end);
 237         if (new_ptl != old_ptl)
 238                 spin_unlock(new_ptl);
 239         pte_unmap(new_pte - 1);
 240         pte_unmap_unlock(old_pte - 1, old_ptl);
 241 out:
 242         if (need_rmap_locks)
 243                 drop_rmap_locks(vma);
 244         return err;
 245 }
 246
 247 #ifndef arch_supports_page_table_move
 248 #define arch_supports_page_table_move arch_supports_page_table_move
 249 static inline bool arch_supports_page_table_move(void)
 250 {
 251         return IS_ENABLED(CONFIG_HAVE_MOVE_PMD) ||
 252                 IS_ENABLED(CONFIG_HAVE_MOVE_PUD);
 253 }
 254 #endif
 255
 256 #ifdef CONFIG_HAVE_MOVE_PMD
 257 static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 258                   unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
 259 {
 260         spinlock_t *old_ptl, *new_ptl;
 261         struct mm_struct *mm = vma->vm_mm;
 262         bool res = false;
 263         pmd_t pmd;
 264
 265         if (!arch_supports_page_table_move())
 266                 return false;
 267         /*
 268          * The destination pmd shouldn't be established, free_pgtables()
 269          * should have released it.
 270          *
 271          * However, there's a case during execve() where we use mremap
 272          * to move the initial stack, and in that case the target area
 273          * may overlap the source area (always moving down).
 274          *
 275          * If everything is PMD-aligned, that works fine, as moving
 276          * each pmd down will clear the source pmd. But if we first
 277          * have a few 4kB-only pages that get moved down, and then
 278          * hit the "now the rest is PMD-aligned, let's do everything
 279          * one pmd at a time", we will still have the old (now empty
 280          * of any 4kB pages, but still there) PMD in the page table
 281          * tree.
 282          *
 283          * Warn on it once - because we really should try to figure
 284          * out how to do this better - but then say "I won't move
 285          * this pmd".
 286          *
 287          * One alternative might be to just unmap the target pmd at
 288          * this point, and verify that it really is empty. We'll see.
 289          */
 290         if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
 291                 return false;
 292
 293         /* If this pmd belongs to a uffd vma with remap events disabled, we need
 294          * to ensure that the uffd-wp state is cleared from all pgtables. This
 295          * means recursing into lower page tables in move_page_tables(), and we
 296          * can reuse the existing code if we simply treat the entry as "not
 297          * moved".
 298          */
 299         if (vma_has_uffd_without_event_remap(vma))
 300                 return false;
 301
 302         /*
 303          * We don't have to worry about the ordering of src and dst
 304          * ptlocks because exclusive mmap_lock prevents deadlock.
 305          */
 306         old_ptl = pmd_lock(vma->vm_mm, old_pmd);
 307         new_ptl = pmd_lockptr(mm, new_pmd);
 308         if (new_ptl != old_ptl)
 309                 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
 310
 311         pmd = *old_pmd;
 312
 313         /* Racing with collapse? */
 314         if (unlikely(!pmd_present(pmd) || pmd_leaf(pmd)))
 315                 goto out_unlock;
 316         /* Clear the pmd */
 317         pmd_clear(old_pmd);
 318         res = true;
 319
 320         VM_BUG_ON(!pmd_none(*new_pmd));
 321
 322         pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
 323         flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
 324 out_unlock:
 325         if (new_ptl != old_ptl)
 326                 spin_unlock(new_ptl);
 327         spin_unlock(old_ptl);
 328
 329         return res;
 330 }
 331 #else
 332 static inline bool move_normal_pmd(struct vm_area_struct *vma,
 333                 unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd,
 334                 pmd_t *new_pmd)
 335 {
 336         return false;
 337 }
 338 #endif
 339
 340 #if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD)
 341 static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
 342                   unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
 343 {
 344         spinlock_t *old_ptl, *new_ptl;
 345         struct mm_struct *mm = vma->vm_mm;
 346         pud_t pud;
 347
 348         if (!arch_supports_page_table_move())
 349                 return false;
 350         /*
 351          * The destination pud shouldn't be established, free_pgtables()
 352          * should have released it.
 353          */
 354         if (WARN_ON_ONCE(!pud_none(*new_pud)))
 355                 return false;
 356
 357         /* If this pud belongs to a uffd vma with remap events disabled, we need
 358          * to ensure that the uffd-wp state is cleared from all pgtables. This
 359          * means recursing into lower page tables in move_page_tables(), and we
 360          * can reuse the existing code if we simply treat the entry as "not
 361          * moved".
 362          */
 363         if (vma_has_uffd_without_event_remap(vma))
 364                 return false;
 365
 366         /*
 367          * We don't have to worry about the ordering of src and dst
 368          * ptlocks because exclusive mmap_lock prevents deadlock.
 369          */
 370         old_ptl = pud_lock(vma->vm_mm, old_pud);
 371         new_ptl = pud_lockptr(mm, new_pud);
 372         if (new_ptl != old_ptl)
 373                 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
 374
 375         /* Clear the pud */
 376         pud = *old_pud;
 377         pud_clear(old_pud);
 378
 379         VM_BUG_ON(!pud_none(*new_pud));
 380
 381         pud_populate(mm, new_pud, pud_pgtable(pud));
 382         flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE);
 383         if (new_ptl != old_ptl)
 384                 spin_unlock(new_ptl);
 385         spin_unlock(old_ptl);
 386
 387         return true;
 388 }
 389 #else
 390 static inline bool move_normal_pud(struct vm_area_struct *vma,
 391                 unsigned long old_addr, unsigned long new_addr, pud_t *old_pud,
 392                 pud_t *new_pud)
 393 {
 394         return false;
 395 }
 396 #endif
 397
 398 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
 399 static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
 400                           unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
 401 {
 402         spinlock_t *old_ptl, *new_ptl;
 403         struct mm_struct *mm = vma->vm_mm;
 404         pud_t pud;
 405
 406         /*
 407          * The destination pud shouldn't be established, free_pgtables()
 408          * should have released it.
 409          */
 410         if (WARN_ON_ONCE(!pud_none(*new_pud)))
 411                 return false;
 412
 413         /*
 414          * We don't have to worry about the ordering of src and dst
 415          * ptlocks because exclusive mmap_lock prevents deadlock.
 416          */
 417         old_ptl = pud_lock(vma->vm_mm, old_pud);
 418         new_ptl = pud_lockptr(mm, new_pud);
 419         if (new_ptl != old_ptl)
 420                 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
 421
 422         /* Clear the pud */
 423         pud = *old_pud;
 424         pud_clear(old_pud);
 425
 426         VM_BUG_ON(!pud_none(*new_pud));
 427
 428         /* Set the new pud */
 429         /* mark soft_ditry when we add pud level soft dirty support */
 430         set_pud_at(mm, new_addr, new_pud, pud);
 431         flush_pud_tlb_range(vma, old_addr, old_addr + HPAGE_PUD_SIZE);
 432         if (new_ptl != old_ptl)
 433                 spin_unlock(new_ptl);
 434         spin_unlock(old_ptl);
 435
 436         return true;
 437 }
 438 #else
 439 static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
 440                           unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
 441 {
 442         WARN_ON_ONCE(1);
 443         return false;
 444
 445 }
 446 #endif
 447
 448 enum pgt_entry {
 449         NORMAL_PMD,
 450         HPAGE_PMD,
 451         NORMAL_PUD,
 452         HPAGE_PUD,
 453 };
 454
 455 /*
 456  * Returns an extent of the corresponding size for the pgt_entry specified if
 457  * valid. Else returns a smaller extent bounded by the end of the source and
 458  * destination pgt_entry.
 459  */
 460 static __always_inline unsigned long get_extent(enum pgt_entry entry,
 461                         unsigned long old_addr, unsigned long old_end,
 462                         unsigned long new_addr)
 463 {
 464         unsigned long next, extent, mask, size;
 465
 466         switch (entry) {
 467         case HPAGE_PMD:
 468         case NORMAL_PMD:
 469                 mask = PMD_MASK;
 470                 size = PMD_SIZE;
 471                 break;
 472         case HPAGE_PUD:
 473         case NORMAL_PUD:
 474                 mask = PUD_MASK;
 475                 size = PUD_SIZE;
 476                 break;
 477         default:
 478                 BUILD_BUG();
 479                 break;
 480         }
 481
 482         next = (old_addr + size) & mask;
 483         /* even if next overflowed, extent below will be ok */
 484         extent = next - old_addr;
 485         if (extent > old_end - old_addr)
 486                 extent = old_end - old_addr;
 487         next = (new_addr + size) & mask;
 488         if (extent > next - new_addr)
 489                 extent = next - new_addr;
 490         return extent;
 491 }
 492
 493 /*
 494  * Attempts to speedup the move by moving entry at the level corresponding to
 495  * pgt_entry. Returns true if the move was successful, else false.
 496  */
 497 static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
 498                         unsigned long old_addr, unsigned long new_addr,
 499                         void *old_entry, void *new_entry, bool need_rmap_locks)
 500 {
 501         bool moved = false;
 502
 503         /* See comment in move_ptes() */
 504         if (need_rmap_locks)
 505                 take_rmap_locks(vma);
 506
 507         switch (entry) {
 508         case NORMAL_PMD:
 509                 moved = move_normal_pmd(vma, old_addr, new_addr, old_entry,
 510                                         new_entry);
 511                 break;
 512         case NORMAL_PUD:
 513                 moved = move_normal_pud(vma, old_addr, new_addr, old_entry,
 514                                         new_entry);
 515                 break;
 516         case HPAGE_PMD:
 517                 moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
 518                         move_huge_pmd(vma, old_addr, new_addr, old_entry,
 519                                       new_entry);
 520                 break;
 521         case HPAGE_PUD:
 522                 moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
 523                         move_huge_pud(vma, old_addr, new_addr, old_entry,
 524                                       new_entry);
 525                 break;
 526
 527         default:
 528                 WARN_ON_ONCE(1);
 529                 break;
 530         }
 531
 532         if (need_rmap_locks)
 533                 drop_rmap_locks(vma);
 534
 535         return moved;
 536 }
 537
 538 /*
 539  * A helper to check if aligning down is OK. The aligned address should fall
 540  * on *no mapping*. For the stack moving down, that's a special move within
 541  * the VMA that is created to span the source and destination of the move,
 542  * so we make an exception for it.
 543  */
 544 static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_align,
 545                             unsigned long mask, bool for_stack)
 546 {
 547         unsigned long addr_masked = addr_to_align & mask;
 548
 549         /*
 550          * If @addr_to_align of either source or destination is not the beginning
 551          * of the corresponding VMA, we can't align down or we will destroy part
 552          * of the current mapping.
 553          */
 554         if (!for_stack && vma->vm_start != addr_to_align)
 555                 return false;
 556
 557         /* In the stack case we explicitly permit in-VMA alignment. */
 558         if (for_stack && addr_masked >= vma->vm_start)
 559                 return true;
 560
 561         /*
 562          * Make sure the realignment doesn't cause the address to fall on an
 563          * existing mapping.
 564          */
 565         return find_vma_intersection(vma->vm_mm, addr_masked, vma->vm_start) == NULL;
 566 }
 567
 568 /* Opportunistically realign to specified boundary for faster copy. */
 569 static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old_vma,
 570                              unsigned long *new_addr, struct vm_area_struct *new_vma,
 571                              unsigned long mask, bool for_stack)
 572 {
 573         /* Skip if the addresses are already aligned. */
 574         if ((*old_addr & ~mask) == 0)
 575                 return;
 576
 577         /* Only realign if the new and old addresses are mutually aligned. */
 578         if ((*old_addr & ~mask) != (*new_addr & ~mask))
 579                 return;
 580
 581         /* Ensure realignment doesn't cause overlap with existing mappings. */
 582         if (!can_align_down(old_vma, *old_addr, mask, for_stack) ||
 583             !can_align_down(new_vma, *new_addr, mask, for_stack))
 584                 return;
 585
 586         *old_addr = *old_addr & mask;
 587         *new_addr = *new_addr & mask;
 588 }
 589
 590 unsigned long move_page_tables(struct vm_area_struct *vma,
 591                 unsigned long old_addr, struct vm_area_struct *new_vma,
 592                 unsigned long new_addr, unsigned long len,
 593                 bool need_rmap_locks, bool for_stack)
 594 {
 595         unsigned long extent, old_end;
 596         struct mmu_notifier_range range;
 597         pmd_t *old_pmd, *new_pmd;
 598         pud_t *old_pud, *new_pud;
 599
 600         if (!len)
 601                 return 0;
 602
 603         old_end = old_addr + len;
 604
 605         if (is_vm_hugetlb_page(vma))
 606                 return move_hugetlb_page_tables(vma, new_vma, old_addr,
 607                                                 new_addr, len);
 608
 609         /*
 610          * If possible, realign addresses to PMD boundary for faster copy.
 611          * Only realign if the mremap copying hits a PMD boundary.
 612          */
 613         if (len >= PMD_SIZE - (old_addr & ~PMD_MASK))
 614                 try_realign_addr(&old_addr, vma, &new_addr, new_vma, PMD_MASK,
 615                                  for_stack);
 616
 617         flush_cache_range(vma, old_addr, old_end);
 618         mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
 619                                 old_addr, old_end);
 620         mmu_notifier_invalidate_range_start(&range);
 621
 622         for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
 623                 cond_resched();
 624                 /*
 625                  * If extent is PUD-sized try to speed up the move by moving at the
 626                  * PUD level if possible.
 627                  */
 628                 extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr);
 629
 630                 old_pud = get_old_pud(vma->vm_mm, old_addr);
 631                 if (!old_pud)
 632                         continue;
 633                 new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr);
 634                 if (!new_pud)
 635                         break;
 636                 if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) {
 637                         if (extent == HPAGE_PUD_SIZE) {
 638                                 move_pgt_entry(HPAGE_PUD, vma, old_addr, new_addr,
 639                                                old_pud, new_pud, need_rmap_locks);
 640                                 /* We ignore and continue on error? */
 641                                 continue;
 642                         }
 643                 } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
 644
 645                         if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr,
 646                                            old_pud, new_pud, true))
 647                                 continue;
 648                 }
 649
 650                 extent = get_extent(NORMAL_PMD, old_addr, old_end, new_addr);
 651                 old_pmd = get_old_pmd(vma->vm_mm, old_addr);
 652                 if (!old_pmd)
 653                         continue;
 654                 new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
 655                 if (!new_pmd)
 656                         break;
 657 again:
 658                 if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) ||
 659                     pmd_devmap(*old_pmd)) {
 660                         if (extent == HPAGE_PMD_SIZE &&
 661                             move_pgt_entry(HPAGE_PMD, vma, old_addr, new_addr,
 662                                            old_pmd, new_pmd, need_rmap_locks))
 663                                 continue;
 664                         split_huge_pmd(vma, old_pmd, old_addr);
 665                 } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) &&
 666                            extent == PMD_SIZE) {
 667                         /*
 668                          * If the extent is PMD-sized, try to speed the move by
 669                          * moving at the PMD level if possible.
 670                          */
 671                         if (move_pgt_entry(NORMAL_PMD, vma, old_addr, new_addr,
 672                                            old_pmd, new_pmd, true))
 673                                 continue;
 674                 }
 675                 if (pmd_none(*old_pmd))
 676                         continue;
 677                 if (pte_alloc(new_vma->vm_mm, new_pmd))
 678                         break;
 679                 if (move_ptes(vma, old_pmd, old_addr, old_addr + extent,
 680                               new_vma, new_pmd, new_addr, need_rmap_locks) < 0)
 681                         goto again;
 682         }
 683
 684         mmu_notifier_invalidate_range_end(&range);
 685
 686         /*
 687          * Prevent negative return values when {old,new}_addr was realigned
 688          * but we broke out of the above loop for the first PMD itself.
 689          */
 690         if (old_addr < old_end - len)
 691                 return 0;
 692
 693         return len + old_addr - old_end;        /* how much done */
 694 }
 695
 696 static unsigned long move_vma(struct vm_area_struct *vma,
 697                 unsigned long old_addr, unsigned long old_len,
 698                 unsigned long new_len, unsigned long new_addr,
 699                 bool *locked, unsigned long flags,
 700                 struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
 701 {
 702         long to_account = new_len - old_len;
 703         struct mm_struct *mm = vma->vm_mm;
 704         struct vm_area_struct *new_vma;
 705         unsigned long vm_flags = vma->vm_flags;
 706         unsigned long new_pgoff;
 707         unsigned long moved_len;
 708         unsigned long account_start = 0;
 709         unsigned long account_end = 0;
 710         unsigned long hiwater_vm;
 711         int err = 0;
 712         bool need_rmap_locks;
 713         struct vma_iterator vmi;
 714
 715         /*
 716          * We'd prefer to avoid failure later on in do_munmap:
 717          * which may split one vma into three before unmapping.
 718          */
 719         if (mm->map_count >= sysctl_max_map_count - 3)
 720                 return -ENOMEM;
 721
 722         if (unlikely(flags & MREMAP_DONTUNMAP))
 723                 to_account = new_len;
 724
 725         if (vma->vm_ops && vma->vm_ops->may_split) {
 726                 if (vma->vm_start != old_addr)
 727                         err = vma->vm_ops->may_split(vma, old_addr);
 728                 if (!err && vma->vm_end != old_addr + old_len)
 729                         err = vma->vm_ops->may_split(vma, old_addr + old_len);
 730                 if (err)
 731                         return err;
 732         }
 733
 734         /*
 735          * Advise KSM to break any KSM pages in the area to be moved:
 736          * it would be confusing if they were to turn up at the new
 737          * location, where they happen to coincide with different KSM
 738          * pages recently unmapped.  But leave vma->vm_flags as it was,
 739          * so KSM can come around to merge on vma and new_vma afterwards.
 740          */
 741         err = ksm_madvise(vma, old_addr, old_addr + old_len,
 742                                                 MADV_UNMERGEABLE, &vm_flags);
 743         if (err)
 744                 return err;
 745
 746         if (vm_flags & VM_ACCOUNT) {
 747                 if (security_vm_enough_memory_mm(mm, to_account >> PAGE_SHIFT))
 748                         return -ENOMEM;
 749         }
 750
 751         vma_start_write(vma);
 752         new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
 753         new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
 754                            &need_rmap_locks);
 755         if (!new_vma) {
 756                 if (vm_flags & VM_ACCOUNT)
 757                         vm_unacct_memory(to_account >> PAGE_SHIFT);
 758                 return -ENOMEM;
 759         }
 760
 761         moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
 762                                      need_rmap_locks, false);
 763         if (moved_len < old_len) {
 764                 err = -ENOMEM;
 765         } else if (vma->vm_ops && vma->vm_ops->mremap) {
 766                 err = vma->vm_ops->mremap(new_vma);
 767         }
 768
 769         if (unlikely(err)) {
 770                 /*
 771                  * On error, move entries back from new area to old,
 772                  * which will succeed since page tables still there,
 773                  * and then proceed to unmap new area instead of old.
 774                  */
 775                 move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
 776                                  true, false);
 777                 vma = new_vma;
 778                 old_len = new_len;
 779                 old_addr = new_addr;
 780                 new_addr = err;
 781         } else {
 782                 mremap_userfaultfd_prep(new_vma, uf);
 783         }
 784
 785         if (is_vm_hugetlb_page(vma)) {
 786                 clear_vma_resv_huge_pages(vma);
 787         }
 788
 789         /* Conceal VM_ACCOUNT so old reservation is not undone */
 790         if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) {
 791                 vm_flags_clear(vma, VM_ACCOUNT);
 792                 if (vma->vm_start < old_addr)
 793                         account_start = vma->vm_start;
 794                 if (vma->vm_end > old_addr + old_len)
 795                         account_end = vma->vm_end;
 796         }
 797
 798         /*
 799          * If we failed to move page tables we still do total_vm increment
 800          * since do_munmap() will decrement it by old_len == new_len.
 801          *
 802          * Since total_vm is about to be raised artificially high for a
 803          * moment, we need to restore high watermark afterwards: if stats
 804          * are taken meanwhile, total_vm and hiwater_vm appear too high.
 805          * If this were a serious issue, we'd add a flag to do_munmap().
 806          */
 807         hiwater_vm = mm->hiwater_vm;
 808         vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT);
 809
 810         /* Tell pfnmap has moved from this vma */
 811         if (unlikely(vma->vm_flags & VM_PFNMAP))
 812                 untrack_pfn_clear(vma);
 813
 814         if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
 815                 /* We always clear VM_LOCKED[ONFAULT] on the old vma */
 816                 vm_flags_clear(vma, VM_LOCKED_MASK);
 817
 818                 /*
 819                  * anon_vma links of the old vma is no longer needed after its page
 820                  * table has been moved.
 821                  */
 822                 if (new_vma != vma && vma->vm_start == old_addr &&
 823                         vma->vm_end == (old_addr + old_len))
 824                         unlink_anon_vmas(vma);
 825
 826                 /* Because we won't unmap we don't need to touch locked_vm */
 827                 return new_addr;
 828         }
 829
 830         vma_iter_init(&vmi, mm, old_addr);
 831         if (do_vmi_munmap(&vmi, mm, old_addr, old_len, uf_unmap, false) < 0) {
 832                 /* OOM: unable to split vma, just get accounts right */
 833                 if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
 834                         vm_acct_memory(old_len >> PAGE_SHIFT);
 835                 account_start = account_end = 0;
 836         }
 837
 838         if (vm_flags & VM_LOCKED) {
 839                 mm->locked_vm += new_len >> PAGE_SHIFT;
 840                 *locked = true;
 841         }
 842
 843         mm->hiwater_vm = hiwater_vm;
 844
 845         /* Restore VM_ACCOUNT if one or two pieces of vma left */
 846         if (account_start) {
 847                 vma = vma_prev(&vmi);
 848                 vm_flags_set(vma, VM_ACCOUNT);
 849         }
 850
 851         if (account_end) {
 852                 vma = vma_next(&vmi);
 853                 vm_flags_set(vma, VM_ACCOUNT);
 854         }
 855
 856         return new_addr;
 857 }
 858
 859 /*
 860  * resize_is_valid() - Ensure the vma can be resized to the new length at the give
 861  * address.
 862  *
 863  * @vma: The vma to resize
 864  * @addr: The old address
 865  * @old_len: The current size
 866  * @new_len: The desired size
 867  * @flags: The vma flags
 868  *
 869  * Return 0 on success, error otherwise.
 870  */
 871 static int resize_is_valid(struct vm_area_struct *vma, unsigned long addr,
 872         unsigned long old_len, unsigned long new_len, unsigned long flags)
 873 {
 874         struct mm_struct *mm = current->mm;
 875         unsigned long pgoff;
 876
 877         /*
 878          * !old_len is a special case where an attempt is made to 'duplicate'
 879          * a mapping.  This makes no sense for private mappings as it will
 880          * instead create a fresh/new mapping unrelated to the original.  This
 881          * is contrary to the basic idea of mremap which creates new mappings
 882          * based on the original.  There are no known use cases for this
 883          * behavior.  As a result, fail such attempts.
 884          */
 885         if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) {
 886                 pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap.  This is not supported.\n", current->comm, current->pid);
 887                 return -EINVAL;
 888         }
 889
 890         if ((flags & MREMAP_DONTUNMAP) &&
 891                         (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
 892                 return -EINVAL;
 893
 894         /* We can't remap across vm area boundaries */
 895         if (old_len > vma->vm_end - addr)
 896                 return -EFAULT;
 897
 898         if (new_len == old_len)
 899                 return 0;
 900
 901         /* Need to be careful about a growing mapping */
 902         pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
 903         pgoff += vma->vm_pgoff;
 904         if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
 905                 return -EINVAL;
 906
 907         if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
 908                 return -EFAULT;
 909
 910         if (!mlock_future_ok(mm, vma->vm_flags, new_len - old_len))
 911                 return -EAGAIN;
 912
 913         if (!may_expand_vm(mm, vma->vm_flags,
 914                                 (new_len - old_len) >> PAGE_SHIFT))
 915                 return -ENOMEM;
 916
 917         return 0;
 918 }
 919
 920 /*
 921  * mremap_to() - remap a vma to a new location
 922  * @addr: The old address
 923  * @old_len: The old size
 924  * @new_addr: The target address
 925  * @new_len: The new size
 926  * @locked: If the returned vma is locked (VM_LOCKED)
 927  * @flags: the mremap flags
 928  * @uf: The mremap userfaultfd context
 929  * @uf_unmap_early: The userfaultfd unmap early context
 930  * @uf_unmap: The userfaultfd unmap context
 931  *
 932  * Returns: The new address of the vma or an error.
 933  */
 934 static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 935                 unsigned long new_addr, unsigned long new_len, bool *locked,
 936                 unsigned long flags, struct vm_userfaultfd_ctx *uf,
 937                 struct list_head *uf_unmap_early,
 938                 struct list_head *uf_unmap)
 939 {
 940         struct mm_struct *mm = current->mm;
 941         struct vm_area_struct *vma;
 942         unsigned long ret;
 943         unsigned long map_flags = 0;
 944
 945         if (offset_in_page(new_addr))
 946                 return -EINVAL;
 947
 948         if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
 949                 return -EINVAL;
 950
 951         /* Ensure the old/new locations do not overlap */
 952         if (addr + old_len > new_addr && new_addr + new_len > addr)
 953                 return -EINVAL;
 954
 955         /*
 956          * move_vma() need us to stay 4 maps below the threshold, otherwise
 957          * it will bail out at the very beginning.
 958          * That is a problem if we have already unmaped the regions here
 959          * (new_addr, and old_addr), because userspace will not know the
 960          * state of the vma's after it gets -ENOMEM.
 961          * So, to avoid such scenario we can pre-compute if the whole
 962          * operation has high chances to success map-wise.
 963          * Worst-scenario case is when both vma's (new_addr and old_addr) get
 964          * split in 3 before unmapping it.
 965          * That means 2 more maps (1 for each) to the ones we already hold.
 966          * Check whether current map count plus 2 still leads us to 4 maps below
 967          * the threshold, otherwise return -ENOMEM here to be more safe.
 968          */
 969         if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
 970                 return -ENOMEM;
 971
 972         if (flags & MREMAP_FIXED) {
 973                 /*
 974                  * In mremap_to().
 975                  * VMA is moved to dst address, and munmap dst first.
 976                  * do_munmap will check if dst is sealed.
 977                  */
 978                 ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
 979                 if (ret)
 980                         return ret;
 981         }
 982
 983         if (old_len > new_len) {
 984                 ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
 985                 if (ret)
 986                         return ret;
 987                 old_len = new_len;
 988         }
 989
 990         vma = vma_lookup(mm, addr);
 991         if (!vma)
 992                 return -EFAULT;
 993
 994         ret = resize_is_valid(vma, addr, old_len, new_len, flags);
 995         if (ret)
 996                 return ret;
 997
 998         /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */
 999         if (flags & MREMAP_DONTUNMAP &&
1000                 !may_expand_vm(mm, vma->vm_flags, old_len >> PAGE_SHIFT)) {
1001                 return -ENOMEM;
1002         }
1003
1004         if (flags & MREMAP_FIXED)
1005                 map_flags |= MAP_FIXED;
1006
1007         if (vma->vm_flags & VM_MAYSHARE)
1008                 map_flags |= MAP_SHARED;
1009
1010         ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
1011                                 ((addr - vma->vm_start) >> PAGE_SHIFT),
1012                                 map_flags);
1013         if (IS_ERR_VALUE(ret))
1014                 return ret;
1015
1016         /* We got a new mapping */
1017         if (!(flags & MREMAP_FIXED))
1018                 new_addr = ret;
1019
1020         return move_vma(vma, addr, old_len, new_len, new_addr, locked, flags,
1021                         uf, uf_unmap);
1022 }
1023
1024 static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
1025 {
1026         unsigned long end = vma->vm_end + delta;
1027
1028         if (end < vma->vm_end) /* overflow */
1029                 return 0;
1030         if (find_vma_intersection(vma->vm_mm, vma->vm_end, end))
1031                 return 0;
1032         if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
1033                               0, MAP_FIXED) & ~PAGE_MASK)
1034                 return 0;
1035         return 1;
1036 }
1037
1038 /*
1039  * Expand (or shrink) an existing mapping, potentially moving it at the
1040  * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
1041  *
1042  * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
1043  * This option implies MREMAP_MAYMOVE.
1044  */
1045 SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
1046                 unsigned long, new_len, unsigned long, flags,
1047                 unsigned long, new_addr)
1048 {
1049         struct mm_struct *mm = current->mm;
1050         struct vm_area_struct *vma;
1051         unsigned long ret = -EINVAL;
1052         bool locked = false;
1053         struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
1054         LIST_HEAD(uf_unmap_early);
1055         LIST_HEAD(uf_unmap);
1056
1057         /*
1058          * There is a deliberate asymmetry here: we strip the pointer tag
1059          * from the old address but leave the new address alone. This is
1060          * for consistency with mmap(), where we prevent the creation of
1061          * aliasing mappings in userspace by leaving the tag bits of the
1062          * mapping address intact. A non-zero tag will cause the subsequent
1063          * range checks to reject the address as invalid.
1064          *
1065          * See Documentation/arch/arm64/tagged-address-abi.rst for more
1066          * information.
1067          */
1068         addr = untagged_addr(addr);
1069
1070         if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
1071                 return ret;
1072
1073         if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
1074                 return ret;
1075
1076         /*
1077          * MREMAP_DONTUNMAP is always a move and it does not allow resizing
1078          * in the process.
1079          */
1080         if (flags & MREMAP_DONTUNMAP &&
1081                         (!(flags & MREMAP_MAYMOVE) || old_len != new_len))
1082                 return ret;
1083
1084
1085         if (offset_in_page(addr))
1086                 return ret;
1087
1088         old_len = PAGE_ALIGN(old_len);
1089         new_len = PAGE_ALIGN(new_len);
1090
1091         /*
1092          * We allow a zero old-len as a special case
1093          * for DOS-emu "duplicate shm area" thing. But
1094          * a zero new-len is nonsensical.
1095          */
1096         if (!new_len)
1097                 return ret;
1098
1099         if (mmap_write_lock_killable(current->mm))
1100                 return -EINTR;
1101         vma = vma_lookup(mm, addr);
1102         if (!vma) {
1103                 ret = -EFAULT;
1104                 goto out;
1105         }
1106
1107         /* Don't allow remapping vmas when they have already been sealed */
1108         if (!can_modify_vma(vma)) {
1109                 ret = -EPERM;
1110                 goto out;
1111         }
1112
1113         if (is_vm_hugetlb_page(vma)) {
1114                 struct hstate *h __maybe_unused = hstate_vma(vma);
1115
1116                 old_len = ALIGN(old_len, huge_page_size(h));
1117                 new_len = ALIGN(new_len, huge_page_size(h));
1118
1119                 /* addrs must be huge page aligned */
1120                 if (addr & ~huge_page_mask(h))
1121                         goto out;
1122                 if (new_addr & ~huge_page_mask(h))
1123                         goto out;
1124
1125                 /*
1126                  * Don't allow remap expansion, because the underlying hugetlb
1127                  * reservation is not yet capable to handle split reservation.
1128                  */
1129                 if (new_len > old_len)
1130                         goto out;
1131         }
1132
1133         if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {
1134                 ret = mremap_to(addr, old_len, new_addr, new_len,
1135                                 &locked, flags, &uf, &uf_unmap_early,
1136                                 &uf_unmap);
1137                 goto out;
1138         }
1139
1140         /*
1141          * Always allow a shrinking remap: that just unmaps
1142          * the unnecessary pages..
1143          * do_vmi_munmap does all the needed commit accounting, and
1144          * unlocks the mmap_lock if so directed.
1145          */
1146         if (old_len >= new_len) {
1147                 VMA_ITERATOR(vmi, mm, addr + new_len);
1148
1149                 if (old_len == new_len) {
1150                         ret = addr;
1151                         goto out;
1152                 }
1153
1154                 ret = do_vmi_munmap(&vmi, mm, addr + new_len, old_len - new_len,
1155                                     &uf_unmap, true);
1156                 if (ret)
1157                         goto out;
1158
1159                 ret = addr;
1160                 goto out_unlocked;
1161         }
1162
1163         /*
1164          * Ok, we need to grow..
1165          */
1166         ret = resize_is_valid(vma, addr, old_len, new_len, flags);
1167         if (ret)
1168                 goto out;
1169
1170         /* old_len exactly to the end of the area..
1171          */
1172         if (old_len == vma->vm_end - addr) {
1173                 unsigned long delta = new_len - old_len;
1174
1175                 /* can we just expand the current mapping? */
1176                 if (vma_expandable(vma, delta)) {
1177                         long pages = delta >> PAGE_SHIFT;
1178                         VMA_ITERATOR(vmi, mm, vma->vm_end);
1179                         long charged = 0;
1180
1181                         if (vma->vm_flags & VM_ACCOUNT) {
1182                                 if (security_vm_enough_memory_mm(mm, pages)) {
1183                                         ret = -ENOMEM;
1184                                         goto out;
1185                                 }
1186                                 charged = pages;
1187                         }
1188
1189                         /*
1190                          * Function vma_merge_extend() is called on the
1191                          * extension we are adding to the already existing vma,
1192                          * vma_merge_extend() will merge this extension with the
1193                          * already existing vma (expand operation itself) and
1194                          * possibly also with the next vma if it becomes
1195                          * adjacent to the expanded vma and otherwise
1196                          * compatible.
1197                          */
1198                         vma = vma_merge_extend(&vmi, vma, delta);
1199                         if (!vma) {
1200                                 vm_unacct_memory(charged);
1201                                 ret = -ENOMEM;
1202                                 goto out;
1203                         }
1204
1205                         vm_stat_account(mm, vma->vm_flags, pages);
1206                         if (vma->vm_flags & VM_LOCKED) {
1207                                 mm->locked_vm += pages;
1208                                 locked = true;
1209                                 new_addr = addr;
1210                         }
1211                         ret = addr;
1212                         goto out;
1213                 }
1214         }
1215
1216         /*
1217          * We weren't able to just expand or shrink the area,
1218          * we need to create a new one and move it..
1219          */
1220         ret = -ENOMEM;
1221         if (flags & MREMAP_MAYMOVE) {
1222                 unsigned long map_flags = 0;
1223                 if (vma->vm_flags & VM_MAYSHARE)
1224                         map_flags |= MAP_SHARED;
1225
1226                 new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
1227                                         vma->vm_pgoff +
1228                                         ((addr - vma->vm_start) >> PAGE_SHIFT),
1229                                         map_flags);
1230                 if (IS_ERR_VALUE(new_addr)) {
1231                         ret = new_addr;
1232                         goto out;
1233                 }
1234
1235                 ret = move_vma(vma, addr, old_len, new_len, new_addr,
1236                                &locked, flags, &uf, &uf_unmap);
1237         }
1238 out:
1239         if (offset_in_page(ret))
1240                 locked = false;
1241         mmap_write_unlock(current->mm);
1242         if (locked && new_len > old_len)
1243                 mm_populate(new_addr + old_len, new_len - old_len);
1244 out_unlocked:
1245         userfaultfd_unmap_complete(mm, &uf_unmap_early);
1246         mremap_userfaultfd_complete(&uf, addr, ret, old_len);
1247         userfaultfd_unmap_complete(mm, &uf_unmap);
1248         return ret;
1249 }