arch/powerpc/mm/book3s64/hash_pgtable.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * Copyright 2005, Paul Mackerras, IBM Corporation.
   4  * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation.
   5  * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
   6  */
   7
   8 #include <linux/sched.h>
   9 #include <linux/mm_types.h>
  10 #include <linux/mm.h>
  11 #include <linux/stop_machine.h>
  12
  13 #include <asm/sections.h>
  14 #include <asm/mmu.h>
  15 #include <asm/tlb.h>
  16 #include <asm/firmware.h>
  17
  18 #include <mm/mmu_decl.h>
  19
  20 #include <trace/events/thp.h>
  21
  22 #if H_PGTABLE_RANGE > (USER_VSID_RANGE * (TASK_SIZE_USER64 / TASK_CONTEXT_SIZE))
  23 #warning Limited user VSID range means pagetable space is wasted
  24 #endif
  25
  26 #ifdef CONFIG_SPARSEMEM_VMEMMAP
  27 /*
  28  * vmemmap is the starting address of the virtual address space where
  29  * struct pages are allocated for all possible PFNs present on the system
  30  * including holes and bad memory (hence sparse). These virtual struct
  31  * pages are stored in sequence in this virtual address space irrespective
  32  * of the fact whether the corresponding PFN is valid or not. This achieves
  33  * constant relationship between address of struct page and its PFN.
  34  *
  35  * During boot or memory hotplug operation when a new memory section is
  36  * added, physical memory allocation (including hash table bolting) will
  37  * be performed for the set of struct pages which are part of the memory
  38  * section. This saves memory by not allocating struct pages for PFNs
  39  * which are not valid.
  40  *
  41  *              ----------------------------------------------
  42  *              | PHYSICAL ALLOCATION OF VIRTUAL STRUCT PAGES|
  43  *              ----------------------------------------------
  44  *
  45  *         f000000000000000                  c000000000000000
  46  * vmemmap +--------------+                  +--------------+
  47  *  +      |  page struct | +--------------> |  page struct |
  48  *  |      +--------------+                  +--------------+
  49  *  |      |  page struct | +--------------> |  page struct |
  50  *  |      +--------------+ |                +--------------+
  51  *  |      |  page struct | +       +------> |  page struct |
  52  *  |      +--------------+         |        +--------------+
  53  *  |      |  page struct |         |   +--> |  page struct |
  54  *  |      +--------------+         |   |    +--------------+
  55  *  |      |  page struct |         |   |
  56  *  |      +--------------+         |   |
  57  *  |      |  page struct |         |   |
  58  *  |      +--------------+         |   |
  59  *  |      |  page struct |         |   |
  60  *  |      +--------------+         |   |
  61  *  |      |  page struct |         |   |
  62  *  |      +--------------+         |   |
  63  *  |      |  page struct | +-------+   |
  64  *  |      +--------------+             |
  65  *  |      |  page struct | +-----------+
  66  *  |      +--------------+
  67  *  |      |  page struct | No mapping
  68  *  |      +--------------+
  69  *  |      |  page struct | No mapping
  70  *  v      +--------------+
  71  *
  72  *              -----------------------------------------
  73  *              | RELATION BETWEEN STRUCT PAGES AND PFNS|
  74  *              -----------------------------------------
  75  *
  76  * vmemmap +--------------+                 +---------------+
  77  *  +      |  page struct | +-------------> |      PFN      |
  78  *  |      +--------------+                 +---------------+
  79  *  |      |  page struct | +-------------> |      PFN      |
  80  *  |      +--------------+                 +---------------+
  81  *  |      |  page struct | +-------------> |      PFN      |
  82  *  |      +--------------+                 +---------------+
  83  *  |      |  page struct | +-------------> |      PFN      |
  84  *  |      +--------------+                 +---------------+
  85  *  |      |              |
  86  *  |      +--------------+
  87  *  |      |              |
  88  *  |      +--------------+
  89  *  |      |              |
  90  *  |      +--------------+                 +---------------+
  91  *  |      |  page struct | +-------------> |      PFN      |
  92  *  |      +--------------+                 +---------------+
  93  *  |      |              |
  94  *  |      +--------------+
  95  *  |      |              |
  96  *  |      +--------------+                 +---------------+
  97  *  |      |  page struct | +-------------> |      PFN      |
  98  *  |      +--------------+                 +---------------+
  99  *  |      |  page struct | +-------------> |      PFN      |
 100  *  v      +--------------+                 +---------------+
 101  */
 102 /*
 103  * On hash-based CPUs, the vmemmap is bolted in the hash table.
 104  *
 105  */
 106 int __meminit hash__vmemmap_create_mapping(unsigned long start,
 107                                        unsigned long page_size,
 108                                        unsigned long phys)
 109 {
 110         int rc;
 111
 112         if ((start + page_size) >= H_VMEMMAP_END) {
 113                 pr_warn("Outside the supported range\n");
 114                 return -1;
 115         }
 116
 117         rc = htab_bolt_mapping(start, start + page_size, phys,
 118                                pgprot_val(PAGE_KERNEL),
 119                                mmu_vmemmap_psize, mmu_kernel_ssize);
 120         if (rc < 0) {
 121                 int rc2 = htab_remove_mapping(start, start + page_size,
 122                                               mmu_vmemmap_psize,
 123                                               mmu_kernel_ssize);
 124                 BUG_ON(rc2 && (rc2 != -ENOENT));
 125         }
 126         return rc;
 127 }
 128
 129 #ifdef CONFIG_MEMORY_HOTPLUG
 130 void hash__vmemmap_remove_mapping(unsigned long start,
 131                               unsigned long page_size)
 132 {
 133         int rc = htab_remove_mapping(start, start + page_size,
 134                                      mmu_vmemmap_psize,
 135                                      mmu_kernel_ssize);
 136         BUG_ON((rc < 0) && (rc != -ENOENT));
 137         WARN_ON(rc == -ENOENT);
 138 }
 139 #endif
 140 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 141
 142 /*
 143  * map_kernel_page currently only called by __ioremap
 144  * map_kernel_page adds an entry to the ioremap page table
 145  * and adds an entry to the HPT, possibly bolting it
 146  */
 147 int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
 148 {
 149         pgd_t *pgdp;
 150         p4d_t *p4dp;
 151         pud_t *pudp;
 152         pmd_t *pmdp;
 153         pte_t *ptep;
 154
 155         BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE);
 156         if (slab_is_available()) {
 157                 pgdp = pgd_offset_k(ea);
 158                 p4dp = p4d_offset(pgdp, ea);
 159                 pudp = pud_alloc(&init_mm, p4dp, ea);
 160                 if (!pudp)
 161                         return -ENOMEM;
 162                 pmdp = pmd_alloc(&init_mm, pudp, ea);
 163                 if (!pmdp)
 164                         return -ENOMEM;
 165                 ptep = pte_alloc_kernel(pmdp, ea);
 166                 if (!ptep)
 167                         return -ENOMEM;
 168                 set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot));
 169         } else {
 170                 /*
 171                  * If the mm subsystem is not fully up, we cannot create a
 172                  * linux page table entry for this mapping.  Simply bolt an
 173                  * entry in the hardware page table.
 174                  *
 175                  */
 176                 if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, pgprot_val(prot),
 177                                       mmu_io_psize, mmu_kernel_ssize)) {
 178                         printk(KERN_ERR "Failed to do bolted mapping IO "
 179                                "memory at %016lx !\n", pa);
 180                         return -ENOMEM;
 181                 }
 182         }
 183
 184         smp_wmb();
 185         return 0;
 186 }
 187
 188 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 189
 190 unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
 191                                     pmd_t *pmdp, unsigned long clr,
 192                                     unsigned long set)
 193 {
 194         __be64 old_be, tmp;
 195         unsigned long old;
 196
 197 #ifdef CONFIG_DEBUG_VM
 198         WARN_ON(!hash__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
 199         assert_spin_locked(pmd_lockptr(mm, pmdp));
 200 #endif
 201
 202         __asm__ __volatile__(
 203         "1:     ldarx   %0,0,%3\n\
 204                 and.    %1,%0,%6\n\
 205                 bne-    1b \n\
 206                 andc    %1,%0,%4 \n\
 207                 or      %1,%1,%7\n\
 208                 stdcx.  %1,0,%3 \n\
 209                 bne-    1b"
 210         : "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp)
 211         : "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp),
 212           "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set))
 213         : "cc" );
 214
 215         old = be64_to_cpu(old_be);
 216
 217         trace_hugepage_update_pmd(addr, old, clr, set);
 218         if (old & H_PAGE_HASHPTE)
 219                 hpte_do_hugepage_flush(mm, addr, pmdp, old);
 220         return old;
 221 }
 222
 223 pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
 224                             pmd_t *pmdp)
 225 {
 226         pmd_t pmd;
 227
 228         VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 229         VM_BUG_ON(pmd_trans_huge(*pmdp));
 230         VM_BUG_ON(pmd_devmap(*pmdp));
 231
 232         pmd = *pmdp;
 233         pmd_clear(pmdp);
 234         /*
 235          * Wait for all pending hash_page to finish. This is needed
 236          * in case of subpage collapse. When we collapse normal pages
 237          * to hugepage, we first clear the pmd, then invalidate all
 238          * the PTE entries. The assumption here is that any low level
 239          * page fault will see a none pmd and take the slow path that
 240          * will wait on mmap_lock. But we could very well be in a
 241          * hash_page with local ptep pointer value. Such a hash page
 242          * can result in adding new HPTE entries for normal subpages.
 243          * That means we could be modifying the page content as we
 244          * copy them to a huge page. So wait for parallel hash_page
 245          * to finish before invalidating HPTE entries. We can do this
 246          * by sending an IPI to all the cpus and executing a dummy
 247          * function there.
 248          */
 249         serialize_against_pte_lookup(vma->vm_mm);
 250         /*
 251          * Now invalidate the hpte entries in the range
 252          * covered by pmd. This make sure we take a
 253          * fault and will find the pmd as none, which will
 254          * result in a major fault which takes mmap_lock and
 255          * hence wait for collapse to complete. Without this
 256          * the __collapse_huge_page_copy can result in copying
 257          * the old content.
 258          */
 259         flush_hash_table_pmd_range(vma->vm_mm, &pmd, address);
 260         return pmd;
 261 }
 262
 263 /*
 264  * We want to put the pgtable in pmd and use pgtable for tracking
 265  * the base page size hptes
 266  */
 267 void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
 268                                   pgtable_t pgtable)
 269 {
 270         pgtable_t *pgtable_slot;
 271
 272         assert_spin_locked(pmd_lockptr(mm, pmdp));
 273         /*
 274          * we store the pgtable in the second half of PMD
 275          */
 276         pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
 277         *pgtable_slot = pgtable;
 278         /*
 279          * expose the deposited pgtable to other cpus.
 280          * before we set the hugepage PTE at pmd level
 281          * hash fault code looks at the deposted pgtable
 282          * to store hash index values.
 283          */
 284         smp_wmb();
 285 }
 286
 287 pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
 288 {
 289         pgtable_t pgtable;
 290         pgtable_t *pgtable_slot;
 291
 292         assert_spin_locked(pmd_lockptr(mm, pmdp));
 293
 294         pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
 295         pgtable = *pgtable_slot;
 296         /*
 297          * Once we withdraw, mark the entry NULL.
 298          */
 299         *pgtable_slot = NULL;
 300         /*
 301          * We store HPTE information in the deposited PTE fragment.
 302          * zero out the content on withdraw.
 303          */
 304         memset(pgtable, 0, PTE_FRAG_SIZE);
 305         return pgtable;
 306 }
 307
 308 /*
 309  * A linux hugepage PMD was changed and the corresponding hash table entries
 310  * neesd to be flushed.
 311  */
 312 void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
 313                             pmd_t *pmdp, unsigned long old_pmd)
 314 {
 315         int ssize;
 316         unsigned int psize;
 317         unsigned long vsid;
 318         unsigned long flags = 0;
 319
 320         /* get the base page size,vsid and segment size */
 321 #ifdef CONFIG_DEBUG_VM
 322         psize = get_slice_psize(mm, addr);
 323         BUG_ON(psize == MMU_PAGE_16M);
 324 #endif
 325         if (old_pmd & H_PAGE_COMBO)
 326                 psize = MMU_PAGE_4K;
 327         else
 328                 psize = MMU_PAGE_64K;
 329
 330         if (!is_kernel_addr(addr)) {
 331                 ssize = user_segment_size(addr);
 332                 vsid = get_user_vsid(&mm->context, addr, ssize);
 333                 WARN_ON(vsid == 0);
 334         } else {
 335                 vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
 336                 ssize = mmu_kernel_ssize;
 337         }
 338
 339         if (mm_is_thread_local(mm))
 340                 flags |= HPTE_LOCAL_UPDATE;
 341
 342         return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags);
 343 }
 344
 345 pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
 346                                 unsigned long addr, pmd_t *pmdp)
 347 {
 348         pmd_t old_pmd;
 349         pgtable_t pgtable;
 350         unsigned long old;
 351         pgtable_t *pgtable_slot;
 352
 353         old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
 354         old_pmd = __pmd(old);
 355         /*
 356          * We have pmd == none and we are holding page_table_lock.
 357          * So we can safely go and clear the pgtable hash
 358          * index info.
 359          */
 360         pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
 361         pgtable = *pgtable_slot;
 362         /*
 363          * Let's zero out old valid and hash index details
 364          * hash fault look at them.
 365          */
 366         memset(pgtable, 0, PTE_FRAG_SIZE);
 367         return old_pmd;
 368 }
 369
 370 int hash__has_transparent_hugepage(void)
 371 {
 372
 373         if (!mmu_has_feature(MMU_FTR_16M_PAGE))
 374                 return 0;
 375         /*
 376          * We support THP only if PMD_SIZE is 16MB.
 377          */
 378         if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
 379                 return 0;
 380         /*
 381          * We need to make sure that we support 16MB hugepage in a segment
 382          * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
 383          * of 64K.
 384          */
 385         /*
 386          * If we have 64K HPTE, we will be using that by default
 387          */
 388         if (mmu_psize_defs[MMU_PAGE_64K].shift &&
 389             (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
 390                 return 0;
 391         /*
 392          * Ok we only have 4K HPTE
 393          */
 394         if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
 395                 return 0;
 396
 397         return 1;
 398 }
 399 EXPORT_SYMBOL_GPL(hash__has_transparent_hugepage);
 400
 401 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 402
 403 #ifdef CONFIG_STRICT_KERNEL_RWX
 404
 405 struct change_memory_parms {
 406         unsigned long start, end, newpp;
 407         unsigned int step, nr_cpus;
 408         atomic_t master_cpu;
 409         atomic_t cpu_counter;
 410 };
 411
 412 // We'd rather this was on the stack but it has to be in the RMO
 413 static struct change_memory_parms chmem_parms;
 414
 415 // And therefore we need a lock to protect it from concurrent use
 416 static DEFINE_MUTEX(chmem_lock);
 417
 418 static void change_memory_range(unsigned long start, unsigned long end,
 419                                 unsigned int step, unsigned long newpp)
 420 {
 421         unsigned long idx;
 422
 423         pr_debug("Changing page protection on range 0x%lx-0x%lx, to 0x%lx, step 0x%x\n",
 424                  start, end, newpp, step);
 425
 426         for (idx = start; idx < end; idx += step)
 427                 /* Not sure if we can do much with the return value */
 428                 mmu_hash_ops.hpte_updateboltedpp(newpp, idx, mmu_linear_psize,
 429                                                         mmu_kernel_ssize);
 430 }
 431
 432 static int notrace chmem_secondary_loop(struct change_memory_parms *parms)
 433 {
 434         unsigned long msr, tmp, flags;
 435         int *p;
 436
 437         p = &parms->cpu_counter.counter;
 438
 439         local_irq_save(flags);
 440         hard_irq_disable();
 441
 442         asm volatile (
 443         // Switch to real mode and leave interrupts off
 444         "mfmsr  %[msr]                  ;"
 445         "li     %[tmp], %[MSR_IR_DR]    ;"
 446         "andc   %[tmp], %[msr], %[tmp]  ;"
 447         "mtmsrd %[tmp]                  ;"
 448
 449         // Tell the master we are in real mode
 450         "1:                             "
 451         "lwarx  %[tmp], 0, %[p]         ;"
 452         "addic  %[tmp], %[tmp], -1      ;"
 453         "stwcx. %[tmp], 0, %[p]         ;"
 454         "bne-   1b                      ;"
 455
 456         // Spin until the counter goes to zero
 457         "2:                             ;"
 458         "lwz    %[tmp], 0(%[p])         ;"
 459         "cmpwi  %[tmp], 0               ;"
 460         "bne-   2b                      ;"
 461
 462         // Switch back to virtual mode
 463         "mtmsrd %[msr]                  ;"
 464
 465         : // outputs
 466           [msr] "=&r" (msr), [tmp] "=&b" (tmp), "+m" (*p)
 467         : // inputs
 468           [p] "b" (p), [MSR_IR_DR] "i" (MSR_IR | MSR_DR)
 469         : // clobbers
 470           "cc", "xer"
 471         );
 472
 473         local_irq_restore(flags);
 474
 475         return 0;
 476 }
 477
 478 static int change_memory_range_fn(void *data)
 479 {
 480         struct change_memory_parms *parms = data;
 481
 482         // First CPU goes through, all others wait.
 483         if (atomic_xchg(&parms->master_cpu, 1) == 1)
 484                 return chmem_secondary_loop(parms);
 485
 486         // Wait for all but one CPU (this one) to call-in
 487         while (atomic_read(&parms->cpu_counter) > 1)
 488                 barrier();
 489
 490         change_memory_range(parms->start, parms->end, parms->step, parms->newpp);
 491
 492         mb();
 493
 494         // Signal the other CPUs that we're done
 495         atomic_dec(&parms->cpu_counter);
 496
 497         return 0;
 498 }
 499
 500 static bool hash__change_memory_range(unsigned long start, unsigned long end,
 501                                       unsigned long newpp)
 502 {
 503         unsigned int step, shift;
 504
 505         shift = mmu_psize_defs[mmu_linear_psize].shift;
 506         step = 1 << shift;
 507
 508         start = ALIGN_DOWN(start, step);
 509         end = ALIGN(end, step); // aligns up
 510
 511         if (start >= end)
 512                 return false;
 513
 514         if (firmware_has_feature(FW_FEATURE_LPAR)) {
 515                 mutex_lock(&chmem_lock);
 516
 517                 chmem_parms.start = start;
 518                 chmem_parms.end = end;
 519                 chmem_parms.step = step;
 520                 chmem_parms.newpp = newpp;
 521                 atomic_set(&chmem_parms.master_cpu, 0);
 522
 523                 cpus_read_lock();
 524
 525                 atomic_set(&chmem_parms.cpu_counter, num_online_cpus());
 526
 527                 // Ensure state is consistent before we call the other CPUs
 528                 mb();
 529
 530                 stop_machine_cpuslocked(change_memory_range_fn, &chmem_parms,
 531                                         cpu_online_mask);
 532
 533                 cpus_read_unlock();
 534                 mutex_unlock(&chmem_lock);
 535         } else
 536                 change_memory_range(start, end, step, newpp);
 537
 538         return true;
 539 }
 540
 541 void hash__mark_rodata_ro(void)
 542 {
 543         unsigned long start, end, pp;
 544
 545         start = (unsigned long)_stext;
 546         end = (unsigned long)__end_rodata;
 547
 548         pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL_ROX), HPTE_USE_KERNEL_KEY);
 549
 550         WARN_ON(!hash__change_memory_range(start, end, pp));
 551 }
 552
 553 void hash__mark_initmem_nx(void)
 554 {
 555         unsigned long start, end, pp;
 556
 557         start = (unsigned long)__init_begin;
 558         end = (unsigned long)__init_end;
 559
 560         pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL), HPTE_USE_KERNEL_KEY);
 561
 562         WARN_ON(!hash__change_memory_range(start, end, pp));
 563 }
 564 #endif