arch/powerpc/mm/pgtable-hash64.c

   1 /*
   2  * Copyright 2005, Paul Mackerras, IBM Corporation.
   3  * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation.
   4  * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
   5  *
   6  * This program is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU General Public License
   8  * as published by the Free Software Foundation; either version
   9  * 2 of the License, or (at your option) any later version.
  10  */
  11
  12 #include <linux/sched.h>
  13 #include <asm/pgalloc.h>
  14 #include <asm/tlb.h>
  15
  16 #include "mmu_decl.h"
  17
  18 #define CREATE_TRACE_POINTS
  19 #include <trace/events/thp.h>
  20
  21 #ifdef CONFIG_SPARSEMEM_VMEMMAP
  22 /*
  23  * On hash-based CPUs, the vmemmap is bolted in the hash table.
  24  *
  25  */
  26 int __meminit hash__vmemmap_create_mapping(unsigned long start,
  27                                        unsigned long page_size,
  28                                        unsigned long phys)
  29 {
  30         int rc = htab_bolt_mapping(start, start + page_size, phys,
  31                                    pgprot_val(PAGE_KERNEL),
  32                                    mmu_vmemmap_psize, mmu_kernel_ssize);
  33         if (rc < 0) {
  34                 int rc2 = htab_remove_mapping(start, start + page_size,
  35                                               mmu_vmemmap_psize,
  36                                               mmu_kernel_ssize);
  37                 BUG_ON(rc2 && (rc2 != -ENOENT));
  38         }
  39         return rc;
  40 }
  41
  42 #ifdef CONFIG_MEMORY_HOTPLUG
  43 void hash__vmemmap_remove_mapping(unsigned long start,
  44                               unsigned long page_size)
  45 {
  46         int rc = htab_remove_mapping(start, start + page_size,
  47                                      mmu_vmemmap_psize,
  48                                      mmu_kernel_ssize);
  49         BUG_ON((rc < 0) && (rc != -ENOENT));
  50         WARN_ON(rc == -ENOENT);
  51 }
  52 #endif
  53 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
  54
  55 /*
  56  * map_kernel_page currently only called by __ioremap
  57  * map_kernel_page adds an entry to the ioremap page table
  58  * and adds an entry to the HPT, possibly bolting it
  59  */
  60 int hash__map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags)
  61 {
  62         pgd_t *pgdp;
  63         pud_t *pudp;
  64         pmd_t *pmdp;
  65         pte_t *ptep;
  66
  67         BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE);
  68         if (slab_is_available()) {
  69                 pgdp = pgd_offset_k(ea);
  70                 pudp = pud_alloc(&init_mm, pgdp, ea);
  71                 if (!pudp)
  72                         return -ENOMEM;
  73                 pmdp = pmd_alloc(&init_mm, pudp, ea);
  74                 if (!pmdp)
  75                         return -ENOMEM;
  76                 ptep = pte_alloc_kernel(pmdp, ea);
  77                 if (!ptep)
  78                         return -ENOMEM;
  79                 set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
  80                                                           __pgprot(flags)));
  81         } else {
  82                 /*
  83                  * If the mm subsystem is not fully up, we cannot create a
  84                  * linux page table entry for this mapping.  Simply bolt an
  85                  * entry in the hardware page table.
  86                  *
  87                  */
  88                 if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags,
  89                                       mmu_io_psize, mmu_kernel_ssize)) {
  90                         printk(KERN_ERR "Failed to do bolted mapping IO "
  91                                "memory at %016lx !\n", pa);
  92                         return -ENOMEM;
  93                 }
  94         }
  95
  96         smp_wmb();
  97         return 0;
  98 }
  99
 100 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 101
 102 unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
 103                                     pmd_t *pmdp, unsigned long clr,
 104                                     unsigned long set)
 105 {
 106         __be64 old_be, tmp;
 107         unsigned long old;
 108
 109 #ifdef CONFIG_DEBUG_VM
 110         WARN_ON(!pmd_trans_huge(*pmdp));
 111         assert_spin_locked(&mm->page_table_lock);
 112 #endif
 113
 114         __asm__ __volatile__(
 115         "1:     ldarx   %0,0,%3\n\
 116                 and.    %1,%0,%6\n\
 117                 bne-    1b \n\
 118                 andc    %1,%0,%4 \n\
 119                 or      %1,%1,%7\n\
 120                 stdcx.  %1,0,%3 \n\
 121                 bne-    1b"
 122         : "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp)
 123         : "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp),
 124           "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set))
 125         : "cc" );
 126
 127         old = be64_to_cpu(old_be);
 128
 129         trace_hugepage_update(addr, old, clr, set);
 130         if (old & H_PAGE_HASHPTE)
 131                 hpte_do_hugepage_flush(mm, addr, pmdp, old);
 132         return old;
 133 }
 134
 135 pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
 136                             pmd_t *pmdp)
 137 {
 138         pmd_t pmd;
 139
 140         VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 141         VM_BUG_ON(pmd_trans_huge(*pmdp));
 142
 143         pmd = *pmdp;
 144         pmd_clear(pmdp);
 145         /*
 146          * Wait for all pending hash_page to finish. This is needed
 147          * in case of subpage collapse. When we collapse normal pages
 148          * to hugepage, we first clear the pmd, then invalidate all
 149          * the PTE entries. The assumption here is that any low level
 150          * page fault will see a none pmd and take the slow path that
 151          * will wait on mmap_sem. But we could very well be in a
 152          * hash_page with local ptep pointer value. Such a hash page
 153          * can result in adding new HPTE entries for normal subpages.
 154          * That means we could be modifying the page content as we
 155          * copy them to a huge page. So wait for parallel hash_page
 156          * to finish before invalidating HPTE entries. We can do this
 157          * by sending an IPI to all the cpus and executing a dummy
 158          * function there.
 159          */
 160         kick_all_cpus_sync();
 161         /*
 162          * Now invalidate the hpte entries in the range
 163          * covered by pmd. This make sure we take a
 164          * fault and will find the pmd as none, which will
 165          * result in a major fault which takes mmap_sem and
 166          * hence wait for collapse to complete. Without this
 167          * the __collapse_huge_page_copy can result in copying
 168          * the old content.
 169          */
 170         flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
 171         return pmd;
 172 }
 173
 174 /*
 175  * We want to put the pgtable in pmd and use pgtable for tracking
 176  * the base page size hptes
 177  */
 178 void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
 179                                   pgtable_t pgtable)
 180 {
 181         pgtable_t *pgtable_slot;
 182         assert_spin_locked(&mm->page_table_lock);
 183         /*
 184          * we store the pgtable in the second half of PMD
 185          */
 186         pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
 187         *pgtable_slot = pgtable;
 188         /*
 189          * expose the deposited pgtable to other cpus.
 190          * before we set the hugepage PTE at pmd level
 191          * hash fault code looks at the deposted pgtable
 192          * to store hash index values.
 193          */
 194         smp_wmb();
 195 }
 196
 197 pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
 198 {
 199         pgtable_t pgtable;
 200         pgtable_t *pgtable_slot;
 201
 202         assert_spin_locked(&mm->page_table_lock);
 203         pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
 204         pgtable = *pgtable_slot;
 205         /*
 206          * Once we withdraw, mark the entry NULL.
 207          */
 208         *pgtable_slot = NULL;
 209         /*
 210          * We store HPTE information in the deposited PTE fragment.
 211          * zero out the content on withdraw.
 212          */
 213         memset(pgtable, 0, PTE_FRAG_SIZE);
 214         return pgtable;
 215 }
 216
 217 void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma,
 218                                unsigned long address, pmd_t *pmdp)
 219 {
 220         VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 221         VM_BUG_ON(REGION_ID(address) != USER_REGION_ID);
 222
 223         /*
 224          * We can't mark the pmd none here, because that will cause a race
 225          * against exit_mmap. We need to continue mark pmd TRANS HUGE, while
 226          * we spilt, but at the same time we wan't rest of the ppc64 code
 227          * not to insert hash pte on this, because we will be modifying
 228          * the deposited pgtable in the caller of this function. Hence
 229          * clear the _PAGE_USER so that we move the fault handling to
 230          * higher level function and that will serialize against ptl.
 231          * We need to flush existing hash pte entries here even though,
 232          * the translation is still valid, because we will withdraw
 233          * pgtable_t after this.
 234          */
 235         pmd_hugepage_update(vma->vm_mm, address, pmdp, 0, _PAGE_PRIVILEGED);
 236 }
 237
 238 /*
 239  * A linux hugepage PMD was changed and the corresponding hash table entries
 240  * neesd to be flushed.
 241  */
 242 void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
 243                             pmd_t *pmdp, unsigned long old_pmd)
 244 {
 245         int ssize;
 246         unsigned int psize;
 247         unsigned long vsid;
 248         unsigned long flags = 0;
 249         const struct cpumask *tmp;
 250
 251         /* get the base page size,vsid and segment size */
 252 #ifdef CONFIG_DEBUG_VM
 253         psize = get_slice_psize(mm, addr);
 254         BUG_ON(psize == MMU_PAGE_16M);
 255 #endif
 256         if (old_pmd & H_PAGE_COMBO)
 257                 psize = MMU_PAGE_4K;
 258         else
 259                 psize = MMU_PAGE_64K;
 260
 261         if (!is_kernel_addr(addr)) {
 262                 ssize = user_segment_size(addr);
 263                 vsid = get_vsid(mm->context.id, addr, ssize);
 264                 WARN_ON(vsid == 0);
 265         } else {
 266                 vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
 267                 ssize = mmu_kernel_ssize;
 268         }
 269
 270         tmp = cpumask_of(smp_processor_id());
 271         if (cpumask_equal(mm_cpumask(mm), tmp))
 272                 flags |= HPTE_LOCAL_UPDATE;
 273
 274         return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags);
 275 }
 276
 277 pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
 278                                 unsigned long addr, pmd_t *pmdp)
 279 {
 280         pmd_t old_pmd;
 281         pgtable_t pgtable;
 282         unsigned long old;
 283         pgtable_t *pgtable_slot;
 284
 285         old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
 286         old_pmd = __pmd(old);
 287         /*
 288          * We have pmd == none and we are holding page_table_lock.
 289          * So we can safely go and clear the pgtable hash
 290          * index info.
 291          */
 292         pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
 293         pgtable = *pgtable_slot;
 294         /*
 295          * Let's zero out old valid and hash index details
 296          * hash fault look at them.
 297          */
 298         memset(pgtable, 0, PTE_FRAG_SIZE);
 299         /*
 300          * Serialize against find_linux_pte_or_hugepte which does lock-less
 301          * lookup in page tables with local interrupts disabled. For huge pages
 302          * it casts pmd_t to pte_t. Since format of pte_t is different from
 303          * pmd_t we want to prevent transit from pmd pointing to page table
 304          * to pmd pointing to huge page (and back) while interrupts are disabled.
 305          * We clear pmd to possibly replace it with page table pointer in
 306          * different code paths. So make sure we wait for the parallel
 307          * find_linux_pte_or_hugepage to finish.
 308          */
 309         kick_all_cpus_sync();
 310         return old_pmd;
 311 }
 312
 313 int hash__has_transparent_hugepage(void)
 314 {
 315
 316         if (!mmu_has_feature(MMU_FTR_16M_PAGE))
 317                 return 0;
 318         /*
 319          * We support THP only if PMD_SIZE is 16MB.
 320          */
 321         if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
 322                 return 0;
 323         /*
 324          * We need to make sure that we support 16MB hugepage in a segement
 325          * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
 326          * of 64K.
 327          */
 328         /*
 329          * If we have 64K HPTE, we will be using that by default
 330          */
 331         if (mmu_psize_defs[MMU_PAGE_64K].shift &&
 332             (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
 333                 return 0;
 334         /*
 335          * Ok we only have 4K HPTE
 336          */
 337         if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
 338                 return 0;
 339
 340         return 1;
 341 }
 342 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */