arch/powerpc/mm/hugetlbpage.c

   1 /*
   2  * PPC Huge TLB Page Support for Kernel.
   3  *
   4  * Copyright (C) 2003 David Gibson, IBM Corporation.
   5  * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
   6  *
   7  * Based on the IA-32 version:
   8  * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
   9  */
  10
  11 #include <linux/mm.h>
  12 #include <linux/io.h>
  13 #include <linux/slab.h>
  14 #include <linux/hugetlb.h>
  15 #include <linux/export.h>
  16 #include <linux/of_fdt.h>
  17 #include <linux/memblock.h>
  18 #include <linux/bootmem.h>
  19 #include <linux/moduleparam.h>
  20 #include <asm/pgtable.h>
  21 #include <asm/pgalloc.h>
  22 #include <asm/tlb.h>
  23 #include <asm/setup.h>
  24 #include <asm/hugetlb.h>
  25
  26 #ifdef CONFIG_HUGETLB_PAGE
  27
  28 #define PAGE_SHIFT_64K  16
  29 #define PAGE_SHIFT_16M  24
  30 #define PAGE_SHIFT_16G  34
  31
  32 unsigned int HPAGE_SHIFT;
  33
  34 /*
  35  * Tracks gpages after the device tree is scanned and before the
  36  * huge_boot_pages list is ready.  On non-Freescale implementations, this is
  37  * just used to track 16G pages and so is a single array.  FSL-based
  38  * implementations may have more than one gpage size, so we need multiple
  39  * arrays
  40  */
  41 #ifdef CONFIG_PPC_FSL_BOOK3E
  42 #define MAX_NUMBER_GPAGES       128
  43 struct psize_gpages {
  44         u64 gpage_list[MAX_NUMBER_GPAGES];
  45         unsigned int nr_gpages;
  46 };
  47 static struct psize_gpages gpage_freearray[MMU_PAGE_COUNT];
  48 #else
  49 #define MAX_NUMBER_GPAGES       1024
  50 static u64 gpage_freearray[MAX_NUMBER_GPAGES];
  51 static unsigned nr_gpages;
  52 #endif
  53
  54 #define hugepd_none(hpd)        ((hpd).pd == 0)
  55
  56 #ifdef CONFIG_PPC_BOOK3S_64
  57 /*
  58  * At this point we do the placement change only for BOOK3S 64. This would
  59  * possibly work on other subarchs.
  60  */
  61
  62 /*
  63  * We have PGD_INDEX_SIZ = 12 and PTE_INDEX_SIZE = 8, so that we can have
  64  * 16GB hugepage pte in PGD and 16MB hugepage pte at PMD;
  65  *
  66  * Defined in such a way that we can optimize away code block at build time
  67  * if CONFIG_HUGETLB_PAGE=n.
  68  */
  69 int pmd_huge(pmd_t pmd)
  70 {
  71         /*
  72          * leaf pte for huge page, bottom two bits != 00
  73          */
  74         return ((pmd_val(pmd) & 0x3) != 0x0);
  75 }
  76
  77 int pud_huge(pud_t pud)
  78 {
  79         /*
  80          * leaf pte for huge page, bottom two bits != 00
  81          */
  82         return ((pud_val(pud) & 0x3) != 0x0);
  83 }
  84
  85 int pgd_huge(pgd_t pgd)
  86 {
  87         /*
  88          * leaf pte for huge page, bottom two bits != 00
  89          */
  90         return ((pgd_val(pgd) & 0x3) != 0x0);
  91 }
  92
  93 #if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_DEBUG_VM)
  94 /*
  95  * This enables us to catch the wrong page directory format
  96  * Moved here so that we can use WARN() in the call.
  97  */
  98 int hugepd_ok(hugepd_t hpd)
  99 {
 100         bool is_hugepd;
 101
 102         /*
 103          * We should not find this format in page directory, warn otherwise.
 104          */
 105         is_hugepd = (((hpd.pd & 0x3) == 0x0) && ((hpd.pd & HUGEPD_SHIFT_MASK) != 0));
 106         WARN(is_hugepd, "Found wrong page directory format\n");
 107         return 0;
 108 }
 109 #endif
 110
 111 #else
 112 int pmd_huge(pmd_t pmd)
 113 {
 114         return 0;
 115 }
 116
 117 int pud_huge(pud_t pud)
 118 {
 119         return 0;
 120 }
 121
 122 int pgd_huge(pgd_t pgd)
 123 {
 124         return 0;
 125 }
 126 #endif
 127
 128 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 129 {
 130         /* Only called for hugetlbfs pages, hence can ignore THP */
 131         return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL, NULL);
 132 }
 133
 134 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 135                            unsigned long address, unsigned pdshift, unsigned pshift)
 136 {
 137         struct kmem_cache *cachep;
 138         pte_t *new;
 139
 140 #ifdef CONFIG_PPC_FSL_BOOK3E
 141         int i;
 142         int num_hugepd = 1 << (pshift - pdshift);
 143         cachep = hugepte_cache;
 144 #else
 145         cachep = PGT_CACHE(pdshift - pshift);
 146 #endif
 147
 148         new = kmem_cache_zalloc(cachep, GFP_KERNEL|__GFP_REPEAT);
 149
 150         BUG_ON(pshift > HUGEPD_SHIFT_MASK);
 151         BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
 152
 153         if (! new)
 154                 return -ENOMEM;
 155
 156         spin_lock(&mm->page_table_lock);
 157 #ifdef CONFIG_PPC_FSL_BOOK3E
 158         /*
 159          * We have multiple higher-level entries that point to the same
 160          * actual pte location.  Fill in each as we go and backtrack on error.
 161          * We need all of these so the DTLB pgtable walk code can find the
 162          * right higher-level entry without knowing if it's a hugepage or not.
 163          */
 164         for (i = 0; i < num_hugepd; i++, hpdp++) {
 165                 if (unlikely(!hugepd_none(*hpdp)))
 166                         break;
 167                 else
 168                         /* We use the old format for PPC_FSL_BOOK3E */
 169                         hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
 170         }
 171         /* If we bailed from the for loop early, an error occurred, clean up */
 172         if (i < num_hugepd) {
 173                 for (i = i - 1 ; i >= 0; i--, hpdp--)
 174                         hpdp->pd = 0;
 175                 kmem_cache_free(cachep, new);
 176         }
 177 #else
 178         if (!hugepd_none(*hpdp))
 179                 kmem_cache_free(cachep, new);
 180         else {
 181 #ifdef CONFIG_PPC_BOOK3S_64
 182                 hpdp->pd = (unsigned long)new |
 183                             (shift_to_mmu_psize(pshift) << 2);
 184 #else
 185                 hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
 186 #endif
 187         }
 188 #endif
 189         spin_unlock(&mm->page_table_lock);
 190         return 0;
 191 }
 192
 193 /*
 194  * These macros define how to determine which level of the page table holds
 195  * the hpdp.
 196  */
 197 #ifdef CONFIG_PPC_FSL_BOOK3E
 198 #define HUGEPD_PGD_SHIFT PGDIR_SHIFT
 199 #define HUGEPD_PUD_SHIFT PUD_SHIFT
 200 #else
 201 #define HUGEPD_PGD_SHIFT PUD_SHIFT
 202 #define HUGEPD_PUD_SHIFT PMD_SHIFT
 203 #endif
 204
 205 #ifdef CONFIG_PPC_BOOK3S_64
 206 /*
 207  * At this point we do the placement change only for BOOK3S 64. This would
 208  * possibly work on other subarchs.
 209  */
 210 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
 211 {
 212         pgd_t *pg;
 213         pud_t *pu;
 214         pmd_t *pm;
 215         hugepd_t *hpdp = NULL;
 216         unsigned pshift = __ffs(sz);
 217         unsigned pdshift = PGDIR_SHIFT;
 218
 219         addr &= ~(sz-1);
 220         pg = pgd_offset(mm, addr);
 221
 222         if (pshift == PGDIR_SHIFT)
 223                 /* 16GB huge page */
 224                 return (pte_t *) pg;
 225         else if (pshift > PUD_SHIFT)
 226                 /*
 227                  * We need to use hugepd table
 228                  */
 229                 hpdp = (hugepd_t *)pg;
 230         else {
 231                 pdshift = PUD_SHIFT;
 232                 pu = pud_alloc(mm, pg, addr);
 233                 if (pshift == PUD_SHIFT)
 234                         return (pte_t *)pu;
 235                 else if (pshift > PMD_SHIFT)
 236                         hpdp = (hugepd_t *)pu;
 237                 else {
 238                         pdshift = PMD_SHIFT;
 239                         pm = pmd_alloc(mm, pu, addr);
 240                         if (pshift == PMD_SHIFT)
 241                                 /* 16MB hugepage */
 242                                 return (pte_t *)pm;
 243                         else
 244                                 hpdp = (hugepd_t *)pm;
 245                 }
 246         }
 247         if (!hpdp)
 248                 return NULL;
 249
 250         BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
 251
 252         if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
 253                 return NULL;
 254
 255         return hugepte_offset(*hpdp, addr, pdshift);
 256 }
 257
 258 #else
 259
 260 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
 261 {
 262         pgd_t *pg;
 263         pud_t *pu;
 264         pmd_t *pm;
 265         hugepd_t *hpdp = NULL;
 266         unsigned pshift = __ffs(sz);
 267         unsigned pdshift = PGDIR_SHIFT;
 268
 269         addr &= ~(sz-1);
 270
 271         pg = pgd_offset(mm, addr);
 272
 273         if (pshift >= HUGEPD_PGD_SHIFT) {
 274                 hpdp = (hugepd_t *)pg;
 275         } else {
 276                 pdshift = PUD_SHIFT;
 277                 pu = pud_alloc(mm, pg, addr);
 278                 if (pshift >= HUGEPD_PUD_SHIFT) {
 279                         hpdp = (hugepd_t *)pu;
 280                 } else {
 281                         pdshift = PMD_SHIFT;
 282                         pm = pmd_alloc(mm, pu, addr);
 283                         hpdp = (hugepd_t *)pm;
 284                 }
 285         }
 286
 287         if (!hpdp)
 288                 return NULL;
 289
 290         BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
 291
 292         if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
 293                 return NULL;
 294
 295         return hugepte_offset(*hpdp, addr, pdshift);
 296 }
 297 #endif
 298
 299 #ifdef CONFIG_PPC_FSL_BOOK3E
 300 /* Build list of addresses of gigantic pages.  This function is used in early
 301  * boot before the buddy allocator is setup.
 302  */
 303 void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
 304 {
 305         unsigned int idx = shift_to_mmu_psize(__ffs(page_size));
 306         int i;
 307
 308         if (addr == 0)
 309                 return;
 310
 311         gpage_freearray[idx].nr_gpages = number_of_pages;
 312
 313         for (i = 0; i < number_of_pages; i++) {
 314                 gpage_freearray[idx].gpage_list[i] = addr;
 315                 addr += page_size;
 316         }
 317 }
 318
 319 /*
 320  * Moves the gigantic page addresses from the temporary list to the
 321  * huge_boot_pages list.
 322  */
 323 int alloc_bootmem_huge_page(struct hstate *hstate)
 324 {
 325         struct huge_bootmem_page *m;
 326         int idx = shift_to_mmu_psize(huge_page_shift(hstate));
 327         int nr_gpages = gpage_freearray[idx].nr_gpages;
 328
 329         if (nr_gpages == 0)
 330                 return 0;
 331
 332 #ifdef CONFIG_HIGHMEM
 333         /*
 334          * If gpages can be in highmem we can't use the trick of storing the
 335          * data structure in the page; allocate space for this
 336          */
 337         m = memblock_virt_alloc(sizeof(struct huge_bootmem_page), 0);
 338         m->phys = gpage_freearray[idx].gpage_list[--nr_gpages];
 339 #else
 340         m = phys_to_virt(gpage_freearray[idx].gpage_list[--nr_gpages]);
 341 #endif
 342
 343         list_add(&m->list, &huge_boot_pages);
 344         gpage_freearray[idx].nr_gpages = nr_gpages;
 345         gpage_freearray[idx].gpage_list[nr_gpages] = 0;
 346         m->hstate = hstate;
 347
 348         return 1;
 349 }
 350 /*
 351  * Scan the command line hugepagesz= options for gigantic pages; store those in
 352  * a list that we use to allocate the memory once all options are parsed.
 353  */
 354
 355 unsigned long gpage_npages[MMU_PAGE_COUNT];
 356
 357 static int __init do_gpage_early_setup(char *param, char *val,
 358                                        const char *unused, void *arg)
 359 {
 360         static phys_addr_t size;
 361         unsigned long npages;
 362
 363         /*
 364          * The hugepagesz and hugepages cmdline options are interleaved.  We
 365          * use the size variable to keep track of whether or not this was done
 366          * properly and skip over instances where it is incorrect.  Other
 367          * command-line parsing code will issue warnings, so we don't need to.
 368          *
 369          */
 370         if ((strcmp(param, "default_hugepagesz") == 0) ||
 371             (strcmp(param, "hugepagesz") == 0)) {
 372                 size = memparse(val, NULL);
 373         } else if (strcmp(param, "hugepages") == 0) {
 374                 if (size != 0) {
 375                         if (sscanf(val, "%lu", &npages) <= 0)
 376                                 npages = 0;
 377                         if (npages > MAX_NUMBER_GPAGES) {
 378                                 pr_warn("MMU: %lu pages requested for page "
 379                                         "size %llu KB, limiting to "
 380                                         __stringify(MAX_NUMBER_GPAGES) "\n",
 381                                         npages, size / 1024);
 382                                 npages = MAX_NUMBER_GPAGES;
 383                         }
 384                         gpage_npages[shift_to_mmu_psize(__ffs(size))] = npages;
 385                         size = 0;
 386                 }
 387         }
 388         return 0;
 389 }
 390
 391
 392 /*
 393  * This function allocates physical space for pages that are larger than the
 394  * buddy allocator can handle.  We want to allocate these in highmem because
 395  * the amount of lowmem is limited.  This means that this function MUST be
 396  * called before lowmem_end_addr is set up in MMU_init() in order for the lmb
 397  * allocate to grab highmem.
 398  */
 399 void __init reserve_hugetlb_gpages(void)
 400 {
 401         static __initdata char cmdline[COMMAND_LINE_SIZE];
 402         phys_addr_t size, base;
 403         int i;
 404
 405         strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE);
 406         parse_args("hugetlb gpages", cmdline, NULL, 0, 0, 0,
 407                         NULL, &do_gpage_early_setup);
 408
 409         /*
 410          * Walk gpage list in reverse, allocating larger page sizes first.
 411          * Skip over unsupported sizes, or sizes that have 0 gpages allocated.
 412          * When we reach the point in the list where pages are no longer
 413          * considered gpages, we're done.
 414          */
 415         for (i = MMU_PAGE_COUNT-1; i >= 0; i--) {
 416                 if (mmu_psize_defs[i].shift == 0 || gpage_npages[i] == 0)
 417                         continue;
 418                 else if (mmu_psize_to_shift(i) < (MAX_ORDER + PAGE_SHIFT))
 419                         break;
 420
 421                 size = (phys_addr_t)(1ULL << mmu_psize_to_shift(i));
 422                 base = memblock_alloc_base(size * gpage_npages[i], size,
 423                                            MEMBLOCK_ALLOC_ANYWHERE);
 424                 add_gpage(base, size, gpage_npages[i]);
 425         }
 426 }
 427
 428 #else /* !PPC_FSL_BOOK3E */
 429
 430 /* Build list of addresses of gigantic pages.  This function is used in early
 431  * boot before the buddy allocator is setup.
 432  */
 433 void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
 434 {
 435         if (!addr)
 436                 return;
 437         while (number_of_pages > 0) {
 438                 gpage_freearray[nr_gpages] = addr;
 439                 nr_gpages++;
 440                 number_of_pages--;
 441                 addr += page_size;
 442         }
 443 }
 444
 445 /* Moves the gigantic page addresses from the temporary list to the
 446  * huge_boot_pages list.
 447  */
 448 int alloc_bootmem_huge_page(struct hstate *hstate)
 449 {
 450         struct huge_bootmem_page *m;
 451         if (nr_gpages == 0)
 452                 return 0;
 453         m = phys_to_virt(gpage_freearray[--nr_gpages]);
 454         gpage_freearray[nr_gpages] = 0;
 455         list_add(&m->list, &huge_boot_pages);
 456         m->hstate = hstate;
 457         return 1;
 458 }
 459 #endif
 460
 461 #ifdef CONFIG_PPC_FSL_BOOK3E
 462 #define HUGEPD_FREELIST_SIZE \
 463         ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
 464
 465 struct hugepd_freelist {
 466         struct rcu_head rcu;
 467         unsigned int index;
 468         void *ptes[0];
 469 };
 470
 471 static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);
 472
 473 static void hugepd_free_rcu_callback(struct rcu_head *head)
 474 {
 475         struct hugepd_freelist *batch =
 476                 container_of(head, struct hugepd_freelist, rcu);
 477         unsigned int i;
 478
 479         for (i = 0; i < batch->index; i++)
 480                 kmem_cache_free(hugepte_cache, batch->ptes[i]);
 481
 482         free_page((unsigned long)batch);
 483 }
 484
 485 static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
 486 {
 487         struct hugepd_freelist **batchp;
 488
 489         batchp = this_cpu_ptr(&hugepd_freelist_cur);
 490
 491         if (atomic_read(&tlb->mm->mm_users) < 2 ||
 492             cpumask_equal(mm_cpumask(tlb->mm),
 493                           cpumask_of(smp_processor_id()))) {
 494                 kmem_cache_free(hugepte_cache, hugepte);
 495         put_cpu_var(hugepd_freelist_cur);
 496                 return;
 497         }
 498
 499         if (*batchp == NULL) {
 500                 *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
 501                 (*batchp)->index = 0;
 502         }
 503
 504         (*batchp)->ptes[(*batchp)->index++] = hugepte;
 505         if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
 506                 call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
 507                 *batchp = NULL;
 508         }
 509         put_cpu_var(hugepd_freelist_cur);
 510 }
 511 #endif
 512
 513 static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
 514                               unsigned long start, unsigned long end,
 515                               unsigned long floor, unsigned long ceiling)
 516 {
 517         pte_t *hugepte = hugepd_page(*hpdp);
 518         int i;
 519
 520         unsigned long pdmask = ~((1UL << pdshift) - 1);
 521         unsigned int num_hugepd = 1;
 522
 523 #ifdef CONFIG_PPC_FSL_BOOK3E
 524         /* Note: On fsl the hpdp may be the first of several */
 525         num_hugepd = (1 << (hugepd_shift(*hpdp) - pdshift));
 526 #else
 527         unsigned int shift = hugepd_shift(*hpdp);
 528 #endif
 529
 530         start &= pdmask;
 531         if (start < floor)
 532                 return;
 533         if (ceiling) {
 534                 ceiling &= pdmask;
 535                 if (! ceiling)
 536                         return;
 537         }
 538         if (end - 1 > ceiling - 1)
 539                 return;
 540
 541         for (i = 0; i < num_hugepd; i++, hpdp++)
 542                 hpdp->pd = 0;
 543
 544 #ifdef CONFIG_PPC_FSL_BOOK3E
 545         hugepd_free(tlb, hugepte);
 546 #else
 547         pgtable_free_tlb(tlb, hugepte, pdshift - shift);
 548 #endif
 549 }
 550
 551 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 552                                    unsigned long addr, unsigned long end,
 553                                    unsigned long floor, unsigned long ceiling)
 554 {
 555         pmd_t *pmd;
 556         unsigned long next;
 557         unsigned long start;
 558
 559         start = addr;
 560         do {
 561                 pmd = pmd_offset(pud, addr);
 562                 next = pmd_addr_end(addr, end);
 563                 if (!is_hugepd(__hugepd(pmd_val(*pmd)))) {
 564                         /*
 565                          * if it is not hugepd pointer, we should already find
 566                          * it cleared.
 567                          */
 568                         WARN_ON(!pmd_none_or_clear_bad(pmd));
 569                         continue;
 570                 }
 571 #ifdef CONFIG_PPC_FSL_BOOK3E
 572                 /*
 573                  * Increment next by the size of the huge mapping since
 574                  * there may be more than one entry at this level for a
 575                  * single hugepage, but all of them point to
 576                  * the same kmem cache that holds the hugepte.
 577                  */
 578                 next = addr + (1 << hugepd_shift(*(hugepd_t *)pmd));
 579 #endif
 580                 free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
 581                                   addr, next, floor, ceiling);
 582         } while (addr = next, addr != end);
 583
 584         start &= PUD_MASK;
 585         if (start < floor)
 586                 return;
 587         if (ceiling) {
 588                 ceiling &= PUD_MASK;
 589                 if (!ceiling)
 590                         return;
 591         }
 592         if (end - 1 > ceiling - 1)
 593                 return;
 594
 595         pmd = pmd_offset(pud, start);
 596         pud_clear(pud);
 597         pmd_free_tlb(tlb, pmd, start);
 598         mm_dec_nr_pmds(tlb->mm);
 599 }
 600
 601 static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 602                                    unsigned long addr, unsigned long end,
 603                                    unsigned long floor, unsigned long ceiling)
 604 {
 605         pud_t *pud;
 606         unsigned long next;
 607         unsigned long start;
 608
 609         start = addr;
 610         do {
 611                 pud = pud_offset(pgd, addr);
 612                 next = pud_addr_end(addr, end);
 613                 if (!is_hugepd(__hugepd(pud_val(*pud)))) {
 614                         if (pud_none_or_clear_bad(pud))
 615                                 continue;
 616                         hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
 617                                                ceiling);
 618                 } else {
 619 #ifdef CONFIG_PPC_FSL_BOOK3E
 620                         /*
 621                          * Increment next by the size of the huge mapping since
 622                          * there may be more than one entry at this level for a
 623                          * single hugepage, but all of them point to
 624                          * the same kmem cache that holds the hugepte.
 625                          */
 626                         next = addr + (1 << hugepd_shift(*(hugepd_t *)pud));
 627 #endif
 628                         free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
 629                                           addr, next, floor, ceiling);
 630                 }
 631         } while (addr = next, addr != end);
 632
 633         start &= PGDIR_MASK;
 634         if (start < floor)
 635                 return;
 636         if (ceiling) {
 637                 ceiling &= PGDIR_MASK;
 638                 if (!ceiling)
 639                         return;
 640         }
 641         if (end - 1 > ceiling - 1)
 642                 return;
 643
 644         pud = pud_offset(pgd, start);
 645         pgd_clear(pgd);
 646         pud_free_tlb(tlb, pud, start);
 647 }
 648
 649 /*
 650  * This function frees user-level page tables of a process.
 651  */
 652 void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 653                             unsigned long addr, unsigned long end,
 654                             unsigned long floor, unsigned long ceiling)
 655 {
 656         pgd_t *pgd;
 657         unsigned long next;
 658
 659         /*
 660          * Because there are a number of different possible pagetable
 661          * layouts for hugepage ranges, we limit knowledge of how
 662          * things should be laid out to the allocation path
 663          * (huge_pte_alloc(), above).  Everything else works out the
 664          * structure as it goes from information in the hugepd
 665          * pointers.  That means that we can't here use the
 666          * optimization used in the normal page free_pgd_range(), of
 667          * checking whether we're actually covering a large enough
 668          * range to have to do anything at the top level of the walk
 669          * instead of at the bottom.
 670          *
 671          * To make sense of this, you should probably go read the big
 672          * block comment at the top of the normal free_pgd_range(),
 673          * too.
 674          */
 675
 676         do {
 677                 next = pgd_addr_end(addr, end);
 678                 pgd = pgd_offset(tlb->mm, addr);
 679                 if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
 680                         if (pgd_none_or_clear_bad(pgd))
 681                                 continue;
 682                         hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
 683                 } else {
 684 #ifdef CONFIG_PPC_FSL_BOOK3E
 685                         /*
 686                          * Increment next by the size of the huge mapping since
 687                          * there may be more than one entry at the pgd level
 688                          * for a single hugepage, but all of them point to the
 689                          * same kmem cache that holds the hugepte.
 690                          */
 691                         next = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
 692 #endif
 693                         free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
 694                                           addr, next, floor, ceiling);
 695                 }
 696         } while (addr = next, addr != end);
 697 }
 698
 699 /*
 700  * We are holding mmap_sem, so a parallel huge page collapse cannot run.
 701  * To prevent hugepage split, disable irq.
 702  */
 703 struct page *
 704 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 705 {
 706         bool is_thp;
 707         pte_t *ptep, pte;
 708         unsigned shift;
 709         unsigned long mask, flags;
 710         struct page *page = ERR_PTR(-EINVAL);
 711
 712         local_irq_save(flags);
 713         ptep = find_linux_pte_or_hugepte(mm->pgd, address, &is_thp, &shift);
 714         if (!ptep)
 715                 goto no_page;
 716         pte = READ_ONCE(*ptep);
 717         /*
 718          * Verify it is a huge page else bail.
 719          * Transparent hugepages are handled by generic code. We can skip them
 720          * here.
 721          */
 722         if (!shift || is_thp)
 723                 goto no_page;
 724
 725         if (!pte_present(pte)) {
 726                 page = NULL;
 727                 goto no_page;
 728         }
 729         mask = (1UL << shift) - 1;
 730         page = pte_page(pte);
 731         if (page)
 732                 page += (address & mask) / PAGE_SIZE;
 733
 734 no_page:
 735         local_irq_restore(flags);
 736         return page;
 737 }
 738
 739 struct page *
 740 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 741                 pmd_t *pmd, int write)
 742 {
 743         BUG();
 744         return NULL;
 745 }
 746
 747 struct page *
 748 follow_huge_pud(struct mm_struct *mm, unsigned long address,
 749                 pud_t *pud, int write)
 750 {
 751         BUG();
 752         return NULL;
 753 }
 754
 755 static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
 756                                       unsigned long sz)
 757 {
 758         unsigned long __boundary = (addr + sz) & ~(sz-1);
 759         return (__boundary - 1 < end - 1) ? __boundary : end;
 760 }
 761
 762 int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned pdshift,
 763                 unsigned long end, int write, struct page **pages, int *nr)
 764 {
 765         pte_t *ptep;
 766         unsigned long sz = 1UL << hugepd_shift(hugepd);
 767         unsigned long next;
 768
 769         ptep = hugepte_offset(hugepd, addr, pdshift);
 770         do {
 771                 next = hugepte_addr_end(addr, end, sz);
 772                 if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
 773                         return 0;
 774         } while (ptep++, addr = next, addr != end);
 775
 776         return 1;
 777 }
 778
 779 #ifdef CONFIG_PPC_MM_SLICES
 780 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 781                                         unsigned long len, unsigned long pgoff,
 782                                         unsigned long flags)
 783 {
 784         struct hstate *hstate = hstate_file(file);
 785         int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
 786
 787         return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1);
 788 }
 789 #endif
 790
 791 unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
 792 {
 793 #ifdef CONFIG_PPC_MM_SLICES
 794         unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
 795
 796         return 1UL << mmu_psize_to_shift(psize);
 797 #else
 798         if (!is_vm_hugetlb_page(vma))
 799                 return PAGE_SIZE;
 800
 801         return huge_page_size(hstate_vma(vma));
 802 #endif
 803 }
 804
 805 static inline bool is_power_of_4(unsigned long x)
 806 {
 807         if (is_power_of_2(x))
 808                 return (__ilog2(x) % 2) ? false : true;
 809         return false;
 810 }
 811
 812 static int __init add_huge_page_size(unsigned long long size)
 813 {
 814         int shift = __ffs(size);
 815         int mmu_psize;
 816
 817         /* Check that it is a page size supported by the hardware and
 818          * that it fits within pagetable and slice limits. */
 819 #ifdef CONFIG_PPC_FSL_BOOK3E
 820         if ((size < PAGE_SIZE) || !is_power_of_4(size))
 821                 return -EINVAL;
 822 #else
 823         if (!is_power_of_2(size)
 824             || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT))
 825                 return -EINVAL;
 826 #endif
 827
 828         if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
 829                 return -EINVAL;
 830
 831         BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
 832
 833         /* Return if huge page size has already been setup */
 834         if (size_to_hstate(size))
 835                 return 0;
 836
 837         hugetlb_add_hstate(shift - PAGE_SHIFT);
 838
 839         return 0;
 840 }
 841
 842 static int __init hugepage_setup_sz(char *str)
 843 {
 844         unsigned long long size;
 845
 846         size = memparse(str, &str);
 847
 848         if (add_huge_page_size(size) != 0)
 849                 printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
 850
 851         return 1;
 852 }
 853 __setup("hugepagesz=", hugepage_setup_sz);
 854
 855 #ifdef CONFIG_PPC_FSL_BOOK3E
 856 struct kmem_cache *hugepte_cache;
 857 static int __init hugetlbpage_init(void)
 858 {
 859         int psize;
 860
 861         for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
 862                 unsigned shift;
 863
 864                 if (!mmu_psize_defs[psize].shift)
 865                         continue;
 866
 867                 shift = mmu_psize_to_shift(psize);
 868
 869                 /* Don't treat normal page sizes as huge... */
 870                 if (shift != PAGE_SHIFT)
 871                         if (add_huge_page_size(1ULL << shift) < 0)
 872                                 continue;
 873         }
 874
 875         /*
 876          * Create a kmem cache for hugeptes.  The bottom bits in the pte have
 877          * size information encoded in them, so align them to allow this
 878          */
 879         hugepte_cache =  kmem_cache_create("hugepte-cache", sizeof(pte_t),
 880                                            HUGEPD_SHIFT_MASK + 1, 0, NULL);
 881         if (hugepte_cache == NULL)
 882                 panic("%s: Unable to create kmem cache for hugeptes\n",
 883                       __func__);
 884
 885         /* Default hpage size = 4M */
 886         if (mmu_psize_defs[MMU_PAGE_4M].shift)
 887                 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
 888         else
 889                 panic("%s: Unable to set default huge page size\n", __func__);
 890
 891
 892         return 0;
 893 }
 894 #else
 895 static int __init hugetlbpage_init(void)
 896 {
 897         int psize;
 898
 899         if (!mmu_has_feature(MMU_FTR_16M_PAGE))
 900                 return -ENODEV;
 901
 902         for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
 903                 unsigned shift;
 904                 unsigned pdshift;
 905
 906                 if (!mmu_psize_defs[psize].shift)
 907                         continue;
 908
 909                 shift = mmu_psize_to_shift(psize);
 910
 911                 if (add_huge_page_size(1ULL << shift) < 0)
 912                         continue;
 913
 914                 if (shift < PMD_SHIFT)
 915                         pdshift = PMD_SHIFT;
 916                 else if (shift < PUD_SHIFT)
 917                         pdshift = PUD_SHIFT;
 918                 else
 919                         pdshift = PGDIR_SHIFT;
 920                 /*
 921                  * if we have pdshift and shift value same, we don't
 922                  * use pgt cache for hugepd.
 923                  */
 924                 if (pdshift != shift) {
 925                         pgtable_cache_add(pdshift - shift, NULL);
 926                         if (!PGT_CACHE(pdshift - shift))
 927                                 panic("hugetlbpage_init(): could not create "
 928                                       "pgtable cache for %d bit pagesize\n", shift);
 929                 }
 930         }
 931
 932         /* Set default large page size. Currently, we pick 16M or 1M
 933          * depending on what is available
 934          */
 935         if (mmu_psize_defs[MMU_PAGE_16M].shift)
 936                 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
 937         else if (mmu_psize_defs[MMU_PAGE_1M].shift)
 938                 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
 939
 940         return 0;
 941 }
 942 #endif
 943 arch_initcall(hugetlbpage_init);
 944
 945 void flush_dcache_icache_hugepage(struct page *page)
 946 {
 947         int i;
 948         void *start;
 949
 950         BUG_ON(!PageCompound(page));
 951
 952         for (i = 0; i < (1UL << compound_order(page)); i++) {
 953                 if (!PageHighMem(page)) {
 954                         __flush_dcache_icache(page_address(page+i));
 955                 } else {
 956                         start = kmap_atomic(page+i);
 957                         __flush_dcache_icache(start);
 958                         kunmap_atomic(start);
 959                 }
 960         }
 961 }
 962
 963 #endif /* CONFIG_HUGETLB_PAGE */
 964
 965 /*
 966  * We have 4 cases for pgds and pmds:
 967  * (1) invalid (all zeroes)
 968  * (2) pointer to next table, as normal; bottom 6 bits == 0
 969  * (3) leaf pte for huge page, bottom two bits != 00
 970  * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table
 971  *
 972  * So long as we atomically load page table pointers we are safe against teardown,
 973  * we can follow the address down to the the page and take a ref on it.
 974  * This function need to be called with interrupts disabled. We use this variant
 975  * when we have MSR[EE] = 0 but the paca->soft_enabled = 1
 976  */
 977
 978 pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
 979                                    bool *is_thp, unsigned *shift)
 980 {
 981         pgd_t pgd, *pgdp;
 982         pud_t pud, *pudp;
 983         pmd_t pmd, *pmdp;
 984         pte_t *ret_pte;
 985         hugepd_t *hpdp = NULL;
 986         unsigned pdshift = PGDIR_SHIFT;
 987
 988         if (shift)
 989                 *shift = 0;
 990
 991         if (is_thp)
 992                 *is_thp = false;
 993
 994         pgdp = pgdir + pgd_index(ea);
 995         pgd  = READ_ONCE(*pgdp);
 996         /*
 997          * Always operate on the local stack value. This make sure the
 998          * value don't get updated by a parallel THP split/collapse,
 999          * page fault or a page unmap. The return pte_t * is still not
1000          * stable. So should be checked there for above conditions.
1001          */
1002         if (pgd_none(pgd))
1003                 return NULL;
1004         else if (pgd_huge(pgd)) {
1005                 ret_pte = (pte_t *) pgdp;
1006                 goto out;
1007         } else if (is_hugepd(__hugepd(pgd_val(pgd))))
1008                 hpdp = (hugepd_t *)&pgd;
1009         else {
1010                 /*
1011                  * Even if we end up with an unmap, the pgtable will not
1012                  * be freed, because we do an rcu free and here we are
1013                  * irq disabled
1014                  */
1015                 pdshift = PUD_SHIFT;
1016                 pudp = pud_offset(&pgd, ea);
1017                 pud  = READ_ONCE(*pudp);
1018
1019                 if (pud_none(pud))
1020                         return NULL;
1021                 else if (pud_huge(pud)) {
1022                         ret_pte = (pte_t *) pudp;
1023                         goto out;
1024                 } else if (is_hugepd(__hugepd(pud_val(pud))))
1025                         hpdp = (hugepd_t *)&pud;
1026                 else {
1027                         pdshift = PMD_SHIFT;
1028                         pmdp = pmd_offset(&pud, ea);
1029                         pmd  = READ_ONCE(*pmdp);
1030                         /*
1031                          * A hugepage collapse is captured by pmd_none, because
1032                          * it mark the pmd none and do a hpte invalidate.
1033                          *
1034                          * We don't worry about pmd_trans_splitting here, The
1035                          * caller if it needs to handle the splitting case
1036                          * should check for that.
1037                          */
1038                         if (pmd_none(pmd))
1039                                 return NULL;
1040
1041                         if (pmd_trans_huge(pmd)) {
1042                                 if (is_thp)
1043                                         *is_thp = true;
1044                                 ret_pte = (pte_t *) pmdp;
1045                                 goto out;
1046                         }
1047
1048                         if (pmd_huge(pmd)) {
1049                                 ret_pte = (pte_t *) pmdp;
1050                                 goto out;
1051                         } else if (is_hugepd(__hugepd(pmd_val(pmd))))
1052                                 hpdp = (hugepd_t *)&pmd;
1053                         else
1054                                 return pte_offset_kernel(&pmd, ea);
1055                 }
1056         }
1057         if (!hpdp)
1058                 return NULL;
1059
1060         ret_pte = hugepte_offset(*hpdp, ea, pdshift);
1061         pdshift = hugepd_shift(*hpdp);
1062 out:
1063         if (shift)
1064                 *shift = pdshift;
1065         return ret_pte;
1066 }
1067 EXPORT_SYMBOL_GPL(__find_linux_pte_or_hugepte);
1068
1069 int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
1070                 unsigned long end, int write, struct page **pages, int *nr)
1071 {
1072         unsigned long mask;
1073         unsigned long pte_end;
1074         struct page *head, *page, *tail;
1075         pte_t pte;
1076         int refs;
1077
1078         pte_end = (addr + sz) & ~(sz-1);
1079         if (pte_end < end)
1080                 end = pte_end;
1081
1082         pte = READ_ONCE(*ptep);
1083         mask = _PAGE_PRESENT | _PAGE_USER;
1084         if (write)
1085                 mask |= _PAGE_RW;
1086
1087         if ((pte_val(pte) & mask) != mask)
1088                 return 0;
1089
1090         /* hugepages are never "special" */
1091         VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
1092
1093         refs = 0;
1094         head = pte_page(pte);
1095
1096         page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
1097         tail = page;
1098         do {
1099                 VM_BUG_ON(compound_head(page) != head);
1100                 pages[*nr] = page;
1101                 (*nr)++;
1102                 page++;
1103                 refs++;
1104         } while (addr += PAGE_SIZE, addr != end);
1105
1106         if (!page_cache_add_speculative(head, refs)) {
1107                 *nr -= refs;
1108                 return 0;
1109         }
1110
1111         if (unlikely(pte_val(pte) != pte_val(*ptep))) {
1112                 /* Could be optimized better */
1113                 *nr -= refs;
1114                 while (refs--)
1115                         put_page(head);
1116                 return 0;
1117         }
1118
1119         /*
1120          * Any tail page need their mapcount reference taken before we
1121          * return.
1122          */
1123         while (refs--) {
1124                 if (PageTail(tail))
1125                         get_huge_page_tail(tail);
1126                 tail++;
1127         }
1128
1129         return 1;
1130 }