arch/powerpc/mm/hugetlbpage.c

   1 /*
   2  * PPC64 (POWER4) Huge TLB Page Support for Kernel.
   3  *
   4  * Copyright (C) 2003 David Gibson, IBM Corporation.
   5  *
   6  * Based on the IA-32 version:
   7  * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
   8  */
   9
  10 #include <linux/init.h>
  11 #include <linux/fs.h>
  12 #include <linux/mm.h>
  13 #include <linux/hugetlb.h>
  14 #include <linux/pagemap.h>
  15 #include <linux/slab.h>
  16 #include <linux/err.h>
  17 #include <linux/sysctl.h>
  18 #include <asm/mman.h>
  19 #include <asm/pgalloc.h>
  20 #include <asm/tlb.h>
  21 #include <asm/tlbflush.h>
  22 #include <asm/mmu_context.h>
  23 #include <asm/machdep.h>
  24 #include <asm/cputable.h>
  25 #include <asm/spu.h>
  26
  27 #define PAGE_SHIFT_64K  16
  28 #define PAGE_SHIFT_16M  24
  29 #define PAGE_SHIFT_16G  34
  30
  31 #define NUM_LOW_AREAS   (0x100000000UL >> SID_SHIFT)
  32 #define NUM_HIGH_AREAS  (PGTABLE_RANGE >> HTLB_AREA_SHIFT)
  33 #define MAX_NUMBER_GPAGES       1024
  34
  35 /* Tracks the 16G pages after the device tree is scanned and before the
  36  * huge_boot_pages list is ready.  */
  37 static unsigned long gpage_freearray[MAX_NUMBER_GPAGES];
  38 static unsigned nr_gpages;
  39
  40 /* Array of valid huge page sizes - non-zero value(hugepte_shift) is
  41  * stored for the huge page sizes that are valid.
  42  */
  43 unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
  44
  45 #define hugepte_shift                   mmu_huge_psizes
  46 #define PTRS_PER_HUGEPTE(psize)         (1 << hugepte_shift[psize])
  47 #define HUGEPTE_TABLE_SIZE(psize)       (sizeof(pte_t) << hugepte_shift[psize])
  48
  49 #define HUGEPD_SHIFT(psize)             (mmu_psize_to_shift(psize) \
  50                                                 + hugepte_shift[psize])
  51 #define HUGEPD_SIZE(psize)              (1UL << HUGEPD_SHIFT(psize))
  52 #define HUGEPD_MASK(psize)              (~(HUGEPD_SIZE(psize)-1))
  53
  54 /* Subtract one from array size because we don't need a cache for 4K since
  55  * is not a huge page size */
  56 #define HUGE_PGTABLE_INDEX(psize)       (HUGEPTE_CACHE_NUM + psize - 1)
  57 #define HUGEPTE_CACHE_NAME(psize)       (huge_pgtable_cache_name[psize])
  58
  59 static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = {
  60         [MMU_PAGE_64K]  = "hugepte_cache_64K",
  61         [MMU_PAGE_1M]   = "hugepte_cache_1M",
  62         [MMU_PAGE_16M]  = "hugepte_cache_16M",
  63         [MMU_PAGE_16G]  = "hugepte_cache_16G",
  64 };
  65
  66 /* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
  67  * will choke on pointers to hugepte tables, which is handy for
  68  * catching screwups early. */
  69 #define HUGEPD_OK       0x1
  70
  71 typedef struct { unsigned long pd; } hugepd_t;
  72
  73 #define hugepd_none(hpd)        ((hpd).pd == 0)
  74
  75 static inline int shift_to_mmu_psize(unsigned int shift)
  76 {
  77         switch (shift) {
  78 #ifndef CONFIG_PPC_64K_PAGES
  79         case PAGE_SHIFT_64K:
  80             return MMU_PAGE_64K;
  81 #endif
  82         case PAGE_SHIFT_16M:
  83             return MMU_PAGE_16M;
  84         case PAGE_SHIFT_16G:
  85             return MMU_PAGE_16G;
  86         }
  87         return -1;
  88 }
  89
  90 static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
  91 {
  92         if (mmu_psize_defs[mmu_psize].shift)
  93                 return mmu_psize_defs[mmu_psize].shift;
  94         BUG();
  95 }
  96
  97 static inline pte_t *hugepd_page(hugepd_t hpd)
  98 {
  99         BUG_ON(!(hpd.pd & HUGEPD_OK));
 100         return (pte_t *)(hpd.pd & ~HUGEPD_OK);
 101 }
 102
 103 static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr,
 104                                     struct hstate *hstate)
 105 {
 106         unsigned int shift = huge_page_shift(hstate);
 107         int psize = shift_to_mmu_psize(shift);
 108         unsigned long idx = ((addr >> shift) & (PTRS_PER_HUGEPTE(psize)-1));
 109         pte_t *dir = hugepd_page(*hpdp);
 110
 111         return dir + idx;
 112 }
 113
 114 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 115                            unsigned long address, unsigned int psize)
 116 {
 117         pte_t *new = kmem_cache_zalloc(pgtable_cache[HUGE_PGTABLE_INDEX(psize)],
 118                                       GFP_KERNEL|__GFP_REPEAT);
 119
 120         if (! new)
 121                 return -ENOMEM;
 122
 123         spin_lock(&mm->page_table_lock);
 124         if (!hugepd_none(*hpdp))
 125                 kmem_cache_free(pgtable_cache[HUGE_PGTABLE_INDEX(psize)], new);
 126         else
 127                 hpdp->pd = (unsigned long)new | HUGEPD_OK;
 128         spin_unlock(&mm->page_table_lock);
 129         return 0;
 130 }
 131
 132
 133 static pud_t *hpud_offset(pgd_t *pgd, unsigned long addr, struct hstate *hstate)
 134 {
 135         if (huge_page_shift(hstate) < PUD_SHIFT)
 136                 return pud_offset(pgd, addr);
 137         else
 138                 return (pud_t *) pgd;
 139 }
 140 static pud_t *hpud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long addr,
 141                          struct hstate *hstate)
 142 {
 143         if (huge_page_shift(hstate) < PUD_SHIFT)
 144                 return pud_alloc(mm, pgd, addr);
 145         else
 146                 return (pud_t *) pgd;
 147 }
 148 static pmd_t *hpmd_offset(pud_t *pud, unsigned long addr, struct hstate *hstate)
 149 {
 150         if (huge_page_shift(hstate) < PMD_SHIFT)
 151                 return pmd_offset(pud, addr);
 152         else
 153                 return (pmd_t *) pud;
 154 }
 155 static pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr,
 156                          struct hstate *hstate)
 157 {
 158         if (huge_page_shift(hstate) < PMD_SHIFT)
 159                 return pmd_alloc(mm, pud, addr);
 160         else
 161                 return (pmd_t *) pud;
 162 }
 163
 164 /* Build list of addresses of gigantic pages.  This function is used in early
 165  * boot before the buddy or bootmem allocator is setup.
 166  */
 167 void add_gpage(unsigned long addr, unsigned long page_size,
 168         unsigned long number_of_pages)
 169 {
 170         if (!addr)
 171                 return;
 172         while (number_of_pages > 0) {
 173                 gpage_freearray[nr_gpages] = addr;
 174                 nr_gpages++;
 175                 number_of_pages--;
 176                 addr += page_size;
 177         }
 178 }
 179
 180 /* Moves the gigantic page addresses from the temporary list to the
 181  * huge_boot_pages list.
 182  */
 183 int alloc_bootmem_huge_page(struct hstate *hstate)
 184 {
 185         struct huge_bootmem_page *m;
 186         if (nr_gpages == 0)
 187                 return 0;
 188         m = phys_to_virt(gpage_freearray[--nr_gpages]);
 189         gpage_freearray[nr_gpages] = 0;
 190         list_add(&m->list, &huge_boot_pages);
 191         m->hstate = hstate;
 192         return 1;
 193 }
 194
 195
 196 /* Modelled after find_linux_pte() */
 197 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 198 {
 199         pgd_t *pg;
 200         pud_t *pu;
 201         pmd_t *pm;
 202
 203         unsigned int psize;
 204         unsigned int shift;
 205         unsigned long sz;
 206         struct hstate *hstate;
 207         psize = get_slice_psize(mm, addr);
 208         shift = mmu_psize_to_shift(psize);
 209         sz = ((1UL) << shift);
 210         hstate = size_to_hstate(sz);
 211
 212         addr &= hstate->mask;
 213
 214         pg = pgd_offset(mm, addr);
 215         if (!pgd_none(*pg)) {
 216                 pu = hpud_offset(pg, addr, hstate);
 217                 if (!pud_none(*pu)) {
 218                         pm = hpmd_offset(pu, addr, hstate);
 219                         if (!pmd_none(*pm))
 220                                 return hugepte_offset((hugepd_t *)pm, addr,
 221                                                       hstate);
 222                 }
 223         }
 224
 225         return NULL;
 226 }
 227
 228 pte_t *huge_pte_alloc(struct mm_struct *mm,
 229                         unsigned long addr, unsigned long sz)
 230 {
 231         pgd_t *pg;
 232         pud_t *pu;
 233         pmd_t *pm;
 234         hugepd_t *hpdp = NULL;
 235         struct hstate *hstate;
 236         unsigned int psize;
 237         hstate = size_to_hstate(sz);
 238
 239         psize = get_slice_psize(mm, addr);
 240         BUG_ON(!mmu_huge_psizes[psize]);
 241
 242         addr &= hstate->mask;
 243
 244         pg = pgd_offset(mm, addr);
 245         pu = hpud_alloc(mm, pg, addr, hstate);
 246
 247         if (pu) {
 248                 pm = hpmd_alloc(mm, pu, addr, hstate);
 249                 if (pm)
 250                         hpdp = (hugepd_t *)pm;
 251         }
 252
 253         if (! hpdp)
 254                 return NULL;
 255
 256         if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, psize))
 257                 return NULL;
 258
 259         return hugepte_offset(hpdp, addr, hstate);
 260 }
 261
 262 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 263 {
 264         return 0;
 265 }
 266
 267 static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp,
 268                                unsigned int psize)
 269 {
 270         pte_t *hugepte = hugepd_page(*hpdp);
 271
 272         hpdp->pd = 0;
 273         tlb->need_flush = 1;
 274         pgtable_free_tlb(tlb, pgtable_free_cache(hugepte,
 275                                                  HUGEPTE_CACHE_NUM+psize-1,
 276                                                  PGF_CACHENUM_MASK));
 277 }
 278
 279 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 280                                    unsigned long addr, unsigned long end,
 281                                    unsigned long floor, unsigned long ceiling,
 282                                    unsigned int psize)
 283 {
 284         pmd_t *pmd;
 285         unsigned long next;
 286         unsigned long start;
 287
 288         start = addr;
 289         pmd = pmd_offset(pud, addr);
 290         do {
 291                 next = pmd_addr_end(addr, end);
 292                 if (pmd_none(*pmd))
 293                         continue;
 294                 free_hugepte_range(tlb, (hugepd_t *)pmd, psize);
 295         } while (pmd++, addr = next, addr != end);
 296
 297         start &= PUD_MASK;
 298         if (start < floor)
 299                 return;
 300         if (ceiling) {
 301                 ceiling &= PUD_MASK;
 302                 if (!ceiling)
 303                         return;
 304         }
 305         if (end - 1 > ceiling - 1)
 306                 return;
 307
 308         pmd = pmd_offset(pud, start);
 309         pud_clear(pud);
 310         pmd_free_tlb(tlb, pmd, start);
 311 }
 312
 313 static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 314                                    unsigned long addr, unsigned long end,
 315                                    unsigned long floor, unsigned long ceiling)
 316 {
 317         pud_t *pud;
 318         unsigned long next;
 319         unsigned long start;
 320         unsigned int shift;
 321         unsigned int psize = get_slice_psize(tlb->mm, addr);
 322         shift = mmu_psize_to_shift(psize);
 323
 324         start = addr;
 325         pud = pud_offset(pgd, addr);
 326         do {
 327                 next = pud_addr_end(addr, end);
 328                 if (shift < PMD_SHIFT) {
 329                         if (pud_none_or_clear_bad(pud))
 330                                 continue;
 331                         hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
 332                                                ceiling, psize);
 333                 } else {
 334                         if (pud_none(*pud))
 335                                 continue;
 336                         free_hugepte_range(tlb, (hugepd_t *)pud, psize);
 337                 }
 338         } while (pud++, addr = next, addr != end);
 339
 340         start &= PGDIR_MASK;
 341         if (start < floor)
 342                 return;
 343         if (ceiling) {
 344                 ceiling &= PGDIR_MASK;
 345                 if (!ceiling)
 346                         return;
 347         }
 348         if (end - 1 > ceiling - 1)
 349                 return;
 350
 351         pud = pud_offset(pgd, start);
 352         pgd_clear(pgd);
 353         pud_free_tlb(tlb, pud, start);
 354 }
 355
 356 /*
 357  * This function frees user-level page tables of a process.
 358  *
 359  * Must be called with pagetable lock held.
 360  */
 361 void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 362                             unsigned long addr, unsigned long end,
 363                             unsigned long floor, unsigned long ceiling)
 364 {
 365         pgd_t *pgd;
 366         unsigned long next;
 367         unsigned long start;
 368
 369         /*
 370          * Comments below take from the normal free_pgd_range().  They
 371          * apply here too.  The tests against HUGEPD_MASK below are
 372          * essential, because we *don't* test for this at the bottom
 373          * level.  Without them we'll attempt to free a hugepte table
 374          * when we unmap just part of it, even if there are other
 375          * active mappings using it.
 376          *
 377          * The next few lines have given us lots of grief...
 378          *
 379          * Why are we testing HUGEPD* at this top level?  Because
 380          * often there will be no work to do at all, and we'd prefer
 381          * not to go all the way down to the bottom just to discover
 382          * that.
 383          *
 384          * Why all these "- 1"s?  Because 0 represents both the bottom
 385          * of the address space and the top of it (using -1 for the
 386          * top wouldn't help much: the masks would do the wrong thing).
 387          * The rule is that addr 0 and floor 0 refer to the bottom of
 388          * the address space, but end 0 and ceiling 0 refer to the top
 389          * Comparisons need to use "end - 1" and "ceiling - 1" (though
 390          * that end 0 case should be mythical).
 391          *
 392          * Wherever addr is brought up or ceiling brought down, we
 393          * must be careful to reject "the opposite 0" before it
 394          * confuses the subsequent tests.  But what about where end is
 395          * brought down by HUGEPD_SIZE below? no, end can't go down to
 396          * 0 there.
 397          *
 398          * Whereas we round start (addr) and ceiling down, by different
 399          * masks at different levels, in order to test whether a table
 400          * now has no other vmas using it, so can be freed, we don't
 401          * bother to round floor or end up - the tests don't need that.
 402          */
 403         unsigned int psize = get_slice_psize(tlb->mm, addr);
 404
 405         addr &= HUGEPD_MASK(psize);
 406         if (addr < floor) {
 407                 addr += HUGEPD_SIZE(psize);
 408                 if (!addr)
 409                         return;
 410         }
 411         if (ceiling) {
 412                 ceiling &= HUGEPD_MASK(psize);
 413                 if (!ceiling)
 414                         return;
 415         }
 416         if (end - 1 > ceiling - 1)
 417                 end -= HUGEPD_SIZE(psize);
 418         if (addr > end - 1)
 419                 return;
 420
 421         start = addr;
 422         pgd = pgd_offset(tlb->mm, addr);
 423         do {
 424                 psize = get_slice_psize(tlb->mm, addr);
 425                 BUG_ON(!mmu_huge_psizes[psize]);
 426                 next = pgd_addr_end(addr, end);
 427                 if (mmu_psize_to_shift(psize) < PUD_SHIFT) {
 428                         if (pgd_none_or_clear_bad(pgd))
 429                                 continue;
 430                         hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
 431                 } else {
 432                         if (pgd_none(*pgd))
 433                                 continue;
 434                         free_hugepte_range(tlb, (hugepd_t *)pgd, psize);
 435                 }
 436         } while (pgd++, addr = next, addr != end);
 437 }
 438
 439 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 440                      pte_t *ptep, pte_t pte)
 441 {
 442         if (pte_present(*ptep)) {
 443                 /* We open-code pte_clear because we need to pass the right
 444                  * argument to hpte_need_flush (huge / !huge). Might not be
 445                  * necessary anymore if we make hpte_need_flush() get the
 446                  * page size from the slices
 447                  */
 448                 unsigned int psize = get_slice_psize(mm, addr);
 449                 unsigned int shift = mmu_psize_to_shift(psize);
 450                 unsigned long sz = ((1UL) << shift);
 451                 struct hstate *hstate = size_to_hstate(sz);
 452                 pte_update(mm, addr & hstate->mask, ptep, ~0UL, 1);
 453         }
 454         *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
 455 }
 456
 457 pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 458                               pte_t *ptep)
 459 {
 460         unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1);
 461         return __pte(old);
 462 }
 463
 464 struct page *
 465 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 466 {
 467         pte_t *ptep;
 468         struct page *page;
 469         unsigned int mmu_psize = get_slice_psize(mm, address);
 470
 471         /* Verify it is a huge page else bail. */
 472         if (!mmu_huge_psizes[mmu_psize])
 473                 return ERR_PTR(-EINVAL);
 474
 475         ptep = huge_pte_offset(mm, address);
 476         page = pte_page(*ptep);
 477         if (page) {
 478                 unsigned int shift = mmu_psize_to_shift(mmu_psize);
 479                 unsigned long sz = ((1UL) << shift);
 480                 page += (address % sz) / PAGE_SIZE;
 481         }
 482
 483         return page;
 484 }
 485
 486 int pmd_huge(pmd_t pmd)
 487 {
 488         return 0;
 489 }
 490
 491 int pud_huge(pud_t pud)
 492 {
 493         return 0;
 494 }
 495
 496 struct page *
 497 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 498                 pmd_t *pmd, int write)
 499 {
 500         BUG();
 501         return NULL;
 502 }
 503
 504
 505 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 506                                         unsigned long len, unsigned long pgoff,
 507                                         unsigned long flags)
 508 {
 509         struct hstate *hstate = hstate_file(file);
 510         int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
 511
 512         if (!mmu_huge_psizes[mmu_psize])
 513                 return -EINVAL;
 514         return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
 515 }
 516
 517 unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
 518 {
 519         unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
 520
 521         return 1UL << mmu_psize_to_shift(psize);
 522 }
 523
 524 /*
 525  * Called by asm hashtable.S for doing lazy icache flush
 526  */
 527 static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
 528                                         pte_t pte, int trap, unsigned long sz)
 529 {
 530         struct page *page;
 531         int i;
 532
 533         if (!pfn_valid(pte_pfn(pte)))
 534                 return rflags;
 535
 536         page = pte_page(pte);
 537
 538         /* page is dirty */
 539         if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
 540                 if (trap == 0x400) {
 541                         for (i = 0; i < (sz / PAGE_SIZE); i++)
 542                                 __flush_dcache_icache(page_address(page+i));
 543                         set_bit(PG_arch_1, &page->flags);
 544                 } else {
 545                         rflags |= HPTE_R_N;
 546                 }
 547         }
 548         return rflags;
 549 }
 550
 551 int hash_huge_page(struct mm_struct *mm, unsigned long access,
 552                    unsigned long ea, unsigned long vsid, int local,
 553                    unsigned long trap)
 554 {
 555         pte_t *ptep;
 556         unsigned long old_pte, new_pte;
 557         unsigned long va, rflags, pa, sz;
 558         long slot;
 559         int err = 1;
 560         int ssize = user_segment_size(ea);
 561         unsigned int mmu_psize;
 562         int shift;
 563         mmu_psize = get_slice_psize(mm, ea);
 564
 565         if (!mmu_huge_psizes[mmu_psize])
 566                 goto out;
 567         ptep = huge_pte_offset(mm, ea);
 568
 569         /* Search the Linux page table for a match with va */
 570         va = hpt_va(ea, vsid, ssize);
 571
 572         /*
 573          * If no pte found or not present, send the problem up to
 574          * do_page_fault
 575          */
 576         if (unlikely(!ptep || pte_none(*ptep)))
 577                 goto out;
 578
 579         /*
 580          * Check the user's access rights to the page.  If access should be
 581          * prevented then send the problem up to do_page_fault.
 582          */
 583         if (unlikely(access & ~pte_val(*ptep)))
 584                 goto out;
 585         /*
 586          * At this point, we have a pte (old_pte) which can be used to build
 587          * or update an HPTE. There are 2 cases:
 588          *
 589          * 1. There is a valid (present) pte with no associated HPTE (this is
 590          *      the most common case)
 591          * 2. There is a valid (present) pte with an associated HPTE. The
 592          *      current values of the pp bits in the HPTE prevent access
 593          *      because we are doing software DIRTY bit management and the
 594          *      page is currently not DIRTY.
 595          */
 596
 597
 598         do {
 599                 old_pte = pte_val(*ptep);
 600                 if (old_pte & _PAGE_BUSY)
 601                         goto out;
 602                 new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
 603         } while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
 604                                          old_pte, new_pte));
 605
 606         rflags = 0x2 | (!(new_pte & _PAGE_RW));
 607         /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
 608         rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
 609         shift = mmu_psize_to_shift(mmu_psize);
 610         sz = ((1UL) << shift);
 611         if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
 612                 /* No CPU has hugepages but lacks no execute, so we
 613                  * don't need to worry about that case */
 614                 rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
 615                                                        trap, sz);
 616
 617         /* Check if pte already has an hpte (case 2) */
 618         if (unlikely(old_pte & _PAGE_HASHPTE)) {
 619                 /* There MIGHT be an HPTE for this pte */
 620                 unsigned long hash, slot;
 621
 622                 hash = hpt_hash(va, shift, ssize);
 623                 if (old_pte & _PAGE_F_SECOND)
 624                         hash = ~hash;
 625                 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 626                 slot += (old_pte & _PAGE_F_GIX) >> 12;
 627
 628                 if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize,
 629                                          ssize, local) == -1)
 630                         old_pte &= ~_PAGE_HPTEFLAGS;
 631         }
 632
 633         if (likely(!(old_pte & _PAGE_HASHPTE))) {
 634                 unsigned long hash = hpt_hash(va, shift, ssize);
 635                 unsigned long hpte_group;
 636
 637                 pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
 638
 639 repeat:
 640                 hpte_group = ((hash & htab_hash_mask) *
 641                               HPTES_PER_GROUP) & ~0x7UL;
 642
 643                 /* clear HPTE slot informations in new PTE */
 644 #ifdef CONFIG_PPC_64K_PAGES
 645                 new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0;
 646 #else
 647                 new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
 648 #endif
 649                 /* Add in WIMG bits */
 650                 rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
 651                                       _PAGE_COHERENT | _PAGE_GUARDED));
 652
 653                 /* Insert into the hash table, primary slot */
 654                 slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
 655                                           mmu_psize, ssize);
 656
 657                 /* Primary is full, try the secondary */
 658                 if (unlikely(slot == -1)) {
 659                         hpte_group = ((~hash & htab_hash_mask) *
 660                                       HPTES_PER_GROUP) & ~0x7UL;
 661                         slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
 662                                                   HPTE_V_SECONDARY,
 663                                                   mmu_psize, ssize);
 664                         if (slot == -1) {
 665                                 if (mftb() & 0x1)
 666                                         hpte_group = ((hash & htab_hash_mask) *
 667                                                       HPTES_PER_GROUP)&~0x7UL;
 668
 669                                 ppc_md.hpte_remove(hpte_group);
 670                                 goto repeat;
 671                         }
 672                 }
 673
 674                 if (unlikely(slot == -2))
 675                         panic("hash_huge_page: pte_insert failed\n");
 676
 677                 new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX);
 678         }
 679
 680         /*
 681          * No need to use ldarx/stdcx here
 682          */
 683         *ptep = __pte(new_pte & ~_PAGE_BUSY);
 684
 685         err = 0;
 686
 687  out:
 688         return err;
 689 }
 690
 691 static void __init set_huge_psize(int psize)
 692 {
 693         /* Check that it is a page size supported by the hardware and
 694          * that it fits within pagetable limits. */
 695         if (mmu_psize_defs[psize].shift &&
 696                 mmu_psize_defs[psize].shift < SID_SHIFT_1T &&
 697                 (mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT ||
 698                  mmu_psize_defs[psize].shift == PAGE_SHIFT_64K ||
 699                  mmu_psize_defs[psize].shift == PAGE_SHIFT_16G)) {
 700                 /* Return if huge page size has already been setup or is the
 701                  * same as the base page size. */
 702                 if (mmu_huge_psizes[psize] ||
 703                    mmu_psize_defs[psize].shift == PAGE_SHIFT)
 704                         return;
 705                 if (WARN_ON(HUGEPTE_CACHE_NAME(psize) == NULL))
 706                         return;
 707                 hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT);
 708
 709                 switch (mmu_psize_defs[psize].shift) {
 710                 case PAGE_SHIFT_64K:
 711                     /* We only allow 64k hpages with 4k base page,
 712                      * which was checked above, and always put them
 713                      * at the PMD */
 714                     hugepte_shift[psize] = PMD_SHIFT;
 715                     break;
 716                 case PAGE_SHIFT_16M:
 717                     /* 16M pages can be at two different levels
 718                      * of pagestables based on base page size */
 719                     if (PAGE_SHIFT == PAGE_SHIFT_64K)
 720                             hugepte_shift[psize] = PMD_SHIFT;
 721                     else /* 4k base page */
 722                             hugepte_shift[psize] = PUD_SHIFT;
 723                     break;
 724                 case PAGE_SHIFT_16G:
 725                     /* 16G pages are always at PGD level */
 726                     hugepte_shift[psize] = PGDIR_SHIFT;
 727                     break;
 728                 }
 729                 hugepte_shift[psize] -= mmu_psize_defs[psize].shift;
 730         } else
 731                 hugepte_shift[psize] = 0;
 732 }
 733
 734 static int __init hugepage_setup_sz(char *str)
 735 {
 736         unsigned long long size;
 737         int mmu_psize;
 738         int shift;
 739
 740         size = memparse(str, &str);
 741
 742         shift = __ffs(size);
 743         mmu_psize = shift_to_mmu_psize(shift);
 744         if (mmu_psize >= 0 && mmu_psize_defs[mmu_psize].shift)
 745                 set_huge_psize(mmu_psize);
 746         else
 747                 printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
 748
 749         return 1;
 750 }
 751 __setup("hugepagesz=", hugepage_setup_sz);
 752
 753 static int __init hugetlbpage_init(void)
 754 {
 755         unsigned int psize;
 756
 757         if (!cpu_has_feature(CPU_FTR_16M_PAGE))
 758                 return -ENODEV;
 759
 760         /* Add supported huge page sizes.  Need to change HUGE_MAX_HSTATE
 761          * and adjust PTE_NONCACHE_NUM if the number of supported huge page
 762          * sizes changes.
 763          */
 764         set_huge_psize(MMU_PAGE_16M);
 765         set_huge_psize(MMU_PAGE_16G);
 766
 767         /* Temporarily disable support for 64K huge pages when 64K SPU local
 768          * store support is enabled as the current implementation conflicts.
 769          */
 770 #ifndef CONFIG_SPU_FS_64K_LS
 771         set_huge_psize(MMU_PAGE_64K);
 772 #endif
 773
 774         for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
 775                 if (mmu_huge_psizes[psize]) {
 776                         pgtable_cache[HUGE_PGTABLE_INDEX(psize)] =
 777                                 kmem_cache_create(
 778                                         HUGEPTE_CACHE_NAME(psize),
 779                                         HUGEPTE_TABLE_SIZE(psize),
 780                                         HUGEPTE_TABLE_SIZE(psize),
 781                                         0,
 782                                         NULL);
 783                         if (!pgtable_cache[HUGE_PGTABLE_INDEX(psize)])
 784                                 panic("hugetlbpage_init(): could not create %s"\
 785                                       "\n", HUGEPTE_CACHE_NAME(psize));
 786                 }
 787         }
 788
 789         return 0;
 790 }
 791
 792 module_init(hugetlbpage_init);