arch/powerpc/mm/hugetlbpage.c

   1 /*
   2  * PPC64 (POWER4) Huge TLB Page Support for Kernel.
   3  *
   4  * Copyright (C) 2003 David Gibson, IBM Corporation.
   5  *
   6  * Based on the IA-32 version:
   7  * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
   8  */
   9
  10 #include <linux/init.h>
  11 #include <linux/fs.h>
  12 #include <linux/mm.h>
  13 #include <linux/hugetlb.h>
  14 #include <linux/pagemap.h>
  15 #include <linux/slab.h>
  16 #include <linux/err.h>
  17 #include <linux/sysctl.h>
  18 #include <asm/mman.h>
  19 #include <asm/pgalloc.h>
  20 #include <asm/tlb.h>
  21 #include <asm/tlbflush.h>
  22 #include <asm/mmu_context.h>
  23 #include <asm/machdep.h>
  24 #include <asm/cputable.h>
  25 #include <asm/tlb.h>
  26 #include <asm/spu.h>
  27
  28 #include <linux/sysctl.h>
  29
  30 #define NUM_LOW_AREAS   (0x100000000UL >> SID_SHIFT)
  31 #define NUM_HIGH_AREAS  (PGTABLE_RANGE >> HTLB_AREA_SHIFT)
  32
  33 #ifdef CONFIG_PPC_64K_PAGES
  34 #define HUGEPTE_INDEX_SIZE      (PMD_SHIFT-HPAGE_SHIFT)
  35 #else
  36 #define HUGEPTE_INDEX_SIZE      (PUD_SHIFT-HPAGE_SHIFT)
  37 #endif
  38 #define PTRS_PER_HUGEPTE        (1 << HUGEPTE_INDEX_SIZE)
  39 #define HUGEPTE_TABLE_SIZE      (sizeof(pte_t) << HUGEPTE_INDEX_SIZE)
  40
  41 #define HUGEPD_SHIFT            (HPAGE_SHIFT + HUGEPTE_INDEX_SIZE)
  42 #define HUGEPD_SIZE             (1UL << HUGEPD_SHIFT)
  43 #define HUGEPD_MASK             (~(HUGEPD_SIZE-1))
  44
  45 #define huge_pgtable_cache      (pgtable_cache[HUGEPTE_CACHE_NUM])
  46
  47 /* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
  48  * will choke on pointers to hugepte tables, which is handy for
  49  * catching screwups early. */
  50 #define HUGEPD_OK       0x1
  51
  52 typedef struct { unsigned long pd; } hugepd_t;
  53
  54 #define hugepd_none(hpd)        ((hpd).pd == 0)
  55
  56 static inline pte_t *hugepd_page(hugepd_t hpd)
  57 {
  58         BUG_ON(!(hpd.pd & HUGEPD_OK));
  59         return (pte_t *)(hpd.pd & ~HUGEPD_OK);
  60 }
  61
  62 static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr)
  63 {
  64         unsigned long idx = ((addr >> HPAGE_SHIFT) & (PTRS_PER_HUGEPTE-1));
  65         pte_t *dir = hugepd_page(*hpdp);
  66
  67         return dir + idx;
  68 }
  69
  70 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
  71                            unsigned long address)
  72 {
  73         pte_t *new = kmem_cache_alloc(huge_pgtable_cache,
  74                                       GFP_KERNEL|__GFP_REPEAT);
  75
  76         if (! new)
  77                 return -ENOMEM;
  78
  79         spin_lock(&mm->page_table_lock);
  80         if (!hugepd_none(*hpdp))
  81                 kmem_cache_free(huge_pgtable_cache, new);
  82         else
  83                 hpdp->pd = (unsigned long)new | HUGEPD_OK;
  84         spin_unlock(&mm->page_table_lock);
  85         return 0;
  86 }
  87
  88 /* Modelled after find_linux_pte() */
  89 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
  90 {
  91         pgd_t *pg;
  92         pud_t *pu;
  93
  94         BUG_ON(get_slice_psize(mm, addr) != mmu_huge_psize);
  95
  96         addr &= HPAGE_MASK;
  97
  98         pg = pgd_offset(mm, addr);
  99         if (!pgd_none(*pg)) {
 100                 pu = pud_offset(pg, addr);
 101                 if (!pud_none(*pu)) {
 102 #ifdef CONFIG_PPC_64K_PAGES
 103                         pmd_t *pm;
 104                         pm = pmd_offset(pu, addr);
 105                         if (!pmd_none(*pm))
 106                                 return hugepte_offset((hugepd_t *)pm, addr);
 107 #else
 108                         return hugepte_offset((hugepd_t *)pu, addr);
 109 #endif
 110                 }
 111         }
 112
 113         return NULL;
 114 }
 115
 116 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 117 {
 118         pgd_t *pg;
 119         pud_t *pu;
 120         hugepd_t *hpdp = NULL;
 121
 122         BUG_ON(get_slice_psize(mm, addr) != mmu_huge_psize);
 123
 124         addr &= HPAGE_MASK;
 125
 126         pg = pgd_offset(mm, addr);
 127         pu = pud_alloc(mm, pg, addr);
 128
 129         if (pu) {
 130 #ifdef CONFIG_PPC_64K_PAGES
 131                 pmd_t *pm;
 132                 pm = pmd_alloc(mm, pu, addr);
 133                 if (pm)
 134                         hpdp = (hugepd_t *)pm;
 135 #else
 136                 hpdp = (hugepd_t *)pu;
 137 #endif
 138         }
 139
 140         if (! hpdp)
 141                 return NULL;
 142
 143         if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr))
 144                 return NULL;
 145
 146         return hugepte_offset(hpdp, addr);
 147 }
 148
 149 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 150 {
 151         return 0;
 152 }
 153
 154 static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp)
 155 {
 156         pte_t *hugepte = hugepd_page(*hpdp);
 157
 158         hpdp->pd = 0;
 159         tlb->need_flush = 1;
 160         pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, HUGEPTE_CACHE_NUM,
 161                                                  PGF_CACHENUM_MASK));
 162 }
 163
 164 #ifdef CONFIG_PPC_64K_PAGES
 165 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 166                                    unsigned long addr, unsigned long end,
 167                                    unsigned long floor, unsigned long ceiling)
 168 {
 169         pmd_t *pmd;
 170         unsigned long next;
 171         unsigned long start;
 172
 173         start = addr;
 174         pmd = pmd_offset(pud, addr);
 175         do {
 176                 next = pmd_addr_end(addr, end);
 177                 if (pmd_none(*pmd))
 178                         continue;
 179                 free_hugepte_range(tlb, (hugepd_t *)pmd);
 180         } while (pmd++, addr = next, addr != end);
 181
 182         start &= PUD_MASK;
 183         if (start < floor)
 184                 return;
 185         if (ceiling) {
 186                 ceiling &= PUD_MASK;
 187                 if (!ceiling)
 188                         return;
 189         }
 190         if (end - 1 > ceiling - 1)
 191                 return;
 192
 193         pmd = pmd_offset(pud, start);
 194         pud_clear(pud);
 195         pmd_free_tlb(tlb, pmd);
 196 }
 197 #endif
 198
 199 static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 200                                    unsigned long addr, unsigned long end,
 201                                    unsigned long floor, unsigned long ceiling)
 202 {
 203         pud_t *pud;
 204         unsigned long next;
 205         unsigned long start;
 206
 207         start = addr;
 208         pud = pud_offset(pgd, addr);
 209         do {
 210                 next = pud_addr_end(addr, end);
 211 #ifdef CONFIG_PPC_64K_PAGES
 212                 if (pud_none_or_clear_bad(pud))
 213                         continue;
 214                 hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
 215 #else
 216                 if (pud_none(*pud))
 217                         continue;
 218                 free_hugepte_range(tlb, (hugepd_t *)pud);
 219 #endif
 220         } while (pud++, addr = next, addr != end);
 221
 222         start &= PGDIR_MASK;
 223         if (start < floor)
 224                 return;
 225         if (ceiling) {
 226                 ceiling &= PGDIR_MASK;
 227                 if (!ceiling)
 228                         return;
 229         }
 230         if (end - 1 > ceiling - 1)
 231                 return;
 232
 233         pud = pud_offset(pgd, start);
 234         pgd_clear(pgd);
 235         pud_free_tlb(tlb, pud);
 236 }
 237
 238 /*
 239  * This function frees user-level page tables of a process.
 240  *
 241  * Must be called with pagetable lock held.
 242  */
 243 void hugetlb_free_pgd_range(struct mmu_gather **tlb,
 244                             unsigned long addr, unsigned long end,
 245                             unsigned long floor, unsigned long ceiling)
 246 {
 247         pgd_t *pgd;
 248         unsigned long next;
 249         unsigned long start;
 250
 251         /*
 252          * Comments below take from the normal free_pgd_range().  They
 253          * apply here too.  The tests against HUGEPD_MASK below are
 254          * essential, because we *don't* test for this at the bottom
 255          * level.  Without them we'll attempt to free a hugepte table
 256          * when we unmap just part of it, even if there are other
 257          * active mappings using it.
 258          *
 259          * The next few lines have given us lots of grief...
 260          *
 261          * Why are we testing HUGEPD* at this top level?  Because
 262          * often there will be no work to do at all, and we'd prefer
 263          * not to go all the way down to the bottom just to discover
 264          * that.
 265          *
 266          * Why all these "- 1"s?  Because 0 represents both the bottom
 267          * of the address space and the top of it (using -1 for the
 268          * top wouldn't help much: the masks would do the wrong thing).
 269          * The rule is that addr 0 and floor 0 refer to the bottom of
 270          * the address space, but end 0 and ceiling 0 refer to the top
 271          * Comparisons need to use "end - 1" and "ceiling - 1" (though
 272          * that end 0 case should be mythical).
 273          *
 274          * Wherever addr is brought up or ceiling brought down, we
 275          * must be careful to reject "the opposite 0" before it
 276          * confuses the subsequent tests.  But what about where end is
 277          * brought down by HUGEPD_SIZE below? no, end can't go down to
 278          * 0 there.
 279          *
 280          * Whereas we round start (addr) and ceiling down, by different
 281          * masks at different levels, in order to test whether a table
 282          * now has no other vmas using it, so can be freed, we don't
 283          * bother to round floor or end up - the tests don't need that.
 284          */
 285
 286         addr &= HUGEPD_MASK;
 287         if (addr < floor) {
 288                 addr += HUGEPD_SIZE;
 289                 if (!addr)
 290                         return;
 291         }
 292         if (ceiling) {
 293                 ceiling &= HUGEPD_MASK;
 294                 if (!ceiling)
 295                         return;
 296         }
 297         if (end - 1 > ceiling - 1)
 298                 end -= HUGEPD_SIZE;
 299         if (addr > end - 1)
 300                 return;
 301
 302         start = addr;
 303         pgd = pgd_offset((*tlb)->mm, addr);
 304         do {
 305                 BUG_ON(get_slice_psize((*tlb)->mm, addr) != mmu_huge_psize);
 306                 next = pgd_addr_end(addr, end);
 307                 if (pgd_none_or_clear_bad(pgd))
 308                         continue;
 309                 hugetlb_free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
 310         } while (pgd++, addr = next, addr != end);
 311 }
 312
 313 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 314                      pte_t *ptep, pte_t pte)
 315 {
 316         if (pte_present(*ptep)) {
 317                 /* We open-code pte_clear because we need to pass the right
 318                  * argument to hpte_need_flush (huge / !huge). Might not be
 319                  * necessary anymore if we make hpte_need_flush() get the
 320                  * page size from the slices
 321                  */
 322                 pte_update(mm, addr & HPAGE_MASK, ptep, ~0UL, 1);
 323         }
 324         *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
 325 }
 326
 327 pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 328                               pte_t *ptep)
 329 {
 330         unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1);
 331         return __pte(old);
 332 }
 333
 334 struct page *
 335 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 336 {
 337         pte_t *ptep;
 338         struct page *page;
 339
 340         if (get_slice_psize(mm, address) != mmu_huge_psize)
 341                 return ERR_PTR(-EINVAL);
 342
 343         ptep = huge_pte_offset(mm, address);
 344         page = pte_page(*ptep);
 345         if (page)
 346                 page += (address % HPAGE_SIZE) / PAGE_SIZE;
 347
 348         return page;
 349 }
 350
 351 int pmd_huge(pmd_t pmd)
 352 {
 353         return 0;
 354 }
 355
 356 struct page *
 357 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 358                 pmd_t *pmd, int write)
 359 {
 360         BUG();
 361         return NULL;
 362 }
 363
 364
 365 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 366                                         unsigned long len, unsigned long pgoff,
 367                                         unsigned long flags)
 368 {
 369         return slice_get_unmapped_area(addr, len, flags,
 370                                        mmu_huge_psize, 1, 0);
 371 }
 372
 373 /*
 374  * Called by asm hashtable.S for doing lazy icache flush
 375  */
 376 static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
 377                                                   pte_t pte, int trap)
 378 {
 379         struct page *page;
 380         int i;
 381
 382         if (!pfn_valid(pte_pfn(pte)))
 383                 return rflags;
 384
 385         page = pte_page(pte);
 386
 387         /* page is dirty */
 388         if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
 389                 if (trap == 0x400) {
 390                         for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++)
 391                                 __flush_dcache_icache(page_address(page+i));
 392                         set_bit(PG_arch_1, &page->flags);
 393                 } else {
 394                         rflags |= HPTE_R_N;
 395                 }
 396         }
 397         return rflags;
 398 }
 399
 400 int hash_huge_page(struct mm_struct *mm, unsigned long access,
 401                    unsigned long ea, unsigned long vsid, int local,
 402                    unsigned long trap)
 403 {
 404         pte_t *ptep;
 405         unsigned long old_pte, new_pte;
 406         unsigned long va, rflags, pa;
 407         long slot;
 408         int err = 1;
 409
 410         ptep = huge_pte_offset(mm, ea);
 411
 412         /* Search the Linux page table for a match with va */
 413         va = (vsid << 28) | (ea & 0x0fffffff);
 414
 415         /*
 416          * If no pte found or not present, send the problem up to
 417          * do_page_fault
 418          */
 419         if (unlikely(!ptep || pte_none(*ptep)))
 420                 goto out;
 421
 422         /*
 423          * Check the user's access rights to the page.  If access should be
 424          * prevented then send the problem up to do_page_fault.
 425          */
 426         if (unlikely(access & ~pte_val(*ptep)))
 427                 goto out;
 428         /*
 429          * At this point, we have a pte (old_pte) which can be used to build
 430          * or update an HPTE. There are 2 cases:
 431          *
 432          * 1. There is a valid (present) pte with no associated HPTE (this is
 433          *      the most common case)
 434          * 2. There is a valid (present) pte with an associated HPTE. The
 435          *      current values of the pp bits in the HPTE prevent access
 436          *      because we are doing software DIRTY bit management and the
 437          *      page is currently not DIRTY.
 438          */
 439
 440
 441         do {
 442                 old_pte = pte_val(*ptep);
 443                 if (old_pte & _PAGE_BUSY)
 444                         goto out;
 445                 new_pte = old_pte | _PAGE_BUSY |
 446                         _PAGE_ACCESSED | _PAGE_HASHPTE;
 447         } while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
 448                                          old_pte, new_pte));
 449
 450         rflags = 0x2 | (!(new_pte & _PAGE_RW));
 451         /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
 452         rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
 453         if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
 454                 /* No CPU has hugepages but lacks no execute, so we
 455                  * don't need to worry about that case */
 456                 rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
 457                                                        trap);
 458
 459         /* Check if pte already has an hpte (case 2) */
 460         if (unlikely(old_pte & _PAGE_HASHPTE)) {
 461                 /* There MIGHT be an HPTE for this pte */
 462                 unsigned long hash, slot;
 463
 464                 hash = hpt_hash(va, HPAGE_SHIFT);
 465                 if (old_pte & _PAGE_F_SECOND)
 466                         hash = ~hash;
 467                 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 468                 slot += (old_pte & _PAGE_F_GIX) >> 12;
 469
 470                 if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_huge_psize,
 471                                          local) == -1)
 472                         old_pte &= ~_PAGE_HPTEFLAGS;
 473         }
 474
 475         if (likely(!(old_pte & _PAGE_HASHPTE))) {
 476                 unsigned long hash = hpt_hash(va, HPAGE_SHIFT);
 477                 unsigned long hpte_group;
 478
 479                 pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
 480
 481 repeat:
 482                 hpte_group = ((hash & htab_hash_mask) *
 483                               HPTES_PER_GROUP) & ~0x7UL;
 484
 485                 /* clear HPTE slot informations in new PTE */
 486                 new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
 487
 488                 /* Add in WIMG bits */
 489                 /* XXX We should store these in the pte */
 490                 /* --BenH: I think they are ... */
 491                 rflags |= _PAGE_COHERENT;
 492
 493                 /* Insert into the hash table, primary slot */
 494                 slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
 495                                           mmu_huge_psize);
 496
 497                 /* Primary is full, try the secondary */
 498                 if (unlikely(slot == -1)) {
 499                         hpte_group = ((~hash & htab_hash_mask) *
 500                                       HPTES_PER_GROUP) & ~0x7UL;
 501                         slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
 502                                                   HPTE_V_SECONDARY,
 503                                                   mmu_huge_psize);
 504                         if (slot == -1) {
 505                                 if (mftb() & 0x1)
 506                                         hpte_group = ((hash & htab_hash_mask) *
 507                                                       HPTES_PER_GROUP)&~0x7UL;
 508
 509                                 ppc_md.hpte_remove(hpte_group);
 510                                 goto repeat;
 511                         }
 512                 }
 513
 514                 if (unlikely(slot == -2))
 515                         panic("hash_huge_page: pte_insert failed\n");
 516
 517                 new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX);
 518         }
 519
 520         /*
 521          * No need to use ldarx/stdcx here
 522          */
 523         *ptep = __pte(new_pte & ~_PAGE_BUSY);
 524
 525         err = 0;
 526
 527  out:
 528         return err;
 529 }
 530
 531 static void zero_ctor(void *addr, struct kmem_cache *cache, unsigned long flags)
 532 {
 533         memset(addr, 0, kmem_cache_size(cache));
 534 }
 535
 536 static int __init hugetlbpage_init(void)
 537 {
 538         if (!cpu_has_feature(CPU_FTR_16M_PAGE))
 539                 return -ENODEV;
 540
 541         huge_pgtable_cache = kmem_cache_create("hugepte_cache",
 542                                                HUGEPTE_TABLE_SIZE,
 543                                                HUGEPTE_TABLE_SIZE,
 544                                                0,
 545                                                zero_ctor);
 546         if (! huge_pgtable_cache)
 547                 panic("hugetlbpage_init(): could not create hugepte cache\n");
 548
 549         return 0;
 550 }
 551
 552 module_init(hugetlbpage_init);