arch/x86/mm/pgtable.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 #include <linux/mm.h>
   3 #include <linux/gfp.h>
   4 #include <linux/hugetlb.h>
   5 #include <asm/pgalloc.h>
   6 #include <asm/pgtable.h>
   7 #include <asm/tlb.h>
   8 #include <asm/fixmap.h>
   9 #include <asm/mtrr.h>
  10
  11 #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
  12 phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
  13 EXPORT_SYMBOL(physical_mask);
  14 #endif
  15
  16 #define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO)
  17
  18 #ifdef CONFIG_HIGHPTE
  19 #define PGALLOC_USER_GFP __GFP_HIGHMEM
  20 #else
  21 #define PGALLOC_USER_GFP 0
  22 #endif
  23
  24 gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
  25
  26 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
  27 {
  28         return (pte_t *)__get_free_page(PGALLOC_GFP & ~__GFP_ACCOUNT);
  29 }
  30
  31 pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
  32 {
  33         struct page *pte;
  34
  35         pte = alloc_pages(__userpte_alloc_gfp, 0);
  36         if (!pte)
  37                 return NULL;
  38         if (!pgtable_page_ctor(pte)) {
  39                 __free_page(pte);
  40                 return NULL;
  41         }
  42         return pte;
  43 }
  44
  45 static int __init setup_userpte(char *arg)
  46 {
  47         if (!arg)
  48                 return -EINVAL;
  49
  50         /*
  51          * "userpte=nohigh" disables allocation of user pagetables in
  52          * high memory.
  53          */
  54         if (strcmp(arg, "nohigh") == 0)
  55                 __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
  56         else
  57                 return -EINVAL;
  58         return 0;
  59 }
  60 early_param("userpte", setup_userpte);
  61
  62 void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
  63 {
  64         pgtable_page_dtor(pte);
  65         paravirt_release_pte(page_to_pfn(pte));
  66         tlb_remove_table(tlb, pte);
  67 }
  68
  69 #if CONFIG_PGTABLE_LEVELS > 2
  70 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
  71 {
  72         struct page *page = virt_to_page(pmd);
  73         paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
  74         /*
  75          * NOTE! For PAE, any changes to the top page-directory-pointer-table
  76          * entries need a full cr3 reload to flush.
  77          */
  78 #ifdef CONFIG_X86_PAE
  79         tlb->need_flush_all = 1;
  80 #endif
  81         pgtable_pmd_page_dtor(page);
  82         tlb_remove_table(tlb, page);
  83 }
  84
  85 #if CONFIG_PGTABLE_LEVELS > 3
  86 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
  87 {
  88         paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
  89         tlb_remove_table(tlb, virt_to_page(pud));
  90 }
  91
  92 #if CONFIG_PGTABLE_LEVELS > 4
  93 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
  94 {
  95         paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
  96         tlb_remove_table(tlb, virt_to_page(p4d));
  97 }
  98 #endif  /* CONFIG_PGTABLE_LEVELS > 4 */
  99 #endif  /* CONFIG_PGTABLE_LEVELS > 3 */
 100 #endif  /* CONFIG_PGTABLE_LEVELS > 2 */
 101
 102 static inline void pgd_list_add(pgd_t *pgd)
 103 {
 104         struct page *page = virt_to_page(pgd);
 105
 106         list_add(&page->lru, &pgd_list);
 107 }
 108
 109 static inline void pgd_list_del(pgd_t *pgd)
 110 {
 111         struct page *page = virt_to_page(pgd);
 112
 113         list_del(&page->lru);
 114 }
 115
 116 #define UNSHARED_PTRS_PER_PGD                           \
 117         (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
 118
 119
 120 static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
 121 {
 122         virt_to_page(pgd)->pt_mm = mm;
 123 }
 124
 125 struct mm_struct *pgd_page_get_mm(struct page *page)
 126 {
 127         return page->pt_mm;
 128 }
 129
 130 static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
 131 {
 132         /* If the pgd points to a shared pagetable level (either the
 133            ptes in non-PAE, or shared PMD in PAE), then just copy the
 134            references from swapper_pg_dir. */
 135         if (CONFIG_PGTABLE_LEVELS == 2 ||
 136             (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
 137             CONFIG_PGTABLE_LEVELS >= 4) {
 138                 clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
 139                                 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
 140                                 KERNEL_PGD_PTRS);
 141         }
 142
 143         /* list required to sync kernel mapping updates */
 144         if (!SHARED_KERNEL_PMD) {
 145                 pgd_set_mm(pgd, mm);
 146                 pgd_list_add(pgd);
 147         }
 148 }
 149
 150 static void pgd_dtor(pgd_t *pgd)
 151 {
 152         if (SHARED_KERNEL_PMD)
 153                 return;
 154
 155         spin_lock(&pgd_lock);
 156         pgd_list_del(pgd);
 157         spin_unlock(&pgd_lock);
 158 }
 159
 160 /*
 161  * List of all pgd's needed for non-PAE so it can invalidate entries
 162  * in both cached and uncached pgd's; not needed for PAE since the
 163  * kernel pmd is shared. If PAE were not to share the pmd a similar
 164  * tactic would be needed. This is essentially codepath-based locking
 165  * against pageattr.c; it is the unique case in which a valid change
 166  * of kernel pagetables can't be lazily synchronized by vmalloc faults.
 167  * vmalloc faults work because attached pagetables are never freed.
 168  * -- nyc
 169  */
 170
 171 #ifdef CONFIG_X86_PAE
 172 /*
 173  * In PAE mode, we need to do a cr3 reload (=tlb flush) when
 174  * updating the top-level pagetable entries to guarantee the
 175  * processor notices the update.  Since this is expensive, and
 176  * all 4 top-level entries are used almost immediately in a
 177  * new process's life, we just pre-populate them here.
 178  *
 179  * Also, if we're in a paravirt environment where the kernel pmd is
 180  * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
 181  * and initialize the kernel pmds here.
 182  */
 183 #define PREALLOCATED_PMDS       UNSHARED_PTRS_PER_PGD
 184
 185 void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
 186 {
 187         paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
 188
 189         /* Note: almost everything apart from _PAGE_PRESENT is
 190            reserved at the pmd (PDPT) level. */
 191         set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
 192
 193         /*
 194          * According to Intel App note "TLBs, Paging-Structure Caches,
 195          * and Their Invalidation", April 2007, document 317080-001,
 196          * section 8.1: in PAE mode we explicitly have to flush the
 197          * TLB via cr3 if the top-level pgd is changed...
 198          */
 199         flush_tlb_mm(mm);
 200 }
 201 #else  /* !CONFIG_X86_PAE */
 202
 203 /* No need to prepopulate any pagetable entries in non-PAE modes. */
 204 #define PREALLOCATED_PMDS       0
 205
 206 #endif  /* CONFIG_X86_PAE */
 207
 208 static void free_pmds(struct mm_struct *mm, pmd_t *pmds[])
 209 {
 210         int i;
 211
 212         for(i = 0; i < PREALLOCATED_PMDS; i++)
 213                 if (pmds[i]) {
 214                         pgtable_pmd_page_dtor(virt_to_page(pmds[i]));
 215                         free_page((unsigned long)pmds[i]);
 216                         mm_dec_nr_pmds(mm);
 217                 }
 218 }
 219
 220 static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
 221 {
 222         int i;
 223         bool failed = false;
 224         gfp_t gfp = PGALLOC_GFP;
 225
 226         if (mm == &init_mm)
 227                 gfp &= ~__GFP_ACCOUNT;
 228
 229         for(i = 0; i < PREALLOCATED_PMDS; i++) {
 230                 pmd_t *pmd = (pmd_t *)__get_free_page(gfp);
 231                 if (!pmd)
 232                         failed = true;
 233                 if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) {
 234                         free_page((unsigned long)pmd);
 235                         pmd = NULL;
 236                         failed = true;
 237                 }
 238                 if (pmd)
 239                         mm_inc_nr_pmds(mm);
 240                 pmds[i] = pmd;
 241         }
 242
 243         if (failed) {
 244                 free_pmds(mm, pmds);
 245                 return -ENOMEM;
 246         }
 247
 248         return 0;
 249 }
 250
 251 /*
 252  * Mop up any pmd pages which may still be attached to the pgd.
 253  * Normally they will be freed by munmap/exit_mmap, but any pmd we
 254  * preallocate which never got a corresponding vma will need to be
 255  * freed manually.
 256  */
 257 static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
 258 {
 259         int i;
 260
 261         for(i = 0; i < PREALLOCATED_PMDS; i++) {
 262                 pgd_t pgd = pgdp[i];
 263
 264                 if (pgd_val(pgd) != 0) {
 265                         pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
 266
 267                         pgdp[i] = native_make_pgd(0);
 268
 269                         paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
 270                         pmd_free(mm, pmd);
 271                         mm_dec_nr_pmds(mm);
 272                 }
 273         }
 274 }
 275
 276 static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
 277 {
 278         p4d_t *p4d;
 279         pud_t *pud;
 280         int i;
 281
 282         if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
 283                 return;
 284
 285         p4d = p4d_offset(pgd, 0);
 286         pud = pud_offset(p4d, 0);
 287
 288         for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) {
 289                 pmd_t *pmd = pmds[i];
 290
 291                 if (i >= KERNEL_PGD_BOUNDARY)
 292                         memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
 293                                sizeof(pmd_t) * PTRS_PER_PMD);
 294
 295                 pud_populate(mm, pud, pmd);
 296         }
 297 }
 298
 299 /*
 300  * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also
 301  * assumes that pgd should be in one page.
 302  *
 303  * But kernel with PAE paging that is not running as a Xen domain
 304  * only needs to allocate 32 bytes for pgd instead of one page.
 305  */
 306 #ifdef CONFIG_X86_PAE
 307
 308 #include <linux/slab.h>
 309
 310 #define PGD_SIZE        (PTRS_PER_PGD * sizeof(pgd_t))
 311 #define PGD_ALIGN       32
 312
 313 static struct kmem_cache *pgd_cache;
 314
 315 static int __init pgd_cache_init(void)
 316 {
 317         /*
 318          * When PAE kernel is running as a Xen domain, it does not use
 319          * shared kernel pmd. And this requires a whole page for pgd.
 320          */
 321         if (!SHARED_KERNEL_PMD)
 322                 return 0;
 323
 324         /*
 325          * when PAE kernel is not running as a Xen domain, it uses
 326          * shared kernel pmd. Shared kernel pmd does not require a whole
 327          * page for pgd. We are able to just allocate a 32-byte for pgd.
 328          * During boot time, we create a 32-byte slab for pgd table allocation.
 329          */
 330         pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
 331                                       SLAB_PANIC, NULL);
 332         if (!pgd_cache)
 333                 return -ENOMEM;
 334
 335         return 0;
 336 }
 337 core_initcall(pgd_cache_init);
 338
 339 static inline pgd_t *_pgd_alloc(void)
 340 {
 341         /*
 342          * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain.
 343          * We allocate one page for pgd.
 344          */
 345         if (!SHARED_KERNEL_PMD)
 346                 return (pgd_t *)__get_free_page(PGALLOC_GFP);
 347
 348         /*
 349          * Now PAE kernel is not running as a Xen domain. We can allocate
 350          * a 32-byte slab for pgd to save memory space.
 351          */
 352         return kmem_cache_alloc(pgd_cache, PGALLOC_GFP);
 353 }
 354
 355 static inline void _pgd_free(pgd_t *pgd)
 356 {
 357         if (!SHARED_KERNEL_PMD)
 358                 free_page((unsigned long)pgd);
 359         else
 360                 kmem_cache_free(pgd_cache, pgd);
 361 }
 362 #else
 363
 364 static inline pgd_t *_pgd_alloc(void)
 365 {
 366         return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
 367 }
 368
 369 static inline void _pgd_free(pgd_t *pgd)
 370 {
 371         free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
 372 }
 373 #endif /* CONFIG_X86_PAE */
 374
 375 pgd_t *pgd_alloc(struct mm_struct *mm)
 376 {
 377         pgd_t *pgd;
 378         pmd_t *pmds[PREALLOCATED_PMDS];
 379
 380         pgd = _pgd_alloc();
 381
 382         if (pgd == NULL)
 383                 goto out;
 384
 385         mm->pgd = pgd;
 386
 387         if (preallocate_pmds(mm, pmds) != 0)
 388                 goto out_free_pgd;
 389
 390         if (paravirt_pgd_alloc(mm) != 0)
 391                 goto out_free_pmds;
 392
 393         /*
 394          * Make sure that pre-populating the pmds is atomic with
 395          * respect to anything walking the pgd_list, so that they
 396          * never see a partially populated pgd.
 397          */
 398         spin_lock(&pgd_lock);
 399
 400         pgd_ctor(mm, pgd);
 401         pgd_prepopulate_pmd(mm, pgd, pmds);
 402
 403         spin_unlock(&pgd_lock);
 404
 405         return pgd;
 406
 407 out_free_pmds:
 408         free_pmds(mm, pmds);
 409 out_free_pgd:
 410         _pgd_free(pgd);
 411 out:
 412         return NULL;
 413 }
 414
 415 void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 416 {
 417         pgd_mop_up_pmds(mm, pgd);
 418         pgd_dtor(pgd);
 419         paravirt_pgd_free(mm, pgd);
 420         _pgd_free(pgd);
 421 }
 422
 423 /*
 424  * Used to set accessed or dirty bits in the page table entries
 425  * on other architectures. On x86, the accessed and dirty bits
 426  * are tracked by hardware. However, do_wp_page calls this function
 427  * to also make the pte writeable at the same time the dirty bit is
 428  * set. In that case we do actually need to write the PTE.
 429  */
 430 int ptep_set_access_flags(struct vm_area_struct *vma,
 431                           unsigned long address, pte_t *ptep,
 432                           pte_t entry, int dirty)
 433 {
 434         int changed = !pte_same(*ptep, entry);
 435
 436         if (changed && dirty)
 437                 *ptep = entry;
 438
 439         return changed;
 440 }
 441
 442 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 443 int pmdp_set_access_flags(struct vm_area_struct *vma,
 444                           unsigned long address, pmd_t *pmdp,
 445                           pmd_t entry, int dirty)
 446 {
 447         int changed = !pmd_same(*pmdp, entry);
 448
 449         VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 450
 451         if (changed && dirty) {
 452                 *pmdp = entry;
 453                 /*
 454                  * We had a write-protection fault here and changed the pmd
 455                  * to to more permissive. No need to flush the TLB for that,
 456                  * #PF is architecturally guaranteed to do that and in the
 457                  * worst-case we'll generate a spurious fault.
 458                  */
 459         }
 460
 461         return changed;
 462 }
 463
 464 int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
 465                           pud_t *pudp, pud_t entry, int dirty)
 466 {
 467         int changed = !pud_same(*pudp, entry);
 468
 469         VM_BUG_ON(address & ~HPAGE_PUD_MASK);
 470
 471         if (changed && dirty) {
 472                 *pudp = entry;
 473                 /*
 474                  * We had a write-protection fault here and changed the pud
 475                  * to to more permissive. No need to flush the TLB for that,
 476                  * #PF is architecturally guaranteed to do that and in the
 477                  * worst-case we'll generate a spurious fault.
 478                  */
 479         }
 480
 481         return changed;
 482 }
 483 #endif
 484
 485 int ptep_test_and_clear_young(struct vm_area_struct *vma,
 486                               unsigned long addr, pte_t *ptep)
 487 {
 488         int ret = 0;
 489
 490         if (pte_young(*ptep))
 491                 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
 492                                          (unsigned long *) &ptep->pte);
 493
 494         return ret;
 495 }
 496
 497 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 498 int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 499                               unsigned long addr, pmd_t *pmdp)
 500 {
 501         int ret = 0;
 502
 503         if (pmd_young(*pmdp))
 504                 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
 505                                          (unsigned long *)pmdp);
 506
 507         return ret;
 508 }
 509 int pudp_test_and_clear_young(struct vm_area_struct *vma,
 510                               unsigned long addr, pud_t *pudp)
 511 {
 512         int ret = 0;
 513
 514         if (pud_young(*pudp))
 515                 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
 516                                          (unsigned long *)pudp);
 517
 518         return ret;
 519 }
 520 #endif
 521
 522 int ptep_clear_flush_young(struct vm_area_struct *vma,
 523                            unsigned long address, pte_t *ptep)
 524 {
 525         /*
 526          * On x86 CPUs, clearing the accessed bit without a TLB flush
 527          * doesn't cause data corruption. [ It could cause incorrect
 528          * page aging and the (mistaken) reclaim of hot pages, but the
 529          * chance of that should be relatively low. ]
 530          *
 531          * So as a performance optimization don't flush the TLB when
 532          * clearing the accessed bit, it will eventually be flushed by
 533          * a context switch or a VM operation anyway. [ In the rare
 534          * event of it not getting flushed for a long time the delay
 535          * shouldn't really matter because there's no real memory
 536          * pressure for swapout to react to. ]
 537          */
 538         return ptep_test_and_clear_young(vma, address, ptep);
 539 }
 540
 541 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 542 int pmdp_clear_flush_young(struct vm_area_struct *vma,
 543                            unsigned long address, pmd_t *pmdp)
 544 {
 545         int young;
 546
 547         VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 548
 549         young = pmdp_test_and_clear_young(vma, address, pmdp);
 550         if (young)
 551                 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 552
 553         return young;
 554 }
 555 #endif
 556
 557 /**
 558  * reserve_top_address - reserves a hole in the top of kernel address space
 559  * @reserve - size of hole to reserve
 560  *
 561  * Can be used to relocate the fixmap area and poke a hole in the top
 562  * of kernel address space to make room for a hypervisor.
 563  */
 564 void __init reserve_top_address(unsigned long reserve)
 565 {
 566 #ifdef CONFIG_X86_32
 567         BUG_ON(fixmaps_set > 0);
 568         __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE;
 569         printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n",
 570                -reserve, __FIXADDR_TOP + PAGE_SIZE);
 571 #endif
 572 }
 573
 574 int fixmaps_set;
 575
 576 void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
 577 {
 578         unsigned long address = __fix_to_virt(idx);
 579
 580         if (idx >= __end_of_fixed_addresses) {
 581                 BUG();
 582                 return;
 583         }
 584         set_pte_vaddr(address, pte);
 585         fixmaps_set++;
 586 }
 587
 588 void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys,
 589                        pgprot_t flags)
 590 {
 591         /* Sanitize 'prot' against any unsupported bits: */
 592         pgprot_val(flags) &= __default_kernel_pte_mask;
 593
 594         __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
 595 }
 596
 597 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
 598 #ifdef CONFIG_X86_5LEVEL
 599 /**
 600  * p4d_set_huge - setup kernel P4D mapping
 601  *
 602  * No 512GB pages yet -- always return 0
 603  */
 604 int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
 605 {
 606         return 0;
 607 }
 608
 609 /**
 610  * p4d_clear_huge - clear kernel P4D mapping when it is set
 611  *
 612  * No 512GB pages yet -- always return 0
 613  */
 614 int p4d_clear_huge(p4d_t *p4d)
 615 {
 616         return 0;
 617 }
 618 #endif
 619
 620 /**
 621  * pud_set_huge - setup kernel PUD mapping
 622  *
 623  * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this
 624  * function sets up a huge page only if any of the following conditions are met:
 625  *
 626  * - MTRRs are disabled, or
 627  *
 628  * - MTRRs are enabled and the range is completely covered by a single MTRR, or
 629  *
 630  * - MTRRs are enabled and the corresponding MTRR memory type is WB, which
 631  *   has no effect on the requested PAT memory type.
 632  *
 633  * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger
 634  * page mapping attempt fails.
 635  *
 636  * Returns 1 on success and 0 on failure.
 637  */
 638 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
 639 {
 640         u8 mtrr, uniform;
 641
 642         mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform);
 643         if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
 644             (mtrr != MTRR_TYPE_WRBACK))
 645                 return 0;
 646
 647         /* Bail out if we are we on a populated non-leaf entry: */
 648         if (pud_present(*pud) && !pud_huge(*pud))
 649                 return 0;
 650
 651         prot = pgprot_4k_2_large(prot);
 652
 653         set_pte((pte_t *)pud, pfn_pte(
 654                 (u64)addr >> PAGE_SHIFT,
 655                 __pgprot(pgprot_val(prot) | _PAGE_PSE)));
 656
 657         return 1;
 658 }
 659
 660 /**
 661  * pmd_set_huge - setup kernel PMD mapping
 662  *
 663  * See text over pud_set_huge() above.
 664  *
 665  * Returns 1 on success and 0 on failure.
 666  */
 667 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
 668 {
 669         u8 mtrr, uniform;
 670
 671         mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform);
 672         if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
 673             (mtrr != MTRR_TYPE_WRBACK)) {
 674                 pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n",
 675                              __func__, addr, addr + PMD_SIZE);
 676                 return 0;
 677         }
 678
 679         /* Bail out if we are we on a populated non-leaf entry: */
 680         if (pmd_present(*pmd) && !pmd_huge(*pmd))
 681                 return 0;
 682
 683         prot = pgprot_4k_2_large(prot);
 684
 685         set_pte((pte_t *)pmd, pfn_pte(
 686                 (u64)addr >> PAGE_SHIFT,
 687                 __pgprot(pgprot_val(prot) | _PAGE_PSE)));
 688
 689         return 1;
 690 }
 691
 692 /**
 693  * pud_clear_huge - clear kernel PUD mapping when it is set
 694  *
 695  * Returns 1 on success and 0 on failure (no PUD map is found).
 696  */
 697 int pud_clear_huge(pud_t *pud)
 698 {
 699         if (pud_large(*pud)) {
 700                 pud_clear(pud);
 701                 return 1;
 702         }
 703
 704         return 0;
 705 }
 706
 707 /**
 708  * pmd_clear_huge - clear kernel PMD mapping when it is set
 709  *
 710  * Returns 1 on success and 0 on failure (no PMD map is found).
 711  */
 712 int pmd_clear_huge(pmd_t *pmd)
 713 {
 714         if (pmd_large(*pmd)) {
 715                 pmd_clear(pmd);
 716                 return 1;
 717         }
 718
 719         return 0;
 720 }
 721
 722 #ifdef CONFIG_X86_64
 723 /**
 724  * pud_free_pmd_page - Clear pud entry and free pmd page.
 725  * @pud: Pointer to a PUD.
 726  * @addr: Virtual address associated with pud.
 727  *
 728  * Context: The pud range has been unmapped and TLB purged.
 729  * Return: 1 if clearing the entry succeeded. 0 otherwise.
 730  *
 731  * NOTE: Callers must allow a single page allocation.
 732  */
 733 int pud_free_pmd_page(pud_t *pud, unsigned long addr)
 734 {
 735         pmd_t *pmd, *pmd_sv;
 736         pte_t *pte;
 737         int i;
 738
 739         if (pud_none(*pud))
 740                 return 1;
 741
 742         pmd = (pmd_t *)pud_page_vaddr(*pud);
 743         pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
 744         if (!pmd_sv)
 745                 return 0;
 746
 747         for (i = 0; i < PTRS_PER_PMD; i++) {
 748                 pmd_sv[i] = pmd[i];
 749                 if (!pmd_none(pmd[i]))
 750                         pmd_clear(&pmd[i]);
 751         }
 752
 753         pud_clear(pud);
 754
 755         /* INVLPG to clear all paging-structure caches */
 756         flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
 757
 758         for (i = 0; i < PTRS_PER_PMD; i++) {
 759                 if (!pmd_none(pmd_sv[i])) {
 760                         pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
 761                         free_page((unsigned long)pte);
 762                 }
 763         }
 764
 765         free_page((unsigned long)pmd_sv);
 766         free_page((unsigned long)pmd);
 767
 768         return 1;
 769 }
 770
 771 /**
 772  * pmd_free_pte_page - Clear pmd entry and free pte page.
 773  * @pmd: Pointer to a PMD.
 774  * @addr: Virtual address associated with pmd.
 775  *
 776  * Context: The pmd range has been unmapped and TLB purged.
 777  * Return: 1 if clearing the entry succeeded. 0 otherwise.
 778  */
 779 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
 780 {
 781         pte_t *pte;
 782
 783         if (pmd_none(*pmd))
 784                 return 1;
 785
 786         pte = (pte_t *)pmd_page_vaddr(*pmd);
 787         pmd_clear(pmd);
 788
 789         /* INVLPG to clear all paging-structure caches */
 790         flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
 791
 792         free_page((unsigned long)pte);
 793
 794         return 1;
 795 }
 796
 797 #else /* !CONFIG_X86_64 */
 798
 799 int pud_free_pmd_page(pud_t *pud, unsigned long addr)
 800 {
 801         return pud_none(*pud);
 802 }
 803
 804 /*
 805  * Disable free page handling on x86-PAE. This assures that ioremap()
 806  * does not update sync'd pmd entries. See vmalloc_sync_one().
 807  */
 808 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
 809 {
 810         return pmd_none(*pmd);
 811 }
 812
 813 #endif /* CONFIG_X86_64 */
 814 #endif  /* CONFIG_HAVE_ARCH_HUGE_VMAP */