arch/s390/mm/pgtable.c

   1 /*
   2  *    Copyright IBM Corp. 2007, 2011
   3  *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
   4  */
   5
   6 #include <linux/sched.h>
   7 #include <linux/kernel.h>
   8 #include <linux/errno.h>
   9 #include <linux/gfp.h>
  10 #include <linux/mm.h>
  11 #include <linux/swap.h>
  12 #include <linux/smp.h>
  13 #include <linux/highmem.h>
  14 #include <linux/pagemap.h>
  15 #include <linux/spinlock.h>
  16 #include <linux/module.h>
  17 #include <linux/quicklist.h>
  18 #include <linux/rcupdate.h>
  19 #include <linux/slab.h>
  20 #include <linux/swapops.h>
  21 #include <linux/sysctl.h>
  22 #include <linux/ksm.h>
  23 #include <linux/mman.h>
  24
  25 #include <asm/pgtable.h>
  26 #include <asm/pgalloc.h>
  27 #include <asm/tlb.h>
  28 #include <asm/tlbflush.h>
  29 #include <asm/mmu_context.h>
  30
  31 #define ALLOC_ORDER     2
  32 #define FRAG_MASK       0x03
  33
  34 unsigned long *crst_table_alloc(struct mm_struct *mm)
  35 {
  36         struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
  37
  38         if (!page)
  39                 return NULL;
  40         return (unsigned long *) page_to_phys(page);
  41 }
  42
  43 void crst_table_free(struct mm_struct *mm, unsigned long *table)
  44 {
  45         free_pages((unsigned long) table, ALLOC_ORDER);
  46 }
  47
  48 static void __crst_table_upgrade(void *arg)
  49 {
  50         struct mm_struct *mm = arg;
  51
  52         if (current->active_mm == mm) {
  53                 clear_user_asce();
  54                 set_user_asce(mm);
  55         }
  56         __tlb_flush_local();
  57 }
  58
  59 int crst_table_upgrade(struct mm_struct *mm)
  60 {
  61         unsigned long *table, *pgd;
  62
  63         /* upgrade should only happen from 3 to 4 levels */
  64         BUG_ON(mm->context.asce_limit != (1UL << 42));
  65
  66         table = crst_table_alloc(mm);
  67         if (!table)
  68                 return -ENOMEM;
  69
  70         spin_lock_bh(&mm->page_table_lock);
  71         pgd = (unsigned long *) mm->pgd;
  72         crst_table_init(table, _REGION2_ENTRY_EMPTY);
  73         pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
  74         mm->pgd = (pgd_t *) table;
  75         mm->context.asce_limit = 1UL << 53;
  76         mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
  77                            _ASCE_USER_BITS | _ASCE_TYPE_REGION2;
  78         mm->task_size = mm->context.asce_limit;
  79         spin_unlock_bh(&mm->page_table_lock);
  80
  81         on_each_cpu(__crst_table_upgrade, mm, 0);
  82         return 0;
  83 }
  84
  85 void crst_table_downgrade(struct mm_struct *mm)
  86 {
  87         pgd_t *pgd;
  88
  89         /* downgrade should only happen from 3 to 2 levels (compat only) */
  90         BUG_ON(mm->context.asce_limit != (1UL << 42));
  91
  92         if (current->active_mm == mm) {
  93                 clear_user_asce();
  94                 __tlb_flush_mm(mm);
  95         }
  96
  97         pgd = mm->pgd;
  98         mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
  99         mm->context.asce_limit = 1UL << 31;
 100         mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
 101                            _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT;
 102         mm->task_size = mm->context.asce_limit;
 103         crst_table_free(mm, (unsigned long *) pgd);
 104
 105         if (current->active_mm == mm)
 106                 set_user_asce(mm);
 107 }
 108
 109 #ifdef CONFIG_PGSTE
 110
 111 /**
 112  * gmap_alloc - allocate a guest address space
 113  * @mm: pointer to the parent mm_struct
 114  * @limit: maximum size of the gmap address space
 115  *
 116  * Returns a guest address space structure.
 117  */
 118 struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
 119 {
 120         struct gmap *gmap;
 121         struct page *page;
 122         unsigned long *table;
 123         unsigned long etype, atype;
 124
 125         if (limit < (1UL << 31)) {
 126                 limit = (1UL << 31) - 1;
 127                 atype = _ASCE_TYPE_SEGMENT;
 128                 etype = _SEGMENT_ENTRY_EMPTY;
 129         } else if (limit < (1UL << 42)) {
 130                 limit = (1UL << 42) - 1;
 131                 atype = _ASCE_TYPE_REGION3;
 132                 etype = _REGION3_ENTRY_EMPTY;
 133         } else if (limit < (1UL << 53)) {
 134                 limit = (1UL << 53) - 1;
 135                 atype = _ASCE_TYPE_REGION2;
 136                 etype = _REGION2_ENTRY_EMPTY;
 137         } else {
 138                 limit = -1UL;
 139                 atype = _ASCE_TYPE_REGION1;
 140                 etype = _REGION1_ENTRY_EMPTY;
 141         }
 142         gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL);
 143         if (!gmap)
 144                 goto out;
 145         INIT_LIST_HEAD(&gmap->crst_list);
 146         INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL);
 147         INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC);
 148         spin_lock_init(&gmap->guest_table_lock);
 149         gmap->mm = mm;
 150         page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
 151         if (!page)
 152                 goto out_free;
 153         page->index = 0;
 154         list_add(&page->lru, &gmap->crst_list);
 155         table = (unsigned long *) page_to_phys(page);
 156         crst_table_init(table, etype);
 157         gmap->table = table;
 158         gmap->asce = atype | _ASCE_TABLE_LENGTH |
 159                 _ASCE_USER_BITS | __pa(table);
 160         gmap->asce_end = limit;
 161         down_write(&mm->mmap_sem);
 162         list_add(&gmap->list, &mm->context.gmap_list);
 163         up_write(&mm->mmap_sem);
 164         return gmap;
 165
 166 out_free:
 167         kfree(gmap);
 168 out:
 169         return NULL;
 170 }
 171 EXPORT_SYMBOL_GPL(gmap_alloc);
 172
 173 static void gmap_flush_tlb(struct gmap *gmap)
 174 {
 175         if (MACHINE_HAS_IDTE)
 176                 __tlb_flush_idte(gmap->asce);
 177         else
 178                 __tlb_flush_global();
 179 }
 180
 181 static void gmap_radix_tree_free(struct radix_tree_root *root)
 182 {
 183         struct radix_tree_iter iter;
 184         unsigned long indices[16];
 185         unsigned long index;
 186         void **slot;
 187         int i, nr;
 188
 189         /* A radix tree is freed by deleting all of its entries */
 190         index = 0;
 191         do {
 192                 nr = 0;
 193                 radix_tree_for_each_slot(slot, root, &iter, index) {
 194                         indices[nr] = iter.index;
 195                         if (++nr == 16)
 196                                 break;
 197                 }
 198                 for (i = 0; i < nr; i++) {
 199                         index = indices[i];
 200                         radix_tree_delete(root, index);
 201                 }
 202         } while (nr > 0);
 203 }
 204
 205 /**
 206  * gmap_free - free a guest address space
 207  * @gmap: pointer to the guest address space structure
 208  */
 209 void gmap_free(struct gmap *gmap)
 210 {
 211         struct page *page, *next;
 212
 213         /* Flush tlb. */
 214         if (MACHINE_HAS_IDTE)
 215                 __tlb_flush_idte(gmap->asce);
 216         else
 217                 __tlb_flush_global();
 218
 219         /* Free all segment & region tables. */
 220         list_for_each_entry_safe(page, next, &gmap->crst_list, lru)
 221                 __free_pages(page, ALLOC_ORDER);
 222         gmap_radix_tree_free(&gmap->guest_to_host);
 223         gmap_radix_tree_free(&gmap->host_to_guest);
 224         down_write(&gmap->mm->mmap_sem);
 225         list_del(&gmap->list);
 226         up_write(&gmap->mm->mmap_sem);
 227         kfree(gmap);
 228 }
 229 EXPORT_SYMBOL_GPL(gmap_free);
 230
 231 /**
 232  * gmap_enable - switch primary space to the guest address space
 233  * @gmap: pointer to the guest address space structure
 234  */
 235 void gmap_enable(struct gmap *gmap)
 236 {
 237         S390_lowcore.gmap = (unsigned long) gmap;
 238 }
 239 EXPORT_SYMBOL_GPL(gmap_enable);
 240
 241 /**
 242  * gmap_disable - switch back to the standard primary address space
 243  * @gmap: pointer to the guest address space structure
 244  */
 245 void gmap_disable(struct gmap *gmap)
 246 {
 247         S390_lowcore.gmap = 0UL;
 248 }
 249 EXPORT_SYMBOL_GPL(gmap_disable);
 250
 251 /*
 252  * gmap_alloc_table is assumed to be called with mmap_sem held
 253  */
 254 static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
 255                             unsigned long init, unsigned long gaddr)
 256 {
 257         struct page *page;
 258         unsigned long *new;
 259
 260         /* since we dont free the gmap table until gmap_free we can unlock */
 261         page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
 262         if (!page)
 263                 return -ENOMEM;
 264         new = (unsigned long *) page_to_phys(page);
 265         crst_table_init(new, init);
 266         spin_lock(&gmap->mm->page_table_lock);
 267         if (*table & _REGION_ENTRY_INVALID) {
 268                 list_add(&page->lru, &gmap->crst_list);
 269                 *table = (unsigned long) new | _REGION_ENTRY_LENGTH |
 270                         (*table & _REGION_ENTRY_TYPE_MASK);
 271                 page->index = gaddr;
 272                 page = NULL;
 273         }
 274         spin_unlock(&gmap->mm->page_table_lock);
 275         if (page)
 276                 __free_pages(page, ALLOC_ORDER);
 277         return 0;
 278 }
 279
 280 /**
 281  * __gmap_segment_gaddr - find virtual address from segment pointer
 282  * @entry: pointer to a segment table entry in the guest address space
 283  *
 284  * Returns the virtual address in the guest address space for the segment
 285  */
 286 static unsigned long __gmap_segment_gaddr(unsigned long *entry)
 287 {
 288         struct page *page;
 289         unsigned long offset, mask;
 290
 291         offset = (unsigned long) entry / sizeof(unsigned long);
 292         offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE;
 293         mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
 294         page = virt_to_page((void *)((unsigned long) entry & mask));
 295         return page->index + offset;
 296 }
 297
 298 /**
 299  * __gmap_unlink_by_vmaddr - unlink a single segment via a host address
 300  * @gmap: pointer to the guest address space structure
 301  * @vmaddr: address in the host process address space
 302  *
 303  * Returns 1 if a TLB flush is required
 304  */
 305 static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
 306 {
 307         unsigned long *entry;
 308         int flush = 0;
 309
 310         spin_lock(&gmap->guest_table_lock);
 311         entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
 312         if (entry) {
 313                 flush = (*entry != _SEGMENT_ENTRY_INVALID);
 314                 *entry = _SEGMENT_ENTRY_INVALID;
 315         }
 316         spin_unlock(&gmap->guest_table_lock);
 317         return flush;
 318 }
 319
 320 /**
 321  * __gmap_unmap_by_gaddr - unmap a single segment via a guest address
 322  * @gmap: pointer to the guest address space structure
 323  * @gaddr: address in the guest address space
 324  *
 325  * Returns 1 if a TLB flush is required
 326  */
 327 static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr)
 328 {
 329         unsigned long vmaddr;
 330
 331         vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host,
 332                                                    gaddr >> PMD_SHIFT);
 333         return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0;
 334 }
 335
 336 /**
 337  * gmap_unmap_segment - unmap segment from the guest address space
 338  * @gmap: pointer to the guest address space structure
 339  * @to: address in the guest address space
 340  * @len: length of the memory area to unmap
 341  *
 342  * Returns 0 if the unmap succeeded, -EINVAL if not.
 343  */
 344 int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
 345 {
 346         unsigned long off;
 347         int flush;
 348
 349         if ((to | len) & (PMD_SIZE - 1))
 350                 return -EINVAL;
 351         if (len == 0 || to + len < to)
 352                 return -EINVAL;
 353
 354         flush = 0;
 355         down_write(&gmap->mm->mmap_sem);
 356         for (off = 0; off < len; off += PMD_SIZE)
 357                 flush |= __gmap_unmap_by_gaddr(gmap, to + off);
 358         up_write(&gmap->mm->mmap_sem);
 359         if (flush)
 360                 gmap_flush_tlb(gmap);
 361         return 0;
 362 }
 363 EXPORT_SYMBOL_GPL(gmap_unmap_segment);
 364
 365 /**
 366  * gmap_mmap_segment - map a segment to the guest address space
 367  * @gmap: pointer to the guest address space structure
 368  * @from: source address in the parent address space
 369  * @to: target address in the guest address space
 370  * @len: length of the memory area to map
 371  *
 372  * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not.
 373  */
 374 int gmap_map_segment(struct gmap *gmap, unsigned long from,
 375                      unsigned long to, unsigned long len)
 376 {
 377         unsigned long off;
 378         int flush;
 379
 380         if ((from | to | len) & (PMD_SIZE - 1))
 381                 return -EINVAL;
 382         if (len == 0 || from + len < from || to + len < to ||
 383             from + len > TASK_MAX_SIZE || to + len > gmap->asce_end)
 384                 return -EINVAL;
 385
 386         flush = 0;
 387         down_write(&gmap->mm->mmap_sem);
 388         for (off = 0; off < len; off += PMD_SIZE) {
 389                 /* Remove old translation */
 390                 flush |= __gmap_unmap_by_gaddr(gmap, to + off);
 391                 /* Store new translation */
 392                 if (radix_tree_insert(&gmap->guest_to_host,
 393                                       (to + off) >> PMD_SHIFT,
 394                                       (void *) from + off))
 395                         break;
 396         }
 397         up_write(&gmap->mm->mmap_sem);
 398         if (flush)
 399                 gmap_flush_tlb(gmap);
 400         if (off >= len)
 401                 return 0;
 402         gmap_unmap_segment(gmap, to, len);
 403         return -ENOMEM;
 404 }
 405 EXPORT_SYMBOL_GPL(gmap_map_segment);
 406
 407 /**
 408  * __gmap_translate - translate a guest address to a user space address
 409  * @gmap: pointer to guest mapping meta data structure
 410  * @gaddr: guest address
 411  *
 412  * Returns user space address which corresponds to the guest address or
 413  * -EFAULT if no such mapping exists.
 414  * This function does not establish potentially missing page table entries.
 415  * The mmap_sem of the mm that belongs to the address space must be held
 416  * when this function gets called.
 417  */
 418 unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
 419 {
 420         unsigned long vmaddr;
 421
 422         vmaddr = (unsigned long)
 423                 radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT);
 424         return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT;
 425 }
 426 EXPORT_SYMBOL_GPL(__gmap_translate);
 427
 428 /**
 429  * gmap_translate - translate a guest address to a user space address
 430  * @gmap: pointer to guest mapping meta data structure
 431  * @gaddr: guest address
 432  *
 433  * Returns user space address which corresponds to the guest address or
 434  * -EFAULT if no such mapping exists.
 435  * This function does not establish potentially missing page table entries.
 436  */
 437 unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr)
 438 {
 439         unsigned long rc;
 440
 441         down_read(&gmap->mm->mmap_sem);
 442         rc = __gmap_translate(gmap, gaddr);
 443         up_read(&gmap->mm->mmap_sem);
 444         return rc;
 445 }
 446 EXPORT_SYMBOL_GPL(gmap_translate);
 447
 448 /**
 449  * gmap_unlink - disconnect a page table from the gmap shadow tables
 450  * @gmap: pointer to guest mapping meta data structure
 451  * @table: pointer to the host page table
 452  * @vmaddr: vm address associated with the host page table
 453  */
 454 static void gmap_unlink(struct mm_struct *mm, unsigned long *table,
 455                         unsigned long vmaddr)
 456 {
 457         struct gmap *gmap;
 458         int flush;
 459
 460         list_for_each_entry(gmap, &mm->context.gmap_list, list) {
 461                 flush = __gmap_unlink_by_vmaddr(gmap, vmaddr);
 462                 if (flush)
 463                         gmap_flush_tlb(gmap);
 464         }
 465 }
 466
 467 /**
 468  * gmap_link - set up shadow page tables to connect a host to a guest address
 469  * @gmap: pointer to guest mapping meta data structure
 470  * @gaddr: guest address
 471  * @vmaddr: vm address
 472  *
 473  * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
 474  * if the vm address is already mapped to a different guest segment.
 475  * The mmap_sem of the mm that belongs to the address space must be held
 476  * when this function gets called.
 477  */
 478 int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
 479 {
 480         struct mm_struct *mm;
 481         unsigned long *table;
 482         spinlock_t *ptl;
 483         pgd_t *pgd;
 484         pud_t *pud;
 485         pmd_t *pmd;
 486         int rc;
 487
 488         /* Create higher level tables in the gmap page table */
 489         table = gmap->table;
 490         if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) {
 491                 table += (gaddr >> 53) & 0x7ff;
 492                 if ((*table & _REGION_ENTRY_INVALID) &&
 493                     gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY,
 494                                      gaddr & 0xffe0000000000000UL))
 495                         return -ENOMEM;
 496                 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 497         }
 498         if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) {
 499                 table += (gaddr >> 42) & 0x7ff;
 500                 if ((*table & _REGION_ENTRY_INVALID) &&
 501                     gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY,
 502                                      gaddr & 0xfffffc0000000000UL))
 503                         return -ENOMEM;
 504                 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 505         }
 506         if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) {
 507                 table += (gaddr >> 31) & 0x7ff;
 508                 if ((*table & _REGION_ENTRY_INVALID) &&
 509                     gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY,
 510                                      gaddr & 0xffffffff80000000UL))
 511                         return -ENOMEM;
 512                 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 513         }
 514         table += (gaddr >> 20) & 0x7ff;
 515         /* Walk the parent mm page table */
 516         mm = gmap->mm;
 517         pgd = pgd_offset(mm, vmaddr);
 518         VM_BUG_ON(pgd_none(*pgd));
 519         pud = pud_offset(pgd, vmaddr);
 520         VM_BUG_ON(pud_none(*pud));
 521         pmd = pmd_offset(pud, vmaddr);
 522         VM_BUG_ON(pmd_none(*pmd));
 523         /* large pmds cannot yet be handled */
 524         if (pmd_large(*pmd))
 525                 return -EFAULT;
 526         /* Link gmap segment table entry location to page table. */
 527         rc = radix_tree_preload(GFP_KERNEL);
 528         if (rc)
 529                 return rc;
 530         ptl = pmd_lock(mm, pmd);
 531         spin_lock(&gmap->guest_table_lock);
 532         if (*table == _SEGMENT_ENTRY_INVALID) {
 533                 rc = radix_tree_insert(&gmap->host_to_guest,
 534                                        vmaddr >> PMD_SHIFT, table);
 535                 if (!rc)
 536                         *table = pmd_val(*pmd);
 537         } else
 538                 rc = 0;
 539         spin_unlock(&gmap->guest_table_lock);
 540         spin_unlock(ptl);
 541         radix_tree_preload_end();
 542         return rc;
 543 }
 544
 545 /**
 546  * gmap_fault - resolve a fault on a guest address
 547  * @gmap: pointer to guest mapping meta data structure
 548  * @gaddr: guest address
 549  * @fault_flags: flags to pass down to handle_mm_fault()
 550  *
 551  * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
 552  * if the vm address is already mapped to a different guest segment.
 553  */
 554 int gmap_fault(struct gmap *gmap, unsigned long gaddr,
 555                unsigned int fault_flags)
 556 {
 557         unsigned long vmaddr;
 558         int rc;
 559
 560         down_read(&gmap->mm->mmap_sem);
 561         vmaddr = __gmap_translate(gmap, gaddr);
 562         if (IS_ERR_VALUE(vmaddr)) {
 563                 rc = vmaddr;
 564                 goto out_up;
 565         }
 566         if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags)) {
 567                 rc = -EFAULT;
 568                 goto out_up;
 569         }
 570         rc = __gmap_link(gmap, gaddr, vmaddr);
 571 out_up:
 572         up_read(&gmap->mm->mmap_sem);
 573         return rc;
 574 }
 575 EXPORT_SYMBOL_GPL(gmap_fault);
 576
 577 static void gmap_zap_swap_entry(swp_entry_t entry, struct mm_struct *mm)
 578 {
 579         if (!non_swap_entry(entry))
 580                 dec_mm_counter(mm, MM_SWAPENTS);
 581         else if (is_migration_entry(entry)) {
 582                 struct page *page = migration_entry_to_page(entry);
 583
 584                 if (PageAnon(page))
 585                         dec_mm_counter(mm, MM_ANONPAGES);
 586                 else
 587                         dec_mm_counter(mm, MM_FILEPAGES);
 588         }
 589         free_swap_and_cache(entry);
 590 }
 591
 592 /*
 593  * this function is assumed to be called with mmap_sem held
 594  */
 595 void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
 596 {
 597         unsigned long vmaddr, ptev, pgstev;
 598         pte_t *ptep, pte;
 599         spinlock_t *ptl;
 600         pgste_t pgste;
 601
 602         /* Find the vm address for the guest address */
 603         vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host,
 604                                                    gaddr >> PMD_SHIFT);
 605         if (!vmaddr)
 606                 return;
 607         vmaddr |= gaddr & ~PMD_MASK;
 608         /* Get pointer to the page table entry */
 609         ptep = get_locked_pte(gmap->mm, vmaddr, &ptl);
 610         if (unlikely(!ptep))
 611                 return;
 612         pte = *ptep;
 613         if (!pte_swap(pte))
 614                 goto out_pte;
 615         /* Zap unused and logically-zero pages */
 616         pgste = pgste_get_lock(ptep);
 617         pgstev = pgste_val(pgste);
 618         ptev = pte_val(pte);
 619         if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) ||
 620             ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID))) {
 621                 gmap_zap_swap_entry(pte_to_swp_entry(pte), gmap->mm);
 622                 pte_clear(gmap->mm, vmaddr, ptep);
 623         }
 624         pgste_set_unlock(ptep, pgste);
 625 out_pte:
 626         pte_unmap_unlock(ptep, ptl);
 627 }
 628 EXPORT_SYMBOL_GPL(__gmap_zap);
 629
 630 void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
 631 {
 632         unsigned long gaddr, vmaddr, size;
 633         struct vm_area_struct *vma;
 634
 635         down_read(&gmap->mm->mmap_sem);
 636         for (gaddr = from; gaddr < to;
 637              gaddr = (gaddr + PMD_SIZE) & PMD_MASK) {
 638                 /* Find the vm address for the guest address */
 639                 vmaddr = (unsigned long)
 640                         radix_tree_lookup(&gmap->guest_to_host,
 641                                           gaddr >> PMD_SHIFT);
 642                 if (!vmaddr)
 643                         continue;
 644                 vmaddr |= gaddr & ~PMD_MASK;
 645                 /* Find vma in the parent mm */
 646                 vma = find_vma(gmap->mm, vmaddr);
 647                 size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
 648                 zap_page_range(vma, vmaddr, size, NULL);
 649         }
 650         up_read(&gmap->mm->mmap_sem);
 651 }
 652 EXPORT_SYMBOL_GPL(gmap_discard);
 653
 654 static LIST_HEAD(gmap_notifier_list);
 655 static DEFINE_SPINLOCK(gmap_notifier_lock);
 656
 657 /**
 658  * gmap_register_ipte_notifier - register a pte invalidation callback
 659  * @nb: pointer to the gmap notifier block
 660  */
 661 void gmap_register_ipte_notifier(struct gmap_notifier *nb)
 662 {
 663         spin_lock(&gmap_notifier_lock);
 664         list_add(&nb->list, &gmap_notifier_list);
 665         spin_unlock(&gmap_notifier_lock);
 666 }
 667 EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier);
 668
 669 /**
 670  * gmap_unregister_ipte_notifier - remove a pte invalidation callback
 671  * @nb: pointer to the gmap notifier block
 672  */
 673 void gmap_unregister_ipte_notifier(struct gmap_notifier *nb)
 674 {
 675         spin_lock(&gmap_notifier_lock);
 676         list_del_init(&nb->list);
 677         spin_unlock(&gmap_notifier_lock);
 678 }
 679 EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier);
 680
 681 /**
 682  * gmap_ipte_notify - mark a range of ptes for invalidation notification
 683  * @gmap: pointer to guest mapping meta data structure
 684  * @gaddr: virtual address in the guest address space
 685  * @len: size of area
 686  *
 687  * Returns 0 if for each page in the given range a gmap mapping exists and
 688  * the invalidation notification could be set. If the gmap mapping is missing
 689  * for one or more pages -EFAULT is returned. If no memory could be allocated
 690  * -ENOMEM is returned. This function establishes missing page table entries.
 691  */
 692 int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len)
 693 {
 694         unsigned long addr;
 695         spinlock_t *ptl;
 696         pte_t *ptep, entry;
 697         pgste_t pgste;
 698         int rc = 0;
 699
 700         if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK))
 701                 return -EINVAL;
 702         down_read(&gmap->mm->mmap_sem);
 703         while (len) {
 704                 /* Convert gmap address and connect the page tables */
 705                 addr = __gmap_translate(gmap, gaddr);
 706                 if (IS_ERR_VALUE(addr)) {
 707                         rc = addr;
 708                         break;
 709                 }
 710                 /* Get the page mapped */
 711                 if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE)) {
 712                         rc = -EFAULT;
 713                         break;
 714                 }
 715                 rc = __gmap_link(gmap, gaddr, addr);
 716                 if (rc)
 717                         break;
 718                 /* Walk the process page table, lock and get pte pointer */
 719                 ptep = get_locked_pte(gmap->mm, addr, &ptl);
 720                 VM_BUG_ON(!ptep);
 721                 /* Set notification bit in the pgste of the pte */
 722                 entry = *ptep;
 723                 if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) {
 724                         pgste = pgste_get_lock(ptep);
 725                         pgste_val(pgste) |= PGSTE_IN_BIT;
 726                         pgste_set_unlock(ptep, pgste);
 727                         gaddr += PAGE_SIZE;
 728                         len -= PAGE_SIZE;
 729                 }
 730                 pte_unmap_unlock(ptep, ptl);
 731         }
 732         up_read(&gmap->mm->mmap_sem);
 733         return rc;
 734 }
 735 EXPORT_SYMBOL_GPL(gmap_ipte_notify);
 736
 737 /**
 738  * gmap_do_ipte_notify - call all invalidation callbacks for a specific pte.
 739  * @mm: pointer to the process mm_struct
 740  * @addr: virtual address in the process address space
 741  * @pte: pointer to the page table entry
 742  *
 743  * This function is assumed to be called with the page table lock held
 744  * for the pte to notify.
 745  */
 746 void gmap_do_ipte_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte)
 747 {
 748         unsigned long offset, gaddr;
 749         unsigned long *table;
 750         struct gmap_notifier *nb;
 751         struct gmap *gmap;
 752
 753         offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
 754         offset = offset * (4096 / sizeof(pte_t));
 755         spin_lock(&gmap_notifier_lock);
 756         list_for_each_entry(gmap, &mm->context.gmap_list, list) {
 757                 table = radix_tree_lookup(&gmap->host_to_guest,
 758                                           vmaddr >> PMD_SHIFT);
 759                 if (!table)
 760                         continue;
 761                 gaddr = __gmap_segment_gaddr(table) + offset;
 762                 list_for_each_entry(nb, &gmap_notifier_list, list)
 763                         nb->notifier_call(gmap, gaddr);
 764         }
 765         spin_unlock(&gmap_notifier_lock);
 766 }
 767 EXPORT_SYMBOL_GPL(gmap_do_ipte_notify);
 768
 769 static inline int page_table_with_pgste(struct page *page)
 770 {
 771         return atomic_read(&page->_mapcount) == 0;
 772 }
 773
 774 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm)
 775 {
 776         struct page *page;
 777         unsigned long *table;
 778
 779         page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
 780         if (!page)
 781                 return NULL;
 782         if (!pgtable_page_ctor(page)) {
 783                 __free_page(page);
 784                 return NULL;
 785         }
 786         atomic_set(&page->_mapcount, 0);
 787         table = (unsigned long *) page_to_phys(page);
 788         clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
 789         clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
 790         return table;
 791 }
 792
 793 static inline void page_table_free_pgste(unsigned long *table)
 794 {
 795         struct page *page;
 796
 797         page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
 798         pgtable_page_dtor(page);
 799         atomic_set(&page->_mapcount, -1);
 800         __free_page(page);
 801 }
 802
 803 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 804                           unsigned long key, bool nq)
 805 {
 806         spinlock_t *ptl;
 807         pgste_t old, new;
 808         pte_t *ptep;
 809
 810         down_read(&mm->mmap_sem);
 811 retry:
 812         ptep = get_locked_pte(mm, addr, &ptl);
 813         if (unlikely(!ptep)) {
 814                 up_read(&mm->mmap_sem);
 815                 return -EFAULT;
 816         }
 817         if (!(pte_val(*ptep) & _PAGE_INVALID) &&
 818              (pte_val(*ptep) & _PAGE_PROTECT)) {
 819                 pte_unmap_unlock(ptep, ptl);
 820                 if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE)) {
 821                         up_read(&mm->mmap_sem);
 822                         return -EFAULT;
 823                 }
 824                 goto retry;
 825         }
 826
 827         new = old = pgste_get_lock(ptep);
 828         pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT |
 829                             PGSTE_ACC_BITS | PGSTE_FP_BIT);
 830         pgste_val(new) |= (key & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48;
 831         pgste_val(new) |= (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
 832         if (!(pte_val(*ptep) & _PAGE_INVALID)) {
 833                 unsigned long address, bits, skey;
 834
 835                 address = pte_val(*ptep) & PAGE_MASK;
 836                 skey = (unsigned long) page_get_storage_key(address);
 837                 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
 838                 skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT);
 839                 /* Set storage key ACC and FP */
 840                 page_set_storage_key(address, skey, !nq);
 841                 /* Merge host changed & referenced into pgste  */
 842                 pgste_val(new) |= bits << 52;
 843         }
 844         /* changing the guest storage key is considered a change of the page */
 845         if ((pgste_val(new) ^ pgste_val(old)) &
 846             (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT))
 847                 pgste_val(new) |= PGSTE_UC_BIT;
 848
 849         pgste_set_unlock(ptep, new);
 850         pte_unmap_unlock(ptep, ptl);
 851         up_read(&mm->mmap_sem);
 852         return 0;
 853 }
 854 EXPORT_SYMBOL(set_guest_storage_key);
 855
 856 unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr)
 857 {
 858         spinlock_t *ptl;
 859         pgste_t pgste;
 860         pte_t *ptep;
 861         uint64_t physaddr;
 862         unsigned long key = 0;
 863
 864         down_read(&mm->mmap_sem);
 865         ptep = get_locked_pte(mm, addr, &ptl);
 866         if (unlikely(!ptep)) {
 867                 up_read(&mm->mmap_sem);
 868                 return -EFAULT;
 869         }
 870         pgste = pgste_get_lock(ptep);
 871
 872         if (pte_val(*ptep) & _PAGE_INVALID) {
 873                 key |= (pgste_val(pgste) & PGSTE_ACC_BITS) >> 56;
 874                 key |= (pgste_val(pgste) & PGSTE_FP_BIT) >> 56;
 875                 key |= (pgste_val(pgste) & PGSTE_GR_BIT) >> 48;
 876                 key |= (pgste_val(pgste) & PGSTE_GC_BIT) >> 48;
 877         } else {
 878                 physaddr = pte_val(*ptep) & PAGE_MASK;
 879                 key = page_get_storage_key(physaddr);
 880
 881                 /* Reflect guest's logical view, not physical */
 882                 if (pgste_val(pgste) & PGSTE_GR_BIT)
 883                         key |= _PAGE_REFERENCED;
 884                 if (pgste_val(pgste) & PGSTE_GC_BIT)
 885                         key |= _PAGE_CHANGED;
 886         }
 887
 888         pgste_set_unlock(ptep, pgste);
 889         pte_unmap_unlock(ptep, ptl);
 890         up_read(&mm->mmap_sem);
 891         return key;
 892 }
 893 EXPORT_SYMBOL(get_guest_storage_key);
 894
 895 static int page_table_allocate_pgste_min = 0;
 896 static int page_table_allocate_pgste_max = 1;
 897 int page_table_allocate_pgste = 0;
 898 EXPORT_SYMBOL(page_table_allocate_pgste);
 899
 900 static struct ctl_table page_table_sysctl[] = {
 901         {
 902                 .procname       = "allocate_pgste",
 903                 .data           = &page_table_allocate_pgste,
 904                 .maxlen         = sizeof(int),
 905                 .mode           = S_IRUGO | S_IWUSR,
 906                 .proc_handler   = proc_dointvec,
 907                 .extra1         = &page_table_allocate_pgste_min,
 908                 .extra2         = &page_table_allocate_pgste_max,
 909         },
 910         { }
 911 };
 912
 913 static struct ctl_table page_table_sysctl_dir[] = {
 914         {
 915                 .procname       = "vm",
 916                 .maxlen         = 0,
 917                 .mode           = 0555,
 918                 .child          = page_table_sysctl,
 919         },
 920         { }
 921 };
 922
 923 static int __init page_table_register_sysctl(void)
 924 {
 925         return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM;
 926 }
 927 __initcall(page_table_register_sysctl);
 928
 929 #else /* CONFIG_PGSTE */
 930
 931 static inline int page_table_with_pgste(struct page *page)
 932 {
 933         return 0;
 934 }
 935
 936 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm)
 937 {
 938         return NULL;
 939 }
 940
 941 static inline void page_table_free_pgste(unsigned long *table)
 942 {
 943 }
 944
 945 static inline void gmap_unlink(struct mm_struct *mm, unsigned long *table,
 946                         unsigned long vmaddr)
 947 {
 948 }
 949
 950 #endif /* CONFIG_PGSTE */
 951
 952 static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
 953 {
 954         unsigned int old, new;
 955
 956         do {
 957                 old = atomic_read(v);
 958                 new = old ^ bits;
 959         } while (atomic_cmpxchg(v, old, new) != old);
 960         return new;
 961 }
 962
 963 /*
 964  * page table entry allocation/free routines.
 965  */
 966 unsigned long *page_table_alloc(struct mm_struct *mm)
 967 {
 968         unsigned long *uninitialized_var(table);
 969         struct page *uninitialized_var(page);
 970         unsigned int mask, bit;
 971
 972         if (mm_alloc_pgste(mm))
 973                 return page_table_alloc_pgste(mm);
 974         /* Allocate fragments of a 4K page as 1K/2K page table */
 975         spin_lock_bh(&mm->context.list_lock);
 976         mask = FRAG_MASK;
 977         if (!list_empty(&mm->context.pgtable_list)) {
 978                 page = list_first_entry(&mm->context.pgtable_list,
 979                                         struct page, lru);
 980                 table = (unsigned long *) page_to_phys(page);
 981                 mask = atomic_read(&page->_mapcount);
 982                 mask = mask | (mask >> 4);
 983         }
 984         if ((mask & FRAG_MASK) == FRAG_MASK) {
 985                 spin_unlock_bh(&mm->context.list_lock);
 986                 page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
 987                 if (!page)
 988                         return NULL;
 989                 if (!pgtable_page_ctor(page)) {
 990                         __free_page(page);
 991                         return NULL;
 992                 }
 993                 atomic_set(&page->_mapcount, 1);
 994                 table = (unsigned long *) page_to_phys(page);
 995                 clear_table(table, _PAGE_INVALID, PAGE_SIZE);
 996                 spin_lock_bh(&mm->context.list_lock);
 997                 list_add(&page->lru, &mm->context.pgtable_list);
 998         } else {
 999                 for (bit = 1; mask & bit; bit <<= 1)
1000                         table += PTRS_PER_PTE;
1001                 mask = atomic_xor_bits(&page->_mapcount, bit);
1002                 if ((mask & FRAG_MASK) == FRAG_MASK)
1003                         list_del(&page->lru);
1004         }
1005         spin_unlock_bh(&mm->context.list_lock);
1006         return table;
1007 }
1008
1009 void page_table_free(struct mm_struct *mm, unsigned long *table)
1010 {
1011         struct page *page;
1012         unsigned int bit, mask;
1013
1014         page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
1015         if (page_table_with_pgste(page))
1016                 return page_table_free_pgste(table);
1017         /* Free 1K/2K page table fragment of a 4K page */
1018         bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)));
1019         spin_lock_bh(&mm->context.list_lock);
1020         if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
1021                 list_del(&page->lru);
1022         mask = atomic_xor_bits(&page->_mapcount, bit);
1023         if (mask & FRAG_MASK)
1024                 list_add(&page->lru, &mm->context.pgtable_list);
1025         spin_unlock_bh(&mm->context.list_lock);
1026         if (mask == 0) {
1027                 pgtable_page_dtor(page);
1028                 atomic_set(&page->_mapcount, -1);
1029                 __free_page(page);
1030         }
1031 }
1032
1033 static void __page_table_free_rcu(void *table, unsigned bit)
1034 {
1035         struct page *page;
1036
1037         if (bit == FRAG_MASK)
1038                 return page_table_free_pgste(table);
1039         /* Free 1K/2K page table fragment of a 4K page */
1040         page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
1041         if (atomic_xor_bits(&page->_mapcount, bit) == 0) {
1042                 pgtable_page_dtor(page);
1043                 atomic_set(&page->_mapcount, -1);
1044                 __free_page(page);
1045         }
1046 }
1047
1048 void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
1049                          unsigned long vmaddr)
1050 {
1051         struct mm_struct *mm;
1052         struct page *page;
1053         unsigned int bit, mask;
1054
1055         mm = tlb->mm;
1056         page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
1057         if (page_table_with_pgste(page)) {
1058                 gmap_unlink(mm, table, vmaddr);
1059                 table = (unsigned long *) (__pa(table) | FRAG_MASK);
1060                 tlb_remove_table(tlb, table);
1061                 return;
1062         }
1063         bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)));
1064         spin_lock_bh(&mm->context.list_lock);
1065         if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
1066                 list_del(&page->lru);
1067         mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4));
1068         if (mask & FRAG_MASK)
1069                 list_add_tail(&page->lru, &mm->context.pgtable_list);
1070         spin_unlock_bh(&mm->context.list_lock);
1071         table = (unsigned long *) (__pa(table) | (bit << 4));
1072         tlb_remove_table(tlb, table);
1073 }
1074
1075 static void __tlb_remove_table(void *_table)
1076 {
1077         const unsigned long mask = (FRAG_MASK << 4) | FRAG_MASK;
1078         void *table = (void *)((unsigned long) _table & ~mask);
1079         unsigned type = (unsigned long) _table & mask;
1080
1081         if (type)
1082                 __page_table_free_rcu(table, type);
1083         else
1084                 free_pages((unsigned long) table, ALLOC_ORDER);
1085 }
1086
1087 static void tlb_remove_table_smp_sync(void *arg)
1088 {
1089         /* Simply deliver the interrupt */
1090 }
1091
1092 static void tlb_remove_table_one(void *table)
1093 {
1094         /*
1095          * This isn't an RCU grace period and hence the page-tables cannot be
1096          * assumed to be actually RCU-freed.
1097          *
1098          * It is however sufficient for software page-table walkers that rely
1099          * on IRQ disabling. See the comment near struct mmu_table_batch.
1100          */
1101         smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
1102         __tlb_remove_table(table);
1103 }
1104
1105 static void tlb_remove_table_rcu(struct rcu_head *head)
1106 {
1107         struct mmu_table_batch *batch;
1108         int i;
1109
1110         batch = container_of(head, struct mmu_table_batch, rcu);
1111
1112         for (i = 0; i < batch->nr; i++)
1113                 __tlb_remove_table(batch->tables[i]);
1114
1115         free_page((unsigned long)batch);
1116 }
1117
1118 void tlb_table_flush(struct mmu_gather *tlb)
1119 {
1120         struct mmu_table_batch **batch = &tlb->batch;
1121
1122         if (*batch) {
1123                 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
1124                 *batch = NULL;
1125         }
1126 }
1127
1128 void tlb_remove_table(struct mmu_gather *tlb, void *table)
1129 {
1130         struct mmu_table_batch **batch = &tlb->batch;
1131
1132         tlb->mm->context.flush_mm = 1;
1133         if (*batch == NULL) {
1134                 *batch = (struct mmu_table_batch *)
1135                         __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
1136                 if (*batch == NULL) {
1137                         __tlb_flush_mm_lazy(tlb->mm);
1138                         tlb_remove_table_one(table);
1139                         return;
1140                 }
1141                 (*batch)->nr = 0;
1142         }
1143         (*batch)->tables[(*batch)->nr++] = table;
1144         if ((*batch)->nr == MAX_TABLE_BATCH)
1145                 tlb_flush_mmu(tlb);
1146 }
1147
1148 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1149 static inline void thp_split_vma(struct vm_area_struct *vma)
1150 {
1151         unsigned long addr;
1152
1153         for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE)
1154                 follow_page(vma, addr, FOLL_SPLIT);
1155 }
1156
1157 static inline void thp_split_mm(struct mm_struct *mm)
1158 {
1159         struct vm_area_struct *vma;
1160
1161         for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
1162                 thp_split_vma(vma);
1163                 vma->vm_flags &= ~VM_HUGEPAGE;
1164                 vma->vm_flags |= VM_NOHUGEPAGE;
1165         }
1166         mm->def_flags |= VM_NOHUGEPAGE;
1167 }
1168 #else
1169 static inline void thp_split_mm(struct mm_struct *mm)
1170 {
1171 }
1172 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1173
1174 /*
1175  * switch on pgstes for its userspace process (for kvm)
1176  */
1177 int s390_enable_sie(void)
1178 {
1179         struct mm_struct *mm = current->mm;
1180
1181         /* Do we have pgstes? if yes, we are done */
1182         if (mm_has_pgste(mm))
1183                 return 0;
1184         /* Fail if the page tables are 2K */
1185         if (!mm_alloc_pgste(mm))
1186                 return -EINVAL;
1187         down_write(&mm->mmap_sem);
1188         mm->context.has_pgste = 1;
1189         /* split thp mappings and disable thp for future mappings */
1190         thp_split_mm(mm);
1191         up_write(&mm->mmap_sem);
1192         return 0;
1193 }
1194 EXPORT_SYMBOL_GPL(s390_enable_sie);
1195
1196 /*
1197  * Enable storage key handling from now on and initialize the storage
1198  * keys with the default key.
1199  */
1200 static int __s390_enable_skey(pte_t *pte, unsigned long addr,
1201                               unsigned long next, struct mm_walk *walk)
1202 {
1203         unsigned long ptev;
1204         pgste_t pgste;
1205
1206         pgste = pgste_get_lock(pte);
1207         /*
1208          * Remove all zero page mappings,
1209          * after establishing a policy to forbid zero page mappings
1210          * following faults for that page will get fresh anonymous pages
1211          */
1212         if (is_zero_pfn(pte_pfn(*pte))) {
1213                 ptep_flush_direct(walk->mm, addr, pte);
1214                 pte_val(*pte) = _PAGE_INVALID;
1215         }
1216         /* Clear storage key */
1217         pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT |
1218                               PGSTE_GR_BIT | PGSTE_GC_BIT);
1219         ptev = pte_val(*pte);
1220         if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE))
1221                 page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1);
1222         pgste_set_unlock(pte, pgste);
1223         return 0;
1224 }
1225
1226 int s390_enable_skey(void)
1227 {
1228         struct mm_walk walk = { .pte_entry = __s390_enable_skey };
1229         struct mm_struct *mm = current->mm;
1230         struct vm_area_struct *vma;
1231         int rc = 0;
1232
1233         down_write(&mm->mmap_sem);
1234         if (mm_use_skey(mm))
1235                 goto out_up;
1236
1237         mm->context.use_skey = 1;
1238         for (vma = mm->mmap; vma; vma = vma->vm_next) {
1239                 if (ksm_madvise(vma, vma->vm_start, vma->vm_end,
1240                                 MADV_UNMERGEABLE, &vma->vm_flags)) {
1241                         mm->context.use_skey = 0;
1242                         rc = -ENOMEM;
1243                         goto out_up;
1244                 }
1245         }
1246         mm->def_flags &= ~VM_MERGEABLE;
1247
1248         walk.mm = mm;
1249         walk_page_range(0, TASK_SIZE, &walk);
1250
1251 out_up:
1252         up_write(&mm->mmap_sem);
1253         return rc;
1254 }
1255 EXPORT_SYMBOL_GPL(s390_enable_skey);
1256
1257 /*
1258  * Reset CMMA state, make all pages stable again.
1259  */
1260 static int __s390_reset_cmma(pte_t *pte, unsigned long addr,
1261                              unsigned long next, struct mm_walk *walk)
1262 {
1263         pgste_t pgste;
1264
1265         pgste = pgste_get_lock(pte);
1266         pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK;
1267         pgste_set_unlock(pte, pgste);
1268         return 0;
1269 }
1270
1271 void s390_reset_cmma(struct mm_struct *mm)
1272 {
1273         struct mm_walk walk = { .pte_entry = __s390_reset_cmma };
1274
1275         down_write(&mm->mmap_sem);
1276         walk.mm = mm;
1277         walk_page_range(0, TASK_SIZE, &walk);
1278         up_write(&mm->mmap_sem);
1279 }
1280 EXPORT_SYMBOL_GPL(s390_reset_cmma);
1281
1282 /*
1283  * Test and reset if a guest page is dirty
1284  */
1285 bool gmap_test_and_clear_dirty(unsigned long address, struct gmap *gmap)
1286 {
1287         pte_t *pte;
1288         spinlock_t *ptl;
1289         bool dirty = false;
1290
1291         pte = get_locked_pte(gmap->mm, address, &ptl);
1292         if (unlikely(!pte))
1293                 return false;
1294
1295         if (ptep_test_and_clear_user_dirty(gmap->mm, address, pte))
1296                 dirty = true;
1297
1298         spin_unlock(ptl);
1299         return dirty;
1300 }
1301 EXPORT_SYMBOL_GPL(gmap_test_and_clear_dirty);
1302
1303 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1304 int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address,
1305                            pmd_t *pmdp)
1306 {
1307         VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1308         /* No need to flush TLB
1309          * On s390 reference bits are in storage key and never in TLB */
1310         return pmdp_test_and_clear_young(vma, address, pmdp);
1311 }
1312
1313 int pmdp_set_access_flags(struct vm_area_struct *vma,
1314                           unsigned long address, pmd_t *pmdp,
1315                           pmd_t entry, int dirty)
1316 {
1317         VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1318
1319         entry = pmd_mkyoung(entry);
1320         if (dirty)
1321                 entry = pmd_mkdirty(entry);
1322         if (pmd_same(*pmdp, entry))
1323                 return 0;
1324         pmdp_invalidate(vma, address, pmdp);
1325         set_pmd_at(vma->vm_mm, address, pmdp, entry);
1326         return 1;
1327 }
1328
1329 static void pmdp_splitting_flush_sync(void *arg)
1330 {
1331         /* Simply deliver the interrupt */
1332 }
1333
1334 void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
1335                           pmd_t *pmdp)
1336 {
1337         VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1338         if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT,
1339                               (unsigned long *) pmdp)) {
1340                 /* need to serialize against gup-fast (IRQ disabled) */
1341                 smp_call_function(pmdp_splitting_flush_sync, NULL, 1);
1342         }
1343 }
1344
1345 void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
1346                                 pgtable_t pgtable)
1347 {
1348         struct list_head *lh = (struct list_head *) pgtable;
1349
1350         assert_spin_locked(pmd_lockptr(mm, pmdp));
1351
1352         /* FIFO */
1353         if (!pmd_huge_pte(mm, pmdp))
1354                 INIT_LIST_HEAD(lh);
1355         else
1356                 list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
1357         pmd_huge_pte(mm, pmdp) = pgtable;
1358 }
1359
1360 pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
1361 {
1362         struct list_head *lh;
1363         pgtable_t pgtable;
1364         pte_t *ptep;
1365
1366         assert_spin_locked(pmd_lockptr(mm, pmdp));
1367
1368         /* FIFO */
1369         pgtable = pmd_huge_pte(mm, pmdp);
1370         lh = (struct list_head *) pgtable;
1371         if (list_empty(lh))
1372                 pmd_huge_pte(mm, pmdp) = NULL;
1373         else {
1374                 pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
1375                 list_del(lh);
1376         }
1377         ptep = (pte_t *) pgtable;
1378         pte_val(*ptep) = _PAGE_INVALID;
1379         ptep++;
1380         pte_val(*ptep) = _PAGE_INVALID;
1381         return pgtable;
1382 }
1383 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */