2 * Copyright IBM Corp. 2007, 2011
3 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
6 #include <linux/sched.h>
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
11 #include <linux/swap.h>
12 #include <linux/smp.h>
13 #include <linux/highmem.h>
14 #include <linux/pagemap.h>
15 #include <linux/spinlock.h>
16 #include <linux/module.h>
17 #include <linux/quicklist.h>
18 #include <linux/rcupdate.h>
19 #include <linux/slab.h>
21 #include <asm/pgtable.h>
22 #include <asm/pgalloc.h>
24 #include <asm/tlbflush.h>
25 #include <asm/mmu_context.h>
29 #define FRAG_MASK 0x0f
32 #define FRAG_MASK 0x03
36 unsigned long *crst_table_alloc(struct mm_struct
*mm
)
38 struct page
*page
= alloc_pages(GFP_KERNEL
, ALLOC_ORDER
);
42 return (unsigned long *) page_to_phys(page
);
45 void crst_table_free(struct mm_struct
*mm
, unsigned long *table
)
47 free_pages((unsigned long) table
, ALLOC_ORDER
);
51 int crst_table_upgrade(struct mm_struct
*mm
, unsigned long limit
)
53 unsigned long *table
, *pgd
;
56 BUG_ON(limit
> (1UL << 53));
58 table
= crst_table_alloc(mm
);
61 spin_lock_bh(&mm
->page_table_lock
);
62 if (mm
->context
.asce_limit
< limit
) {
63 pgd
= (unsigned long *) mm
->pgd
;
64 if (mm
->context
.asce_limit
<= (1UL << 31)) {
65 entry
= _REGION3_ENTRY_EMPTY
;
66 mm
->context
.asce_limit
= 1UL << 42;
67 mm
->context
.asce_bits
= _ASCE_TABLE_LENGTH
|
71 entry
= _REGION2_ENTRY_EMPTY
;
72 mm
->context
.asce_limit
= 1UL << 53;
73 mm
->context
.asce_bits
= _ASCE_TABLE_LENGTH
|
77 crst_table_init(table
, entry
);
78 pgd_populate(mm
, (pgd_t
*) table
, (pud_t
*) pgd
);
79 mm
->pgd
= (pgd_t
*) table
;
80 mm
->task_size
= mm
->context
.asce_limit
;
83 spin_unlock_bh(&mm
->page_table_lock
);
85 crst_table_free(mm
, table
);
86 if (mm
->context
.asce_limit
< limit
)
91 void crst_table_downgrade(struct mm_struct
*mm
, unsigned long limit
)
95 while (mm
->context
.asce_limit
> limit
) {
97 switch (pgd_val(*pgd
) & _REGION_ENTRY_TYPE_MASK
) {
98 case _REGION_ENTRY_TYPE_R2
:
99 mm
->context
.asce_limit
= 1UL << 42;
100 mm
->context
.asce_bits
= _ASCE_TABLE_LENGTH
|
104 case _REGION_ENTRY_TYPE_R3
:
105 mm
->context
.asce_limit
= 1UL << 31;
106 mm
->context
.asce_bits
= _ASCE_TABLE_LENGTH
|
113 mm
->pgd
= (pgd_t
*) (pgd_val(*pgd
) & _REGION_ENTRY_ORIGIN
);
114 mm
->task_size
= mm
->context
.asce_limit
;
115 crst_table_free(mm
, (unsigned long *) pgd
);
123 * gmap_alloc - allocate a guest address space
124 * @mm: pointer to the parent mm_struct
126 * Returns a guest address space structure.
128 struct gmap
*gmap_alloc(struct mm_struct
*mm
)
132 unsigned long *table
;
134 gmap
= kzalloc(sizeof(struct gmap
), GFP_KERNEL
);
137 INIT_LIST_HEAD(&gmap
->crst_list
);
139 page
= alloc_pages(GFP_KERNEL
, ALLOC_ORDER
);
142 list_add(&page
->lru
, &gmap
->crst_list
);
143 table
= (unsigned long *) page_to_phys(page
);
144 crst_table_init(table
, _REGION1_ENTRY_EMPTY
);
146 gmap
->asce
= _ASCE_TYPE_REGION1
| _ASCE_TABLE_LENGTH
|
147 _ASCE_USER_BITS
| __pa(table
);
148 list_add(&gmap
->list
, &mm
->context
.gmap_list
);
156 EXPORT_SYMBOL_GPL(gmap_alloc
);
158 static int gmap_unlink_segment(struct gmap
*gmap
, unsigned long *table
)
160 struct gmap_pgtable
*mp
;
161 struct gmap_rmap
*rmap
;
164 if (*table
& _SEGMENT_ENTRY_INVALID
)
166 page
= pfn_to_page(*table
>> PAGE_SHIFT
);
167 mp
= (struct gmap_pgtable
*) page
->index
;
168 list_for_each_entry(rmap
, &mp
->mapper
, list
) {
169 if (rmap
->entry
!= table
)
171 list_del(&rmap
->list
);
175 *table
= mp
->vmaddr
| _SEGMENT_ENTRY_INVALID
| _SEGMENT_ENTRY_PROTECT
;
179 static void gmap_flush_tlb(struct gmap
*gmap
)
181 if (MACHINE_HAS_IDTE
)
182 __tlb_flush_idte((unsigned long) gmap
->table
|
185 __tlb_flush_global();
189 * gmap_free - free a guest address space
190 * @gmap: pointer to the guest address space structure
192 void gmap_free(struct gmap
*gmap
)
194 struct page
*page
, *next
;
195 unsigned long *table
;
200 if (MACHINE_HAS_IDTE
)
201 __tlb_flush_idte((unsigned long) gmap
->table
|
204 __tlb_flush_global();
206 /* Free all segment & region tables. */
207 down_read(&gmap
->mm
->mmap_sem
);
208 spin_lock(&gmap
->mm
->page_table_lock
);
209 list_for_each_entry_safe(page
, next
, &gmap
->crst_list
, lru
) {
210 table
= (unsigned long *) page_to_phys(page
);
211 if ((*table
& _REGION_ENTRY_TYPE_MASK
) == 0)
212 /* Remove gmap rmap structures for segment table. */
213 for (i
= 0; i
< PTRS_PER_PMD
; i
++, table
++)
214 gmap_unlink_segment(gmap
, table
);
215 __free_pages(page
, ALLOC_ORDER
);
217 spin_unlock(&gmap
->mm
->page_table_lock
);
218 up_read(&gmap
->mm
->mmap_sem
);
219 list_del(&gmap
->list
);
222 EXPORT_SYMBOL_GPL(gmap_free
);
225 * gmap_enable - switch primary space to the guest address space
226 * @gmap: pointer to the guest address space structure
228 void gmap_enable(struct gmap
*gmap
)
230 S390_lowcore
.gmap
= (unsigned long) gmap
;
232 EXPORT_SYMBOL_GPL(gmap_enable
);
235 * gmap_disable - switch back to the standard primary address space
236 * @gmap: pointer to the guest address space structure
238 void gmap_disable(struct gmap
*gmap
)
240 S390_lowcore
.gmap
= 0UL;
242 EXPORT_SYMBOL_GPL(gmap_disable
);
245 * gmap_alloc_table is assumed to be called with mmap_sem held
247 static int gmap_alloc_table(struct gmap
*gmap
,
248 unsigned long *table
, unsigned long init
)
249 __releases(&gmap
->mm
->page_table_lock
)
250 __acquires(&gmap
->mm
->page_table_lock
)
255 /* since we dont free the gmap table until gmap_free we can unlock */
256 spin_unlock(&gmap
->mm
->page_table_lock
);
257 page
= alloc_pages(GFP_KERNEL
, ALLOC_ORDER
);
258 spin_lock(&gmap
->mm
->page_table_lock
);
261 new = (unsigned long *) page_to_phys(page
);
262 crst_table_init(new, init
);
263 if (*table
& _REGION_ENTRY_INVALID
) {
264 list_add(&page
->lru
, &gmap
->crst_list
);
265 *table
= (unsigned long) new | _REGION_ENTRY_LENGTH
|
266 (*table
& _REGION_ENTRY_TYPE_MASK
);
268 __free_pages(page
, ALLOC_ORDER
);
273 * gmap_unmap_segment - unmap segment from the guest address space
274 * @gmap: pointer to the guest address space structure
275 * @addr: address in the guest address space
276 * @len: length of the memory area to unmap
278 * Returns 0 if the unmap succeded, -EINVAL if not.
280 int gmap_unmap_segment(struct gmap
*gmap
, unsigned long to
, unsigned long len
)
282 unsigned long *table
;
286 if ((to
| len
) & (PMD_SIZE
- 1))
288 if (len
== 0 || to
+ len
< to
)
292 down_read(&gmap
->mm
->mmap_sem
);
293 spin_lock(&gmap
->mm
->page_table_lock
);
294 for (off
= 0; off
< len
; off
+= PMD_SIZE
) {
295 /* Walk the guest addr space page table */
296 table
= gmap
->table
+ (((to
+ off
) >> 53) & 0x7ff);
297 if (*table
& _REGION_ENTRY_INVALID
)
299 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
300 table
= table
+ (((to
+ off
) >> 42) & 0x7ff);
301 if (*table
& _REGION_ENTRY_INVALID
)
303 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
304 table
= table
+ (((to
+ off
) >> 31) & 0x7ff);
305 if (*table
& _REGION_ENTRY_INVALID
)
307 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
308 table
= table
+ (((to
+ off
) >> 20) & 0x7ff);
310 /* Clear segment table entry in guest address space. */
311 flush
|= gmap_unlink_segment(gmap
, table
);
312 *table
= _SEGMENT_ENTRY_INVALID
;
315 spin_unlock(&gmap
->mm
->page_table_lock
);
316 up_read(&gmap
->mm
->mmap_sem
);
318 gmap_flush_tlb(gmap
);
321 EXPORT_SYMBOL_GPL(gmap_unmap_segment
);
324 * gmap_mmap_segment - map a segment to the guest address space
325 * @gmap: pointer to the guest address space structure
326 * @from: source address in the parent address space
327 * @to: target address in the guest address space
329 * Returns 0 if the mmap succeded, -EINVAL or -ENOMEM if not.
331 int gmap_map_segment(struct gmap
*gmap
, unsigned long from
,
332 unsigned long to
, unsigned long len
)
334 unsigned long *table
;
338 if ((from
| to
| len
) & (PMD_SIZE
- 1))
340 if (len
== 0 || from
+ len
> TASK_MAX_SIZE
||
341 from
+ len
< from
|| to
+ len
< to
)
345 down_read(&gmap
->mm
->mmap_sem
);
346 spin_lock(&gmap
->mm
->page_table_lock
);
347 for (off
= 0; off
< len
; off
+= PMD_SIZE
) {
348 /* Walk the gmap address space page table */
349 table
= gmap
->table
+ (((to
+ off
) >> 53) & 0x7ff);
350 if ((*table
& _REGION_ENTRY_INVALID
) &&
351 gmap_alloc_table(gmap
, table
, _REGION2_ENTRY_EMPTY
))
353 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
354 table
= table
+ (((to
+ off
) >> 42) & 0x7ff);
355 if ((*table
& _REGION_ENTRY_INVALID
) &&
356 gmap_alloc_table(gmap
, table
, _REGION3_ENTRY_EMPTY
))
358 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
359 table
= table
+ (((to
+ off
) >> 31) & 0x7ff);
360 if ((*table
& _REGION_ENTRY_INVALID
) &&
361 gmap_alloc_table(gmap
, table
, _SEGMENT_ENTRY_EMPTY
))
363 table
= (unsigned long *) (*table
& _REGION_ENTRY_ORIGIN
);
364 table
= table
+ (((to
+ off
) >> 20) & 0x7ff);
366 /* Store 'from' address in an invalid segment table entry. */
367 flush
|= gmap_unlink_segment(gmap
, table
);
368 *table
= (from
+ off
) | (_SEGMENT_ENTRY_INVALID
|
369 _SEGMENT_ENTRY_PROTECT
);
371 spin_unlock(&gmap
->mm
->page_table_lock
);
372 up_read(&gmap
->mm
->mmap_sem
);
374 gmap_flush_tlb(gmap
);
378 spin_unlock(&gmap
->mm
->page_table_lock
);
379 up_read(&gmap
->mm
->mmap_sem
);
380 gmap_unmap_segment(gmap
, to
, len
);
383 EXPORT_SYMBOL_GPL(gmap_map_segment
);
385 static unsigned long *gmap_table_walk(unsigned long address
, struct gmap
*gmap
)
387 unsigned long *table
;
389 table
= gmap
->table
+ ((address
>> 53) & 0x7ff);
390 if (unlikely(*table
& _REGION_ENTRY_INVALID
))
391 return ERR_PTR(-EFAULT
);
392 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
393 table
= table
+ ((address
>> 42) & 0x7ff);
394 if (unlikely(*table
& _REGION_ENTRY_INVALID
))
395 return ERR_PTR(-EFAULT
);
396 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
397 table
= table
+ ((address
>> 31) & 0x7ff);
398 if (unlikely(*table
& _REGION_ENTRY_INVALID
))
399 return ERR_PTR(-EFAULT
);
400 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
401 table
= table
+ ((address
>> 20) & 0x7ff);
406 * __gmap_translate - translate a guest address to a user space address
407 * @address: guest address
408 * @gmap: pointer to guest mapping meta data structure
410 * Returns user space address which corresponds to the guest address or
411 * -EFAULT if no such mapping exists.
412 * This function does not establish potentially missing page table entries.
413 * The mmap_sem of the mm that belongs to the address space must be held
414 * when this function gets called.
416 unsigned long __gmap_translate(unsigned long address
, struct gmap
*gmap
)
418 unsigned long *segment_ptr
, vmaddr
, segment
;
419 struct gmap_pgtable
*mp
;
422 current
->thread
.gmap_addr
= address
;
423 segment_ptr
= gmap_table_walk(address
, gmap
);
424 if (IS_ERR(segment_ptr
))
425 return PTR_ERR(segment_ptr
);
426 /* Convert the gmap address to an mm address. */
427 segment
= *segment_ptr
;
428 if (!(segment
& _SEGMENT_ENTRY_INVALID
)) {
429 page
= pfn_to_page(segment
>> PAGE_SHIFT
);
430 mp
= (struct gmap_pgtable
*) page
->index
;
431 return mp
->vmaddr
| (address
& ~PMD_MASK
);
432 } else if (segment
& _SEGMENT_ENTRY_PROTECT
) {
433 vmaddr
= segment
& _SEGMENT_ENTRY_ORIGIN
;
434 return vmaddr
| (address
& ~PMD_MASK
);
438 EXPORT_SYMBOL_GPL(__gmap_translate
);
441 * gmap_translate - translate a guest address to a user space address
442 * @address: guest address
443 * @gmap: pointer to guest mapping meta data structure
445 * Returns user space address which corresponds to the guest address or
446 * -EFAULT if no such mapping exists.
447 * This function does not establish potentially missing page table entries.
449 unsigned long gmap_translate(unsigned long address
, struct gmap
*gmap
)
453 down_read(&gmap
->mm
->mmap_sem
);
454 rc
= __gmap_translate(address
, gmap
);
455 up_read(&gmap
->mm
->mmap_sem
);
458 EXPORT_SYMBOL_GPL(gmap_translate
);
460 static int gmap_connect_pgtable(unsigned long address
, unsigned long segment
,
461 unsigned long *segment_ptr
, struct gmap
*gmap
)
463 unsigned long vmaddr
;
464 struct vm_area_struct
*vma
;
465 struct gmap_pgtable
*mp
;
466 struct gmap_rmap
*rmap
;
467 struct mm_struct
*mm
;
474 vmaddr
= segment
& _SEGMENT_ENTRY_ORIGIN
;
475 vma
= find_vma(mm
, vmaddr
);
476 if (!vma
|| vma
->vm_start
> vmaddr
)
478 /* Walk the parent mm page table */
479 pgd
= pgd_offset(mm
, vmaddr
);
480 pud
= pud_alloc(mm
, pgd
, vmaddr
);
483 pmd
= pmd_alloc(mm
, pud
, vmaddr
);
486 if (!pmd_present(*pmd
) &&
487 __pte_alloc(mm
, vma
, pmd
, vmaddr
))
489 /* pmd now points to a valid segment table entry. */
490 rmap
= kmalloc(sizeof(*rmap
), GFP_KERNEL
|__GFP_REPEAT
);
493 /* Link gmap segment table entry location to page table. */
494 page
= pmd_page(*pmd
);
495 mp
= (struct gmap_pgtable
*) page
->index
;
497 rmap
->entry
= segment_ptr
;
498 rmap
->vmaddr
= address
& PMD_MASK
;
499 spin_lock(&mm
->page_table_lock
);
500 if (*segment_ptr
== segment
) {
501 list_add(&rmap
->list
, &mp
->mapper
);
502 /* Set gmap segment table entry to page table. */
503 *segment_ptr
= pmd_val(*pmd
) & PAGE_MASK
;
506 spin_unlock(&mm
->page_table_lock
);
511 static void gmap_disconnect_pgtable(struct mm_struct
*mm
, unsigned long *table
)
513 struct gmap_rmap
*rmap
, *next
;
514 struct gmap_pgtable
*mp
;
519 spin_lock(&mm
->page_table_lock
);
520 page
= pfn_to_page(__pa(table
) >> PAGE_SHIFT
);
521 mp
= (struct gmap_pgtable
*) page
->index
;
522 list_for_each_entry_safe(rmap
, next
, &mp
->mapper
, list
) {
523 *rmap
->entry
= mp
->vmaddr
| (_SEGMENT_ENTRY_INVALID
|
524 _SEGMENT_ENTRY_PROTECT
);
525 list_del(&rmap
->list
);
529 spin_unlock(&mm
->page_table_lock
);
531 __tlb_flush_global();
535 * this function is assumed to be called with mmap_sem held
537 unsigned long __gmap_fault(unsigned long address
, struct gmap
*gmap
)
539 unsigned long *segment_ptr
, segment
;
540 struct gmap_pgtable
*mp
;
544 current
->thread
.gmap_addr
= address
;
545 segment_ptr
= gmap_table_walk(address
, gmap
);
546 if (IS_ERR(segment_ptr
))
548 /* Convert the gmap address to an mm address. */
550 segment
= *segment_ptr
;
551 if (!(segment
& _SEGMENT_ENTRY_INVALID
)) {
552 /* Page table is present */
553 page
= pfn_to_page(segment
>> PAGE_SHIFT
);
554 mp
= (struct gmap_pgtable
*) page
->index
;
555 return mp
->vmaddr
| (address
& ~PMD_MASK
);
557 if (!(segment
& _SEGMENT_ENTRY_PROTECT
))
558 /* Nothing mapped in the gmap address space. */
560 rc
= gmap_connect_pgtable(address
, segment
, segment_ptr
, gmap
);
567 unsigned long gmap_fault(unsigned long address
, struct gmap
*gmap
)
571 down_read(&gmap
->mm
->mmap_sem
);
572 rc
= __gmap_fault(address
, gmap
);
573 up_read(&gmap
->mm
->mmap_sem
);
577 EXPORT_SYMBOL_GPL(gmap_fault
);
579 void gmap_discard(unsigned long from
, unsigned long to
, struct gmap
*gmap
)
582 unsigned long *table
, address
, size
;
583 struct vm_area_struct
*vma
;
584 struct gmap_pgtable
*mp
;
587 down_read(&gmap
->mm
->mmap_sem
);
589 while (address
< to
) {
590 /* Walk the gmap address space page table */
591 table
= gmap
->table
+ ((address
>> 53) & 0x7ff);
592 if (unlikely(*table
& _REGION_ENTRY_INVALID
)) {
593 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
596 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
597 table
= table
+ ((address
>> 42) & 0x7ff);
598 if (unlikely(*table
& _REGION_ENTRY_INVALID
)) {
599 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
602 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
603 table
= table
+ ((address
>> 31) & 0x7ff);
604 if (unlikely(*table
& _REGION_ENTRY_INVALID
)) {
605 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
608 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
609 table
= table
+ ((address
>> 20) & 0x7ff);
610 if (unlikely(*table
& _SEGMENT_ENTRY_INVALID
)) {
611 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
614 page
= pfn_to_page(*table
>> PAGE_SHIFT
);
615 mp
= (struct gmap_pgtable
*) page
->index
;
616 vma
= find_vma(gmap
->mm
, mp
->vmaddr
);
617 size
= min(to
- address
, PMD_SIZE
- (address
& ~PMD_MASK
));
618 zap_page_range(vma
, mp
->vmaddr
| (address
& ~PMD_MASK
),
620 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
622 up_read(&gmap
->mm
->mmap_sem
);
624 EXPORT_SYMBOL_GPL(gmap_discard
);
626 static LIST_HEAD(gmap_notifier_list
);
627 static DEFINE_SPINLOCK(gmap_notifier_lock
);
630 * gmap_register_ipte_notifier - register a pte invalidation callback
631 * @nb: pointer to the gmap notifier block
633 void gmap_register_ipte_notifier(struct gmap_notifier
*nb
)
635 spin_lock(&gmap_notifier_lock
);
636 list_add(&nb
->list
, &gmap_notifier_list
);
637 spin_unlock(&gmap_notifier_lock
);
639 EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier
);
642 * gmap_unregister_ipte_notifier - remove a pte invalidation callback
643 * @nb: pointer to the gmap notifier block
645 void gmap_unregister_ipte_notifier(struct gmap_notifier
*nb
)
647 spin_lock(&gmap_notifier_lock
);
648 list_del_init(&nb
->list
);
649 spin_unlock(&gmap_notifier_lock
);
651 EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier
);
654 * gmap_ipte_notify - mark a range of ptes for invalidation notification
655 * @gmap: pointer to guest mapping meta data structure
656 * @address: virtual address in the guest address space
659 * Returns 0 if for each page in the given range a gmap mapping exists and
660 * the invalidation notification could be set. If the gmap mapping is missing
661 * for one or more pages -EFAULT is returned. If no memory could be allocated
662 * -ENOMEM is returned. This function establishes missing page table entries.
664 int gmap_ipte_notify(struct gmap
*gmap
, unsigned long start
, unsigned long len
)
672 if ((start
& ~PAGE_MASK
) || (len
& ~PAGE_MASK
))
674 down_read(&gmap
->mm
->mmap_sem
);
676 /* Convert gmap address and connect the page tables */
677 addr
= __gmap_fault(start
, gmap
);
678 if (IS_ERR_VALUE(addr
)) {
682 /* Get the page mapped */
683 if (fixup_user_fault(current
, gmap
->mm
, addr
, FAULT_FLAG_WRITE
)) {
687 /* Walk the process page table, lock and get pte pointer */
688 ptep
= get_locked_pte(gmap
->mm
, addr
, &ptl
);
691 /* Set notification bit in the pgste of the pte */
693 if ((pte_val(entry
) & (_PAGE_INVALID
| _PAGE_PROTECT
)) == 0) {
694 pgste
= pgste_get_lock(ptep
);
695 pgste_val(pgste
) |= PGSTE_IN_BIT
;
696 pgste_set_unlock(ptep
, pgste
);
702 up_read(&gmap
->mm
->mmap_sem
);
705 EXPORT_SYMBOL_GPL(gmap_ipte_notify
);
708 * gmap_do_ipte_notify - call all invalidation callbacks for a specific pte.
709 * @mm: pointer to the process mm_struct
710 * @addr: virtual address in the process address space
711 * @pte: pointer to the page table entry
713 * This function is assumed to be called with the page table lock held
714 * for the pte to notify.
716 void gmap_do_ipte_notify(struct mm_struct
*mm
, unsigned long addr
, pte_t
*pte
)
718 unsigned long segment_offset
;
719 struct gmap_notifier
*nb
;
720 struct gmap_pgtable
*mp
;
721 struct gmap_rmap
*rmap
;
724 segment_offset
= ((unsigned long) pte
) & (255 * sizeof(pte_t
));
725 segment_offset
= segment_offset
* (4096 / sizeof(pte_t
));
726 page
= pfn_to_page(__pa(pte
) >> PAGE_SHIFT
);
727 mp
= (struct gmap_pgtable
*) page
->index
;
728 spin_lock(&gmap_notifier_lock
);
729 list_for_each_entry(rmap
, &mp
->mapper
, list
) {
730 list_for_each_entry(nb
, &gmap_notifier_list
, list
)
731 nb
->notifier_call(rmap
->gmap
,
732 rmap
->vmaddr
+ segment_offset
);
734 spin_unlock(&gmap_notifier_lock
);
737 static inline int page_table_with_pgste(struct page
*page
)
739 return atomic_read(&page
->_mapcount
) == 0;
742 static inline unsigned long *page_table_alloc_pgste(struct mm_struct
*mm
,
743 unsigned long vmaddr
)
746 unsigned long *table
;
747 struct gmap_pgtable
*mp
;
749 page
= alloc_page(GFP_KERNEL
|__GFP_REPEAT
);
752 mp
= kmalloc(sizeof(*mp
), GFP_KERNEL
|__GFP_REPEAT
);
757 pgtable_page_ctor(page
);
758 mp
->vmaddr
= vmaddr
& PMD_MASK
;
759 INIT_LIST_HEAD(&mp
->mapper
);
760 page
->index
= (unsigned long) mp
;
761 atomic_set(&page
->_mapcount
, 0);
762 table
= (unsigned long *) page_to_phys(page
);
763 clear_table(table
, _PAGE_INVALID
, PAGE_SIZE
/2);
764 clear_table(table
+ PTRS_PER_PTE
, PGSTE_HR_BIT
| PGSTE_HC_BIT
,
769 static inline void page_table_free_pgste(unsigned long *table
)
772 struct gmap_pgtable
*mp
;
774 page
= pfn_to_page(__pa(table
) >> PAGE_SHIFT
);
775 mp
= (struct gmap_pgtable
*) page
->index
;
776 BUG_ON(!list_empty(&mp
->mapper
));
777 pgtable_page_dtor(page
);
778 atomic_set(&page
->_mapcount
, -1);
783 int set_guest_storage_key(struct mm_struct
*mm
, unsigned long addr
,
784 unsigned long key
, bool nq
)
790 down_read(&mm
->mmap_sem
);
791 ptep
= get_locked_pte(current
->mm
, addr
, &ptl
);
792 if (unlikely(!ptep
)) {
793 up_read(&mm
->mmap_sem
);
797 new = old
= pgste_get_lock(ptep
);
798 pgste_val(new) &= ~(PGSTE_GR_BIT
| PGSTE_GC_BIT
|
799 PGSTE_ACC_BITS
| PGSTE_FP_BIT
);
800 pgste_val(new) |= (key
& (_PAGE_CHANGED
| _PAGE_REFERENCED
)) << 48;
801 pgste_val(new) |= (key
& (_PAGE_ACC_BITS
| _PAGE_FP_BIT
)) << 56;
802 if (!(pte_val(*ptep
) & _PAGE_INVALID
)) {
803 unsigned long address
, bits
, skey
;
805 address
= pte_val(*ptep
) & PAGE_MASK
;
806 skey
= (unsigned long) page_get_storage_key(address
);
807 bits
= skey
& (_PAGE_CHANGED
| _PAGE_REFERENCED
);
808 skey
= key
& (_PAGE_ACC_BITS
| _PAGE_FP_BIT
);
809 /* Set storage key ACC and FP */
810 page_set_storage_key(address
, skey
, !nq
);
811 /* Merge host changed & referenced into pgste */
812 pgste_val(new) |= bits
<< 52;
814 /* changing the guest storage key is considered a change of the page */
815 if ((pgste_val(new) ^ pgste_val(old
)) &
816 (PGSTE_ACC_BITS
| PGSTE_FP_BIT
| PGSTE_GR_BIT
| PGSTE_GC_BIT
))
817 pgste_val(new) |= PGSTE_HC_BIT
;
819 pgste_set_unlock(ptep
, new);
820 pte_unmap_unlock(*ptep
, ptl
);
821 up_read(&mm
->mmap_sem
);
824 EXPORT_SYMBOL(set_guest_storage_key
);
826 #else /* CONFIG_PGSTE */
828 static inline int page_table_with_pgste(struct page
*page
)
833 static inline unsigned long *page_table_alloc_pgste(struct mm_struct
*mm
,
834 unsigned long vmaddr
)
839 static inline void page_table_free_pgste(unsigned long *table
)
843 static inline void gmap_disconnect_pgtable(struct mm_struct
*mm
,
844 unsigned long *table
)
848 #endif /* CONFIG_PGSTE */
850 static inline unsigned int atomic_xor_bits(atomic_t
*v
, unsigned int bits
)
852 unsigned int old
, new;
855 old
= atomic_read(v
);
857 } while (atomic_cmpxchg(v
, old
, new) != old
);
862 * page table entry allocation/free routines.
864 unsigned long *page_table_alloc(struct mm_struct
*mm
, unsigned long vmaddr
)
866 unsigned long *uninitialized_var(table
);
867 struct page
*uninitialized_var(page
);
868 unsigned int mask
, bit
;
870 if (mm_has_pgste(mm
))
871 return page_table_alloc_pgste(mm
, vmaddr
);
872 /* Allocate fragments of a 4K page as 1K/2K page table */
873 spin_lock_bh(&mm
->context
.list_lock
);
875 if (!list_empty(&mm
->context
.pgtable_list
)) {
876 page
= list_first_entry(&mm
->context
.pgtable_list
,
878 table
= (unsigned long *) page_to_phys(page
);
879 mask
= atomic_read(&page
->_mapcount
);
880 mask
= mask
| (mask
>> 4);
882 if ((mask
& FRAG_MASK
) == FRAG_MASK
) {
883 spin_unlock_bh(&mm
->context
.list_lock
);
884 page
= alloc_page(GFP_KERNEL
|__GFP_REPEAT
);
887 pgtable_page_ctor(page
);
888 atomic_set(&page
->_mapcount
, 1);
889 table
= (unsigned long *) page_to_phys(page
);
890 clear_table(table
, _PAGE_INVALID
, PAGE_SIZE
);
891 spin_lock_bh(&mm
->context
.list_lock
);
892 list_add(&page
->lru
, &mm
->context
.pgtable_list
);
894 for (bit
= 1; mask
& bit
; bit
<<= 1)
895 table
+= PTRS_PER_PTE
;
896 mask
= atomic_xor_bits(&page
->_mapcount
, bit
);
897 if ((mask
& FRAG_MASK
) == FRAG_MASK
)
898 list_del(&page
->lru
);
900 spin_unlock_bh(&mm
->context
.list_lock
);
904 void page_table_free(struct mm_struct
*mm
, unsigned long *table
)
907 unsigned int bit
, mask
;
909 page
= pfn_to_page(__pa(table
) >> PAGE_SHIFT
);
910 if (page_table_with_pgste(page
)) {
911 gmap_disconnect_pgtable(mm
, table
);
912 return page_table_free_pgste(table
);
914 /* Free 1K/2K page table fragment of a 4K page */
915 bit
= 1 << ((__pa(table
) & ~PAGE_MASK
)/(PTRS_PER_PTE
*sizeof(pte_t
)));
916 spin_lock_bh(&mm
->context
.list_lock
);
917 if ((atomic_read(&page
->_mapcount
) & FRAG_MASK
) != FRAG_MASK
)
918 list_del(&page
->lru
);
919 mask
= atomic_xor_bits(&page
->_mapcount
, bit
);
920 if (mask
& FRAG_MASK
)
921 list_add(&page
->lru
, &mm
->context
.pgtable_list
);
922 spin_unlock_bh(&mm
->context
.list_lock
);
924 pgtable_page_dtor(page
);
925 atomic_set(&page
->_mapcount
, -1);
930 static void __page_table_free_rcu(void *table
, unsigned bit
)
934 if (bit
== FRAG_MASK
)
935 return page_table_free_pgste(table
);
936 /* Free 1K/2K page table fragment of a 4K page */
937 page
= pfn_to_page(__pa(table
) >> PAGE_SHIFT
);
938 if (atomic_xor_bits(&page
->_mapcount
, bit
) == 0) {
939 pgtable_page_dtor(page
);
940 atomic_set(&page
->_mapcount
, -1);
945 void page_table_free_rcu(struct mmu_gather
*tlb
, unsigned long *table
)
947 struct mm_struct
*mm
;
949 unsigned int bit
, mask
;
952 page
= pfn_to_page(__pa(table
) >> PAGE_SHIFT
);
953 if (page_table_with_pgste(page
)) {
954 gmap_disconnect_pgtable(mm
, table
);
955 table
= (unsigned long *) (__pa(table
) | FRAG_MASK
);
956 tlb_remove_table(tlb
, table
);
959 bit
= 1 << ((__pa(table
) & ~PAGE_MASK
) / (PTRS_PER_PTE
*sizeof(pte_t
)));
960 spin_lock_bh(&mm
->context
.list_lock
);
961 if ((atomic_read(&page
->_mapcount
) & FRAG_MASK
) != FRAG_MASK
)
962 list_del(&page
->lru
);
963 mask
= atomic_xor_bits(&page
->_mapcount
, bit
| (bit
<< 4));
964 if (mask
& FRAG_MASK
)
965 list_add_tail(&page
->lru
, &mm
->context
.pgtable_list
);
966 spin_unlock_bh(&mm
->context
.list_lock
);
967 table
= (unsigned long *) (__pa(table
) | (bit
<< 4));
968 tlb_remove_table(tlb
, table
);
971 static void __tlb_remove_table(void *_table
)
973 const unsigned long mask
= (FRAG_MASK
<< 4) | FRAG_MASK
;
974 void *table
= (void *)((unsigned long) _table
& ~mask
);
975 unsigned type
= (unsigned long) _table
& mask
;
978 __page_table_free_rcu(table
, type
);
980 free_pages((unsigned long) table
, ALLOC_ORDER
);
983 static void tlb_remove_table_smp_sync(void *arg
)
985 /* Simply deliver the interrupt */
988 static void tlb_remove_table_one(void *table
)
991 * This isn't an RCU grace period and hence the page-tables cannot be
992 * assumed to be actually RCU-freed.
994 * It is however sufficient for software page-table walkers that rely
995 * on IRQ disabling. See the comment near struct mmu_table_batch.
997 smp_call_function(tlb_remove_table_smp_sync
, NULL
, 1);
998 __tlb_remove_table(table
);
1001 static void tlb_remove_table_rcu(struct rcu_head
*head
)
1003 struct mmu_table_batch
*batch
;
1006 batch
= container_of(head
, struct mmu_table_batch
, rcu
);
1008 for (i
= 0; i
< batch
->nr
; i
++)
1009 __tlb_remove_table(batch
->tables
[i
]);
1011 free_page((unsigned long)batch
);
1014 void tlb_table_flush(struct mmu_gather
*tlb
)
1016 struct mmu_table_batch
**batch
= &tlb
->batch
;
1019 call_rcu_sched(&(*batch
)->rcu
, tlb_remove_table_rcu
);
1024 void tlb_remove_table(struct mmu_gather
*tlb
, void *table
)
1026 struct mmu_table_batch
**batch
= &tlb
->batch
;
1028 tlb
->mm
->context
.flush_mm
= 1;
1029 if (*batch
== NULL
) {
1030 *batch
= (struct mmu_table_batch
*)
1031 __get_free_page(GFP_NOWAIT
| __GFP_NOWARN
);
1032 if (*batch
== NULL
) {
1033 __tlb_flush_mm_lazy(tlb
->mm
);
1034 tlb_remove_table_one(table
);
1039 (*batch
)->tables
[(*batch
)->nr
++] = table
;
1040 if ((*batch
)->nr
== MAX_TABLE_BATCH
)
1044 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1045 static inline void thp_split_vma(struct vm_area_struct
*vma
)
1049 for (addr
= vma
->vm_start
; addr
< vma
->vm_end
; addr
+= PAGE_SIZE
)
1050 follow_page(vma
, addr
, FOLL_SPLIT
);
1053 static inline void thp_split_mm(struct mm_struct
*mm
)
1055 struct vm_area_struct
*vma
;
1057 for (vma
= mm
->mmap
; vma
!= NULL
; vma
= vma
->vm_next
) {
1059 vma
->vm_flags
&= ~VM_HUGEPAGE
;
1060 vma
->vm_flags
|= VM_NOHUGEPAGE
;
1062 mm
->def_flags
|= VM_NOHUGEPAGE
;
1065 static inline void thp_split_mm(struct mm_struct
*mm
)
1068 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1070 static unsigned long page_table_realloc_pmd(struct mmu_gather
*tlb
,
1071 struct mm_struct
*mm
, pud_t
*pud
,
1072 unsigned long addr
, unsigned long end
)
1074 unsigned long next
, *table
, *new;
1078 pmd
= pmd_offset(pud
, addr
);
1080 next
= pmd_addr_end(addr
, end
);
1082 if (pmd_none_or_clear_bad(pmd
))
1084 table
= (unsigned long *) pmd_deref(*pmd
);
1085 page
= pfn_to_page(__pa(table
) >> PAGE_SHIFT
);
1086 if (page_table_with_pgste(page
))
1088 /* Allocate new page table with pgstes */
1089 new = page_table_alloc_pgste(mm
, addr
);
1091 mm
->context
.has_pgste
= 0;
1094 spin_lock(&mm
->page_table_lock
);
1095 if (likely((unsigned long *) pmd_deref(*pmd
) == table
)) {
1096 /* Nuke pmd entry pointing to the "short" page table */
1097 pmdp_flush_lazy(mm
, addr
, pmd
);
1099 /* Copy ptes from old table to new table */
1100 memcpy(new, table
, PAGE_SIZE
/2);
1101 clear_table(table
, _PAGE_INVALID
, PAGE_SIZE
/2);
1102 /* Establish new table */
1103 pmd_populate(mm
, pmd
, (pte_t
*) new);
1104 /* Free old table with rcu, there might be a walker! */
1105 page_table_free_rcu(tlb
, table
);
1108 spin_unlock(&mm
->page_table_lock
);
1110 page_table_free_pgste(new);
1113 } while (pmd
++, addr
= next
, addr
!= end
);
1118 static unsigned long page_table_realloc_pud(struct mmu_gather
*tlb
,
1119 struct mm_struct
*mm
, pgd_t
*pgd
,
1120 unsigned long addr
, unsigned long end
)
1125 pud
= pud_offset(pgd
, addr
);
1127 next
= pud_addr_end(addr
, end
);
1128 if (pud_none_or_clear_bad(pud
))
1130 next
= page_table_realloc_pmd(tlb
, mm
, pud
, addr
, next
);
1131 } while (pud
++, addr
= next
, addr
!= end
);
1136 static void page_table_realloc(struct mmu_gather
*tlb
, struct mm_struct
*mm
,
1137 unsigned long addr
, unsigned long end
)
1142 pgd
= pgd_offset(mm
, addr
);
1144 next
= pgd_addr_end(addr
, end
);
1145 if (pgd_none_or_clear_bad(pgd
))
1147 next
= page_table_realloc_pud(tlb
, mm
, pgd
, addr
, next
);
1148 } while (pgd
++, addr
= next
, addr
!= end
);
1152 * switch on pgstes for its userspace process (for kvm)
1154 int s390_enable_sie(void)
1156 struct task_struct
*tsk
= current
;
1157 struct mm_struct
*mm
= tsk
->mm
;
1158 struct mmu_gather tlb
;
1160 /* Do we have switched amode? If no, we cannot do sie */
1161 if (s390_user_mode
== HOME_SPACE_MODE
)
1164 /* Do we have pgstes? if yes, we are done */
1165 if (mm_has_pgste(tsk
->mm
))
1168 down_write(&mm
->mmap_sem
);
1169 /* split thp mappings and disable thp for future mappings */
1171 /* Reallocate the page tables with pgstes */
1172 mm
->context
.has_pgste
= 1;
1173 tlb_gather_mmu(&tlb
, mm
, 0, TASK_SIZE
);
1174 page_table_realloc(&tlb
, mm
, 0, TASK_SIZE
);
1175 tlb_finish_mmu(&tlb
, 0, TASK_SIZE
);
1176 up_write(&mm
->mmap_sem
);
1177 return mm
->context
.has_pgste
? 0 : -ENOMEM
;
1179 EXPORT_SYMBOL_GPL(s390_enable_sie
);
1181 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1182 int pmdp_clear_flush_young(struct vm_area_struct
*vma
, unsigned long address
,
1185 VM_BUG_ON(address
& ~HPAGE_PMD_MASK
);
1186 /* No need to flush TLB
1187 * On s390 reference bits are in storage key and never in TLB */
1188 return pmdp_test_and_clear_young(vma
, address
, pmdp
);
1191 int pmdp_set_access_flags(struct vm_area_struct
*vma
,
1192 unsigned long address
, pmd_t
*pmdp
,
1193 pmd_t entry
, int dirty
)
1195 VM_BUG_ON(address
& ~HPAGE_PMD_MASK
);
1197 if (pmd_same(*pmdp
, entry
))
1199 pmdp_invalidate(vma
, address
, pmdp
);
1200 set_pmd_at(vma
->vm_mm
, address
, pmdp
, entry
);
1204 static void pmdp_splitting_flush_sync(void *arg
)
1206 /* Simply deliver the interrupt */
1209 void pmdp_splitting_flush(struct vm_area_struct
*vma
, unsigned long address
,
1212 VM_BUG_ON(address
& ~HPAGE_PMD_MASK
);
1213 if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT
,
1214 (unsigned long *) pmdp
)) {
1215 /* need to serialize against gup-fast (IRQ disabled) */
1216 smp_call_function(pmdp_splitting_flush_sync
, NULL
, 1);
1220 void pgtable_trans_huge_deposit(struct mm_struct
*mm
, pmd_t
*pmdp
,
1223 struct list_head
*lh
= (struct list_head
*) pgtable
;
1225 assert_spin_locked(&mm
->page_table_lock
);
1228 if (!mm
->pmd_huge_pte
)
1231 list_add(lh
, (struct list_head
*) mm
->pmd_huge_pte
);
1232 mm
->pmd_huge_pte
= pgtable
;
1235 pgtable_t
pgtable_trans_huge_withdraw(struct mm_struct
*mm
, pmd_t
*pmdp
)
1237 struct list_head
*lh
;
1241 assert_spin_locked(&mm
->page_table_lock
);
1244 pgtable
= mm
->pmd_huge_pte
;
1245 lh
= (struct list_head
*) pgtable
;
1247 mm
->pmd_huge_pte
= NULL
;
1249 mm
->pmd_huge_pte
= (pgtable_t
) lh
->next
;
1252 ptep
= (pte_t
*) pgtable
;
1253 pte_val(*ptep
) = _PAGE_INVALID
;
1255 pte_val(*ptep
) = _PAGE_INVALID
;
1258 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */