2 * Copyright IBM Corp. 2007, 2011
3 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
6 #include <linux/sched.h>
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
11 #include <linux/swap.h>
12 #include <linux/smp.h>
13 #include <linux/highmem.h>
14 #include <linux/pagemap.h>
15 #include <linux/spinlock.h>
16 #include <linux/module.h>
17 #include <linux/quicklist.h>
18 #include <linux/rcupdate.h>
19 #include <linux/slab.h>
21 #include <asm/pgtable.h>
22 #include <asm/pgalloc.h>
24 #include <asm/tlbflush.h>
25 #include <asm/mmu_context.h>
29 #define FRAG_MASK 0x0f
32 #define FRAG_MASK 0x03
36 unsigned long *crst_table_alloc(struct mm_struct
*mm
)
38 struct page
*page
= alloc_pages(GFP_KERNEL
, ALLOC_ORDER
);
42 return (unsigned long *) page_to_phys(page
);
45 void crst_table_free(struct mm_struct
*mm
, unsigned long *table
)
47 free_pages((unsigned long) table
, ALLOC_ORDER
);
51 int crst_table_upgrade(struct mm_struct
*mm
, unsigned long limit
)
53 unsigned long *table
, *pgd
;
56 BUG_ON(limit
> (1UL << 53));
58 table
= crst_table_alloc(mm
);
61 spin_lock_bh(&mm
->page_table_lock
);
62 if (mm
->context
.asce_limit
< limit
) {
63 pgd
= (unsigned long *) mm
->pgd
;
64 if (mm
->context
.asce_limit
<= (1UL << 31)) {
65 entry
= _REGION3_ENTRY_EMPTY
;
66 mm
->context
.asce_limit
= 1UL << 42;
67 mm
->context
.asce_bits
= _ASCE_TABLE_LENGTH
|
71 entry
= _REGION2_ENTRY_EMPTY
;
72 mm
->context
.asce_limit
= 1UL << 53;
73 mm
->context
.asce_bits
= _ASCE_TABLE_LENGTH
|
77 crst_table_init(table
, entry
);
78 pgd_populate(mm
, (pgd_t
*) table
, (pud_t
*) pgd
);
79 mm
->pgd
= (pgd_t
*) table
;
80 mm
->task_size
= mm
->context
.asce_limit
;
83 spin_unlock_bh(&mm
->page_table_lock
);
85 crst_table_free(mm
, table
);
86 if (mm
->context
.asce_limit
< limit
)
91 void crst_table_downgrade(struct mm_struct
*mm
, unsigned long limit
)
95 while (mm
->context
.asce_limit
> limit
) {
97 switch (pgd_val(*pgd
) & _REGION_ENTRY_TYPE_MASK
) {
98 case _REGION_ENTRY_TYPE_R2
:
99 mm
->context
.asce_limit
= 1UL << 42;
100 mm
->context
.asce_bits
= _ASCE_TABLE_LENGTH
|
104 case _REGION_ENTRY_TYPE_R3
:
105 mm
->context
.asce_limit
= 1UL << 31;
106 mm
->context
.asce_bits
= _ASCE_TABLE_LENGTH
|
113 mm
->pgd
= (pgd_t
*) (pgd_val(*pgd
) & _REGION_ENTRY_ORIGIN
);
114 mm
->task_size
= mm
->context
.asce_limit
;
115 crst_table_free(mm
, (unsigned long *) pgd
);
123 * gmap_alloc - allocate a guest address space
124 * @mm: pointer to the parent mm_struct
126 * Returns a guest address space structure.
128 struct gmap
*gmap_alloc(struct mm_struct
*mm
)
132 unsigned long *table
;
134 gmap
= kzalloc(sizeof(struct gmap
), GFP_KERNEL
);
137 INIT_LIST_HEAD(&gmap
->crst_list
);
139 page
= alloc_pages(GFP_KERNEL
, ALLOC_ORDER
);
142 list_add(&page
->lru
, &gmap
->crst_list
);
143 table
= (unsigned long *) page_to_phys(page
);
144 crst_table_init(table
, _REGION1_ENTRY_EMPTY
);
146 gmap
->asce
= _ASCE_TYPE_REGION1
| _ASCE_TABLE_LENGTH
|
147 _ASCE_USER_BITS
| __pa(table
);
148 list_add(&gmap
->list
, &mm
->context
.gmap_list
);
156 EXPORT_SYMBOL_GPL(gmap_alloc
);
158 static int gmap_unlink_segment(struct gmap
*gmap
, unsigned long *table
)
160 struct gmap_pgtable
*mp
;
161 struct gmap_rmap
*rmap
;
164 if (*table
& _SEGMENT_ENTRY_INV
)
166 page
= pfn_to_page(*table
>> PAGE_SHIFT
);
167 mp
= (struct gmap_pgtable
*) page
->index
;
168 list_for_each_entry(rmap
, &mp
->mapper
, list
) {
169 if (rmap
->entry
!= table
)
171 list_del(&rmap
->list
);
175 *table
= _SEGMENT_ENTRY_INV
| _SEGMENT_ENTRY_RO
| mp
->vmaddr
;
179 static void gmap_flush_tlb(struct gmap
*gmap
)
181 if (MACHINE_HAS_IDTE
)
182 __tlb_flush_idte((unsigned long) gmap
->table
|
185 __tlb_flush_global();
189 * gmap_free - free a guest address space
190 * @gmap: pointer to the guest address space structure
192 void gmap_free(struct gmap
*gmap
)
194 struct page
*page
, *next
;
195 unsigned long *table
;
200 if (MACHINE_HAS_IDTE
)
201 __tlb_flush_idte((unsigned long) gmap
->table
|
204 __tlb_flush_global();
206 /* Free all segment & region tables. */
207 down_read(&gmap
->mm
->mmap_sem
);
208 spin_lock(&gmap
->mm
->page_table_lock
);
209 list_for_each_entry_safe(page
, next
, &gmap
->crst_list
, lru
) {
210 table
= (unsigned long *) page_to_phys(page
);
211 if ((*table
& _REGION_ENTRY_TYPE_MASK
) == 0)
212 /* Remove gmap rmap structures for segment table. */
213 for (i
= 0; i
< PTRS_PER_PMD
; i
++, table
++)
214 gmap_unlink_segment(gmap
, table
);
215 __free_pages(page
, ALLOC_ORDER
);
217 spin_unlock(&gmap
->mm
->page_table_lock
);
218 up_read(&gmap
->mm
->mmap_sem
);
219 list_del(&gmap
->list
);
222 EXPORT_SYMBOL_GPL(gmap_free
);
225 * gmap_enable - switch primary space to the guest address space
226 * @gmap: pointer to the guest address space structure
228 void gmap_enable(struct gmap
*gmap
)
230 S390_lowcore
.gmap
= (unsigned long) gmap
;
232 EXPORT_SYMBOL_GPL(gmap_enable
);
235 * gmap_disable - switch back to the standard primary address space
236 * @gmap: pointer to the guest address space structure
238 void gmap_disable(struct gmap
*gmap
)
240 S390_lowcore
.gmap
= 0UL;
242 EXPORT_SYMBOL_GPL(gmap_disable
);
245 * gmap_alloc_table is assumed to be called with mmap_sem held
247 static int gmap_alloc_table(struct gmap
*gmap
,
248 unsigned long *table
, unsigned long init
)
253 /* since we dont free the gmap table until gmap_free we can unlock */
254 spin_unlock(&gmap
->mm
->page_table_lock
);
255 page
= alloc_pages(GFP_KERNEL
, ALLOC_ORDER
);
256 spin_lock(&gmap
->mm
->page_table_lock
);
259 new = (unsigned long *) page_to_phys(page
);
260 crst_table_init(new, init
);
261 if (*table
& _REGION_ENTRY_INV
) {
262 list_add(&page
->lru
, &gmap
->crst_list
);
263 *table
= (unsigned long) new | _REGION_ENTRY_LENGTH
|
264 (*table
& _REGION_ENTRY_TYPE_MASK
);
266 __free_pages(page
, ALLOC_ORDER
);
271 * gmap_unmap_segment - unmap segment from the guest address space
272 * @gmap: pointer to the guest address space structure
273 * @addr: address in the guest address space
274 * @len: length of the memory area to unmap
276 * Returns 0 if the unmap succeded, -EINVAL if not.
278 int gmap_unmap_segment(struct gmap
*gmap
, unsigned long to
, unsigned long len
)
280 unsigned long *table
;
284 if ((to
| len
) & (PMD_SIZE
- 1))
286 if (len
== 0 || to
+ len
< to
)
290 down_read(&gmap
->mm
->mmap_sem
);
291 spin_lock(&gmap
->mm
->page_table_lock
);
292 for (off
= 0; off
< len
; off
+= PMD_SIZE
) {
293 /* Walk the guest addr space page table */
294 table
= gmap
->table
+ (((to
+ off
) >> 53) & 0x7ff);
295 if (*table
& _REGION_ENTRY_INV
)
297 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
298 table
= table
+ (((to
+ off
) >> 42) & 0x7ff);
299 if (*table
& _REGION_ENTRY_INV
)
301 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
302 table
= table
+ (((to
+ off
) >> 31) & 0x7ff);
303 if (*table
& _REGION_ENTRY_INV
)
305 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
306 table
= table
+ (((to
+ off
) >> 20) & 0x7ff);
308 /* Clear segment table entry in guest address space. */
309 flush
|= gmap_unlink_segment(gmap
, table
);
310 *table
= _SEGMENT_ENTRY_INV
;
313 spin_unlock(&gmap
->mm
->page_table_lock
);
314 up_read(&gmap
->mm
->mmap_sem
);
316 gmap_flush_tlb(gmap
);
319 EXPORT_SYMBOL_GPL(gmap_unmap_segment
);
322 * gmap_mmap_segment - map a segment to the guest address space
323 * @gmap: pointer to the guest address space structure
324 * @from: source address in the parent address space
325 * @to: target address in the guest address space
327 * Returns 0 if the mmap succeded, -EINVAL or -ENOMEM if not.
329 int gmap_map_segment(struct gmap
*gmap
, unsigned long from
,
330 unsigned long to
, unsigned long len
)
332 unsigned long *table
;
336 if ((from
| to
| len
) & (PMD_SIZE
- 1))
338 if (len
== 0 || from
+ len
> PGDIR_SIZE
||
339 from
+ len
< from
|| to
+ len
< to
)
343 down_read(&gmap
->mm
->mmap_sem
);
344 spin_lock(&gmap
->mm
->page_table_lock
);
345 for (off
= 0; off
< len
; off
+= PMD_SIZE
) {
346 /* Walk the gmap address space page table */
347 table
= gmap
->table
+ (((to
+ off
) >> 53) & 0x7ff);
348 if ((*table
& _REGION_ENTRY_INV
) &&
349 gmap_alloc_table(gmap
, table
, _REGION2_ENTRY_EMPTY
))
351 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
352 table
= table
+ (((to
+ off
) >> 42) & 0x7ff);
353 if ((*table
& _REGION_ENTRY_INV
) &&
354 gmap_alloc_table(gmap
, table
, _REGION3_ENTRY_EMPTY
))
356 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
357 table
= table
+ (((to
+ off
) >> 31) & 0x7ff);
358 if ((*table
& _REGION_ENTRY_INV
) &&
359 gmap_alloc_table(gmap
, table
, _SEGMENT_ENTRY_EMPTY
))
361 table
= (unsigned long *) (*table
& _REGION_ENTRY_ORIGIN
);
362 table
= table
+ (((to
+ off
) >> 20) & 0x7ff);
364 /* Store 'from' address in an invalid segment table entry. */
365 flush
|= gmap_unlink_segment(gmap
, table
);
366 *table
= _SEGMENT_ENTRY_INV
| _SEGMENT_ENTRY_RO
| (from
+ off
);
368 spin_unlock(&gmap
->mm
->page_table_lock
);
369 up_read(&gmap
->mm
->mmap_sem
);
371 gmap_flush_tlb(gmap
);
375 spin_unlock(&gmap
->mm
->page_table_lock
);
376 up_read(&gmap
->mm
->mmap_sem
);
377 gmap_unmap_segment(gmap
, to
, len
);
380 EXPORT_SYMBOL_GPL(gmap_map_segment
);
383 * this function is assumed to be called with mmap_sem held
385 unsigned long __gmap_fault(unsigned long address
, struct gmap
*gmap
)
387 unsigned long *table
, vmaddr
, segment
;
388 struct mm_struct
*mm
;
389 struct gmap_pgtable
*mp
;
390 struct gmap_rmap
*rmap
;
391 struct vm_area_struct
*vma
;
397 current
->thread
.gmap_addr
= address
;
399 /* Walk the gmap address space page table */
400 table
= gmap
->table
+ ((address
>> 53) & 0x7ff);
401 if (unlikely(*table
& _REGION_ENTRY_INV
))
403 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
404 table
= table
+ ((address
>> 42) & 0x7ff);
405 if (unlikely(*table
& _REGION_ENTRY_INV
))
407 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
408 table
= table
+ ((address
>> 31) & 0x7ff);
409 if (unlikely(*table
& _REGION_ENTRY_INV
))
411 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
412 table
= table
+ ((address
>> 20) & 0x7ff);
414 /* Convert the gmap address to an mm address. */
416 if (likely(!(segment
& _SEGMENT_ENTRY_INV
))) {
417 page
= pfn_to_page(segment
>> PAGE_SHIFT
);
418 mp
= (struct gmap_pgtable
*) page
->index
;
419 return mp
->vmaddr
| (address
& ~PMD_MASK
);
420 } else if (segment
& _SEGMENT_ENTRY_RO
) {
421 vmaddr
= segment
& _SEGMENT_ENTRY_ORIGIN
;
422 vma
= find_vma(mm
, vmaddr
);
423 if (!vma
|| vma
->vm_start
> vmaddr
)
426 /* Walk the parent mm page table */
427 pgd
= pgd_offset(mm
, vmaddr
);
428 pud
= pud_alloc(mm
, pgd
, vmaddr
);
431 pmd
= pmd_alloc(mm
, pud
, vmaddr
);
434 if (!pmd_present(*pmd
) &&
435 __pte_alloc(mm
, vma
, pmd
, vmaddr
))
437 /* pmd now points to a valid segment table entry. */
438 rmap
= kmalloc(sizeof(*rmap
), GFP_KERNEL
|__GFP_REPEAT
);
441 /* Link gmap segment table entry location to page table. */
442 page
= pmd_page(*pmd
);
443 mp
= (struct gmap_pgtable
*) page
->index
;
445 spin_lock(&mm
->page_table_lock
);
446 list_add(&rmap
->list
, &mp
->mapper
);
447 spin_unlock(&mm
->page_table_lock
);
448 /* Set gmap segment table entry to page table. */
449 *table
= pmd_val(*pmd
) & PAGE_MASK
;
450 return vmaddr
| (address
& ~PMD_MASK
);
455 unsigned long gmap_fault(unsigned long address
, struct gmap
*gmap
)
459 down_read(&gmap
->mm
->mmap_sem
);
460 rc
= __gmap_fault(address
, gmap
);
461 up_read(&gmap
->mm
->mmap_sem
);
465 EXPORT_SYMBOL_GPL(gmap_fault
);
467 void gmap_discard(unsigned long from
, unsigned long to
, struct gmap
*gmap
)
470 unsigned long *table
, address
, size
;
471 struct vm_area_struct
*vma
;
472 struct gmap_pgtable
*mp
;
475 down_read(&gmap
->mm
->mmap_sem
);
477 while (address
< to
) {
478 /* Walk the gmap address space page table */
479 table
= gmap
->table
+ ((address
>> 53) & 0x7ff);
480 if (unlikely(*table
& _REGION_ENTRY_INV
)) {
481 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
484 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
485 table
= table
+ ((address
>> 42) & 0x7ff);
486 if (unlikely(*table
& _REGION_ENTRY_INV
)) {
487 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
490 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
491 table
= table
+ ((address
>> 31) & 0x7ff);
492 if (unlikely(*table
& _REGION_ENTRY_INV
)) {
493 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
496 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
497 table
= table
+ ((address
>> 20) & 0x7ff);
498 if (unlikely(*table
& _SEGMENT_ENTRY_INV
)) {
499 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
502 page
= pfn_to_page(*table
>> PAGE_SHIFT
);
503 mp
= (struct gmap_pgtable
*) page
->index
;
504 vma
= find_vma(gmap
->mm
, mp
->vmaddr
);
505 size
= min(to
- address
, PMD_SIZE
- (address
& ~PMD_MASK
));
506 zap_page_range(vma
, mp
->vmaddr
| (address
& ~PMD_MASK
),
508 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
510 up_read(&gmap
->mm
->mmap_sem
);
512 EXPORT_SYMBOL_GPL(gmap_discard
);
514 void gmap_unmap_notifier(struct mm_struct
*mm
, unsigned long *table
)
516 struct gmap_rmap
*rmap
, *next
;
517 struct gmap_pgtable
*mp
;
522 spin_lock(&mm
->page_table_lock
);
523 page
= pfn_to_page(__pa(table
) >> PAGE_SHIFT
);
524 mp
= (struct gmap_pgtable
*) page
->index
;
525 list_for_each_entry_safe(rmap
, next
, &mp
->mapper
, list
) {
527 _SEGMENT_ENTRY_INV
| _SEGMENT_ENTRY_RO
| mp
->vmaddr
;
528 list_del(&rmap
->list
);
532 spin_unlock(&mm
->page_table_lock
);
534 __tlb_flush_global();
537 static inline unsigned long *page_table_alloc_pgste(struct mm_struct
*mm
,
538 unsigned long vmaddr
)
541 unsigned long *table
;
542 struct gmap_pgtable
*mp
;
544 page
= alloc_page(GFP_KERNEL
|__GFP_REPEAT
);
547 mp
= kmalloc(sizeof(*mp
), GFP_KERNEL
|__GFP_REPEAT
);
552 pgtable_page_ctor(page
);
553 mp
->vmaddr
= vmaddr
& PMD_MASK
;
554 INIT_LIST_HEAD(&mp
->mapper
);
555 page
->index
= (unsigned long) mp
;
556 atomic_set(&page
->_mapcount
, 3);
557 table
= (unsigned long *) page_to_phys(page
);
558 clear_table(table
, _PAGE_TYPE_EMPTY
, PAGE_SIZE
/2);
559 clear_table(table
+ PTRS_PER_PTE
, 0, PAGE_SIZE
/2);
563 static inline void page_table_free_pgste(unsigned long *table
)
566 struct gmap_pgtable
*mp
;
568 page
= pfn_to_page(__pa(table
) >> PAGE_SHIFT
);
569 mp
= (struct gmap_pgtable
*) page
->index
;
570 BUG_ON(!list_empty(&mp
->mapper
));
571 pgtable_page_dtor(page
);
572 atomic_set(&page
->_mapcount
, -1);
577 #else /* CONFIG_PGSTE */
579 static inline unsigned long *page_table_alloc_pgste(struct mm_struct
*mm
,
580 unsigned long vmaddr
)
585 static inline void page_table_free_pgste(unsigned long *table
)
589 static inline void gmap_unmap_notifier(struct mm_struct
*mm
,
590 unsigned long *table
)
594 #endif /* CONFIG_PGSTE */
596 static inline unsigned int atomic_xor_bits(atomic_t
*v
, unsigned int bits
)
598 unsigned int old
, new;
601 old
= atomic_read(v
);
603 } while (atomic_cmpxchg(v
, old
, new) != old
);
608 * page table entry allocation/free routines.
610 unsigned long *page_table_alloc(struct mm_struct
*mm
, unsigned long vmaddr
)
612 unsigned long *uninitialized_var(table
);
613 struct page
*uninitialized_var(page
);
614 unsigned int mask
, bit
;
616 if (mm_has_pgste(mm
))
617 return page_table_alloc_pgste(mm
, vmaddr
);
618 /* Allocate fragments of a 4K page as 1K/2K page table */
619 spin_lock_bh(&mm
->context
.list_lock
);
621 if (!list_empty(&mm
->context
.pgtable_list
)) {
622 page
= list_first_entry(&mm
->context
.pgtable_list
,
624 table
= (unsigned long *) page_to_phys(page
);
625 mask
= atomic_read(&page
->_mapcount
);
626 mask
= mask
| (mask
>> 4);
628 if ((mask
& FRAG_MASK
) == FRAG_MASK
) {
629 spin_unlock_bh(&mm
->context
.list_lock
);
630 page
= alloc_page(GFP_KERNEL
|__GFP_REPEAT
);
633 pgtable_page_ctor(page
);
634 atomic_set(&page
->_mapcount
, 1);
635 table
= (unsigned long *) page_to_phys(page
);
636 clear_table(table
, _PAGE_TYPE_EMPTY
, PAGE_SIZE
);
637 spin_lock_bh(&mm
->context
.list_lock
);
638 list_add(&page
->lru
, &mm
->context
.pgtable_list
);
640 for (bit
= 1; mask
& bit
; bit
<<= 1)
641 table
+= PTRS_PER_PTE
;
642 mask
= atomic_xor_bits(&page
->_mapcount
, bit
);
643 if ((mask
& FRAG_MASK
) == FRAG_MASK
)
644 list_del(&page
->lru
);
646 spin_unlock_bh(&mm
->context
.list_lock
);
650 void page_table_free(struct mm_struct
*mm
, unsigned long *table
)
653 unsigned int bit
, mask
;
655 if (mm_has_pgste(mm
)) {
656 gmap_unmap_notifier(mm
, table
);
657 return page_table_free_pgste(table
);
659 /* Free 1K/2K page table fragment of a 4K page */
660 page
= pfn_to_page(__pa(table
) >> PAGE_SHIFT
);
661 bit
= 1 << ((__pa(table
) & ~PAGE_MASK
)/(PTRS_PER_PTE
*sizeof(pte_t
)));
662 spin_lock_bh(&mm
->context
.list_lock
);
663 if ((atomic_read(&page
->_mapcount
) & FRAG_MASK
) != FRAG_MASK
)
664 list_del(&page
->lru
);
665 mask
= atomic_xor_bits(&page
->_mapcount
, bit
);
666 if (mask
& FRAG_MASK
)
667 list_add(&page
->lru
, &mm
->context
.pgtable_list
);
668 spin_unlock_bh(&mm
->context
.list_lock
);
670 pgtable_page_dtor(page
);
671 atomic_set(&page
->_mapcount
, -1);
676 static void __page_table_free_rcu(void *table
, unsigned bit
)
680 if (bit
== FRAG_MASK
)
681 return page_table_free_pgste(table
);
682 /* Free 1K/2K page table fragment of a 4K page */
683 page
= pfn_to_page(__pa(table
) >> PAGE_SHIFT
);
684 if (atomic_xor_bits(&page
->_mapcount
, bit
) == 0) {
685 pgtable_page_dtor(page
);
686 atomic_set(&page
->_mapcount
, -1);
691 void page_table_free_rcu(struct mmu_gather
*tlb
, unsigned long *table
)
693 struct mm_struct
*mm
;
695 unsigned int bit
, mask
;
698 if (mm_has_pgste(mm
)) {
699 gmap_unmap_notifier(mm
, table
);
700 table
= (unsigned long *) (__pa(table
) | FRAG_MASK
);
701 tlb_remove_table(tlb
, table
);
704 bit
= 1 << ((__pa(table
) & ~PAGE_MASK
) / (PTRS_PER_PTE
*sizeof(pte_t
)));
705 page
= pfn_to_page(__pa(table
) >> PAGE_SHIFT
);
706 spin_lock_bh(&mm
->context
.list_lock
);
707 if ((atomic_read(&page
->_mapcount
) & FRAG_MASK
) != FRAG_MASK
)
708 list_del(&page
->lru
);
709 mask
= atomic_xor_bits(&page
->_mapcount
, bit
| (bit
<< 4));
710 if (mask
& FRAG_MASK
)
711 list_add_tail(&page
->lru
, &mm
->context
.pgtable_list
);
712 spin_unlock_bh(&mm
->context
.list_lock
);
713 table
= (unsigned long *) (__pa(table
) | (bit
<< 4));
714 tlb_remove_table(tlb
, table
);
717 void __tlb_remove_table(void *_table
)
719 const unsigned long mask
= (FRAG_MASK
<< 4) | FRAG_MASK
;
720 void *table
= (void *)((unsigned long) _table
& ~mask
);
721 unsigned type
= (unsigned long) _table
& mask
;
724 __page_table_free_rcu(table
, type
);
726 free_pages((unsigned long) table
, ALLOC_ORDER
);
729 static void tlb_remove_table_smp_sync(void *arg
)
731 /* Simply deliver the interrupt */
734 static void tlb_remove_table_one(void *table
)
737 * This isn't an RCU grace period and hence the page-tables cannot be
738 * assumed to be actually RCU-freed.
740 * It is however sufficient for software page-table walkers that rely
741 * on IRQ disabling. See the comment near struct mmu_table_batch.
743 smp_call_function(tlb_remove_table_smp_sync
, NULL
, 1);
744 __tlb_remove_table(table
);
747 static void tlb_remove_table_rcu(struct rcu_head
*head
)
749 struct mmu_table_batch
*batch
;
752 batch
= container_of(head
, struct mmu_table_batch
, rcu
);
754 for (i
= 0; i
< batch
->nr
; i
++)
755 __tlb_remove_table(batch
->tables
[i
]);
757 free_page((unsigned long)batch
);
760 void tlb_table_flush(struct mmu_gather
*tlb
)
762 struct mmu_table_batch
**batch
= &tlb
->batch
;
765 __tlb_flush_mm(tlb
->mm
);
766 call_rcu_sched(&(*batch
)->rcu
, tlb_remove_table_rcu
);
771 void tlb_remove_table(struct mmu_gather
*tlb
, void *table
)
773 struct mmu_table_batch
**batch
= &tlb
->batch
;
775 if (*batch
== NULL
) {
776 *batch
= (struct mmu_table_batch
*)
777 __get_free_page(GFP_NOWAIT
| __GFP_NOWARN
);
778 if (*batch
== NULL
) {
779 __tlb_flush_mm(tlb
->mm
);
780 tlb_remove_table_one(table
);
785 (*batch
)->tables
[(*batch
)->nr
++] = table
;
786 if ((*batch
)->nr
== MAX_TABLE_BATCH
)
787 tlb_table_flush(tlb
);
790 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
791 void thp_split_vma(struct vm_area_struct
*vma
)
796 for (addr
= vma
->vm_start
; addr
< vma
->vm_end
; addr
+= PAGE_SIZE
) {
797 page
= follow_page(vma
, addr
, FOLL_SPLIT
);
801 void thp_split_mm(struct mm_struct
*mm
)
803 struct vm_area_struct
*vma
= mm
->mmap
;
805 while (vma
!= NULL
) {
807 vma
->vm_flags
&= ~VM_HUGEPAGE
;
808 vma
->vm_flags
|= VM_NOHUGEPAGE
;
812 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
815 * switch on pgstes for its userspace process (for kvm)
817 int s390_enable_sie(void)
819 struct task_struct
*tsk
= current
;
820 struct mm_struct
*mm
, *old_mm
;
822 /* Do we have switched amode? If no, we cannot do sie */
823 if (s390_user_mode
== HOME_SPACE_MODE
)
826 /* Do we have pgstes? if yes, we are done */
827 if (mm_has_pgste(tsk
->mm
))
830 /* lets check if we are allowed to replace the mm */
832 if (!tsk
->mm
|| atomic_read(&tsk
->mm
->mm_users
) > 1 ||
834 !hlist_empty(&tsk
->mm
->ioctx_list
) ||
836 tsk
->mm
!= tsk
->active_mm
) {
842 /* we copy the mm and let dup_mm create the page tables with_pgstes */
843 tsk
->mm
->context
.alloc_pgste
= 1;
844 /* make sure that both mms have a correct rss state */
845 sync_mm_rss(tsk
->mm
);
847 tsk
->mm
->context
.alloc_pgste
= 0;
851 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
852 /* split thp mappings and disable thp for future mappings */
854 mm
->def_flags
|= VM_NOHUGEPAGE
;
857 /* Now lets check again if something happened */
859 if (!tsk
->mm
|| atomic_read(&tsk
->mm
->mm_users
) > 1 ||
861 !hlist_empty(&tsk
->mm
->ioctx_list
) ||
863 tsk
->mm
!= tsk
->active_mm
) {
869 /* ok, we are alone. No ptrace, no threads, etc. */
871 tsk
->mm
= tsk
->active_mm
= mm
;
874 atomic_inc(&mm
->context
.attach_count
);
875 atomic_dec(&old_mm
->context
.attach_count
);
876 cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm
));
882 EXPORT_SYMBOL_GPL(s390_enable_sie
);
884 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
885 int pmdp_clear_flush_young(struct vm_area_struct
*vma
, unsigned long address
,
888 VM_BUG_ON(address
& ~HPAGE_PMD_MASK
);
889 /* No need to flush TLB
890 * On s390 reference bits are in storage key and never in TLB */
891 return pmdp_test_and_clear_young(vma
, address
, pmdp
);
894 int pmdp_set_access_flags(struct vm_area_struct
*vma
,
895 unsigned long address
, pmd_t
*pmdp
,
896 pmd_t entry
, int dirty
)
898 VM_BUG_ON(address
& ~HPAGE_PMD_MASK
);
900 if (pmd_same(*pmdp
, entry
))
902 pmdp_invalidate(vma
, address
, pmdp
);
903 set_pmd_at(vma
->vm_mm
, address
, pmdp
, entry
);
907 static void pmdp_splitting_flush_sync(void *arg
)
909 /* Simply deliver the interrupt */
912 void pmdp_splitting_flush(struct vm_area_struct
*vma
, unsigned long address
,
915 VM_BUG_ON(address
& ~HPAGE_PMD_MASK
);
916 if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT
,
917 (unsigned long *) pmdp
)) {
918 /* need to serialize against gup-fast (IRQ disabled) */
919 smp_call_function(pmdp_splitting_flush_sync
, NULL
, 1);
923 void pgtable_trans_huge_deposit(struct mm_struct
*mm
, pgtable_t pgtable
)
925 struct list_head
*lh
= (struct list_head
*) pgtable
;
927 assert_spin_locked(&mm
->page_table_lock
);
930 if (!mm
->pmd_huge_pte
)
933 list_add(lh
, (struct list_head
*) mm
->pmd_huge_pte
);
934 mm
->pmd_huge_pte
= pgtable
;
937 pgtable_t
pgtable_trans_huge_withdraw(struct mm_struct
*mm
)
939 struct list_head
*lh
;
943 assert_spin_locked(&mm
->page_table_lock
);
946 pgtable
= mm
->pmd_huge_pte
;
947 lh
= (struct list_head
*) pgtable
;
949 mm
->pmd_huge_pte
= NULL
;
951 mm
->pmd_huge_pte
= (pgtable_t
) lh
->next
;
954 ptep
= (pte_t
*) pgtable
;
955 pte_val(*ptep
) = _PAGE_TYPE_EMPTY
;
957 pte_val(*ptep
) = _PAGE_TYPE_EMPTY
;
960 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */