2 * SPDX-License-Identifier: MIT
4 * Copyright © 2012-2014 Intel Corporation
7 #include <linux/mmu_context.h>
8 #include <linux/mmu_notifier.h>
9 #include <linux/mempolicy.h>
10 #include <linux/swap.h>
11 #include <linux/sched/mm.h>
13 #include <drm/i915_drm.h>
16 #include "i915_gem_ioctls.h"
17 #include "i915_gem_object.h"
18 #include "i915_scatterlist.h"
20 struct i915_mm_struct
{
22 struct drm_i915_private
*i915
;
23 struct i915_mmu_notifier
*mn
;
24 struct hlist_node node
;
26 struct work_struct work
;
29 #if defined(CONFIG_MMU_NOTIFIER)
30 #include <linux/interval_tree.h>
32 struct i915_mmu_notifier
{
34 struct hlist_node node
;
35 struct mmu_notifier mn
;
36 struct rb_root_cached objects
;
37 struct i915_mm_struct
*mm
;
40 struct i915_mmu_object
{
41 struct i915_mmu_notifier
*mn
;
42 struct drm_i915_gem_object
*obj
;
43 struct interval_tree_node it
;
46 static void add_object(struct i915_mmu_object
*mo
)
48 GEM_BUG_ON(!RB_EMPTY_NODE(&mo
->it
.rb
));
49 interval_tree_insert(&mo
->it
, &mo
->mn
->objects
);
52 static void del_object(struct i915_mmu_object
*mo
)
54 if (RB_EMPTY_NODE(&mo
->it
.rb
))
57 interval_tree_remove(&mo
->it
, &mo
->mn
->objects
);
58 RB_CLEAR_NODE(&mo
->it
.rb
);
62 __i915_gem_userptr_set_active(struct drm_i915_gem_object
*obj
, bool value
)
64 struct i915_mmu_object
*mo
= obj
->userptr
.mmu_object
;
67 * During mm_invalidate_range we need to cancel any userptr that
68 * overlaps the range being invalidated. Doing so requires the
69 * struct_mutex, and that risks recursion. In order to cause
70 * recursion, the user must alias the userptr address space with
71 * a GTT mmapping (possible with a MAP_FIXED) - then when we have
72 * to invalidate that mmaping, mm_invalidate_range is called with
73 * the userptr address *and* the struct_mutex held. To prevent that
74 * we set a flag under the i915_mmu_notifier spinlock to indicate
75 * whether this object is valid.
80 spin_lock(&mo
->mn
->lock
);
85 spin_unlock(&mo
->mn
->lock
);
89 userptr_mn_invalidate_range_start(struct mmu_notifier
*_mn
,
90 const struct mmu_notifier_range
*range
)
92 struct i915_mmu_notifier
*mn
=
93 container_of(_mn
, struct i915_mmu_notifier
, mn
);
94 struct interval_tree_node
*it
;
98 if (RB_EMPTY_ROOT(&mn
->objects
.rb_root
))
101 /* interval ranges are inclusive, but invalidate range is exclusive */
102 end
= range
->end
- 1;
104 spin_lock(&mn
->lock
);
105 it
= interval_tree_iter_first(&mn
->objects
, range
->start
, end
);
107 struct drm_i915_gem_object
*obj
;
109 if (!mmu_notifier_range_blockable(range
)) {
115 * The mmu_object is released late when destroying the
116 * GEM object so it is entirely possible to gain a
117 * reference on an object in the process of being freed
118 * since our serialisation is via the spinlock and not
119 * the struct_mutex - and consequently use it after it
120 * is freed and then double free it. To prevent that
121 * use-after-free we only acquire a reference on the
122 * object if it is not in the process of being destroyed.
124 obj
= container_of(it
, struct i915_mmu_object
, it
)->obj
;
125 if (!kref_get_unless_zero(&obj
->base
.refcount
)) {
126 it
= interval_tree_iter_next(it
, range
->start
, end
);
129 spin_unlock(&mn
->lock
);
131 ret
= i915_gem_object_unbind(obj
,
132 I915_GEM_OBJECT_UNBIND_ACTIVE
|
133 I915_GEM_OBJECT_UNBIND_BARRIER
);
135 ret
= __i915_gem_object_put_pages(obj
);
136 i915_gem_object_put(obj
);
140 spin_lock(&mn
->lock
);
143 * As we do not (yet) protect the mmu from concurrent insertion
144 * over this range, there is no guarantee that this search will
145 * terminate given a pathologic workload.
147 it
= interval_tree_iter_first(&mn
->objects
, range
->start
, end
);
149 spin_unlock(&mn
->lock
);
155 static const struct mmu_notifier_ops i915_gem_userptr_notifier
= {
156 .invalidate_range_start
= userptr_mn_invalidate_range_start
,
159 static struct i915_mmu_notifier
*
160 i915_mmu_notifier_create(struct i915_mm_struct
*mm
)
162 struct i915_mmu_notifier
*mn
;
164 mn
= kmalloc(sizeof(*mn
), GFP_KERNEL
);
166 return ERR_PTR(-ENOMEM
);
168 spin_lock_init(&mn
->lock
);
169 mn
->mn
.ops
= &i915_gem_userptr_notifier
;
170 mn
->objects
= RB_ROOT_CACHED
;
177 i915_gem_userptr_release__mmu_notifier(struct drm_i915_gem_object
*obj
)
179 struct i915_mmu_object
*mo
;
181 mo
= fetch_and_zero(&obj
->userptr
.mmu_object
);
185 spin_lock(&mo
->mn
->lock
);
187 spin_unlock(&mo
->mn
->lock
);
191 static struct i915_mmu_notifier
*
192 i915_mmu_notifier_find(struct i915_mm_struct
*mm
)
194 struct i915_mmu_notifier
*mn
;
201 mn
= i915_mmu_notifier_create(mm
);
205 down_write(&mm
->mm
->mmap_sem
);
206 mutex_lock(&mm
->i915
->mm_lock
);
207 if (mm
->mn
== NULL
&& !err
) {
208 /* Protected by mmap_sem (write-lock) */
209 err
= __mmu_notifier_register(&mn
->mn
, mm
->mm
);
211 /* Protected by mm_lock */
212 mm
->mn
= fetch_and_zero(&mn
);
216 * Someone else raced and successfully installed the mmu
217 * notifier, we can cancel our own errors.
221 mutex_unlock(&mm
->i915
->mm_lock
);
222 up_write(&mm
->mm
->mmap_sem
);
224 if (mn
&& !IS_ERR(mn
))
227 return err
? ERR_PTR(err
) : mm
->mn
;
231 i915_gem_userptr_init__mmu_notifier(struct drm_i915_gem_object
*obj
,
234 struct i915_mmu_notifier
*mn
;
235 struct i915_mmu_object
*mo
;
237 if (flags
& I915_USERPTR_UNSYNCHRONIZED
)
238 return capable(CAP_SYS_ADMIN
) ? 0 : -EPERM
;
240 if (WARN_ON(obj
->userptr
.mm
== NULL
))
243 mn
= i915_mmu_notifier_find(obj
->userptr
.mm
);
247 mo
= kzalloc(sizeof(*mo
), GFP_KERNEL
);
253 mo
->it
.start
= obj
->userptr
.ptr
;
254 mo
->it
.last
= obj
->userptr
.ptr
+ obj
->base
.size
- 1;
255 RB_CLEAR_NODE(&mo
->it
.rb
);
257 obj
->userptr
.mmu_object
= mo
;
262 i915_mmu_notifier_free(struct i915_mmu_notifier
*mn
,
263 struct mm_struct
*mm
)
268 mmu_notifier_unregister(&mn
->mn
, mm
);
275 __i915_gem_userptr_set_active(struct drm_i915_gem_object
*obj
, bool value
)
280 i915_gem_userptr_release__mmu_notifier(struct drm_i915_gem_object
*obj
)
285 i915_gem_userptr_init__mmu_notifier(struct drm_i915_gem_object
*obj
,
288 if ((flags
& I915_USERPTR_UNSYNCHRONIZED
) == 0)
291 if (!capable(CAP_SYS_ADMIN
))
298 i915_mmu_notifier_free(struct i915_mmu_notifier
*mn
,
299 struct mm_struct
*mm
)
305 static struct i915_mm_struct
*
306 __i915_mm_struct_find(struct drm_i915_private
*dev_priv
, struct mm_struct
*real
)
308 struct i915_mm_struct
*mm
;
310 /* Protected by dev_priv->mm_lock */
311 hash_for_each_possible(dev_priv
->mm_structs
, mm
, node
, (unsigned long)real
)
319 i915_gem_userptr_init__mm_struct(struct drm_i915_gem_object
*obj
)
321 struct drm_i915_private
*dev_priv
= to_i915(obj
->base
.dev
);
322 struct i915_mm_struct
*mm
;
325 /* During release of the GEM object we hold the struct_mutex. This
326 * precludes us from calling mmput() at that time as that may be
327 * the last reference and so call exit_mmap(). exit_mmap() will
328 * attempt to reap the vma, and if we were holding a GTT mmap
329 * would then call drm_gem_vm_close() and attempt to reacquire
330 * the struct mutex. So in order to avoid that recursion, we have
331 * to defer releasing the mm reference until after we drop the
332 * struct_mutex, i.e. we need to schedule a worker to do the clean
335 mutex_lock(&dev_priv
->mm_lock
);
336 mm
= __i915_mm_struct_find(dev_priv
, current
->mm
);
338 mm
= kmalloc(sizeof(*mm
), GFP_KERNEL
);
344 kref_init(&mm
->kref
);
345 mm
->i915
= to_i915(obj
->base
.dev
);
347 mm
->mm
= current
->mm
;
352 /* Protected by dev_priv->mm_lock */
353 hash_add(dev_priv
->mm_structs
,
354 &mm
->node
, (unsigned long)mm
->mm
);
358 obj
->userptr
.mm
= mm
;
360 mutex_unlock(&dev_priv
->mm_lock
);
365 __i915_mm_struct_free__worker(struct work_struct
*work
)
367 struct i915_mm_struct
*mm
= container_of(work
, typeof(*mm
), work
);
368 i915_mmu_notifier_free(mm
->mn
, mm
->mm
);
374 __i915_mm_struct_free(struct kref
*kref
)
376 struct i915_mm_struct
*mm
= container_of(kref
, typeof(*mm
), kref
);
378 /* Protected by dev_priv->mm_lock */
380 mutex_unlock(&mm
->i915
->mm_lock
);
382 INIT_WORK(&mm
->work
, __i915_mm_struct_free__worker
);
383 queue_work(mm
->i915
->mm
.userptr_wq
, &mm
->work
);
387 i915_gem_userptr_release__mm_struct(struct drm_i915_gem_object
*obj
)
389 if (obj
->userptr
.mm
== NULL
)
392 kref_put_mutex(&obj
->userptr
.mm
->kref
,
393 __i915_mm_struct_free
,
394 &to_i915(obj
->base
.dev
)->mm_lock
);
395 obj
->userptr
.mm
= NULL
;
398 struct get_pages_work
{
399 struct work_struct work
;
400 struct drm_i915_gem_object
*obj
;
401 struct task_struct
*task
;
404 static struct sg_table
*
405 __i915_gem_userptr_alloc_pages(struct drm_i915_gem_object
*obj
,
406 struct page
**pvec
, unsigned long num_pages
)
408 unsigned int max_segment
= i915_sg_segment_size();
410 unsigned int sg_page_sizes
;
413 st
= kmalloc(sizeof(*st
), GFP_KERNEL
);
415 return ERR_PTR(-ENOMEM
);
418 ret
= __sg_alloc_table_from_pages(st
, pvec
, num_pages
,
419 0, num_pages
<< PAGE_SHIFT
,
427 ret
= i915_gem_gtt_prepare_pages(obj
, st
);
431 if (max_segment
> PAGE_SIZE
) {
432 max_segment
= PAGE_SIZE
;
440 sg_page_sizes
= i915_sg_page_sizes(st
->sgl
);
442 __i915_gem_object_set_pages(obj
, st
, sg_page_sizes
);
448 __i915_gem_userptr_get_pages_worker(struct work_struct
*_work
)
450 struct get_pages_work
*work
= container_of(_work
, typeof(*work
), work
);
451 struct drm_i915_gem_object
*obj
= work
->obj
;
452 const unsigned long npages
= obj
->base
.size
>> PAGE_SHIFT
;
453 unsigned long pinned
;
460 pvec
= kvmalloc_array(npages
, sizeof(struct page
*), GFP_KERNEL
);
462 struct mm_struct
*mm
= obj
->userptr
.mm
->mm
;
463 unsigned int flags
= 0;
466 if (!i915_gem_object_is_readonly(obj
))
470 if (mmget_not_zero(mm
)) {
471 while (pinned
< npages
) {
473 down_read(&mm
->mmap_sem
);
476 ret
= get_user_pages_remote
478 obj
->userptr
.ptr
+ pinned
* PAGE_SIZE
,
481 pvec
+ pinned
, NULL
, &locked
);
488 up_read(&mm
->mmap_sem
);
493 mutex_lock_nested(&obj
->mm
.lock
, I915_MM_GET_PAGES
);
494 if (obj
->userptr
.work
== &work
->work
) {
495 struct sg_table
*pages
= ERR_PTR(ret
);
497 if (pinned
== npages
) {
498 pages
= __i915_gem_userptr_alloc_pages(obj
, pvec
,
500 if (!IS_ERR(pages
)) {
506 obj
->userptr
.work
= ERR_CAST(pages
);
508 __i915_gem_userptr_set_active(obj
, false);
510 mutex_unlock(&obj
->mm
.lock
);
512 release_pages(pvec
, pinned
);
515 i915_gem_object_put(obj
);
516 put_task_struct(work
->task
);
520 static struct sg_table
*
521 __i915_gem_userptr_get_pages_schedule(struct drm_i915_gem_object
*obj
)
523 struct get_pages_work
*work
;
525 /* Spawn a worker so that we can acquire the
526 * user pages without holding our mutex. Access
527 * to the user pages requires mmap_sem, and we have
528 * a strict lock ordering of mmap_sem, struct_mutex -
529 * we already hold struct_mutex here and so cannot
530 * call gup without encountering a lock inversion.
532 * Userspace will keep on repeating the operation
533 * (thanks to EAGAIN) until either we hit the fast
534 * path or the worker completes. If the worker is
535 * cancelled or superseded, the task is still run
536 * but the results ignored. (This leads to
537 * complications that we may have a stray object
538 * refcount that we need to be wary of when
539 * checking for existing objects during creation.)
540 * If the worker encounters an error, it reports
541 * that error back to this function through
542 * obj->userptr.work = ERR_PTR.
544 work
= kmalloc(sizeof(*work
), GFP_KERNEL
);
546 return ERR_PTR(-ENOMEM
);
548 obj
->userptr
.work
= &work
->work
;
550 work
->obj
= i915_gem_object_get(obj
);
552 work
->task
= current
;
553 get_task_struct(work
->task
);
555 INIT_WORK(&work
->work
, __i915_gem_userptr_get_pages_worker
);
556 queue_work(to_i915(obj
->base
.dev
)->mm
.userptr_wq
, &work
->work
);
558 return ERR_PTR(-EAGAIN
);
561 static int i915_gem_userptr_get_pages(struct drm_i915_gem_object
*obj
)
563 const unsigned long num_pages
= obj
->base
.size
>> PAGE_SHIFT
;
564 struct mm_struct
*mm
= obj
->userptr
.mm
->mm
;
566 struct sg_table
*pages
;
570 /* If userspace should engineer that these pages are replaced in
571 * the vma between us binding this page into the GTT and completion
572 * of rendering... Their loss. If they change the mapping of their
573 * pages they need to create a new bo to point to the new vma.
575 * However, that still leaves open the possibility of the vma
576 * being copied upon fork. Which falls under the same userspace
577 * synchronisation issue as a regular bo, except that this time
578 * the process may not be expecting that a particular piece of
579 * memory is tied to the GPU.
581 * Fortunately, we can hook into the mmu_notifier in order to
582 * discard the page references prior to anything nasty happening
583 * to the vma (discard or cloning) which should prevent the more
584 * egregious cases from causing harm.
587 if (obj
->userptr
.work
) {
588 /* active flag should still be held for the pending work */
589 if (IS_ERR(obj
->userptr
.work
))
590 return PTR_ERR(obj
->userptr
.work
);
598 if (mm
== current
->mm
) {
599 pvec
= kvmalloc_array(num_pages
, sizeof(struct page
*),
603 if (pvec
) /* defer to worker if malloc fails */
604 pinned
= __get_user_pages_fast(obj
->userptr
.ptr
,
606 !i915_gem_object_is_readonly(obj
),
612 pages
= ERR_PTR(pinned
);
614 } else if (pinned
< num_pages
) {
615 pages
= __i915_gem_userptr_get_pages_schedule(obj
);
616 active
= pages
== ERR_PTR(-EAGAIN
);
618 pages
= __i915_gem_userptr_alloc_pages(obj
, pvec
, num_pages
);
619 active
= !IS_ERR(pages
);
622 __i915_gem_userptr_set_active(obj
, true);
625 release_pages(pvec
, pinned
);
628 return PTR_ERR_OR_ZERO(pages
);
632 i915_gem_userptr_put_pages(struct drm_i915_gem_object
*obj
,
633 struct sg_table
*pages
)
635 struct sgt_iter sgt_iter
;
638 /* Cancel any inflight work and force them to restart their gup */
639 obj
->userptr
.work
= NULL
;
640 __i915_gem_userptr_set_active(obj
, false);
644 __i915_gem_object_release_shmem(obj
, pages
, true);
645 i915_gem_gtt_finish_pages(obj
, pages
);
648 * We always mark objects as dirty when they are used by the GPU,
649 * just in case. However, if we set the vma as being read-only we know
650 * that the object will never have been written to.
652 if (i915_gem_object_is_readonly(obj
))
653 obj
->mm
.dirty
= false;
655 for_each_sgt_page(page
, sgt_iter
, pages
) {
656 if (obj
->mm
.dirty
&& trylock_page(page
)) {
658 * As this may not be anonymous memory (e.g. shmem)
659 * but exist on a real mapping, we have to lock
660 * the page in order to dirty it -- holding
661 * the page reference is not sufficient to
662 * prevent the inode from being truncated.
663 * Play safe and take the lock.
667 * The mmu-notifier can be invalidated for a
668 * migrate_page, that is alreadying holding the lock
669 * on the page. Such a try_to_unmap() will result
670 * in us calling put_pages() and so recursively try
671 * to lock the page. We avoid that deadlock with
672 * a trylock_page() and in exchange we risk missing
673 * some page dirtying.
675 set_page_dirty(page
);
679 mark_page_accessed(page
);
682 obj
->mm
.dirty
= false;
684 sg_free_table(pages
);
689 i915_gem_userptr_release(struct drm_i915_gem_object
*obj
)
691 i915_gem_userptr_release__mmu_notifier(obj
);
692 i915_gem_userptr_release__mm_struct(obj
);
696 i915_gem_userptr_dmabuf_export(struct drm_i915_gem_object
*obj
)
698 if (obj
->userptr
.mmu_object
)
701 return i915_gem_userptr_init__mmu_notifier(obj
, 0);
704 static const struct drm_i915_gem_object_ops i915_gem_userptr_ops
= {
705 .flags
= I915_GEM_OBJECT_HAS_STRUCT_PAGE
|
706 I915_GEM_OBJECT_IS_SHRINKABLE
|
707 I915_GEM_OBJECT_NO_GGTT
|
708 I915_GEM_OBJECT_ASYNC_CANCEL
,
709 .get_pages
= i915_gem_userptr_get_pages
,
710 .put_pages
= i915_gem_userptr_put_pages
,
711 .dmabuf_export
= i915_gem_userptr_dmabuf_export
,
712 .release
= i915_gem_userptr_release
,
716 * Creates a new mm object that wraps some normal memory from the process
717 * context - user memory.
719 * We impose several restrictions upon the memory being mapped
721 * 1. It must be page aligned (both start/end addresses, i.e ptr and size).
722 * 2. It must be normal system memory, not a pointer into another map of IO
723 * space (e.g. it must not be a GTT mmapping of another object).
724 * 3. We only allow a bo as large as we could in theory map into the GTT,
725 * that is we limit the size to the total size of the GTT.
726 * 4. The bo is marked as being snoopable. The backing pages are left
727 * accessible directly by the CPU, but reads and writes by the GPU may
728 * incur the cost of a snoop (unless you have an LLC architecture).
730 * Synchronisation between multiple users and the GPU is left to userspace
731 * through the normal set-domain-ioctl. The kernel will enforce that the
732 * GPU relinquishes the VMA before it is returned back to the system
733 * i.e. upon free(), munmap() or process termination. However, the userspace
734 * malloc() library may not immediately relinquish the VMA after free() and
735 * instead reuse it whilst the GPU is still reading and writing to the VMA.
738 * Also note, that the object created here is not currently a "first class"
739 * object, in that several ioctls are banned. These are the CPU access
740 * ioctls: mmap(), pwrite and pread. In practice, you are expected to use
741 * direct access via your pointer rather than use those ioctls. Another
742 * restriction is that we do not allow userptr surfaces to be pinned to the
743 * hardware and so we reject any attempt to create a framebuffer out of a
746 * If you think this is a good interface to use to pass GPU memory between
747 * drivers, please use dma-buf instead. In fact, wherever possible use
751 i915_gem_userptr_ioctl(struct drm_device
*dev
,
753 struct drm_file
*file
)
755 static struct lock_class_key lock_class
;
756 struct drm_i915_private
*dev_priv
= to_i915(dev
);
757 struct drm_i915_gem_userptr
*args
= data
;
758 struct drm_i915_gem_object
*obj
;
762 if (!HAS_LLC(dev_priv
) && !HAS_SNOOP(dev_priv
)) {
763 /* We cannot support coherent userptr objects on hw without
764 * LLC and broken snooping.
769 if (args
->flags
& ~(I915_USERPTR_READ_ONLY
|
770 I915_USERPTR_UNSYNCHRONIZED
))
773 if (!args
->user_size
)
776 if (offset_in_page(args
->user_ptr
| args
->user_size
))
779 if (!access_ok((char __user
*)(unsigned long)args
->user_ptr
, args
->user_size
))
782 if (args
->flags
& I915_USERPTR_READ_ONLY
) {
784 * On almost all of the older hw, we cannot tell the GPU that
785 * a page is readonly.
787 if (!dev_priv
->gt
.vm
->has_read_only
)
791 obj
= i915_gem_object_alloc();
795 drm_gem_private_object_init(dev
, &obj
->base
, args
->user_size
);
796 i915_gem_object_init(obj
, &i915_gem_userptr_ops
, &lock_class
);
797 obj
->read_domains
= I915_GEM_DOMAIN_CPU
;
798 obj
->write_domain
= I915_GEM_DOMAIN_CPU
;
799 i915_gem_object_set_cache_coherency(obj
, I915_CACHE_LLC
);
801 obj
->userptr
.ptr
= args
->user_ptr
;
802 if (args
->flags
& I915_USERPTR_READ_ONLY
)
803 i915_gem_object_set_readonly(obj
);
805 /* And keep a pointer to the current->mm for resolving the user pages
806 * at binding. This means that we need to hook into the mmu_notifier
807 * in order to detect if the mmu is destroyed.
809 ret
= i915_gem_userptr_init__mm_struct(obj
);
811 ret
= i915_gem_userptr_init__mmu_notifier(obj
, args
->flags
);
813 ret
= drm_gem_handle_create(file
, &obj
->base
, &handle
);
815 /* drop reference from allocate - handle holds it now */
816 i915_gem_object_put(obj
);
820 args
->handle
= handle
;
824 int i915_gem_init_userptr(struct drm_i915_private
*dev_priv
)
826 mutex_init(&dev_priv
->mm_lock
);
827 hash_init(dev_priv
->mm_structs
);
829 dev_priv
->mm
.userptr_wq
=
830 alloc_workqueue("i915-userptr-acquire",
831 WQ_HIGHPRI
| WQ_UNBOUND
,
833 if (!dev_priv
->mm
.userptr_wq
)
839 void i915_gem_cleanup_userptr(struct drm_i915_private
*dev_priv
)
841 destroy_workqueue(dev_priv
->mm
.userptr_wq
);