2 * SPDX-License-Identifier: MIT
4 * Copyright © 2012-2014 Intel Corporation
7 #include <linux/mmu_context.h>
8 #include <linux/mmu_notifier.h>
9 #include <linux/mempolicy.h>
10 #include <linux/swap.h>
11 #include <linux/sched/mm.h>
14 #include "i915_gem_ioctls.h"
15 #include "i915_gem_object.h"
16 #include "i915_scatterlist.h"
18 struct i915_mm_struct
{
20 struct drm_i915_private
*i915
;
21 struct i915_mmu_notifier
*mn
;
22 struct hlist_node node
;
27 #if defined(CONFIG_MMU_NOTIFIER)
28 #include <linux/interval_tree.h>
30 struct i915_mmu_notifier
{
32 struct hlist_node node
;
33 struct mmu_notifier mn
;
34 struct rb_root_cached objects
;
35 struct i915_mm_struct
*mm
;
38 struct i915_mmu_object
{
39 struct i915_mmu_notifier
*mn
;
40 struct drm_i915_gem_object
*obj
;
41 struct interval_tree_node it
;
44 static void add_object(struct i915_mmu_object
*mo
)
46 GEM_BUG_ON(!RB_EMPTY_NODE(&mo
->it
.rb
));
47 interval_tree_insert(&mo
->it
, &mo
->mn
->objects
);
50 static void del_object(struct i915_mmu_object
*mo
)
52 if (RB_EMPTY_NODE(&mo
->it
.rb
))
55 interval_tree_remove(&mo
->it
, &mo
->mn
->objects
);
56 RB_CLEAR_NODE(&mo
->it
.rb
);
60 __i915_gem_userptr_set_active(struct drm_i915_gem_object
*obj
, bool value
)
62 struct i915_mmu_object
*mo
= obj
->userptr
.mmu_object
;
65 * During mm_invalidate_range we need to cancel any userptr that
66 * overlaps the range being invalidated. Doing so requires the
67 * struct_mutex, and that risks recursion. In order to cause
68 * recursion, the user must alias the userptr address space with
69 * a GTT mmapping (possible with a MAP_FIXED) - then when we have
70 * to invalidate that mmaping, mm_invalidate_range is called with
71 * the userptr address *and* the struct_mutex held. To prevent that
72 * we set a flag under the i915_mmu_notifier spinlock to indicate
73 * whether this object is valid.
78 spin_lock(&mo
->mn
->lock
);
83 spin_unlock(&mo
->mn
->lock
);
87 userptr_mn_invalidate_range_start(struct mmu_notifier
*_mn
,
88 const struct mmu_notifier_range
*range
)
90 struct i915_mmu_notifier
*mn
=
91 container_of(_mn
, struct i915_mmu_notifier
, mn
);
92 struct interval_tree_node
*it
;
96 if (RB_EMPTY_ROOT(&mn
->objects
.rb_root
))
99 /* interval ranges are inclusive, but invalidate range is exclusive */
100 end
= range
->end
- 1;
102 spin_lock(&mn
->lock
);
103 it
= interval_tree_iter_first(&mn
->objects
, range
->start
, end
);
105 struct drm_i915_gem_object
*obj
;
107 if (!mmu_notifier_range_blockable(range
)) {
113 * The mmu_object is released late when destroying the
114 * GEM object so it is entirely possible to gain a
115 * reference on an object in the process of being freed
116 * since our serialisation is via the spinlock and not
117 * the struct_mutex - and consequently use it after it
118 * is freed and then double free it. To prevent that
119 * use-after-free we only acquire a reference on the
120 * object if it is not in the process of being destroyed.
122 obj
= container_of(it
, struct i915_mmu_object
, it
)->obj
;
123 if (!kref_get_unless_zero(&obj
->base
.refcount
)) {
124 it
= interval_tree_iter_next(it
, range
->start
, end
);
127 spin_unlock(&mn
->lock
);
129 ret
= i915_gem_object_unbind(obj
,
130 I915_GEM_OBJECT_UNBIND_ACTIVE
|
131 I915_GEM_OBJECT_UNBIND_BARRIER
);
133 ret
= __i915_gem_object_put_pages(obj
);
134 i915_gem_object_put(obj
);
138 spin_lock(&mn
->lock
);
141 * As we do not (yet) protect the mmu from concurrent insertion
142 * over this range, there is no guarantee that this search will
143 * terminate given a pathologic workload.
145 it
= interval_tree_iter_first(&mn
->objects
, range
->start
, end
);
147 spin_unlock(&mn
->lock
);
153 static const struct mmu_notifier_ops i915_gem_userptr_notifier
= {
154 .invalidate_range_start
= userptr_mn_invalidate_range_start
,
157 static struct i915_mmu_notifier
*
158 i915_mmu_notifier_create(struct i915_mm_struct
*mm
)
160 struct i915_mmu_notifier
*mn
;
162 mn
= kmalloc(sizeof(*mn
), GFP_KERNEL
);
164 return ERR_PTR(-ENOMEM
);
166 spin_lock_init(&mn
->lock
);
167 mn
->mn
.ops
= &i915_gem_userptr_notifier
;
168 mn
->objects
= RB_ROOT_CACHED
;
175 i915_gem_userptr_release__mmu_notifier(struct drm_i915_gem_object
*obj
)
177 struct i915_mmu_object
*mo
;
179 mo
= fetch_and_zero(&obj
->userptr
.mmu_object
);
183 spin_lock(&mo
->mn
->lock
);
185 spin_unlock(&mo
->mn
->lock
);
189 static struct i915_mmu_notifier
*
190 i915_mmu_notifier_find(struct i915_mm_struct
*mm
)
192 struct i915_mmu_notifier
*mn
, *old
;
195 mn
= READ_ONCE(mm
->mn
);
199 mn
= i915_mmu_notifier_create(mm
);
203 err
= mmu_notifier_register(&mn
->mn
, mm
->mm
);
209 old
= cmpxchg(&mm
->mn
, NULL
, mn
);
211 mmu_notifier_unregister(&mn
->mn
, mm
->mm
);
220 i915_gem_userptr_init__mmu_notifier(struct drm_i915_gem_object
*obj
,
223 struct i915_mmu_notifier
*mn
;
224 struct i915_mmu_object
*mo
;
226 if (flags
& I915_USERPTR_UNSYNCHRONIZED
)
227 return capable(CAP_SYS_ADMIN
) ? 0 : -EPERM
;
229 if (GEM_WARN_ON(!obj
->userptr
.mm
))
232 mn
= i915_mmu_notifier_find(obj
->userptr
.mm
);
236 mo
= kzalloc(sizeof(*mo
), GFP_KERNEL
);
242 mo
->it
.start
= obj
->userptr
.ptr
;
243 mo
->it
.last
= obj
->userptr
.ptr
+ obj
->base
.size
- 1;
244 RB_CLEAR_NODE(&mo
->it
.rb
);
246 obj
->userptr
.mmu_object
= mo
;
251 i915_mmu_notifier_free(struct i915_mmu_notifier
*mn
,
252 struct mm_struct
*mm
)
257 mmu_notifier_unregister(&mn
->mn
, mm
);
264 __i915_gem_userptr_set_active(struct drm_i915_gem_object
*obj
, bool value
)
269 i915_gem_userptr_release__mmu_notifier(struct drm_i915_gem_object
*obj
)
274 i915_gem_userptr_init__mmu_notifier(struct drm_i915_gem_object
*obj
,
277 if ((flags
& I915_USERPTR_UNSYNCHRONIZED
) == 0)
280 if (!capable(CAP_SYS_ADMIN
))
287 i915_mmu_notifier_free(struct i915_mmu_notifier
*mn
,
288 struct mm_struct
*mm
)
294 static struct i915_mm_struct
*
295 __i915_mm_struct_find(struct drm_i915_private
*i915
, struct mm_struct
*real
)
297 struct i915_mm_struct
*it
, *mm
= NULL
;
300 hash_for_each_possible_rcu(i915
->mm_structs
,
303 if (it
->mm
== real
&& kref_get_unless_zero(&it
->kref
)) {
313 i915_gem_userptr_init__mm_struct(struct drm_i915_gem_object
*obj
)
315 struct drm_i915_private
*i915
= to_i915(obj
->base
.dev
);
316 struct i915_mm_struct
*mm
, *new;
319 /* During release of the GEM object we hold the struct_mutex. This
320 * precludes us from calling mmput() at that time as that may be
321 * the last reference and so call exit_mmap(). exit_mmap() will
322 * attempt to reap the vma, and if we were holding a GTT mmap
323 * would then call drm_gem_vm_close() and attempt to reacquire
324 * the struct mutex. So in order to avoid that recursion, we have
325 * to defer releasing the mm reference until after we drop the
326 * struct_mutex, i.e. we need to schedule a worker to do the clean
329 mm
= __i915_mm_struct_find(i915
, current
->mm
);
333 new = kmalloc(sizeof(*mm
), GFP_KERNEL
);
337 kref_init(&new->kref
);
338 new->i915
= to_i915(obj
->base
.dev
);
339 new->mm
= current
->mm
;
342 spin_lock(&i915
->mm_lock
);
343 mm
= __i915_mm_struct_find(i915
, current
->mm
);
345 hash_add_rcu(i915
->mm_structs
,
347 (unsigned long)new->mm
);
351 spin_unlock(&i915
->mm_lock
);
356 obj
->userptr
.mm
= mm
;
361 __i915_mm_struct_free__worker(struct work_struct
*work
)
363 struct i915_mm_struct
*mm
= container_of(work
, typeof(*mm
), work
.work
);
365 i915_mmu_notifier_free(mm
->mn
, mm
->mm
);
371 __i915_mm_struct_free(struct kref
*kref
)
373 struct i915_mm_struct
*mm
= container_of(kref
, typeof(*mm
), kref
);
375 spin_lock(&mm
->i915
->mm_lock
);
376 hash_del_rcu(&mm
->node
);
377 spin_unlock(&mm
->i915
->mm_lock
);
379 INIT_RCU_WORK(&mm
->work
, __i915_mm_struct_free__worker
);
380 queue_rcu_work(system_wq
, &mm
->work
);
384 i915_gem_userptr_release__mm_struct(struct drm_i915_gem_object
*obj
)
386 if (obj
->userptr
.mm
== NULL
)
389 kref_put(&obj
->userptr
.mm
->kref
, __i915_mm_struct_free
);
390 obj
->userptr
.mm
= NULL
;
393 struct get_pages_work
{
394 struct work_struct work
;
395 struct drm_i915_gem_object
*obj
;
396 struct task_struct
*task
;
399 static struct sg_table
*
400 __i915_gem_userptr_alloc_pages(struct drm_i915_gem_object
*obj
,
401 struct page
**pvec
, unsigned long num_pages
)
403 unsigned int max_segment
= i915_sg_segment_size();
405 unsigned int sg_page_sizes
;
406 struct scatterlist
*sg
;
409 st
= kmalloc(sizeof(*st
), GFP_KERNEL
);
411 return ERR_PTR(-ENOMEM
);
414 sg
= __sg_alloc_table_from_pages(st
, pvec
, num_pages
, 0,
415 num_pages
<< PAGE_SHIFT
, max_segment
,
416 NULL
, 0, GFP_KERNEL
);
422 ret
= i915_gem_gtt_prepare_pages(obj
, st
);
426 if (max_segment
> PAGE_SIZE
) {
427 max_segment
= PAGE_SIZE
;
435 sg_page_sizes
= i915_sg_page_sizes(st
->sgl
);
437 __i915_gem_object_set_pages(obj
, st
, sg_page_sizes
);
443 __i915_gem_userptr_get_pages_worker(struct work_struct
*_work
)
445 struct get_pages_work
*work
= container_of(_work
, typeof(*work
), work
);
446 struct drm_i915_gem_object
*obj
= work
->obj
;
447 const unsigned long npages
= obj
->base
.size
>> PAGE_SHIFT
;
448 unsigned long pinned
;
455 pvec
= kvmalloc_array(npages
, sizeof(struct page
*), GFP_KERNEL
);
457 struct mm_struct
*mm
= obj
->userptr
.mm
->mm
;
458 unsigned int flags
= 0;
461 if (!i915_gem_object_is_readonly(obj
))
465 if (mmget_not_zero(mm
)) {
466 while (pinned
< npages
) {
471 ret
= pin_user_pages_remote
473 obj
->userptr
.ptr
+ pinned
* PAGE_SIZE
,
476 pvec
+ pinned
, NULL
, &locked
);
483 mmap_read_unlock(mm
);
488 mutex_lock_nested(&obj
->mm
.lock
, I915_MM_GET_PAGES
);
489 if (obj
->userptr
.work
== &work
->work
) {
490 struct sg_table
*pages
= ERR_PTR(ret
);
492 if (pinned
== npages
) {
493 pages
= __i915_gem_userptr_alloc_pages(obj
, pvec
,
495 if (!IS_ERR(pages
)) {
501 obj
->userptr
.work
= ERR_CAST(pages
);
503 __i915_gem_userptr_set_active(obj
, false);
505 mutex_unlock(&obj
->mm
.lock
);
507 unpin_user_pages(pvec
, pinned
);
510 i915_gem_object_put(obj
);
511 put_task_struct(work
->task
);
515 static struct sg_table
*
516 __i915_gem_userptr_get_pages_schedule(struct drm_i915_gem_object
*obj
)
518 struct get_pages_work
*work
;
520 /* Spawn a worker so that we can acquire the
521 * user pages without holding our mutex. Access
522 * to the user pages requires mmap_lock, and we have
523 * a strict lock ordering of mmap_lock, struct_mutex -
524 * we already hold struct_mutex here and so cannot
525 * call gup without encountering a lock inversion.
527 * Userspace will keep on repeating the operation
528 * (thanks to EAGAIN) until either we hit the fast
529 * path or the worker completes. If the worker is
530 * cancelled or superseded, the task is still run
531 * but the results ignored. (This leads to
532 * complications that we may have a stray object
533 * refcount that we need to be wary of when
534 * checking for existing objects during creation.)
535 * If the worker encounters an error, it reports
536 * that error back to this function through
537 * obj->userptr.work = ERR_PTR.
539 work
= kmalloc(sizeof(*work
), GFP_KERNEL
);
541 return ERR_PTR(-ENOMEM
);
543 obj
->userptr
.work
= &work
->work
;
545 work
->obj
= i915_gem_object_get(obj
);
547 work
->task
= current
;
548 get_task_struct(work
->task
);
550 INIT_WORK(&work
->work
, __i915_gem_userptr_get_pages_worker
);
551 queue_work(to_i915(obj
->base
.dev
)->mm
.userptr_wq
, &work
->work
);
553 return ERR_PTR(-EAGAIN
);
556 static int i915_gem_userptr_get_pages(struct drm_i915_gem_object
*obj
)
558 const unsigned long num_pages
= obj
->base
.size
>> PAGE_SHIFT
;
559 struct mm_struct
*mm
= obj
->userptr
.mm
->mm
;
561 struct sg_table
*pages
;
564 unsigned int gup_flags
= 0;
566 /* If userspace should engineer that these pages are replaced in
567 * the vma between us binding this page into the GTT and completion
568 * of rendering... Their loss. If they change the mapping of their
569 * pages they need to create a new bo to point to the new vma.
571 * However, that still leaves open the possibility of the vma
572 * being copied upon fork. Which falls under the same userspace
573 * synchronisation issue as a regular bo, except that this time
574 * the process may not be expecting that a particular piece of
575 * memory is tied to the GPU.
577 * Fortunately, we can hook into the mmu_notifier in order to
578 * discard the page references prior to anything nasty happening
579 * to the vma (discard or cloning) which should prevent the more
580 * egregious cases from causing harm.
583 if (obj
->userptr
.work
) {
584 /* active flag should still be held for the pending work */
585 if (IS_ERR(obj
->userptr
.work
))
586 return PTR_ERR(obj
->userptr
.work
);
594 if (mm
== current
->mm
) {
595 pvec
= kvmalloc_array(num_pages
, sizeof(struct page
*),
600 /* defer to worker if malloc fails */
601 if (!i915_gem_object_is_readonly(obj
))
602 gup_flags
|= FOLL_WRITE
;
603 pinned
= pin_user_pages_fast_only(obj
->userptr
.ptr
,
604 num_pages
, gup_flags
,
611 pages
= ERR_PTR(pinned
);
613 } else if (pinned
< num_pages
) {
614 pages
= __i915_gem_userptr_get_pages_schedule(obj
);
615 active
= pages
== ERR_PTR(-EAGAIN
);
617 pages
= __i915_gem_userptr_alloc_pages(obj
, pvec
, num_pages
);
618 active
= !IS_ERR(pages
);
621 __i915_gem_userptr_set_active(obj
, true);
624 unpin_user_pages(pvec
, pinned
);
627 return PTR_ERR_OR_ZERO(pages
);
631 i915_gem_userptr_put_pages(struct drm_i915_gem_object
*obj
,
632 struct sg_table
*pages
)
634 struct sgt_iter sgt_iter
;
637 /* Cancel any inflight work and force them to restart their gup */
638 obj
->userptr
.work
= NULL
;
639 __i915_gem_userptr_set_active(obj
, false);
643 __i915_gem_object_release_shmem(obj
, pages
, true);
644 i915_gem_gtt_finish_pages(obj
, pages
);
647 * We always mark objects as dirty when they are used by the GPU,
648 * just in case. However, if we set the vma as being read-only we know
649 * that the object will never have been written to.
651 if (i915_gem_object_is_readonly(obj
))
652 obj
->mm
.dirty
= false;
654 for_each_sgt_page(page
, sgt_iter
, pages
) {
655 if (obj
->mm
.dirty
&& trylock_page(page
)) {
657 * As this may not be anonymous memory (e.g. shmem)
658 * but exist on a real mapping, we have to lock
659 * the page in order to dirty it -- holding
660 * the page reference is not sufficient to
661 * prevent the inode from being truncated.
662 * Play safe and take the lock.
666 * The mmu-notifier can be invalidated for a
667 * migrate_page, that is alreadying holding the lock
668 * on the page. Such a try_to_unmap() will result
669 * in us calling put_pages() and so recursively try
670 * to lock the page. We avoid that deadlock with
671 * a trylock_page() and in exchange we risk missing
672 * some page dirtying.
674 set_page_dirty(page
);
678 mark_page_accessed(page
);
679 unpin_user_page(page
);
681 obj
->mm
.dirty
= false;
683 sg_free_table(pages
);
688 i915_gem_userptr_release(struct drm_i915_gem_object
*obj
)
690 i915_gem_userptr_release__mmu_notifier(obj
);
691 i915_gem_userptr_release__mm_struct(obj
);
695 i915_gem_userptr_dmabuf_export(struct drm_i915_gem_object
*obj
)
697 if (obj
->userptr
.mmu_object
)
700 return i915_gem_userptr_init__mmu_notifier(obj
, 0);
703 static const struct drm_i915_gem_object_ops i915_gem_userptr_ops
= {
704 .name
= "i915_gem_object_userptr",
705 .flags
= I915_GEM_OBJECT_HAS_STRUCT_PAGE
|
706 I915_GEM_OBJECT_IS_SHRINKABLE
|
707 I915_GEM_OBJECT_NO_MMAP
|
708 I915_GEM_OBJECT_ASYNC_CANCEL
,
709 .get_pages
= i915_gem_userptr_get_pages
,
710 .put_pages
= i915_gem_userptr_put_pages
,
711 .dmabuf_export
= i915_gem_userptr_dmabuf_export
,
712 .release
= i915_gem_userptr_release
,
716 * Creates a new mm object that wraps some normal memory from the process
717 * context - user memory.
719 * We impose several restrictions upon the memory being mapped
721 * 1. It must be page aligned (both start/end addresses, i.e ptr and size).
722 * 2. It must be normal system memory, not a pointer into another map of IO
723 * space (e.g. it must not be a GTT mmapping of another object).
724 * 3. We only allow a bo as large as we could in theory map into the GTT,
725 * that is we limit the size to the total size of the GTT.
726 * 4. The bo is marked as being snoopable. The backing pages are left
727 * accessible directly by the CPU, but reads and writes by the GPU may
728 * incur the cost of a snoop (unless you have an LLC architecture).
730 * Synchronisation between multiple users and the GPU is left to userspace
731 * through the normal set-domain-ioctl. The kernel will enforce that the
732 * GPU relinquishes the VMA before it is returned back to the system
733 * i.e. upon free(), munmap() or process termination. However, the userspace
734 * malloc() library may not immediately relinquish the VMA after free() and
735 * instead reuse it whilst the GPU is still reading and writing to the VMA.
738 * Also note, that the object created here is not currently a "first class"
739 * object, in that several ioctls are banned. These are the CPU access
740 * ioctls: mmap(), pwrite and pread. In practice, you are expected to use
741 * direct access via your pointer rather than use those ioctls. Another
742 * restriction is that we do not allow userptr surfaces to be pinned to the
743 * hardware and so we reject any attempt to create a framebuffer out of a
746 * If you think this is a good interface to use to pass GPU memory between
747 * drivers, please use dma-buf instead. In fact, wherever possible use
751 i915_gem_userptr_ioctl(struct drm_device
*dev
,
753 struct drm_file
*file
)
755 static struct lock_class_key lock_class
;
756 struct drm_i915_private
*dev_priv
= to_i915(dev
);
757 struct drm_i915_gem_userptr
*args
= data
;
758 struct drm_i915_gem_object
*obj
;
762 if (!HAS_LLC(dev_priv
) && !HAS_SNOOP(dev_priv
)) {
763 /* We cannot support coherent userptr objects on hw without
764 * LLC and broken snooping.
769 if (args
->flags
& ~(I915_USERPTR_READ_ONLY
|
770 I915_USERPTR_UNSYNCHRONIZED
))
774 * XXX: There is a prevalence of the assumption that we fit the
775 * object's page count inside a 32bit _signed_ variable. Let's document
776 * this and catch if we ever need to fix it. In the meantime, if you do
777 * spot such a local variable, please consider fixing!
779 * Aside from our own locals (for which we have no excuse!):
780 * - sg_table embeds unsigned int for num_pages
781 * - get_user_pages*() mixed ints with longs
784 if (args
->user_size
>> PAGE_SHIFT
> INT_MAX
)
787 if (overflows_type(args
->user_size
, obj
->base
.size
))
790 if (!args
->user_size
)
793 if (offset_in_page(args
->user_ptr
| args
->user_size
))
796 if (!access_ok((char __user
*)(unsigned long)args
->user_ptr
, args
->user_size
))
799 if (args
->flags
& I915_USERPTR_READ_ONLY
) {
801 * On almost all of the older hw, we cannot tell the GPU that
802 * a page is readonly.
804 if (!dev_priv
->gt
.vm
->has_read_only
)
808 obj
= i915_gem_object_alloc();
812 drm_gem_private_object_init(dev
, &obj
->base
, args
->user_size
);
813 i915_gem_object_init(obj
, &i915_gem_userptr_ops
, &lock_class
);
814 obj
->read_domains
= I915_GEM_DOMAIN_CPU
;
815 obj
->write_domain
= I915_GEM_DOMAIN_CPU
;
816 i915_gem_object_set_cache_coherency(obj
, I915_CACHE_LLC
);
818 obj
->userptr
.ptr
= args
->user_ptr
;
819 if (args
->flags
& I915_USERPTR_READ_ONLY
)
820 i915_gem_object_set_readonly(obj
);
822 /* And keep a pointer to the current->mm for resolving the user pages
823 * at binding. This means that we need to hook into the mmu_notifier
824 * in order to detect if the mmu is destroyed.
826 ret
= i915_gem_userptr_init__mm_struct(obj
);
828 ret
= i915_gem_userptr_init__mmu_notifier(obj
, args
->flags
);
830 ret
= drm_gem_handle_create(file
, &obj
->base
, &handle
);
832 /* drop reference from allocate - handle holds it now */
833 i915_gem_object_put(obj
);
837 args
->handle
= handle
;
841 int i915_gem_init_userptr(struct drm_i915_private
*dev_priv
)
843 spin_lock_init(&dev_priv
->mm_lock
);
844 hash_init(dev_priv
->mm_structs
);
846 dev_priv
->mm
.userptr_wq
=
847 alloc_workqueue("i915-userptr-acquire",
848 WQ_HIGHPRI
| WQ_UNBOUND
,
850 if (!dev_priv
->mm
.userptr_wq
)
856 void i915_gem_cleanup_userptr(struct drm_i915_private
*dev_priv
)
858 destroy_workqueue(dev_priv
->mm
.userptr_wq
);