1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/init.h>
4 #include <linux/errno.h>
6 #include <linux/mman.h>
7 #include <linux/slab.h>
8 #include <linux/vmalloc.h>
9 #include <linux/io_uring.h>
10 #include <linux/io_uring_types.h>
11 #include <asm/shmparam.h>
17 static void *io_mem_alloc_compound(struct page
**pages
, int nr_pages
,
18 size_t size
, gfp_t gfp
)
23 order
= get_order(size
);
24 if (order
> MAX_PAGE_ORDER
)
25 return ERR_PTR(-ENOMEM
);
29 page
= alloc_pages(gfp
, order
);
31 return ERR_PTR(-ENOMEM
);
33 for (i
= 0; i
< nr_pages
; i
++)
36 return page_address(page
);
39 static void *io_mem_alloc_single(struct page
**pages
, int nr_pages
, size_t size
,
45 for (i
= 0; i
< nr_pages
; i
++) {
46 pages
[i
] = alloc_page(gfp
);
51 ret
= vmap(pages
, nr_pages
, VM_MAP
, PAGE_KERNEL
);
57 return ERR_PTR(-ENOMEM
);
60 void *io_pages_map(struct page
***out_pages
, unsigned short *npages
,
63 gfp_t gfp
= GFP_KERNEL_ACCOUNT
| __GFP_ZERO
| __GFP_NOWARN
;
68 nr_pages
= (size
+ PAGE_SIZE
- 1) >> PAGE_SHIFT
;
69 pages
= kvmalloc_array(nr_pages
, sizeof(struct page
*), gfp
);
71 return ERR_PTR(-ENOMEM
);
73 ret
= io_mem_alloc_compound(pages
, nr_pages
, size
, gfp
);
79 ret
= io_mem_alloc_single(pages
, nr_pages
, size
, gfp
);
93 void io_pages_unmap(void *ptr
, struct page
***pages
, unsigned short *npages
,
96 bool do_vunmap
= false;
101 if (put_pages
&& *npages
) {
102 struct page
**to_free
= *pages
;
106 * Only did vmap for the non-compound multiple page case.
107 * For the compound page, we just need to put the head.
109 if (PageCompound(to_free
[0]))
111 else if (*npages
> 1)
113 for (i
= 0; i
< *npages
; i
++)
114 put_page(to_free
[i
]);
123 void io_pages_free(struct page
***pages
, int npages
)
125 struct page
**page_array
= *pages
;
130 unpin_user_pages(page_array
, npages
);
135 struct page
**io_pin_pages(unsigned long uaddr
, unsigned long len
, int *npages
)
137 unsigned long start
, end
, nr_pages
;
141 if (check_add_overflow(uaddr
, len
, &end
))
142 return ERR_PTR(-EOVERFLOW
);
143 if (check_add_overflow(end
, PAGE_SIZE
- 1, &end
))
144 return ERR_PTR(-EOVERFLOW
);
146 end
= end
>> PAGE_SHIFT
;
147 start
= uaddr
>> PAGE_SHIFT
;
148 nr_pages
= end
- start
;
149 if (WARN_ON_ONCE(!nr_pages
))
150 return ERR_PTR(-EINVAL
);
151 if (WARN_ON_ONCE(nr_pages
> INT_MAX
))
152 return ERR_PTR(-EOVERFLOW
);
154 pages
= kvmalloc_array(nr_pages
, sizeof(struct page
*), GFP_KERNEL
);
156 return ERR_PTR(-ENOMEM
);
158 ret
= pin_user_pages_fast(uaddr
, nr_pages
, FOLL_WRITE
| FOLL_LONGTERM
,
160 /* success, mapped all pages */
161 if (ret
== nr_pages
) {
166 /* partial map, or didn't map anything */
168 /* if we did partial map, release any pages we did get */
170 unpin_user_pages(pages
, ret
);
177 void *__io_uaddr_map(struct page
***pages
, unsigned short *npages
,
178 unsigned long uaddr
, size_t size
)
180 struct page
**page_array
;
181 unsigned int nr_pages
;
186 if (uaddr
& (PAGE_SIZE
- 1) || !size
)
187 return ERR_PTR(-EINVAL
);
190 page_array
= io_pin_pages(uaddr
, size
, &nr_pages
);
191 if (IS_ERR(page_array
))
194 page_addr
= vmap(page_array
, nr_pages
, VM_MAP
, PAGE_KERNEL
);
201 io_pages_free(&page_array
, nr_pages
);
202 return ERR_PTR(-ENOMEM
);
205 void io_free_region(struct io_ring_ctx
*ctx
, struct io_mapped_region
*mr
)
208 unpin_user_pages(mr
->pages
, mr
->nr_pages
);
212 vunmap(mr
->vmap_ptr
);
213 if (mr
->nr_pages
&& ctx
->user
)
214 __io_unaccount_mem(ctx
->user
, mr
->nr_pages
);
216 memset(mr
, 0, sizeof(*mr
));
219 int io_create_region(struct io_ring_ctx
*ctx
, struct io_mapped_region
*mr
,
220 struct io_uring_region_desc
*reg
)
222 int pages_accounted
= 0;
228 if (WARN_ON_ONCE(mr
->pages
|| mr
->vmap_ptr
|| mr
->nr_pages
))
230 if (memchr_inv(®
->__resv
, 0, sizeof(reg
->__resv
)))
232 if (reg
->flags
!= IORING_MEM_REGION_TYPE_USER
)
236 if (!reg
->size
|| reg
->mmap_offset
|| reg
->id
)
238 if ((reg
->size
>> PAGE_SHIFT
) > INT_MAX
)
240 if ((reg
->user_addr
| reg
->size
) & ~PAGE_MASK
)
242 if (check_add_overflow(reg
->user_addr
, reg
->size
, &end
))
245 pages
= io_pin_pages(reg
->user_addr
, reg
->size
, &nr_pages
);
247 return PTR_ERR(pages
);
250 ret
= __io_account_mem(ctx
->user
, nr_pages
);
253 pages_accounted
= nr_pages
;
256 vptr
= vmap(pages
, nr_pages
, VM_MAP
, PAGE_KERNEL
);
264 mr
->nr_pages
= nr_pages
;
268 __io_unaccount_mem(ctx
->user
, pages_accounted
);
269 io_pages_free(&pages
, nr_pages
);
273 static void *io_uring_validate_mmap_request(struct file
*file
, loff_t pgoff
,
276 struct io_ring_ctx
*ctx
= file
->private_data
;
277 loff_t offset
= pgoff
<< PAGE_SHIFT
;
279 switch ((pgoff
<< PAGE_SHIFT
) & IORING_OFF_MMAP_MASK
) {
280 case IORING_OFF_SQ_RING
:
281 case IORING_OFF_CQ_RING
:
282 /* Don't allow mmap if the ring was setup without it */
283 if (ctx
->flags
& IORING_SETUP_NO_MMAP
)
284 return ERR_PTR(-EINVAL
);
286 return ERR_PTR(-EFAULT
);
288 case IORING_OFF_SQES
:
289 /* Don't allow mmap if the ring was setup without it */
290 if (ctx
->flags
& IORING_SETUP_NO_MMAP
)
291 return ERR_PTR(-EINVAL
);
293 return ERR_PTR(-EFAULT
);
295 case IORING_OFF_PBUF_RING
: {
296 struct io_buffer_list
*bl
;
300 bgid
= (offset
& ~IORING_OFF_MMAP_MASK
) >> IORING_OFF_PBUF_SHIFT
;
301 bl
= io_pbuf_get_bl(ctx
, bgid
);
310 return ERR_PTR(-EINVAL
);
313 int io_uring_mmap_pages(struct io_ring_ctx
*ctx
, struct vm_area_struct
*vma
,
314 struct page
**pages
, int npages
)
316 unsigned long nr_pages
= npages
;
318 vm_flags_set(vma
, VM_DONTEXPAND
);
319 return vm_insert_pages(vma
, vma
->vm_start
, pages
, &nr_pages
);
324 __cold
int io_uring_mmap(struct file
*file
, struct vm_area_struct
*vma
)
326 struct io_ring_ctx
*ctx
= file
->private_data
;
327 size_t sz
= vma
->vm_end
- vma
->vm_start
;
328 long offset
= vma
->vm_pgoff
<< PAGE_SHIFT
;
332 guard(mutex
)(&ctx
->resize_lock
);
334 ptr
= io_uring_validate_mmap_request(file
, vma
->vm_pgoff
, sz
);
338 switch (offset
& IORING_OFF_MMAP_MASK
) {
339 case IORING_OFF_SQ_RING
:
340 case IORING_OFF_CQ_RING
:
341 npages
= min(ctx
->n_ring_pages
, (sz
+ PAGE_SIZE
- 1) >> PAGE_SHIFT
);
342 return io_uring_mmap_pages(ctx
, vma
, ctx
->ring_pages
, npages
);
343 case IORING_OFF_SQES
:
344 return io_uring_mmap_pages(ctx
, vma
, ctx
->sqe_pages
,
346 case IORING_OFF_PBUF_RING
:
347 return io_pbuf_mmap(file
, vma
);
353 unsigned long io_uring_get_unmapped_area(struct file
*filp
, unsigned long addr
,
354 unsigned long len
, unsigned long pgoff
,
357 struct io_ring_ctx
*ctx
= filp
->private_data
;
361 * Do not allow to map to user-provided address to avoid breaking the
362 * aliasing rules. Userspace is not able to guess the offset address of
363 * kernel kmalloc()ed memory area.
368 guard(mutex
)(&ctx
->resize_lock
);
370 ptr
= io_uring_validate_mmap_request(filp
, pgoff
, len
);
375 * Some architectures have strong cache aliasing requirements.
376 * For such architectures we need a coherent mapping which aliases
377 * kernel memory *and* userspace memory. To achieve that:
378 * - use a NULL file pointer to reference physical memory, and
379 * - use the kernel virtual address of the shared io_uring context
380 * (instead of the userspace-provided address, which has to be 0UL
382 * - use the same pgoff which the get_unmapped_area() uses to
383 * calculate the page colouring.
384 * For architectures without such aliasing requirements, the
385 * architecture will return any suitable mapping because addr is 0.
389 pgoff
= 0; /* has been translated to ptr above */
391 addr
= (uintptr_t) ptr
;
392 pgoff
= addr
>> PAGE_SHIFT
;
396 return mm_get_unmapped_area(current
->mm
, filp
, addr
, len
, pgoff
, flags
);
399 #else /* !CONFIG_MMU */
401 int io_uring_mmap(struct file
*file
, struct vm_area_struct
*vma
)
403 return is_nommu_shared_mapping(vma
->vm_flags
) ? 0 : -EINVAL
;
406 unsigned int io_uring_nommu_mmap_capabilities(struct file
*file
)
408 return NOMMU_MAP_DIRECT
| NOMMU_MAP_READ
| NOMMU_MAP_WRITE
;
411 unsigned long io_uring_get_unmapped_area(struct file
*file
, unsigned long addr
,
412 unsigned long len
, unsigned long pgoff
,
415 struct io_ring_ctx
*ctx
= file
->private_data
;
418 guard(mutex
)(&ctx
->resize_lock
);
420 ptr
= io_uring_validate_mmap_request(file
, pgoff
, len
);
424 return (unsigned long) ptr
;
427 #endif /* !CONFIG_MMU */