1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
5 #include <linux/file.h>
7 #include <linux/slab.h>
8 #include <linux/nospec.h>
9 #include <linux/hugetlb.h>
10 #include <linux/compat.h>
11 #include <linux/io_uring.h>
13 #include <uapi/linux/io_uring.h>
16 #include "alloc_cache.h"
17 #include "openclose.h"
22 struct io_rsrc_update
{
29 static void io_rsrc_buf_put(struct io_ring_ctx
*ctx
, struct io_rsrc_put
*prsrc
);
30 static int io_sqe_buffer_register(struct io_ring_ctx
*ctx
, struct iovec
*iov
,
31 struct io_mapped_ubuf
**pimu
,
32 struct page
**last_hpage
);
35 #define IORING_MAX_FIXED_FILES (1U << 20)
36 #define IORING_MAX_REG_BUFFERS (1U << 14)
38 static const struct io_mapped_ubuf dummy_ubuf
= {
39 /* set invalid range, so io_import_fixed() fails meeting it */
44 int __io_account_mem(struct user_struct
*user
, unsigned long nr_pages
)
46 unsigned long page_limit
, cur_pages
, new_pages
;
51 /* Don't allow more pages than we can safely lock */
52 page_limit
= rlimit(RLIMIT_MEMLOCK
) >> PAGE_SHIFT
;
54 cur_pages
= atomic_long_read(&user
->locked_vm
);
56 new_pages
= cur_pages
+ nr_pages
;
57 if (new_pages
> page_limit
)
59 } while (!atomic_long_try_cmpxchg(&user
->locked_vm
,
60 &cur_pages
, new_pages
));
64 static void io_unaccount_mem(struct io_ring_ctx
*ctx
, unsigned long nr_pages
)
67 __io_unaccount_mem(ctx
->user
, nr_pages
);
70 atomic64_sub(nr_pages
, &ctx
->mm_account
->pinned_vm
);
73 static int io_account_mem(struct io_ring_ctx
*ctx
, unsigned long nr_pages
)
78 ret
= __io_account_mem(ctx
->user
, nr_pages
);
84 atomic64_add(nr_pages
, &ctx
->mm_account
->pinned_vm
);
89 static int io_buffer_validate(struct iovec
*iov
)
91 unsigned long tmp
, acct_len
= iov
->iov_len
+ (PAGE_SIZE
- 1);
94 * Don't impose further limits on the size and buffer
95 * constraints here, we'll -EINVAL later when IO is
96 * submitted if they are wrong.
99 return iov
->iov_len
? -EFAULT
: 0;
103 /* arbitrary limit, but we need something */
104 if (iov
->iov_len
> SZ_1G
)
107 if (check_add_overflow((unsigned long)iov
->iov_base
, acct_len
, &tmp
))
113 static void io_buffer_unmap(struct io_ring_ctx
*ctx
, struct io_mapped_ubuf
**slot
)
115 struct io_mapped_ubuf
*imu
= *slot
;
119 if (imu
!= &dummy_ubuf
) {
120 if (!refcount_dec_and_test(&imu
->refs
))
122 for (i
= 0; i
< imu
->nr_bvecs
; i
++)
123 unpin_user_page(imu
->bvec
[i
].bv_page
);
125 io_unaccount_mem(ctx
, imu
->acct_pages
);
130 static void io_rsrc_put_work(struct io_rsrc_node
*node
)
132 struct io_rsrc_put
*prsrc
= &node
->item
;
135 io_post_aux_cqe(node
->ctx
, prsrc
->tag
, 0, 0);
137 switch (node
->type
) {
138 case IORING_RSRC_FILE
:
141 case IORING_RSRC_BUFFER
:
142 io_rsrc_buf_put(node
->ctx
, prsrc
);
150 void io_rsrc_node_destroy(struct io_ring_ctx
*ctx
, struct io_rsrc_node
*node
)
152 if (!io_alloc_cache_put(&ctx
->rsrc_node_cache
, node
))
156 void io_rsrc_node_ref_zero(struct io_rsrc_node
*node
)
157 __must_hold(&node
->ctx
->uring_lock
)
159 struct io_ring_ctx
*ctx
= node
->ctx
;
161 while (!list_empty(&ctx
->rsrc_ref_list
)) {
162 node
= list_first_entry(&ctx
->rsrc_ref_list
,
163 struct io_rsrc_node
, node
);
164 /* recycle ref nodes in order */
167 list_del(&node
->node
);
169 if (likely(!node
->empty
))
170 io_rsrc_put_work(node
);
171 io_rsrc_node_destroy(ctx
, node
);
173 if (list_empty(&ctx
->rsrc_ref_list
) && unlikely(ctx
->rsrc_quiesce
))
174 wake_up_all(&ctx
->rsrc_quiesce_wq
);
177 struct io_rsrc_node
*io_rsrc_node_alloc(struct io_ring_ctx
*ctx
)
179 struct io_rsrc_node
*ref_node
;
181 ref_node
= io_alloc_cache_get(&ctx
->rsrc_node_cache
);
183 ref_node
= kzalloc(sizeof(*ref_node
), GFP_KERNEL
);
194 __cold
static int io_rsrc_ref_quiesce(struct io_rsrc_data
*data
,
195 struct io_ring_ctx
*ctx
)
197 struct io_rsrc_node
*backup
;
201 /* As We may drop ->uring_lock, other task may have started quiesce */
205 backup
= io_rsrc_node_alloc(ctx
);
208 ctx
->rsrc_node
->empty
= true;
209 ctx
->rsrc_node
->type
= -1;
210 list_add_tail(&ctx
->rsrc_node
->node
, &ctx
->rsrc_ref_list
);
211 io_put_rsrc_node(ctx
, ctx
->rsrc_node
);
212 ctx
->rsrc_node
= backup
;
214 if (list_empty(&ctx
->rsrc_ref_list
))
217 if (ctx
->flags
& IORING_SETUP_DEFER_TASKRUN
) {
218 atomic_set(&ctx
->cq_wait_nr
, 1);
223 data
->quiesce
= true;
225 prepare_to_wait(&ctx
->rsrc_quiesce_wq
, &we
, TASK_INTERRUPTIBLE
);
226 mutex_unlock(&ctx
->uring_lock
);
228 ret
= io_run_task_work_sig(ctx
);
230 finish_wait(&ctx
->rsrc_quiesce_wq
, &we
);
231 mutex_lock(&ctx
->uring_lock
);
232 if (list_empty(&ctx
->rsrc_ref_list
))
238 mutex_lock(&ctx
->uring_lock
);
240 } while (!list_empty(&ctx
->rsrc_ref_list
));
242 finish_wait(&ctx
->rsrc_quiesce_wq
, &we
);
243 data
->quiesce
= false;
246 if (ctx
->flags
& IORING_SETUP_DEFER_TASKRUN
) {
247 atomic_set(&ctx
->cq_wait_nr
, 0);
253 static void io_free_page_table(void **table
, size_t size
)
255 unsigned i
, nr_tables
= DIV_ROUND_UP(size
, PAGE_SIZE
);
257 for (i
= 0; i
< nr_tables
; i
++)
262 static void io_rsrc_data_free(struct io_rsrc_data
*data
)
264 size_t size
= data
->nr
* sizeof(data
->tags
[0][0]);
267 io_free_page_table((void **)data
->tags
, size
);
271 static __cold
void **io_alloc_page_table(size_t size
)
273 unsigned i
, nr_tables
= DIV_ROUND_UP(size
, PAGE_SIZE
);
274 size_t init_size
= size
;
277 table
= kcalloc(nr_tables
, sizeof(*table
), GFP_KERNEL_ACCOUNT
);
281 for (i
= 0; i
< nr_tables
; i
++) {
282 unsigned int this_size
= min_t(size_t, size
, PAGE_SIZE
);
284 table
[i
] = kzalloc(this_size
, GFP_KERNEL_ACCOUNT
);
286 io_free_page_table(table
, init_size
);
294 __cold
static int io_rsrc_data_alloc(struct io_ring_ctx
*ctx
, int type
,
296 unsigned nr
, struct io_rsrc_data
**pdata
)
298 struct io_rsrc_data
*data
;
302 data
= kzalloc(sizeof(*data
), GFP_KERNEL
);
305 data
->tags
= (u64
**)io_alloc_page_table(nr
* sizeof(data
->tags
[0][0]));
313 data
->rsrc_type
= type
;
316 for (i
= 0; i
< nr
; i
++) {
317 u64
*tag_slot
= io_get_tag_slot(data
, i
);
319 if (copy_from_user(tag_slot
, &utags
[i
],
327 io_rsrc_data_free(data
);
331 static int __io_sqe_files_update(struct io_ring_ctx
*ctx
,
332 struct io_uring_rsrc_update2
*up
,
335 u64 __user
*tags
= u64_to_user_ptr(up
->tags
);
336 __s32 __user
*fds
= u64_to_user_ptr(up
->data
);
337 struct io_rsrc_data
*data
= ctx
->file_data
;
338 struct io_fixed_file
*file_slot
;
344 if (up
->offset
+ nr_args
> ctx
->nr_user_files
)
347 for (done
= 0; done
< nr_args
; done
++) {
350 if ((tags
&& copy_from_user(&tag
, &tags
[done
], sizeof(tag
))) ||
351 copy_from_user(&fd
, &fds
[done
], sizeof(fd
))) {
355 if ((fd
== IORING_REGISTER_FILES_SKIP
|| fd
== -1) && tag
) {
359 if (fd
== IORING_REGISTER_FILES_SKIP
)
362 i
= array_index_nospec(up
->offset
+ done
, ctx
->nr_user_files
);
363 file_slot
= io_fixed_file_slot(&ctx
->file_table
, i
);
365 if (file_slot
->file_ptr
) {
366 err
= io_queue_rsrc_removal(data
, i
,
367 io_slot_file(file_slot
));
370 file_slot
->file_ptr
= 0;
371 io_file_bitmap_clear(&ctx
->file_table
, i
);
374 struct file
*file
= fget(fd
);
381 * Don't allow io_uring instances to be registered.
383 if (io_is_uring_fops(file
)) {
388 *io_get_tag_slot(data
, i
) = tag
;
389 io_fixed_file_set(file_slot
, file
);
390 io_file_bitmap_set(&ctx
->file_table
, i
);
393 return done
? done
: err
;
396 static int __io_sqe_buffers_update(struct io_ring_ctx
*ctx
,
397 struct io_uring_rsrc_update2
*up
,
398 unsigned int nr_args
)
400 u64 __user
*tags
= u64_to_user_ptr(up
->tags
);
401 struct iovec fast_iov
, *iov
;
402 struct page
*last_hpage
= NULL
;
403 struct iovec __user
*uvec
;
404 u64 user_data
= up
->data
;
410 if (up
->offset
+ nr_args
> ctx
->nr_user_bufs
)
413 for (done
= 0; done
< nr_args
; done
++) {
414 struct io_mapped_ubuf
*imu
;
417 uvec
= u64_to_user_ptr(user_data
);
418 iov
= iovec_from_user(uvec
, 1, 1, &fast_iov
, ctx
->compat
);
423 if (tags
&& copy_from_user(&tag
, &tags
[done
], sizeof(tag
))) {
427 err
= io_buffer_validate(iov
);
430 if (!iov
->iov_base
&& tag
) {
434 err
= io_sqe_buffer_register(ctx
, iov
, &imu
, &last_hpage
);
438 i
= array_index_nospec(up
->offset
+ done
, ctx
->nr_user_bufs
);
439 if (ctx
->user_bufs
[i
] != &dummy_ubuf
) {
440 err
= io_queue_rsrc_removal(ctx
->buf_data
, i
,
443 io_buffer_unmap(ctx
, &imu
);
446 ctx
->user_bufs
[i
] = (struct io_mapped_ubuf
*)&dummy_ubuf
;
449 ctx
->user_bufs
[i
] = imu
;
450 *io_get_tag_slot(ctx
->buf_data
, i
) = tag
;
452 user_data
+= sizeof(struct compat_iovec
);
454 user_data
+= sizeof(struct iovec
);
456 return done
? done
: err
;
459 static int __io_register_rsrc_update(struct io_ring_ctx
*ctx
, unsigned type
,
460 struct io_uring_rsrc_update2
*up
,
465 lockdep_assert_held(&ctx
->uring_lock
);
467 if (check_add_overflow(up
->offset
, nr_args
, &tmp
))
471 case IORING_RSRC_FILE
:
472 return __io_sqe_files_update(ctx
, up
, nr_args
);
473 case IORING_RSRC_BUFFER
:
474 return __io_sqe_buffers_update(ctx
, up
, nr_args
);
479 int io_register_files_update(struct io_ring_ctx
*ctx
, void __user
*arg
,
482 struct io_uring_rsrc_update2 up
;
486 memset(&up
, 0, sizeof(up
));
487 if (copy_from_user(&up
, arg
, sizeof(struct io_uring_rsrc_update
)))
489 if (up
.resv
|| up
.resv2
)
491 return __io_register_rsrc_update(ctx
, IORING_RSRC_FILE
, &up
, nr_args
);
494 int io_register_rsrc_update(struct io_ring_ctx
*ctx
, void __user
*arg
,
495 unsigned size
, unsigned type
)
497 struct io_uring_rsrc_update2 up
;
499 if (size
!= sizeof(up
))
501 if (copy_from_user(&up
, arg
, sizeof(up
)))
503 if (!up
.nr
|| up
.resv
|| up
.resv2
)
505 return __io_register_rsrc_update(ctx
, type
, &up
, up
.nr
);
508 __cold
int io_register_rsrc(struct io_ring_ctx
*ctx
, void __user
*arg
,
509 unsigned int size
, unsigned int type
)
511 struct io_uring_rsrc_register rr
;
513 /* keep it extendible */
514 if (size
!= sizeof(rr
))
517 memset(&rr
, 0, sizeof(rr
));
518 if (copy_from_user(&rr
, arg
, size
))
520 if (!rr
.nr
|| rr
.resv2
)
522 if (rr
.flags
& ~IORING_RSRC_REGISTER_SPARSE
)
526 case IORING_RSRC_FILE
:
527 if (rr
.flags
& IORING_RSRC_REGISTER_SPARSE
&& rr
.data
)
529 return io_sqe_files_register(ctx
, u64_to_user_ptr(rr
.data
),
530 rr
.nr
, u64_to_user_ptr(rr
.tags
));
531 case IORING_RSRC_BUFFER
:
532 if (rr
.flags
& IORING_RSRC_REGISTER_SPARSE
&& rr
.data
)
534 return io_sqe_buffers_register(ctx
, u64_to_user_ptr(rr
.data
),
535 rr
.nr
, u64_to_user_ptr(rr
.tags
));
540 int io_files_update_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
542 struct io_rsrc_update
*up
= io_kiocb_to_cmd(req
, struct io_rsrc_update
);
544 if (unlikely(req
->flags
& (REQ_F_FIXED_FILE
| REQ_F_BUFFER_SELECT
)))
546 if (sqe
->rw_flags
|| sqe
->splice_fd_in
)
549 up
->offset
= READ_ONCE(sqe
->off
);
550 up
->nr_args
= READ_ONCE(sqe
->len
);
553 up
->arg
= READ_ONCE(sqe
->addr
);
557 static int io_files_update_with_index_alloc(struct io_kiocb
*req
,
558 unsigned int issue_flags
)
560 struct io_rsrc_update
*up
= io_kiocb_to_cmd(req
, struct io_rsrc_update
);
561 __s32 __user
*fds
= u64_to_user_ptr(up
->arg
);
566 if (!req
->ctx
->file_data
)
569 for (done
= 0; done
< up
->nr_args
; done
++) {
570 if (copy_from_user(&fd
, &fds
[done
], sizeof(fd
))) {
580 ret
= io_fixed_fd_install(req
, issue_flags
, file
,
581 IORING_FILE_INDEX_ALLOC
);
584 if (copy_to_user(&fds
[done
], &ret
, sizeof(ret
))) {
585 __io_close_fixed(req
->ctx
, issue_flags
, ret
);
596 int io_files_update(struct io_kiocb
*req
, unsigned int issue_flags
)
598 struct io_rsrc_update
*up
= io_kiocb_to_cmd(req
, struct io_rsrc_update
);
599 struct io_ring_ctx
*ctx
= req
->ctx
;
600 struct io_uring_rsrc_update2 up2
;
603 up2
.offset
= up
->offset
;
610 if (up
->offset
== IORING_FILE_INDEX_ALLOC
) {
611 ret
= io_files_update_with_index_alloc(req
, issue_flags
);
613 io_ring_submit_lock(ctx
, issue_flags
);
614 ret
= __io_register_rsrc_update(ctx
, IORING_RSRC_FILE
,
616 io_ring_submit_unlock(ctx
, issue_flags
);
621 io_req_set_res(req
, ret
, 0);
625 int io_queue_rsrc_removal(struct io_rsrc_data
*data
, unsigned idx
, void *rsrc
)
627 struct io_ring_ctx
*ctx
= data
->ctx
;
628 struct io_rsrc_node
*node
= ctx
->rsrc_node
;
629 u64
*tag_slot
= io_get_tag_slot(data
, idx
);
631 ctx
->rsrc_node
= io_rsrc_node_alloc(ctx
);
632 if (unlikely(!ctx
->rsrc_node
)) {
633 ctx
->rsrc_node
= node
;
637 node
->item
.rsrc
= rsrc
;
638 node
->type
= data
->rsrc_type
;
639 node
->item
.tag
= *tag_slot
;
641 list_add_tail(&node
->node
, &ctx
->rsrc_ref_list
);
642 io_put_rsrc_node(ctx
, node
);
646 void __io_sqe_files_unregister(struct io_ring_ctx
*ctx
)
650 for (i
= 0; i
< ctx
->nr_user_files
; i
++) {
651 struct file
*file
= io_file_from_index(&ctx
->file_table
, i
);
655 io_file_bitmap_clear(&ctx
->file_table
, i
);
659 io_free_file_tables(&ctx
->file_table
);
660 io_file_table_set_alloc_range(ctx
, 0, 0);
661 io_rsrc_data_free(ctx
->file_data
);
662 ctx
->file_data
= NULL
;
663 ctx
->nr_user_files
= 0;
666 int io_sqe_files_unregister(struct io_ring_ctx
*ctx
)
668 unsigned nr
= ctx
->nr_user_files
;
675 * Quiesce may unlock ->uring_lock, and while it's not held
676 * prevent new requests using the table.
678 ctx
->nr_user_files
= 0;
679 ret
= io_rsrc_ref_quiesce(ctx
->file_data
, ctx
);
680 ctx
->nr_user_files
= nr
;
682 __io_sqe_files_unregister(ctx
);
686 int io_sqe_files_register(struct io_ring_ctx
*ctx
, void __user
*arg
,
687 unsigned nr_args
, u64 __user
*tags
)
689 __s32 __user
*fds
= (__s32 __user
*) arg
;
698 if (nr_args
> IORING_MAX_FIXED_FILES
)
700 if (nr_args
> rlimit(RLIMIT_NOFILE
))
702 ret
= io_rsrc_data_alloc(ctx
, IORING_RSRC_FILE
, tags
, nr_args
,
707 if (!io_alloc_file_tables(&ctx
->file_table
, nr_args
)) {
708 io_rsrc_data_free(ctx
->file_data
);
709 ctx
->file_data
= NULL
;
713 for (i
= 0; i
< nr_args
; i
++, ctx
->nr_user_files
++) {
714 struct io_fixed_file
*file_slot
;
716 if (fds
&& copy_from_user(&fd
, &fds
[i
], sizeof(fd
))) {
720 /* allow sparse sets */
721 if (!fds
|| fd
== -1) {
723 if (unlikely(*io_get_tag_slot(ctx
->file_data
, i
)))
734 * Don't allow io_uring instances to be registered.
736 if (io_is_uring_fops(file
)) {
740 file_slot
= io_fixed_file_slot(&ctx
->file_table
, i
);
741 io_fixed_file_set(file_slot
, file
);
742 io_file_bitmap_set(&ctx
->file_table
, i
);
745 /* default it to the whole table */
746 io_file_table_set_alloc_range(ctx
, 0, ctx
->nr_user_files
);
749 __io_sqe_files_unregister(ctx
);
753 static void io_rsrc_buf_put(struct io_ring_ctx
*ctx
, struct io_rsrc_put
*prsrc
)
755 io_buffer_unmap(ctx
, &prsrc
->buf
);
759 void __io_sqe_buffers_unregister(struct io_ring_ctx
*ctx
)
763 for (i
= 0; i
< ctx
->nr_user_bufs
; i
++)
764 io_buffer_unmap(ctx
, &ctx
->user_bufs
[i
]);
765 kfree(ctx
->user_bufs
);
766 io_rsrc_data_free(ctx
->buf_data
);
767 ctx
->user_bufs
= NULL
;
768 ctx
->buf_data
= NULL
;
769 ctx
->nr_user_bufs
= 0;
772 int io_sqe_buffers_unregister(struct io_ring_ctx
*ctx
)
774 unsigned nr
= ctx
->nr_user_bufs
;
781 * Quiesce may unlock ->uring_lock, and while it's not held
782 * prevent new requests using the table.
784 ctx
->nr_user_bufs
= 0;
785 ret
= io_rsrc_ref_quiesce(ctx
->buf_data
, ctx
);
786 ctx
->nr_user_bufs
= nr
;
788 __io_sqe_buffers_unregister(ctx
);
793 * Not super efficient, but this is just a registration time. And we do cache
794 * the last compound head, so generally we'll only do a full search if we don't
797 * We check if the given compound head page has already been accounted, to
798 * avoid double accounting it. This allows us to account the full size of the
799 * page, not just the constituent pages of a huge page.
801 static bool headpage_already_acct(struct io_ring_ctx
*ctx
, struct page
**pages
,
802 int nr_pages
, struct page
*hpage
)
806 /* check current page array */
807 for (i
= 0; i
< nr_pages
; i
++) {
808 if (!PageCompound(pages
[i
]))
810 if (compound_head(pages
[i
]) == hpage
)
814 /* check previously registered pages */
815 for (i
= 0; i
< ctx
->nr_user_bufs
; i
++) {
816 struct io_mapped_ubuf
*imu
= ctx
->user_bufs
[i
];
818 for (j
= 0; j
< imu
->nr_bvecs
; j
++) {
819 if (!PageCompound(imu
->bvec
[j
].bv_page
))
821 if (compound_head(imu
->bvec
[j
].bv_page
) == hpage
)
829 static int io_buffer_account_pin(struct io_ring_ctx
*ctx
, struct page
**pages
,
830 int nr_pages
, struct io_mapped_ubuf
*imu
,
831 struct page
**last_hpage
)
836 for (i
= 0; i
< nr_pages
; i
++) {
837 if (!PageCompound(pages
[i
])) {
842 hpage
= compound_head(pages
[i
]);
843 if (hpage
== *last_hpage
)
846 if (headpage_already_acct(ctx
, pages
, i
, hpage
))
848 imu
->acct_pages
+= page_size(hpage
) >> PAGE_SHIFT
;
852 if (!imu
->acct_pages
)
855 ret
= io_account_mem(ctx
, imu
->acct_pages
);
861 static bool io_do_coalesce_buffer(struct page
***pages
, int *nr_pages
,
862 struct io_imu_folio_data
*data
, int nr_folios
)
864 struct page
**page_array
= *pages
, **new_array
= NULL
;
865 int nr_pages_left
= *nr_pages
, i
, j
;
867 /* Store head pages only*/
868 new_array
= kvmalloc_array(nr_folios
, sizeof(struct page
*),
873 new_array
[0] = compound_head(page_array
[0]);
875 * The pages are bound to the folio, it doesn't
876 * actually unpin them but drops all but one reference,
877 * which is usually put down by io_buffer_unmap().
878 * Note, needs a better helper.
880 if (data
->nr_pages_head
> 1)
881 unpin_user_pages(&page_array
[1], data
->nr_pages_head
- 1);
883 j
= data
->nr_pages_head
;
884 nr_pages_left
-= data
->nr_pages_head
;
885 for (i
= 1; i
< nr_folios
; i
++) {
886 unsigned int nr_unpin
;
888 new_array
[i
] = page_array
[j
];
889 nr_unpin
= min_t(unsigned int, nr_pages_left
- 1,
890 data
->nr_pages_mid
- 1);
892 unpin_user_pages(&page_array
[j
+1], nr_unpin
);
893 j
+= data
->nr_pages_mid
;
894 nr_pages_left
-= data
->nr_pages_mid
;
898 *nr_pages
= nr_folios
;
902 static bool io_try_coalesce_buffer(struct page
***pages
, int *nr_pages
,
903 struct io_imu_folio_data
*data
)
905 struct page
**page_array
= *pages
;
906 struct folio
*folio
= page_folio(page_array
[0]);
907 unsigned int count
= 1, nr_folios
= 1;
913 data
->nr_pages_mid
= folio_nr_pages(folio
);
914 if (data
->nr_pages_mid
== 1)
917 data
->folio_shift
= folio_shift(folio
);
919 * Check if pages are contiguous inside a folio, and all folios have
920 * the same page count except for the head and tail.
922 for (i
= 1; i
< *nr_pages
; i
++) {
923 if (page_folio(page_array
[i
]) == folio
&&
924 page_array
[i
] == page_array
[i
-1] + 1) {
929 if (nr_folios
== 1) {
930 if (folio_page_idx(folio
, page_array
[i
-1]) !=
931 data
->nr_pages_mid
- 1)
934 data
->nr_pages_head
= count
;
935 } else if (count
!= data
->nr_pages_mid
) {
939 folio
= page_folio(page_array
[i
]);
940 if (folio_size(folio
) != (1UL << data
->folio_shift
) ||
941 folio_page_idx(folio
, page_array
[i
]) != 0)
948 data
->nr_pages_head
= count
;
950 return io_do_coalesce_buffer(pages
, nr_pages
, data
, nr_folios
);
953 static int io_sqe_buffer_register(struct io_ring_ctx
*ctx
, struct iovec
*iov
,
954 struct io_mapped_ubuf
**pimu
,
955 struct page
**last_hpage
)
957 struct io_mapped_ubuf
*imu
= NULL
;
958 struct page
**pages
= NULL
;
961 int ret
, nr_pages
, i
;
962 struct io_imu_folio_data data
;
965 *pimu
= (struct io_mapped_ubuf
*)&dummy_ubuf
;
970 pages
= io_pin_pages((unsigned long) iov
->iov_base
, iov
->iov_len
,
973 ret
= PTR_ERR(pages
);
978 /* If it's huge page(s), try to coalesce them into fewer bvec entries */
979 coalesced
= io_try_coalesce_buffer(&pages
, &nr_pages
, &data
);
981 imu
= kvmalloc(struct_size(imu
, bvec
, nr_pages
), GFP_KERNEL
);
985 ret
= io_buffer_account_pin(ctx
, pages
, nr_pages
, imu
, last_hpage
);
987 unpin_user_pages(pages
, nr_pages
);
992 /* store original address for later verification */
993 imu
->ubuf
= (unsigned long) iov
->iov_base
;
994 imu
->len
= iov
->iov_len
;
995 imu
->nr_bvecs
= nr_pages
;
996 imu
->folio_shift
= PAGE_SHIFT
;
998 imu
->folio_shift
= data
.folio_shift
;
999 refcount_set(&imu
->refs
, 1);
1000 off
= (unsigned long) iov
->iov_base
& ((1UL << imu
->folio_shift
) - 1);
1004 for (i
= 0; i
< nr_pages
; i
++) {
1007 vec_len
= min_t(size_t, size
, (1UL << imu
->folio_shift
) - off
);
1008 bvec_set_page(&imu
->bvec
[i
], pages
[i
], vec_len
, off
);
1019 static int io_buffers_map_alloc(struct io_ring_ctx
*ctx
, unsigned int nr_args
)
1021 ctx
->user_bufs
= kcalloc(nr_args
, sizeof(*ctx
->user_bufs
), GFP_KERNEL
);
1022 return ctx
->user_bufs
? 0 : -ENOMEM
;
1025 int io_sqe_buffers_register(struct io_ring_ctx
*ctx
, void __user
*arg
,
1026 unsigned int nr_args
, u64 __user
*tags
)
1028 struct page
*last_hpage
= NULL
;
1029 struct io_rsrc_data
*data
;
1030 struct iovec fast_iov
, *iov
= &fast_iov
;
1031 const struct iovec __user
*uvec
;
1034 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS
>= (1u << 16));
1038 if (!nr_args
|| nr_args
> IORING_MAX_REG_BUFFERS
)
1040 ret
= io_rsrc_data_alloc(ctx
, IORING_RSRC_BUFFER
, tags
, nr_args
, &data
);
1043 ret
= io_buffers_map_alloc(ctx
, nr_args
);
1045 io_rsrc_data_free(data
);
1050 memset(iov
, 0, sizeof(*iov
));
1052 for (i
= 0; i
< nr_args
; i
++, ctx
->nr_user_bufs
++) {
1054 uvec
= (struct iovec __user
*) arg
;
1055 iov
= iovec_from_user(uvec
, 1, 1, &fast_iov
, ctx
->compat
);
1060 ret
= io_buffer_validate(iov
);
1064 arg
+= sizeof(struct compat_iovec
);
1066 arg
+= sizeof(struct iovec
);
1069 if (!iov
->iov_base
&& *io_get_tag_slot(data
, i
)) {
1074 ret
= io_sqe_buffer_register(ctx
, iov
, &ctx
->user_bufs
[i
],
1080 WARN_ON_ONCE(ctx
->buf_data
);
1082 ctx
->buf_data
= data
;
1084 __io_sqe_buffers_unregister(ctx
);
1088 int io_import_fixed(int ddir
, struct iov_iter
*iter
,
1089 struct io_mapped_ubuf
*imu
,
1090 u64 buf_addr
, size_t len
)
1095 if (WARN_ON_ONCE(!imu
))
1097 if (unlikely(check_add_overflow(buf_addr
, (u64
)len
, &buf_end
)))
1099 /* not inside the mapped region */
1100 if (unlikely(buf_addr
< imu
->ubuf
|| buf_end
> (imu
->ubuf
+ imu
->len
)))
1104 * Might not be a start of buffer, set size appropriately
1105 * and advance us to the beginning.
1107 offset
= buf_addr
- imu
->ubuf
;
1108 iov_iter_bvec(iter
, ddir
, imu
->bvec
, imu
->nr_bvecs
, offset
+ len
);
1112 * Don't use iov_iter_advance() here, as it's really slow for
1113 * using the latter parts of a big fixed buffer - it iterates
1114 * over each segment manually. We can cheat a bit here, because
1117 * 1) it's a BVEC iter, we set it up
1118 * 2) all bvecs are the same in size, except potentially the
1119 * first and last bvec
1121 * So just find our index, and adjust the iterator afterwards.
1122 * If the offset is within the first bvec (or the whole first
1123 * bvec, just use iov_iter_advance(). This makes it easier
1124 * since we can just skip the first segment, which may not
1125 * be folio_size aligned.
1127 const struct bio_vec
*bvec
= imu
->bvec
;
1129 if (offset
< bvec
->bv_len
) {
1131 iter
->count
-= offset
;
1132 iter
->iov_offset
= offset
;
1134 unsigned long seg_skip
;
1136 /* skip first vec */
1137 offset
-= bvec
->bv_len
;
1138 seg_skip
= 1 + (offset
>> imu
->folio_shift
);
1140 iter
->bvec
= bvec
+ seg_skip
;
1141 iter
->nr_segs
-= seg_skip
;
1142 iter
->count
-= bvec
->bv_len
+ offset
;
1143 iter
->iov_offset
= offset
& ((1UL << imu
->folio_shift
) - 1);
1150 static int io_clone_buffers(struct io_ring_ctx
*ctx
, struct io_ring_ctx
*src_ctx
)
1152 struct io_mapped_ubuf
**user_bufs
;
1153 struct io_rsrc_data
*data
;
1157 * Drop our own lock here. We'll setup the data we need and reference
1158 * the source buffers, then re-grab, check, and assign at the end.
1160 mutex_unlock(&ctx
->uring_lock
);
1162 mutex_lock(&src_ctx
->uring_lock
);
1164 nbufs
= src_ctx
->nr_user_bufs
;
1167 ret
= io_rsrc_data_alloc(ctx
, IORING_RSRC_BUFFER
, NULL
, nbufs
, &data
);
1172 user_bufs
= kcalloc(nbufs
, sizeof(*ctx
->user_bufs
), GFP_KERNEL
);
1176 for (i
= 0; i
< nbufs
; i
++) {
1177 struct io_mapped_ubuf
*src
= src_ctx
->user_bufs
[i
];
1179 refcount_inc(&src
->refs
);
1183 /* Have a ref on the bufs now, drop src lock and re-grab our own lock */
1184 mutex_unlock(&src_ctx
->uring_lock
);
1185 mutex_lock(&ctx
->uring_lock
);
1186 if (!ctx
->user_bufs
) {
1187 ctx
->user_bufs
= user_bufs
;
1188 ctx
->buf_data
= data
;
1189 ctx
->nr_user_bufs
= nbufs
;
1193 /* someone raced setting up buffers, dump ours */
1194 for (i
= 0; i
< nbufs
; i
++)
1195 io_buffer_unmap(ctx
, &user_bufs
[i
]);
1196 io_rsrc_data_free(data
);
1200 io_rsrc_data_free(data
);
1202 mutex_unlock(&src_ctx
->uring_lock
);
1203 mutex_lock(&ctx
->uring_lock
);
1208 * Copy the registered buffers from the source ring whose file descriptor
1209 * is given in the src_fd to the current ring. This is identical to registering
1210 * the buffers with ctx, except faster as mappings already exist.
1212 * Since the memory is already accounted once, don't account it again.
1214 int io_register_clone_buffers(struct io_ring_ctx
*ctx
, void __user
*arg
)
1216 struct io_uring_clone_buffers buf
;
1217 bool registered_src
;
1221 if (ctx
->user_bufs
|| ctx
->nr_user_bufs
)
1223 if (copy_from_user(&buf
, arg
, sizeof(buf
)))
1225 if (buf
.flags
& ~IORING_REGISTER_SRC_REGISTERED
)
1227 if (memchr_inv(buf
.pad
, 0, sizeof(buf
.pad
)))
1230 registered_src
= (buf
.flags
& IORING_REGISTER_SRC_REGISTERED
) != 0;
1231 file
= io_uring_register_get_file(buf
.src_fd
, registered_src
);
1233 return PTR_ERR(file
);
1234 ret
= io_clone_buffers(ctx
, file
->private_data
);
1235 if (!registered_src
)