1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
5 #include <linux/file.h>
7 #include <linux/slab.h>
8 #include <linux/nospec.h>
9 #include <linux/hugetlb.h>
10 #include <linux/compat.h>
11 #include <linux/io_uring.h>
13 #include <uapi/linux/io_uring.h>
16 #include "alloc_cache.h"
17 #include "openclose.h"
21 struct io_rsrc_update
{
28 static void io_rsrc_buf_put(struct io_ring_ctx
*ctx
, struct io_rsrc_put
*prsrc
);
29 static int io_sqe_buffer_register(struct io_ring_ctx
*ctx
, struct iovec
*iov
,
30 struct io_mapped_ubuf
**pimu
,
31 struct page
**last_hpage
);
34 #define IORING_MAX_FIXED_FILES (1U << 20)
35 #define IORING_MAX_REG_BUFFERS (1U << 14)
37 static const struct io_mapped_ubuf dummy_ubuf
= {
38 /* set invalid range, so io_import_fixed() fails meeting it */
43 int __io_account_mem(struct user_struct
*user
, unsigned long nr_pages
)
45 unsigned long page_limit
, cur_pages
, new_pages
;
50 /* Don't allow more pages than we can safely lock */
51 page_limit
= rlimit(RLIMIT_MEMLOCK
) >> PAGE_SHIFT
;
53 cur_pages
= atomic_long_read(&user
->locked_vm
);
55 new_pages
= cur_pages
+ nr_pages
;
56 if (new_pages
> page_limit
)
58 } while (!atomic_long_try_cmpxchg(&user
->locked_vm
,
59 &cur_pages
, new_pages
));
63 static void io_unaccount_mem(struct io_ring_ctx
*ctx
, unsigned long nr_pages
)
66 __io_unaccount_mem(ctx
->user
, nr_pages
);
69 atomic64_sub(nr_pages
, &ctx
->mm_account
->pinned_vm
);
72 static int io_account_mem(struct io_ring_ctx
*ctx
, unsigned long nr_pages
)
77 ret
= __io_account_mem(ctx
->user
, nr_pages
);
83 atomic64_add(nr_pages
, &ctx
->mm_account
->pinned_vm
);
88 static int io_buffer_validate(struct iovec
*iov
)
90 unsigned long tmp
, acct_len
= iov
->iov_len
+ (PAGE_SIZE
- 1);
93 * Don't impose further limits on the size and buffer
94 * constraints here, we'll -EINVAL later when IO is
95 * submitted if they are wrong.
98 return iov
->iov_len
? -EFAULT
: 0;
102 /* arbitrary limit, but we need something */
103 if (iov
->iov_len
> SZ_1G
)
106 if (check_add_overflow((unsigned long)iov
->iov_base
, acct_len
, &tmp
))
112 static void io_buffer_unmap(struct io_ring_ctx
*ctx
, struct io_mapped_ubuf
**slot
)
114 struct io_mapped_ubuf
*imu
= *slot
;
117 if (imu
!= &dummy_ubuf
) {
118 for (i
= 0; i
< imu
->nr_bvecs
; i
++)
119 unpin_user_page(imu
->bvec
[i
].bv_page
);
121 io_unaccount_mem(ctx
, imu
->acct_pages
);
127 static void io_rsrc_put_work(struct io_rsrc_node
*node
)
129 struct io_rsrc_put
*prsrc
= &node
->item
;
132 io_post_aux_cqe(node
->ctx
, prsrc
->tag
, 0, 0);
134 switch (node
->type
) {
135 case IORING_RSRC_FILE
:
138 case IORING_RSRC_BUFFER
:
139 io_rsrc_buf_put(node
->ctx
, prsrc
);
147 void io_rsrc_node_destroy(struct io_ring_ctx
*ctx
, struct io_rsrc_node
*node
)
149 if (!io_alloc_cache_put(&ctx
->rsrc_node_cache
, node
))
153 void io_rsrc_node_ref_zero(struct io_rsrc_node
*node
)
154 __must_hold(&node
->ctx
->uring_lock
)
156 struct io_ring_ctx
*ctx
= node
->ctx
;
158 while (!list_empty(&ctx
->rsrc_ref_list
)) {
159 node
= list_first_entry(&ctx
->rsrc_ref_list
,
160 struct io_rsrc_node
, node
);
161 /* recycle ref nodes in order */
164 list_del(&node
->node
);
166 if (likely(!node
->empty
))
167 io_rsrc_put_work(node
);
168 io_rsrc_node_destroy(ctx
, node
);
170 if (list_empty(&ctx
->rsrc_ref_list
) && unlikely(ctx
->rsrc_quiesce
))
171 wake_up_all(&ctx
->rsrc_quiesce_wq
);
174 struct io_rsrc_node
*io_rsrc_node_alloc(struct io_ring_ctx
*ctx
)
176 struct io_rsrc_node
*ref_node
;
178 ref_node
= io_alloc_cache_get(&ctx
->rsrc_node_cache
);
180 ref_node
= kzalloc(sizeof(*ref_node
), GFP_KERNEL
);
191 __cold
static int io_rsrc_ref_quiesce(struct io_rsrc_data
*data
,
192 struct io_ring_ctx
*ctx
)
194 struct io_rsrc_node
*backup
;
198 /* As We may drop ->uring_lock, other task may have started quiesce */
202 backup
= io_rsrc_node_alloc(ctx
);
205 ctx
->rsrc_node
->empty
= true;
206 ctx
->rsrc_node
->type
= -1;
207 list_add_tail(&ctx
->rsrc_node
->node
, &ctx
->rsrc_ref_list
);
208 io_put_rsrc_node(ctx
, ctx
->rsrc_node
);
209 ctx
->rsrc_node
= backup
;
211 if (list_empty(&ctx
->rsrc_ref_list
))
214 if (ctx
->flags
& IORING_SETUP_DEFER_TASKRUN
) {
215 atomic_set(&ctx
->cq_wait_nr
, 1);
220 data
->quiesce
= true;
222 prepare_to_wait(&ctx
->rsrc_quiesce_wq
, &we
, TASK_INTERRUPTIBLE
);
223 mutex_unlock(&ctx
->uring_lock
);
225 ret
= io_run_task_work_sig(ctx
);
227 finish_wait(&ctx
->rsrc_quiesce_wq
, &we
);
228 mutex_lock(&ctx
->uring_lock
);
229 if (list_empty(&ctx
->rsrc_ref_list
))
235 mutex_lock(&ctx
->uring_lock
);
237 } while (!list_empty(&ctx
->rsrc_ref_list
));
239 finish_wait(&ctx
->rsrc_quiesce_wq
, &we
);
240 data
->quiesce
= false;
243 if (ctx
->flags
& IORING_SETUP_DEFER_TASKRUN
) {
244 atomic_set(&ctx
->cq_wait_nr
, 0);
250 static void io_free_page_table(void **table
, size_t size
)
252 unsigned i
, nr_tables
= DIV_ROUND_UP(size
, PAGE_SIZE
);
254 for (i
= 0; i
< nr_tables
; i
++)
259 static void io_rsrc_data_free(struct io_rsrc_data
*data
)
261 size_t size
= data
->nr
* sizeof(data
->tags
[0][0]);
264 io_free_page_table((void **)data
->tags
, size
);
268 static __cold
void **io_alloc_page_table(size_t size
)
270 unsigned i
, nr_tables
= DIV_ROUND_UP(size
, PAGE_SIZE
);
271 size_t init_size
= size
;
274 table
= kcalloc(nr_tables
, sizeof(*table
), GFP_KERNEL_ACCOUNT
);
278 for (i
= 0; i
< nr_tables
; i
++) {
279 unsigned int this_size
= min_t(size_t, size
, PAGE_SIZE
);
281 table
[i
] = kzalloc(this_size
, GFP_KERNEL_ACCOUNT
);
283 io_free_page_table(table
, init_size
);
291 __cold
static int io_rsrc_data_alloc(struct io_ring_ctx
*ctx
, int type
,
293 unsigned nr
, struct io_rsrc_data
**pdata
)
295 struct io_rsrc_data
*data
;
299 data
= kzalloc(sizeof(*data
), GFP_KERNEL
);
302 data
->tags
= (u64
**)io_alloc_page_table(nr
* sizeof(data
->tags
[0][0]));
310 data
->rsrc_type
= type
;
313 for (i
= 0; i
< nr
; i
++) {
314 u64
*tag_slot
= io_get_tag_slot(data
, i
);
316 if (copy_from_user(tag_slot
, &utags
[i
],
324 io_rsrc_data_free(data
);
328 static int __io_sqe_files_update(struct io_ring_ctx
*ctx
,
329 struct io_uring_rsrc_update2
*up
,
332 u64 __user
*tags
= u64_to_user_ptr(up
->tags
);
333 __s32 __user
*fds
= u64_to_user_ptr(up
->data
);
334 struct io_rsrc_data
*data
= ctx
->file_data
;
335 struct io_fixed_file
*file_slot
;
341 if (up
->offset
+ nr_args
> ctx
->nr_user_files
)
344 for (done
= 0; done
< nr_args
; done
++) {
347 if ((tags
&& copy_from_user(&tag
, &tags
[done
], sizeof(tag
))) ||
348 copy_from_user(&fd
, &fds
[done
], sizeof(fd
))) {
352 if ((fd
== IORING_REGISTER_FILES_SKIP
|| fd
== -1) && tag
) {
356 if (fd
== IORING_REGISTER_FILES_SKIP
)
359 i
= array_index_nospec(up
->offset
+ done
, ctx
->nr_user_files
);
360 file_slot
= io_fixed_file_slot(&ctx
->file_table
, i
);
362 if (file_slot
->file_ptr
) {
363 err
= io_queue_rsrc_removal(data
, i
,
364 io_slot_file(file_slot
));
367 file_slot
->file_ptr
= 0;
368 io_file_bitmap_clear(&ctx
->file_table
, i
);
371 struct file
*file
= fget(fd
);
378 * Don't allow io_uring instances to be registered.
380 if (io_is_uring_fops(file
)) {
385 *io_get_tag_slot(data
, i
) = tag
;
386 io_fixed_file_set(file_slot
, file
);
387 io_file_bitmap_set(&ctx
->file_table
, i
);
390 return done
? done
: err
;
393 static int __io_sqe_buffers_update(struct io_ring_ctx
*ctx
,
394 struct io_uring_rsrc_update2
*up
,
395 unsigned int nr_args
)
397 u64 __user
*tags
= u64_to_user_ptr(up
->tags
);
398 struct iovec fast_iov
, *iov
;
399 struct page
*last_hpage
= NULL
;
400 struct iovec __user
*uvec
;
401 u64 user_data
= up
->data
;
407 if (up
->offset
+ nr_args
> ctx
->nr_user_bufs
)
410 for (done
= 0; done
< nr_args
; done
++) {
411 struct io_mapped_ubuf
*imu
;
414 uvec
= u64_to_user_ptr(user_data
);
415 iov
= iovec_from_user(uvec
, 1, 1, &fast_iov
, ctx
->compat
);
420 if (tags
&& copy_from_user(&tag
, &tags
[done
], sizeof(tag
))) {
424 err
= io_buffer_validate(iov
);
427 if (!iov
->iov_base
&& tag
) {
431 err
= io_sqe_buffer_register(ctx
, iov
, &imu
, &last_hpage
);
435 i
= array_index_nospec(up
->offset
+ done
, ctx
->nr_user_bufs
);
436 if (ctx
->user_bufs
[i
] != &dummy_ubuf
) {
437 err
= io_queue_rsrc_removal(ctx
->buf_data
, i
,
440 io_buffer_unmap(ctx
, &imu
);
443 ctx
->user_bufs
[i
] = (struct io_mapped_ubuf
*)&dummy_ubuf
;
446 ctx
->user_bufs
[i
] = imu
;
447 *io_get_tag_slot(ctx
->buf_data
, i
) = tag
;
449 user_data
+= sizeof(struct compat_iovec
);
451 user_data
+= sizeof(struct iovec
);
453 return done
? done
: err
;
456 static int __io_register_rsrc_update(struct io_ring_ctx
*ctx
, unsigned type
,
457 struct io_uring_rsrc_update2
*up
,
462 lockdep_assert_held(&ctx
->uring_lock
);
464 if (check_add_overflow(up
->offset
, nr_args
, &tmp
))
468 case IORING_RSRC_FILE
:
469 return __io_sqe_files_update(ctx
, up
, nr_args
);
470 case IORING_RSRC_BUFFER
:
471 return __io_sqe_buffers_update(ctx
, up
, nr_args
);
476 int io_register_files_update(struct io_ring_ctx
*ctx
, void __user
*arg
,
479 struct io_uring_rsrc_update2 up
;
483 memset(&up
, 0, sizeof(up
));
484 if (copy_from_user(&up
, arg
, sizeof(struct io_uring_rsrc_update
)))
486 if (up
.resv
|| up
.resv2
)
488 return __io_register_rsrc_update(ctx
, IORING_RSRC_FILE
, &up
, nr_args
);
491 int io_register_rsrc_update(struct io_ring_ctx
*ctx
, void __user
*arg
,
492 unsigned size
, unsigned type
)
494 struct io_uring_rsrc_update2 up
;
496 if (size
!= sizeof(up
))
498 if (copy_from_user(&up
, arg
, sizeof(up
)))
500 if (!up
.nr
|| up
.resv
|| up
.resv2
)
502 return __io_register_rsrc_update(ctx
, type
, &up
, up
.nr
);
505 __cold
int io_register_rsrc(struct io_ring_ctx
*ctx
, void __user
*arg
,
506 unsigned int size
, unsigned int type
)
508 struct io_uring_rsrc_register rr
;
510 /* keep it extendible */
511 if (size
!= sizeof(rr
))
514 memset(&rr
, 0, sizeof(rr
));
515 if (copy_from_user(&rr
, arg
, size
))
517 if (!rr
.nr
|| rr
.resv2
)
519 if (rr
.flags
& ~IORING_RSRC_REGISTER_SPARSE
)
523 case IORING_RSRC_FILE
:
524 if (rr
.flags
& IORING_RSRC_REGISTER_SPARSE
&& rr
.data
)
526 return io_sqe_files_register(ctx
, u64_to_user_ptr(rr
.data
),
527 rr
.nr
, u64_to_user_ptr(rr
.tags
));
528 case IORING_RSRC_BUFFER
:
529 if (rr
.flags
& IORING_RSRC_REGISTER_SPARSE
&& rr
.data
)
531 return io_sqe_buffers_register(ctx
, u64_to_user_ptr(rr
.data
),
532 rr
.nr
, u64_to_user_ptr(rr
.tags
));
537 int io_files_update_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
539 struct io_rsrc_update
*up
= io_kiocb_to_cmd(req
, struct io_rsrc_update
);
541 if (unlikely(req
->flags
& (REQ_F_FIXED_FILE
| REQ_F_BUFFER_SELECT
)))
543 if (sqe
->rw_flags
|| sqe
->splice_fd_in
)
546 up
->offset
= READ_ONCE(sqe
->off
);
547 up
->nr_args
= READ_ONCE(sqe
->len
);
550 up
->arg
= READ_ONCE(sqe
->addr
);
554 static int io_files_update_with_index_alloc(struct io_kiocb
*req
,
555 unsigned int issue_flags
)
557 struct io_rsrc_update
*up
= io_kiocb_to_cmd(req
, struct io_rsrc_update
);
558 __s32 __user
*fds
= u64_to_user_ptr(up
->arg
);
563 if (!req
->ctx
->file_data
)
566 for (done
= 0; done
< up
->nr_args
; done
++) {
567 if (copy_from_user(&fd
, &fds
[done
], sizeof(fd
))) {
577 ret
= io_fixed_fd_install(req
, issue_flags
, file
,
578 IORING_FILE_INDEX_ALLOC
);
581 if (copy_to_user(&fds
[done
], &ret
, sizeof(ret
))) {
582 __io_close_fixed(req
->ctx
, issue_flags
, ret
);
593 int io_files_update(struct io_kiocb
*req
, unsigned int issue_flags
)
595 struct io_rsrc_update
*up
= io_kiocb_to_cmd(req
, struct io_rsrc_update
);
596 struct io_ring_ctx
*ctx
= req
->ctx
;
597 struct io_uring_rsrc_update2 up2
;
600 up2
.offset
= up
->offset
;
607 if (up
->offset
== IORING_FILE_INDEX_ALLOC
) {
608 ret
= io_files_update_with_index_alloc(req
, issue_flags
);
610 io_ring_submit_lock(ctx
, issue_flags
);
611 ret
= __io_register_rsrc_update(ctx
, IORING_RSRC_FILE
,
613 io_ring_submit_unlock(ctx
, issue_flags
);
618 io_req_set_res(req
, ret
, 0);
622 int io_queue_rsrc_removal(struct io_rsrc_data
*data
, unsigned idx
, void *rsrc
)
624 struct io_ring_ctx
*ctx
= data
->ctx
;
625 struct io_rsrc_node
*node
= ctx
->rsrc_node
;
626 u64
*tag_slot
= io_get_tag_slot(data
, idx
);
628 ctx
->rsrc_node
= io_rsrc_node_alloc(ctx
);
629 if (unlikely(!ctx
->rsrc_node
)) {
630 ctx
->rsrc_node
= node
;
634 node
->item
.rsrc
= rsrc
;
635 node
->type
= data
->rsrc_type
;
636 node
->item
.tag
= *tag_slot
;
638 list_add_tail(&node
->node
, &ctx
->rsrc_ref_list
);
639 io_put_rsrc_node(ctx
, node
);
643 void __io_sqe_files_unregister(struct io_ring_ctx
*ctx
)
647 for (i
= 0; i
< ctx
->nr_user_files
; i
++) {
648 struct file
*file
= io_file_from_index(&ctx
->file_table
, i
);
652 io_file_bitmap_clear(&ctx
->file_table
, i
);
656 io_free_file_tables(&ctx
->file_table
);
657 io_file_table_set_alloc_range(ctx
, 0, 0);
658 io_rsrc_data_free(ctx
->file_data
);
659 ctx
->file_data
= NULL
;
660 ctx
->nr_user_files
= 0;
663 int io_sqe_files_unregister(struct io_ring_ctx
*ctx
)
665 unsigned nr
= ctx
->nr_user_files
;
672 * Quiesce may unlock ->uring_lock, and while it's not held
673 * prevent new requests using the table.
675 ctx
->nr_user_files
= 0;
676 ret
= io_rsrc_ref_quiesce(ctx
->file_data
, ctx
);
677 ctx
->nr_user_files
= nr
;
679 __io_sqe_files_unregister(ctx
);
683 int io_sqe_files_register(struct io_ring_ctx
*ctx
, void __user
*arg
,
684 unsigned nr_args
, u64 __user
*tags
)
686 __s32 __user
*fds
= (__s32 __user
*) arg
;
695 if (nr_args
> IORING_MAX_FIXED_FILES
)
697 if (nr_args
> rlimit(RLIMIT_NOFILE
))
699 ret
= io_rsrc_data_alloc(ctx
, IORING_RSRC_FILE
, tags
, nr_args
,
704 if (!io_alloc_file_tables(&ctx
->file_table
, nr_args
)) {
705 io_rsrc_data_free(ctx
->file_data
);
706 ctx
->file_data
= NULL
;
710 for (i
= 0; i
< nr_args
; i
++, ctx
->nr_user_files
++) {
711 struct io_fixed_file
*file_slot
;
713 if (fds
&& copy_from_user(&fd
, &fds
[i
], sizeof(fd
))) {
717 /* allow sparse sets */
718 if (!fds
|| fd
== -1) {
720 if (unlikely(*io_get_tag_slot(ctx
->file_data
, i
)))
731 * Don't allow io_uring instances to be registered.
733 if (io_is_uring_fops(file
)) {
737 file_slot
= io_fixed_file_slot(&ctx
->file_table
, i
);
738 io_fixed_file_set(file_slot
, file
);
739 io_file_bitmap_set(&ctx
->file_table
, i
);
742 /* default it to the whole table */
743 io_file_table_set_alloc_range(ctx
, 0, ctx
->nr_user_files
);
746 __io_sqe_files_unregister(ctx
);
750 static void io_rsrc_buf_put(struct io_ring_ctx
*ctx
, struct io_rsrc_put
*prsrc
)
752 io_buffer_unmap(ctx
, &prsrc
->buf
);
756 void __io_sqe_buffers_unregister(struct io_ring_ctx
*ctx
)
760 for (i
= 0; i
< ctx
->nr_user_bufs
; i
++)
761 io_buffer_unmap(ctx
, &ctx
->user_bufs
[i
]);
762 kfree(ctx
->user_bufs
);
763 io_rsrc_data_free(ctx
->buf_data
);
764 ctx
->user_bufs
= NULL
;
765 ctx
->buf_data
= NULL
;
766 ctx
->nr_user_bufs
= 0;
769 int io_sqe_buffers_unregister(struct io_ring_ctx
*ctx
)
771 unsigned nr
= ctx
->nr_user_bufs
;
778 * Quiesce may unlock ->uring_lock, and while it's not held
779 * prevent new requests using the table.
781 ctx
->nr_user_bufs
= 0;
782 ret
= io_rsrc_ref_quiesce(ctx
->buf_data
, ctx
);
783 ctx
->nr_user_bufs
= nr
;
785 __io_sqe_buffers_unregister(ctx
);
790 * Not super efficient, but this is just a registration time. And we do cache
791 * the last compound head, so generally we'll only do a full search if we don't
794 * We check if the given compound head page has already been accounted, to
795 * avoid double accounting it. This allows us to account the full size of the
796 * page, not just the constituent pages of a huge page.
798 static bool headpage_already_acct(struct io_ring_ctx
*ctx
, struct page
**pages
,
799 int nr_pages
, struct page
*hpage
)
803 /* check current page array */
804 for (i
= 0; i
< nr_pages
; i
++) {
805 if (!PageCompound(pages
[i
]))
807 if (compound_head(pages
[i
]) == hpage
)
811 /* check previously registered pages */
812 for (i
= 0; i
< ctx
->nr_user_bufs
; i
++) {
813 struct io_mapped_ubuf
*imu
= ctx
->user_bufs
[i
];
815 for (j
= 0; j
< imu
->nr_bvecs
; j
++) {
816 if (!PageCompound(imu
->bvec
[j
].bv_page
))
818 if (compound_head(imu
->bvec
[j
].bv_page
) == hpage
)
826 static int io_buffer_account_pin(struct io_ring_ctx
*ctx
, struct page
**pages
,
827 int nr_pages
, struct io_mapped_ubuf
*imu
,
828 struct page
**last_hpage
)
833 for (i
= 0; i
< nr_pages
; i
++) {
834 if (!PageCompound(pages
[i
])) {
839 hpage
= compound_head(pages
[i
]);
840 if (hpage
== *last_hpage
)
843 if (headpage_already_acct(ctx
, pages
, i
, hpage
))
845 imu
->acct_pages
+= page_size(hpage
) >> PAGE_SHIFT
;
849 if (!imu
->acct_pages
)
852 ret
= io_account_mem(ctx
, imu
->acct_pages
);
858 static int io_sqe_buffer_register(struct io_ring_ctx
*ctx
, struct iovec
*iov
,
859 struct io_mapped_ubuf
**pimu
,
860 struct page
**last_hpage
)
862 struct io_mapped_ubuf
*imu
= NULL
;
863 struct page
**pages
= NULL
;
866 int ret
, nr_pages
, i
;
867 struct folio
*folio
= NULL
;
869 *pimu
= (struct io_mapped_ubuf
*)&dummy_ubuf
;
874 pages
= io_pin_pages((unsigned long) iov
->iov_base
, iov
->iov_len
,
877 ret
= PTR_ERR(pages
);
882 /* If it's a huge page, try to coalesce them into a single bvec entry */
884 folio
= page_folio(pages
[0]);
885 for (i
= 1; i
< nr_pages
; i
++) {
887 * Pages must be consecutive and on the same folio for
890 if (page_folio(pages
[i
]) != folio
||
891 pages
[i
] != pages
[i
- 1] + 1) {
898 * The pages are bound to the folio, it doesn't
899 * actually unpin them but drops all but one reference,
900 * which is usually put down by io_buffer_unmap().
901 * Note, needs a better helper.
903 unpin_user_pages(&pages
[1], nr_pages
- 1);
908 imu
= kvmalloc(struct_size(imu
, bvec
, nr_pages
), GFP_KERNEL
);
912 ret
= io_buffer_account_pin(ctx
, pages
, nr_pages
, imu
, last_hpage
);
914 unpin_user_pages(pages
, nr_pages
);
918 off
= (unsigned long) iov
->iov_base
& ~PAGE_MASK
;
920 /* store original address for later verification */
921 imu
->ubuf
= (unsigned long) iov
->iov_base
;
922 imu
->ubuf_end
= imu
->ubuf
+ iov
->iov_len
;
923 imu
->nr_bvecs
= nr_pages
;
928 bvec_set_page(&imu
->bvec
[0], pages
[0], size
, off
);
931 for (i
= 0; i
< nr_pages
; i
++) {
934 vec_len
= min_t(size_t, size
, PAGE_SIZE
- off
);
935 bvec_set_page(&imu
->bvec
[i
], pages
[i
], vec_len
, off
);
946 static int io_buffers_map_alloc(struct io_ring_ctx
*ctx
, unsigned int nr_args
)
948 ctx
->user_bufs
= kcalloc(nr_args
, sizeof(*ctx
->user_bufs
), GFP_KERNEL
);
949 return ctx
->user_bufs
? 0 : -ENOMEM
;
952 int io_sqe_buffers_register(struct io_ring_ctx
*ctx
, void __user
*arg
,
953 unsigned int nr_args
, u64 __user
*tags
)
955 struct page
*last_hpage
= NULL
;
956 struct io_rsrc_data
*data
;
957 struct iovec fast_iov
, *iov
= &fast_iov
;
958 const struct iovec __user
*uvec
;
961 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS
>= (1u << 16));
965 if (!nr_args
|| nr_args
> IORING_MAX_REG_BUFFERS
)
967 ret
= io_rsrc_data_alloc(ctx
, IORING_RSRC_BUFFER
, tags
, nr_args
, &data
);
970 ret
= io_buffers_map_alloc(ctx
, nr_args
);
972 io_rsrc_data_free(data
);
977 memset(iov
, 0, sizeof(*iov
));
979 for (i
= 0; i
< nr_args
; i
++, ctx
->nr_user_bufs
++) {
981 uvec
= (struct iovec __user
*) arg
;
982 iov
= iovec_from_user(uvec
, 1, 1, &fast_iov
, ctx
->compat
);
987 ret
= io_buffer_validate(iov
);
991 arg
+= sizeof(struct compat_iovec
);
993 arg
+= sizeof(struct iovec
);
996 if (!iov
->iov_base
&& *io_get_tag_slot(data
, i
)) {
1001 ret
= io_sqe_buffer_register(ctx
, iov
, &ctx
->user_bufs
[i
],
1007 WARN_ON_ONCE(ctx
->buf_data
);
1009 ctx
->buf_data
= data
;
1011 __io_sqe_buffers_unregister(ctx
);
1015 int io_import_fixed(int ddir
, struct iov_iter
*iter
,
1016 struct io_mapped_ubuf
*imu
,
1017 u64 buf_addr
, size_t len
)
1022 if (WARN_ON_ONCE(!imu
))
1024 if (unlikely(check_add_overflow(buf_addr
, (u64
)len
, &buf_end
)))
1026 /* not inside the mapped region */
1027 if (unlikely(buf_addr
< imu
->ubuf
|| buf_end
> imu
->ubuf_end
))
1031 * Might not be a start of buffer, set size appropriately
1032 * and advance us to the beginning.
1034 offset
= buf_addr
- imu
->ubuf
;
1035 iov_iter_bvec(iter
, ddir
, imu
->bvec
, imu
->nr_bvecs
, offset
+ len
);
1039 * Don't use iov_iter_advance() here, as it's really slow for
1040 * using the latter parts of a big fixed buffer - it iterates
1041 * over each segment manually. We can cheat a bit here, because
1044 * 1) it's a BVEC iter, we set it up
1045 * 2) all bvecs are PAGE_SIZE in size, except potentially the
1046 * first and last bvec
1048 * So just find our index, and adjust the iterator afterwards.
1049 * If the offset is within the first bvec (or the whole first
1050 * bvec, just use iov_iter_advance(). This makes it easier
1051 * since we can just skip the first segment, which may not
1052 * be PAGE_SIZE aligned.
1054 const struct bio_vec
*bvec
= imu
->bvec
;
1056 if (offset
< bvec
->bv_len
) {
1058 * Note, huge pages buffers consists of one large
1059 * bvec entry and should always go this way. The other
1060 * branch doesn't expect non PAGE_SIZE'd chunks.
1063 iter
->count
-= offset
;
1064 iter
->iov_offset
= offset
;
1066 unsigned long seg_skip
;
1068 /* skip first vec */
1069 offset
-= bvec
->bv_len
;
1070 seg_skip
= 1 + (offset
>> PAGE_SHIFT
);
1072 iter
->bvec
= bvec
+ seg_skip
;
1073 iter
->nr_segs
-= seg_skip
;
1074 iter
->count
-= bvec
->bv_len
+ offset
;
1075 iter
->iov_offset
= offset
& ~PAGE_MASK
;