1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
5 #include <linux/file.h>
7 #include <linux/slab.h>
8 #include <linux/nospec.h>
9 #include <linux/hugetlb.h>
10 #include <linux/compat.h>
11 #include <linux/io_uring.h>
13 #include <uapi/linux/io_uring.h>
16 #include "openclose.h"
21 struct io_rsrc_update
{
28 static struct io_rsrc_node
*io_sqe_buffer_register(struct io_ring_ctx
*ctx
,
29 struct iovec
*iov
, struct page
**last_hpage
);
32 #define IORING_MAX_FIXED_FILES (1U << 20)
33 #define IORING_MAX_REG_BUFFERS (1U << 14)
35 int __io_account_mem(struct user_struct
*user
, unsigned long nr_pages
)
37 unsigned long page_limit
, cur_pages
, new_pages
;
42 /* Don't allow more pages than we can safely lock */
43 page_limit
= rlimit(RLIMIT_MEMLOCK
) >> PAGE_SHIFT
;
45 cur_pages
= atomic_long_read(&user
->locked_vm
);
47 new_pages
= cur_pages
+ nr_pages
;
48 if (new_pages
> page_limit
)
50 } while (!atomic_long_try_cmpxchg(&user
->locked_vm
,
51 &cur_pages
, new_pages
));
55 static void io_unaccount_mem(struct io_ring_ctx
*ctx
, unsigned long nr_pages
)
58 __io_unaccount_mem(ctx
->user
, nr_pages
);
61 atomic64_sub(nr_pages
, &ctx
->mm_account
->pinned_vm
);
64 static int io_account_mem(struct io_ring_ctx
*ctx
, unsigned long nr_pages
)
69 ret
= __io_account_mem(ctx
->user
, nr_pages
);
75 atomic64_add(nr_pages
, &ctx
->mm_account
->pinned_vm
);
80 static int io_buffer_validate(struct iovec
*iov
)
82 unsigned long tmp
, acct_len
= iov
->iov_len
+ (PAGE_SIZE
- 1);
85 * Don't impose further limits on the size and buffer
86 * constraints here, we'll -EINVAL later when IO is
87 * submitted if they are wrong.
90 return iov
->iov_len
? -EFAULT
: 0;
94 /* arbitrary limit, but we need something */
95 if (iov
->iov_len
> SZ_1G
)
98 if (check_add_overflow((unsigned long)iov
->iov_base
, acct_len
, &tmp
))
104 static void io_buffer_unmap(struct io_ring_ctx
*ctx
, struct io_rsrc_node
*node
)
109 struct io_mapped_ubuf
*imu
= node
->buf
;
111 if (!refcount_dec_and_test(&imu
->refs
))
113 for (i
= 0; i
< imu
->nr_bvecs
; i
++)
114 unpin_user_page(imu
->bvec
[i
].bv_page
);
116 io_unaccount_mem(ctx
, imu
->acct_pages
);
121 struct io_rsrc_node
*io_rsrc_node_alloc(struct io_ring_ctx
*ctx
, int type
)
123 struct io_rsrc_node
*node
;
125 node
= kzalloc(sizeof(*node
), GFP_KERNEL
);
133 __cold
void io_rsrc_data_free(struct io_ring_ctx
*ctx
, struct io_rsrc_data
*data
)
138 if (data
->nodes
[data
->nr
])
139 io_put_rsrc_node(ctx
, data
->nodes
[data
->nr
]);
146 __cold
int io_rsrc_data_alloc(struct io_rsrc_data
*data
, unsigned nr
)
148 data
->nodes
= kvmalloc_array(nr
, sizeof(struct io_rsrc_node
*),
149 GFP_KERNEL_ACCOUNT
| __GFP_ZERO
);
157 static int __io_sqe_files_update(struct io_ring_ctx
*ctx
,
158 struct io_uring_rsrc_update2
*up
,
161 u64 __user
*tags
= u64_to_user_ptr(up
->tags
);
162 __s32 __user
*fds
= u64_to_user_ptr(up
->data
);
166 if (!ctx
->file_table
.data
.nr
)
168 if (up
->offset
+ nr_args
> ctx
->file_table
.data
.nr
)
171 for (done
= 0; done
< nr_args
; done
++) {
174 if ((tags
&& copy_from_user(&tag
, &tags
[done
], sizeof(tag
))) ||
175 copy_from_user(&fd
, &fds
[done
], sizeof(fd
))) {
179 if ((fd
== IORING_REGISTER_FILES_SKIP
|| fd
== -1) && tag
) {
183 if (fd
== IORING_REGISTER_FILES_SKIP
)
186 i
= up
->offset
+ done
;
187 if (io_reset_rsrc_node(ctx
, &ctx
->file_table
.data
, i
))
188 io_file_bitmap_clear(&ctx
->file_table
, i
);
191 struct file
*file
= fget(fd
);
192 struct io_rsrc_node
*node
;
199 * Don't allow io_uring instances to be registered.
201 if (io_is_uring_fops(file
)) {
206 node
= io_rsrc_node_alloc(ctx
, IORING_RSRC_FILE
);
212 ctx
->file_table
.data
.nodes
[i
] = node
;
215 io_fixed_file_set(node
, file
);
216 io_file_bitmap_set(&ctx
->file_table
, i
);
219 return done
? done
: err
;
222 static int __io_sqe_buffers_update(struct io_ring_ctx
*ctx
,
223 struct io_uring_rsrc_update2
*up
,
224 unsigned int nr_args
)
226 u64 __user
*tags
= u64_to_user_ptr(up
->tags
);
227 struct iovec fast_iov
, *iov
;
228 struct page
*last_hpage
= NULL
;
229 struct iovec __user
*uvec
;
230 u64 user_data
= up
->data
;
234 if (!ctx
->buf_table
.nr
)
236 if (up
->offset
+ nr_args
> ctx
->buf_table
.nr
)
239 for (done
= 0; done
< nr_args
; done
++) {
240 struct io_rsrc_node
*node
;
243 uvec
= u64_to_user_ptr(user_data
);
244 iov
= iovec_from_user(uvec
, 1, 1, &fast_iov
, ctx
->compat
);
249 if (tags
&& copy_from_user(&tag
, &tags
[done
], sizeof(tag
))) {
253 err
= io_buffer_validate(iov
);
256 node
= io_sqe_buffer_register(ctx
, iov
, &last_hpage
);
268 i
= array_index_nospec(up
->offset
+ done
, ctx
->buf_table
.nr
);
269 io_reset_rsrc_node(ctx
, &ctx
->buf_table
, i
);
270 ctx
->buf_table
.nodes
[i
] = node
;
272 user_data
+= sizeof(struct compat_iovec
);
274 user_data
+= sizeof(struct iovec
);
276 return done
? done
: err
;
279 static int __io_register_rsrc_update(struct io_ring_ctx
*ctx
, unsigned type
,
280 struct io_uring_rsrc_update2
*up
,
285 lockdep_assert_held(&ctx
->uring_lock
);
287 if (check_add_overflow(up
->offset
, nr_args
, &tmp
))
291 case IORING_RSRC_FILE
:
292 return __io_sqe_files_update(ctx
, up
, nr_args
);
293 case IORING_RSRC_BUFFER
:
294 return __io_sqe_buffers_update(ctx
, up
, nr_args
);
299 int io_register_files_update(struct io_ring_ctx
*ctx
, void __user
*arg
,
302 struct io_uring_rsrc_update2 up
;
306 memset(&up
, 0, sizeof(up
));
307 if (copy_from_user(&up
, arg
, sizeof(struct io_uring_rsrc_update
)))
309 if (up
.resv
|| up
.resv2
)
311 return __io_register_rsrc_update(ctx
, IORING_RSRC_FILE
, &up
, nr_args
);
314 int io_register_rsrc_update(struct io_ring_ctx
*ctx
, void __user
*arg
,
315 unsigned size
, unsigned type
)
317 struct io_uring_rsrc_update2 up
;
319 if (size
!= sizeof(up
))
321 if (copy_from_user(&up
, arg
, sizeof(up
)))
323 if (!up
.nr
|| up
.resv
|| up
.resv2
)
325 return __io_register_rsrc_update(ctx
, type
, &up
, up
.nr
);
328 __cold
int io_register_rsrc(struct io_ring_ctx
*ctx
, void __user
*arg
,
329 unsigned int size
, unsigned int type
)
331 struct io_uring_rsrc_register rr
;
333 /* keep it extendible */
334 if (size
!= sizeof(rr
))
337 memset(&rr
, 0, sizeof(rr
));
338 if (copy_from_user(&rr
, arg
, size
))
340 if (!rr
.nr
|| rr
.resv2
)
342 if (rr
.flags
& ~IORING_RSRC_REGISTER_SPARSE
)
346 case IORING_RSRC_FILE
:
347 if (rr
.flags
& IORING_RSRC_REGISTER_SPARSE
&& rr
.data
)
349 return io_sqe_files_register(ctx
, u64_to_user_ptr(rr
.data
),
350 rr
.nr
, u64_to_user_ptr(rr
.tags
));
351 case IORING_RSRC_BUFFER
:
352 if (rr
.flags
& IORING_RSRC_REGISTER_SPARSE
&& rr
.data
)
354 return io_sqe_buffers_register(ctx
, u64_to_user_ptr(rr
.data
),
355 rr
.nr
, u64_to_user_ptr(rr
.tags
));
360 int io_files_update_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
362 struct io_rsrc_update
*up
= io_kiocb_to_cmd(req
, struct io_rsrc_update
);
364 if (unlikely(req
->flags
& (REQ_F_FIXED_FILE
| REQ_F_BUFFER_SELECT
)))
366 if (sqe
->rw_flags
|| sqe
->splice_fd_in
)
369 up
->offset
= READ_ONCE(sqe
->off
);
370 up
->nr_args
= READ_ONCE(sqe
->len
);
373 up
->arg
= READ_ONCE(sqe
->addr
);
377 static int io_files_update_with_index_alloc(struct io_kiocb
*req
,
378 unsigned int issue_flags
)
380 struct io_rsrc_update
*up
= io_kiocb_to_cmd(req
, struct io_rsrc_update
);
381 __s32 __user
*fds
= u64_to_user_ptr(up
->arg
);
386 if (!req
->ctx
->file_table
.data
.nr
)
389 for (done
= 0; done
< up
->nr_args
; done
++) {
390 if (copy_from_user(&fd
, &fds
[done
], sizeof(fd
))) {
400 ret
= io_fixed_fd_install(req
, issue_flags
, file
,
401 IORING_FILE_INDEX_ALLOC
);
404 if (copy_to_user(&fds
[done
], &ret
, sizeof(ret
))) {
405 __io_close_fixed(req
->ctx
, issue_flags
, ret
);
416 int io_files_update(struct io_kiocb
*req
, unsigned int issue_flags
)
418 struct io_rsrc_update
*up
= io_kiocb_to_cmd(req
, struct io_rsrc_update
);
419 struct io_ring_ctx
*ctx
= req
->ctx
;
420 struct io_uring_rsrc_update2 up2
;
423 up2
.offset
= up
->offset
;
430 if (up
->offset
== IORING_FILE_INDEX_ALLOC
) {
431 ret
= io_files_update_with_index_alloc(req
, issue_flags
);
433 io_ring_submit_lock(ctx
, issue_flags
);
434 ret
= __io_register_rsrc_update(ctx
, IORING_RSRC_FILE
,
436 io_ring_submit_unlock(ctx
, issue_flags
);
441 io_req_set_res(req
, ret
, 0);
445 void io_free_rsrc_node(struct io_ring_ctx
*ctx
, struct io_rsrc_node
*node
)
447 lockdep_assert_held(&ctx
->uring_lock
);
450 io_post_aux_cqe(ctx
, node
->tag
, 0, 0);
452 switch (node
->type
) {
453 case IORING_RSRC_FILE
:
454 if (io_slot_file(node
))
455 fput(io_slot_file(node
));
457 case IORING_RSRC_BUFFER
:
459 io_buffer_unmap(ctx
, node
);
469 int io_sqe_files_unregister(struct io_ring_ctx
*ctx
)
471 if (!ctx
->file_table
.data
.nr
)
474 io_free_file_tables(ctx
, &ctx
->file_table
);
475 io_file_table_set_alloc_range(ctx
, 0, 0);
479 int io_sqe_files_register(struct io_ring_ctx
*ctx
, void __user
*arg
,
480 unsigned nr_args
, u64 __user
*tags
)
482 __s32 __user
*fds
= (__s32 __user
*) arg
;
487 if (ctx
->file_table
.data
.nr
)
491 if (nr_args
> IORING_MAX_FIXED_FILES
)
493 if (nr_args
> rlimit(RLIMIT_NOFILE
))
495 if (!io_alloc_file_tables(ctx
, &ctx
->file_table
, nr_args
))
498 for (i
= 0; i
< nr_args
; i
++) {
499 struct io_rsrc_node
*node
;
503 if (tags
&& copy_from_user(&tag
, &tags
[i
], sizeof(tag
)))
505 if (fds
&& copy_from_user(&fd
, &fds
[i
], sizeof(fd
)))
507 /* allow sparse sets */
508 if (!fds
|| fd
== -1) {
521 * Don't allow io_uring instances to be registered.
523 if (io_is_uring_fops(file
)) {
528 node
= io_rsrc_node_alloc(ctx
, IORING_RSRC_FILE
);
535 ctx
->file_table
.data
.nodes
[i
] = node
;
536 io_fixed_file_set(node
, file
);
537 io_file_bitmap_set(&ctx
->file_table
, i
);
540 /* default it to the whole table */
541 io_file_table_set_alloc_range(ctx
, 0, ctx
->file_table
.data
.nr
);
544 io_sqe_files_unregister(ctx
);
548 int io_sqe_buffers_unregister(struct io_ring_ctx
*ctx
)
550 if (!ctx
->buf_table
.nr
)
552 io_rsrc_data_free(ctx
, &ctx
->buf_table
);
557 * Not super efficient, but this is just a registration time. And we do cache
558 * the last compound head, so generally we'll only do a full search if we don't
561 * We check if the given compound head page has already been accounted, to
562 * avoid double accounting it. This allows us to account the full size of the
563 * page, not just the constituent pages of a huge page.
565 static bool headpage_already_acct(struct io_ring_ctx
*ctx
, struct page
**pages
,
566 int nr_pages
, struct page
*hpage
)
570 /* check current page array */
571 for (i
= 0; i
< nr_pages
; i
++) {
572 if (!PageCompound(pages
[i
]))
574 if (compound_head(pages
[i
]) == hpage
)
578 /* check previously registered pages */
579 for (i
= 0; i
< ctx
->buf_table
.nr
; i
++) {
580 struct io_rsrc_node
*node
= ctx
->buf_table
.nodes
[i
];
581 struct io_mapped_ubuf
*imu
;
586 for (j
= 0; j
< imu
->nr_bvecs
; j
++) {
587 if (!PageCompound(imu
->bvec
[j
].bv_page
))
589 if (compound_head(imu
->bvec
[j
].bv_page
) == hpage
)
597 static int io_buffer_account_pin(struct io_ring_ctx
*ctx
, struct page
**pages
,
598 int nr_pages
, struct io_mapped_ubuf
*imu
,
599 struct page
**last_hpage
)
604 for (i
= 0; i
< nr_pages
; i
++) {
605 if (!PageCompound(pages
[i
])) {
610 hpage
= compound_head(pages
[i
]);
611 if (hpage
== *last_hpage
)
614 if (headpage_already_acct(ctx
, pages
, i
, hpage
))
616 imu
->acct_pages
+= page_size(hpage
) >> PAGE_SHIFT
;
620 if (!imu
->acct_pages
)
623 ret
= io_account_mem(ctx
, imu
->acct_pages
);
629 static bool io_do_coalesce_buffer(struct page
***pages
, int *nr_pages
,
630 struct io_imu_folio_data
*data
, int nr_folios
)
632 struct page
**page_array
= *pages
, **new_array
= NULL
;
633 int nr_pages_left
= *nr_pages
, i
, j
;
635 /* Store head pages only*/
636 new_array
= kvmalloc_array(nr_folios
, sizeof(struct page
*),
641 new_array
[0] = compound_head(page_array
[0]);
643 * The pages are bound to the folio, it doesn't
644 * actually unpin them but drops all but one reference,
645 * which is usually put down by io_buffer_unmap().
646 * Note, needs a better helper.
648 if (data
->nr_pages_head
> 1)
649 unpin_user_pages(&page_array
[1], data
->nr_pages_head
- 1);
651 j
= data
->nr_pages_head
;
652 nr_pages_left
-= data
->nr_pages_head
;
653 for (i
= 1; i
< nr_folios
; i
++) {
654 unsigned int nr_unpin
;
656 new_array
[i
] = page_array
[j
];
657 nr_unpin
= min_t(unsigned int, nr_pages_left
- 1,
658 data
->nr_pages_mid
- 1);
660 unpin_user_pages(&page_array
[j
+1], nr_unpin
);
661 j
+= data
->nr_pages_mid
;
662 nr_pages_left
-= data
->nr_pages_mid
;
666 *nr_pages
= nr_folios
;
670 static bool io_try_coalesce_buffer(struct page
***pages
, int *nr_pages
,
671 struct io_imu_folio_data
*data
)
673 struct page
**page_array
= *pages
;
674 struct folio
*folio
= page_folio(page_array
[0]);
675 unsigned int count
= 1, nr_folios
= 1;
681 data
->nr_pages_mid
= folio_nr_pages(folio
);
682 if (data
->nr_pages_mid
== 1)
685 data
->folio_shift
= folio_shift(folio
);
687 * Check if pages are contiguous inside a folio, and all folios have
688 * the same page count except for the head and tail.
690 for (i
= 1; i
< *nr_pages
; i
++) {
691 if (page_folio(page_array
[i
]) == folio
&&
692 page_array
[i
] == page_array
[i
-1] + 1) {
697 if (nr_folios
== 1) {
698 if (folio_page_idx(folio
, page_array
[i
-1]) !=
699 data
->nr_pages_mid
- 1)
702 data
->nr_pages_head
= count
;
703 } else if (count
!= data
->nr_pages_mid
) {
707 folio
= page_folio(page_array
[i
]);
708 if (folio_size(folio
) != (1UL << data
->folio_shift
) ||
709 folio_page_idx(folio
, page_array
[i
]) != 0)
716 data
->nr_pages_head
= count
;
718 return io_do_coalesce_buffer(pages
, nr_pages
, data
, nr_folios
);
721 static struct io_rsrc_node
*io_sqe_buffer_register(struct io_ring_ctx
*ctx
,
723 struct page
**last_hpage
)
725 struct io_mapped_ubuf
*imu
= NULL
;
726 struct page
**pages
= NULL
;
727 struct io_rsrc_node
*node
;
730 int ret
, nr_pages
, i
;
731 struct io_imu_folio_data data
;
737 node
= io_rsrc_node_alloc(ctx
, IORING_RSRC_BUFFER
);
739 return ERR_PTR(-ENOMEM
);
743 pages
= io_pin_pages((unsigned long) iov
->iov_base
, iov
->iov_len
,
746 ret
= PTR_ERR(pages
);
751 /* If it's huge page(s), try to coalesce them into fewer bvec entries */
752 coalesced
= io_try_coalesce_buffer(&pages
, &nr_pages
, &data
);
754 imu
= kvmalloc(struct_size(imu
, bvec
, nr_pages
), GFP_KERNEL
);
758 ret
= io_buffer_account_pin(ctx
, pages
, nr_pages
, imu
, last_hpage
);
760 unpin_user_pages(pages
, nr_pages
);
765 /* store original address for later verification */
766 imu
->ubuf
= (unsigned long) iov
->iov_base
;
767 imu
->len
= iov
->iov_len
;
768 imu
->nr_bvecs
= nr_pages
;
769 imu
->folio_shift
= PAGE_SHIFT
;
771 imu
->folio_shift
= data
.folio_shift
;
772 refcount_set(&imu
->refs
, 1);
773 off
= (unsigned long) iov
->iov_base
& ((1UL << imu
->folio_shift
) - 1);
777 for (i
= 0; i
< nr_pages
; i
++) {
780 vec_len
= min_t(size_t, size
, (1UL << imu
->folio_shift
) - off
);
781 bvec_set_page(&imu
->bvec
[i
], pages
[i
], vec_len
, off
);
789 io_put_rsrc_node(ctx
, node
);
796 int io_sqe_buffers_register(struct io_ring_ctx
*ctx
, void __user
*arg
,
797 unsigned int nr_args
, u64 __user
*tags
)
799 struct page
*last_hpage
= NULL
;
800 struct io_rsrc_data data
;
801 struct iovec fast_iov
, *iov
= &fast_iov
;
802 const struct iovec __user
*uvec
;
805 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS
>= (1u << 16));
807 if (ctx
->buf_table
.nr
)
809 if (!nr_args
|| nr_args
> IORING_MAX_REG_BUFFERS
)
811 ret
= io_rsrc_data_alloc(&data
, nr_args
);
816 memset(iov
, 0, sizeof(*iov
));
818 for (i
= 0; i
< nr_args
; i
++) {
819 struct io_rsrc_node
*node
;
823 uvec
= (struct iovec __user
*) arg
;
824 iov
= iovec_from_user(uvec
, 1, 1, &fast_iov
, ctx
->compat
);
829 ret
= io_buffer_validate(iov
);
833 arg
+= sizeof(struct compat_iovec
);
835 arg
+= sizeof(struct iovec
);
839 if (copy_from_user(&tag
, &tags
[i
], sizeof(tag
))) {
845 node
= io_sqe_buffer_register(ctx
, iov
, &last_hpage
);
857 data
.nodes
[i
] = node
;
860 ctx
->buf_table
= data
;
862 io_sqe_buffers_unregister(ctx
);
866 int io_import_fixed(int ddir
, struct iov_iter
*iter
,
867 struct io_mapped_ubuf
*imu
,
868 u64 buf_addr
, size_t len
)
873 if (WARN_ON_ONCE(!imu
))
875 if (unlikely(check_add_overflow(buf_addr
, (u64
)len
, &buf_end
)))
877 /* not inside the mapped region */
878 if (unlikely(buf_addr
< imu
->ubuf
|| buf_end
> (imu
->ubuf
+ imu
->len
)))
882 * Might not be a start of buffer, set size appropriately
883 * and advance us to the beginning.
885 offset
= buf_addr
- imu
->ubuf
;
886 iov_iter_bvec(iter
, ddir
, imu
->bvec
, imu
->nr_bvecs
, offset
+ len
);
890 * Don't use iov_iter_advance() here, as it's really slow for
891 * using the latter parts of a big fixed buffer - it iterates
892 * over each segment manually. We can cheat a bit here, because
895 * 1) it's a BVEC iter, we set it up
896 * 2) all bvecs are the same in size, except potentially the
897 * first and last bvec
899 * So just find our index, and adjust the iterator afterwards.
900 * If the offset is within the first bvec (or the whole first
901 * bvec, just use iov_iter_advance(). This makes it easier
902 * since we can just skip the first segment, which may not
903 * be folio_size aligned.
905 const struct bio_vec
*bvec
= imu
->bvec
;
907 if (offset
< bvec
->bv_len
) {
908 iter
->count
-= offset
;
909 iter
->iov_offset
= offset
;
911 unsigned long seg_skip
;
914 offset
-= bvec
->bv_len
;
915 seg_skip
= 1 + (offset
>> imu
->folio_shift
);
917 iter
->bvec
+= seg_skip
;
918 iter
->nr_segs
-= seg_skip
;
919 iter
->count
-= bvec
->bv_len
+ offset
;
920 iter
->iov_offset
= offset
& ((1UL << imu
->folio_shift
) - 1);
927 static int io_clone_buffers(struct io_ring_ctx
*ctx
, struct io_ring_ctx
*src_ctx
,
928 struct io_uring_clone_buffers
*arg
)
930 struct io_rsrc_data data
;
934 /* if offsets are given, must have nr specified too */
935 if (!arg
->nr
&& (arg
->dst_off
|| arg
->src_off
))
937 /* not allowed unless REPLACE is set */
938 if (ctx
->buf_table
.nr
&& !(arg
->flags
& IORING_REGISTER_DST_REPLACE
))
941 nbufs
= READ_ONCE(src_ctx
->buf_table
.nr
);
944 else if (arg
->nr
> nbufs
)
946 else if (arg
->nr
> IORING_MAX_REG_BUFFERS
)
948 if (check_add_overflow(arg
->nr
, arg
->dst_off
, &nbufs
))
951 ret
= io_rsrc_data_alloc(&data
, max(nbufs
, ctx
->buf_table
.nr
));
955 /* Fill entries in data from dst that won't overlap with src */
956 for (i
= 0; i
< min(arg
->dst_off
, ctx
->buf_table
.nr
); i
++) {
957 struct io_rsrc_node
*src_node
= ctx
->buf_table
.nodes
[i
];
960 data
.nodes
[i
] = src_node
;
966 * Drop our own lock here. We'll setup the data we need and reference
967 * the source buffers, then re-grab, check, and assign at the end.
969 mutex_unlock(&ctx
->uring_lock
);
971 mutex_lock(&src_ctx
->uring_lock
);
973 nbufs
= src_ctx
->buf_table
.nr
;
979 else if (arg
->nr
> nbufs
)
982 if (check_add_overflow(arg
->nr
, arg
->src_off
, &off
))
991 struct io_rsrc_node
*dst_node
, *src_node
;
993 src_node
= io_rsrc_node_lookup(&src_ctx
->buf_table
, i
);
997 dst_node
= io_rsrc_node_alloc(ctx
, IORING_RSRC_BUFFER
);
1003 refcount_inc(&src_node
->buf
->refs
);
1004 dst_node
->buf
= src_node
->buf
;
1006 data
.nodes
[off
++] = dst_node
;
1010 /* Have a ref on the bufs now, drop src lock and re-grab our own lock */
1011 mutex_unlock(&src_ctx
->uring_lock
);
1012 mutex_lock(&ctx
->uring_lock
);
1015 * If asked for replace, put the old table. data->nodes[] holds both
1016 * old and new nodes at this point.
1018 if (arg
->flags
& IORING_REGISTER_DST_REPLACE
)
1019 io_rsrc_data_free(ctx
, &ctx
->buf_table
);
1022 * ctx->buf_table should be empty now - either the contents are being
1023 * replaced and we just freed the table, or someone raced setting up
1024 * a buffer table while the clone was happening. If not empty, fall
1025 * through to failure handling.
1027 if (!ctx
->buf_table
.nr
) {
1028 ctx
->buf_table
= data
;
1032 mutex_unlock(&ctx
->uring_lock
);
1033 mutex_lock(&src_ctx
->uring_lock
);
1034 /* someone raced setting up buffers, dump ours */
1039 io_buffer_unmap(src_ctx
, data
.nodes
[i
]);
1040 kfree(data
.nodes
[i
]);
1043 io_rsrc_data_free(ctx
, &data
);
1044 mutex_unlock(&src_ctx
->uring_lock
);
1045 mutex_lock(&ctx
->uring_lock
);
1050 * Copy the registered buffers from the source ring whose file descriptor
1051 * is given in the src_fd to the current ring. This is identical to registering
1052 * the buffers with ctx, except faster as mappings already exist.
1054 * Since the memory is already accounted once, don't account it again.
1056 int io_register_clone_buffers(struct io_ring_ctx
*ctx
, void __user
*arg
)
1058 struct io_uring_clone_buffers buf
;
1059 bool registered_src
;
1063 if (copy_from_user(&buf
, arg
, sizeof(buf
)))
1065 if (buf
.flags
& ~(IORING_REGISTER_SRC_REGISTERED
|IORING_REGISTER_DST_REPLACE
))
1067 if (!(buf
.flags
& IORING_REGISTER_DST_REPLACE
) && ctx
->buf_table
.nr
)
1069 if (memchr_inv(buf
.pad
, 0, sizeof(buf
.pad
)))
1072 registered_src
= (buf
.flags
& IORING_REGISTER_SRC_REGISTERED
) != 0;
1073 file
= io_uring_register_get_file(buf
.src_fd
, registered_src
);
1075 return PTR_ERR(file
);
1076 ret
= io_clone_buffers(ctx
, file
->private_data
, &buf
);
1077 if (!registered_src
)