drm/panthor: Don't add write fences to the shared BOs
[drm/drm-misc.git] / io_uring / rsrc.c
blob33a3d156a85b14fa50f2c31d840f2fdbaabc8ca1
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
4 #include <linux/fs.h>
5 #include <linux/file.h>
6 #include <linux/mm.h>
7 #include <linux/slab.h>
8 #include <linux/nospec.h>
9 #include <linux/hugetlb.h>
10 #include <linux/compat.h>
11 #include <linux/io_uring.h>
13 #include <uapi/linux/io_uring.h>
15 #include "io_uring.h"
16 #include "alloc_cache.h"
17 #include "openclose.h"
18 #include "rsrc.h"
19 #include "memmap.h"
20 #include "register.h"
22 struct io_rsrc_update {
23 struct file *file;
24 u64 arg;
25 u32 nr_args;
26 u32 offset;
29 static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
30 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
31 struct io_mapped_ubuf **pimu,
32 struct page **last_hpage);
34 /* only define max */
35 #define IORING_MAX_FIXED_FILES (1U << 20)
36 #define IORING_MAX_REG_BUFFERS (1U << 14)
38 static const struct io_mapped_ubuf dummy_ubuf = {
39 /* set invalid range, so io_import_fixed() fails meeting it */
40 .ubuf = -1UL,
41 .len = UINT_MAX,
44 int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
46 unsigned long page_limit, cur_pages, new_pages;
48 if (!nr_pages)
49 return 0;
51 /* Don't allow more pages than we can safely lock */
52 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
54 cur_pages = atomic_long_read(&user->locked_vm);
55 do {
56 new_pages = cur_pages + nr_pages;
57 if (new_pages > page_limit)
58 return -ENOMEM;
59 } while (!atomic_long_try_cmpxchg(&user->locked_vm,
60 &cur_pages, new_pages));
61 return 0;
64 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
66 if (ctx->user)
67 __io_unaccount_mem(ctx->user, nr_pages);
69 if (ctx->mm_account)
70 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
73 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
75 int ret;
77 if (ctx->user) {
78 ret = __io_account_mem(ctx->user, nr_pages);
79 if (ret)
80 return ret;
83 if (ctx->mm_account)
84 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
86 return 0;
89 static int io_buffer_validate(struct iovec *iov)
91 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
94 * Don't impose further limits on the size and buffer
95 * constraints here, we'll -EINVAL later when IO is
96 * submitted if they are wrong.
98 if (!iov->iov_base)
99 return iov->iov_len ? -EFAULT : 0;
100 if (!iov->iov_len)
101 return -EFAULT;
103 /* arbitrary limit, but we need something */
104 if (iov->iov_len > SZ_1G)
105 return -EFAULT;
107 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
108 return -EOVERFLOW;
110 return 0;
113 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
115 struct io_mapped_ubuf *imu = *slot;
116 unsigned int i;
118 *slot = NULL;
119 if (imu != &dummy_ubuf) {
120 if (!refcount_dec_and_test(&imu->refs))
121 return;
122 for (i = 0; i < imu->nr_bvecs; i++)
123 unpin_user_page(imu->bvec[i].bv_page);
124 if (imu->acct_pages)
125 io_unaccount_mem(ctx, imu->acct_pages);
126 kvfree(imu);
130 static void io_rsrc_put_work(struct io_rsrc_node *node)
132 struct io_rsrc_put *prsrc = &node->item;
134 if (prsrc->tag)
135 io_post_aux_cqe(node->ctx, prsrc->tag, 0, 0);
137 switch (node->type) {
138 case IORING_RSRC_FILE:
139 fput(prsrc->file);
140 break;
141 case IORING_RSRC_BUFFER:
142 io_rsrc_buf_put(node->ctx, prsrc);
143 break;
144 default:
145 WARN_ON_ONCE(1);
146 break;
150 void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
152 if (!io_alloc_cache_put(&ctx->rsrc_node_cache, node))
153 kfree(node);
156 void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
157 __must_hold(&node->ctx->uring_lock)
159 struct io_ring_ctx *ctx = node->ctx;
161 while (!list_empty(&ctx->rsrc_ref_list)) {
162 node = list_first_entry(&ctx->rsrc_ref_list,
163 struct io_rsrc_node, node);
164 /* recycle ref nodes in order */
165 if (node->refs)
166 break;
167 list_del(&node->node);
169 if (likely(!node->empty))
170 io_rsrc_put_work(node);
171 io_rsrc_node_destroy(ctx, node);
173 if (list_empty(&ctx->rsrc_ref_list) && unlikely(ctx->rsrc_quiesce))
174 wake_up_all(&ctx->rsrc_quiesce_wq);
177 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
179 struct io_rsrc_node *ref_node;
181 ref_node = io_alloc_cache_get(&ctx->rsrc_node_cache);
182 if (!ref_node) {
183 ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
184 if (!ref_node)
185 return NULL;
188 ref_node->ctx = ctx;
189 ref_node->empty = 0;
190 ref_node->refs = 1;
191 return ref_node;
194 __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
195 struct io_ring_ctx *ctx)
197 struct io_rsrc_node *backup;
198 DEFINE_WAIT(we);
199 int ret;
201 /* As We may drop ->uring_lock, other task may have started quiesce */
202 if (data->quiesce)
203 return -ENXIO;
205 backup = io_rsrc_node_alloc(ctx);
206 if (!backup)
207 return -ENOMEM;
208 ctx->rsrc_node->empty = true;
209 ctx->rsrc_node->type = -1;
210 list_add_tail(&ctx->rsrc_node->node, &ctx->rsrc_ref_list);
211 io_put_rsrc_node(ctx, ctx->rsrc_node);
212 ctx->rsrc_node = backup;
214 if (list_empty(&ctx->rsrc_ref_list))
215 return 0;
217 if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
218 atomic_set(&ctx->cq_wait_nr, 1);
219 smp_mb();
222 ctx->rsrc_quiesce++;
223 data->quiesce = true;
224 do {
225 prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE);
226 mutex_unlock(&ctx->uring_lock);
228 ret = io_run_task_work_sig(ctx);
229 if (ret < 0) {
230 finish_wait(&ctx->rsrc_quiesce_wq, &we);
231 mutex_lock(&ctx->uring_lock);
232 if (list_empty(&ctx->rsrc_ref_list))
233 ret = 0;
234 break;
237 schedule();
238 mutex_lock(&ctx->uring_lock);
239 ret = 0;
240 } while (!list_empty(&ctx->rsrc_ref_list));
242 finish_wait(&ctx->rsrc_quiesce_wq, &we);
243 data->quiesce = false;
244 ctx->rsrc_quiesce--;
246 if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
247 atomic_set(&ctx->cq_wait_nr, 0);
248 smp_mb();
250 return ret;
253 static void io_free_page_table(void **table, size_t size)
255 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
257 for (i = 0; i < nr_tables; i++)
258 kfree(table[i]);
259 kfree(table);
262 static void io_rsrc_data_free(struct io_rsrc_data *data)
264 size_t size = data->nr * sizeof(data->tags[0][0]);
266 if (data->tags)
267 io_free_page_table((void **)data->tags, size);
268 kfree(data);
271 static __cold void **io_alloc_page_table(size_t size)
273 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
274 size_t init_size = size;
275 void **table;
277 table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
278 if (!table)
279 return NULL;
281 for (i = 0; i < nr_tables; i++) {
282 unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
284 table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
285 if (!table[i]) {
286 io_free_page_table(table, init_size);
287 return NULL;
289 size -= this_size;
291 return table;
294 __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, int type,
295 u64 __user *utags,
296 unsigned nr, struct io_rsrc_data **pdata)
298 struct io_rsrc_data *data;
299 int ret = 0;
300 unsigned i;
302 data = kzalloc(sizeof(*data), GFP_KERNEL);
303 if (!data)
304 return -ENOMEM;
305 data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
306 if (!data->tags) {
307 kfree(data);
308 return -ENOMEM;
311 data->nr = nr;
312 data->ctx = ctx;
313 data->rsrc_type = type;
314 if (utags) {
315 ret = -EFAULT;
316 for (i = 0; i < nr; i++) {
317 u64 *tag_slot = io_get_tag_slot(data, i);
319 if (copy_from_user(tag_slot, &utags[i],
320 sizeof(*tag_slot)))
321 goto fail;
324 *pdata = data;
325 return 0;
326 fail:
327 io_rsrc_data_free(data);
328 return ret;
331 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
332 struct io_uring_rsrc_update2 *up,
333 unsigned nr_args)
335 u64 __user *tags = u64_to_user_ptr(up->tags);
336 __s32 __user *fds = u64_to_user_ptr(up->data);
337 struct io_rsrc_data *data = ctx->file_data;
338 struct io_fixed_file *file_slot;
339 int fd, i, err = 0;
340 unsigned int done;
342 if (!ctx->file_data)
343 return -ENXIO;
344 if (up->offset + nr_args > ctx->nr_user_files)
345 return -EINVAL;
347 for (done = 0; done < nr_args; done++) {
348 u64 tag = 0;
350 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
351 copy_from_user(&fd, &fds[done], sizeof(fd))) {
352 err = -EFAULT;
353 break;
355 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
356 err = -EINVAL;
357 break;
359 if (fd == IORING_REGISTER_FILES_SKIP)
360 continue;
362 i = array_index_nospec(up->offset + done, ctx->nr_user_files);
363 file_slot = io_fixed_file_slot(&ctx->file_table, i);
365 if (file_slot->file_ptr) {
366 err = io_queue_rsrc_removal(data, i,
367 io_slot_file(file_slot));
368 if (err)
369 break;
370 file_slot->file_ptr = 0;
371 io_file_bitmap_clear(&ctx->file_table, i);
373 if (fd != -1) {
374 struct file *file = fget(fd);
376 if (!file) {
377 err = -EBADF;
378 break;
381 * Don't allow io_uring instances to be registered.
383 if (io_is_uring_fops(file)) {
384 fput(file);
385 err = -EBADF;
386 break;
388 *io_get_tag_slot(data, i) = tag;
389 io_fixed_file_set(file_slot, file);
390 io_file_bitmap_set(&ctx->file_table, i);
393 return done ? done : err;
396 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
397 struct io_uring_rsrc_update2 *up,
398 unsigned int nr_args)
400 u64 __user *tags = u64_to_user_ptr(up->tags);
401 struct iovec fast_iov, *iov;
402 struct page *last_hpage = NULL;
403 struct iovec __user *uvec;
404 u64 user_data = up->data;
405 __u32 done;
406 int i, err;
408 if (!ctx->buf_data)
409 return -ENXIO;
410 if (up->offset + nr_args > ctx->nr_user_bufs)
411 return -EINVAL;
413 for (done = 0; done < nr_args; done++) {
414 struct io_mapped_ubuf *imu;
415 u64 tag = 0;
417 uvec = u64_to_user_ptr(user_data);
418 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
419 if (IS_ERR(iov)) {
420 err = PTR_ERR(iov);
421 break;
423 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
424 err = -EFAULT;
425 break;
427 err = io_buffer_validate(iov);
428 if (err)
429 break;
430 if (!iov->iov_base && tag) {
431 err = -EINVAL;
432 break;
434 err = io_sqe_buffer_register(ctx, iov, &imu, &last_hpage);
435 if (err)
436 break;
438 i = array_index_nospec(up->offset + done, ctx->nr_user_bufs);
439 if (ctx->user_bufs[i] != &dummy_ubuf) {
440 err = io_queue_rsrc_removal(ctx->buf_data, i,
441 ctx->user_bufs[i]);
442 if (unlikely(err)) {
443 io_buffer_unmap(ctx, &imu);
444 break;
446 ctx->user_bufs[i] = (struct io_mapped_ubuf *)&dummy_ubuf;
449 ctx->user_bufs[i] = imu;
450 *io_get_tag_slot(ctx->buf_data, i) = tag;
451 if (ctx->compat)
452 user_data += sizeof(struct compat_iovec);
453 else
454 user_data += sizeof(struct iovec);
456 return done ? done : err;
459 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
460 struct io_uring_rsrc_update2 *up,
461 unsigned nr_args)
463 __u32 tmp;
465 lockdep_assert_held(&ctx->uring_lock);
467 if (check_add_overflow(up->offset, nr_args, &tmp))
468 return -EOVERFLOW;
470 switch (type) {
471 case IORING_RSRC_FILE:
472 return __io_sqe_files_update(ctx, up, nr_args);
473 case IORING_RSRC_BUFFER:
474 return __io_sqe_buffers_update(ctx, up, nr_args);
476 return -EINVAL;
479 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
480 unsigned nr_args)
482 struct io_uring_rsrc_update2 up;
484 if (!nr_args)
485 return -EINVAL;
486 memset(&up, 0, sizeof(up));
487 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
488 return -EFAULT;
489 if (up.resv || up.resv2)
490 return -EINVAL;
491 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
494 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
495 unsigned size, unsigned type)
497 struct io_uring_rsrc_update2 up;
499 if (size != sizeof(up))
500 return -EINVAL;
501 if (copy_from_user(&up, arg, sizeof(up)))
502 return -EFAULT;
503 if (!up.nr || up.resv || up.resv2)
504 return -EINVAL;
505 return __io_register_rsrc_update(ctx, type, &up, up.nr);
508 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
509 unsigned int size, unsigned int type)
511 struct io_uring_rsrc_register rr;
513 /* keep it extendible */
514 if (size != sizeof(rr))
515 return -EINVAL;
517 memset(&rr, 0, sizeof(rr));
518 if (copy_from_user(&rr, arg, size))
519 return -EFAULT;
520 if (!rr.nr || rr.resv2)
521 return -EINVAL;
522 if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
523 return -EINVAL;
525 switch (type) {
526 case IORING_RSRC_FILE:
527 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
528 break;
529 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
530 rr.nr, u64_to_user_ptr(rr.tags));
531 case IORING_RSRC_BUFFER:
532 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
533 break;
534 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
535 rr.nr, u64_to_user_ptr(rr.tags));
537 return -EINVAL;
540 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
542 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
544 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
545 return -EINVAL;
546 if (sqe->rw_flags || sqe->splice_fd_in)
547 return -EINVAL;
549 up->offset = READ_ONCE(sqe->off);
550 up->nr_args = READ_ONCE(sqe->len);
551 if (!up->nr_args)
552 return -EINVAL;
553 up->arg = READ_ONCE(sqe->addr);
554 return 0;
557 static int io_files_update_with_index_alloc(struct io_kiocb *req,
558 unsigned int issue_flags)
560 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
561 __s32 __user *fds = u64_to_user_ptr(up->arg);
562 unsigned int done;
563 struct file *file;
564 int ret, fd;
566 if (!req->ctx->file_data)
567 return -ENXIO;
569 for (done = 0; done < up->nr_args; done++) {
570 if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
571 ret = -EFAULT;
572 break;
575 file = fget(fd);
576 if (!file) {
577 ret = -EBADF;
578 break;
580 ret = io_fixed_fd_install(req, issue_flags, file,
581 IORING_FILE_INDEX_ALLOC);
582 if (ret < 0)
583 break;
584 if (copy_to_user(&fds[done], &ret, sizeof(ret))) {
585 __io_close_fixed(req->ctx, issue_flags, ret);
586 ret = -EFAULT;
587 break;
591 if (done)
592 return done;
593 return ret;
596 int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
598 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
599 struct io_ring_ctx *ctx = req->ctx;
600 struct io_uring_rsrc_update2 up2;
601 int ret;
603 up2.offset = up->offset;
604 up2.data = up->arg;
605 up2.nr = 0;
606 up2.tags = 0;
607 up2.resv = 0;
608 up2.resv2 = 0;
610 if (up->offset == IORING_FILE_INDEX_ALLOC) {
611 ret = io_files_update_with_index_alloc(req, issue_flags);
612 } else {
613 io_ring_submit_lock(ctx, issue_flags);
614 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
615 &up2, up->nr_args);
616 io_ring_submit_unlock(ctx, issue_flags);
619 if (ret < 0)
620 req_set_fail(req);
621 io_req_set_res(req, ret, 0);
622 return IOU_OK;
625 int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc)
627 struct io_ring_ctx *ctx = data->ctx;
628 struct io_rsrc_node *node = ctx->rsrc_node;
629 u64 *tag_slot = io_get_tag_slot(data, idx);
631 ctx->rsrc_node = io_rsrc_node_alloc(ctx);
632 if (unlikely(!ctx->rsrc_node)) {
633 ctx->rsrc_node = node;
634 return -ENOMEM;
637 node->item.rsrc = rsrc;
638 node->type = data->rsrc_type;
639 node->item.tag = *tag_slot;
640 *tag_slot = 0;
641 list_add_tail(&node->node, &ctx->rsrc_ref_list);
642 io_put_rsrc_node(ctx, node);
643 return 0;
646 void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
648 int i;
650 for (i = 0; i < ctx->nr_user_files; i++) {
651 struct file *file = io_file_from_index(&ctx->file_table, i);
653 if (!file)
654 continue;
655 io_file_bitmap_clear(&ctx->file_table, i);
656 fput(file);
659 io_free_file_tables(&ctx->file_table);
660 io_file_table_set_alloc_range(ctx, 0, 0);
661 io_rsrc_data_free(ctx->file_data);
662 ctx->file_data = NULL;
663 ctx->nr_user_files = 0;
666 int io_sqe_files_unregister(struct io_ring_ctx *ctx)
668 unsigned nr = ctx->nr_user_files;
669 int ret;
671 if (!ctx->file_data)
672 return -ENXIO;
675 * Quiesce may unlock ->uring_lock, and while it's not held
676 * prevent new requests using the table.
678 ctx->nr_user_files = 0;
679 ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
680 ctx->nr_user_files = nr;
681 if (!ret)
682 __io_sqe_files_unregister(ctx);
683 return ret;
686 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
687 unsigned nr_args, u64 __user *tags)
689 __s32 __user *fds = (__s32 __user *) arg;
690 struct file *file;
691 int fd, ret;
692 unsigned i;
694 if (ctx->file_data)
695 return -EBUSY;
696 if (!nr_args)
697 return -EINVAL;
698 if (nr_args > IORING_MAX_FIXED_FILES)
699 return -EMFILE;
700 if (nr_args > rlimit(RLIMIT_NOFILE))
701 return -EMFILE;
702 ret = io_rsrc_data_alloc(ctx, IORING_RSRC_FILE, tags, nr_args,
703 &ctx->file_data);
704 if (ret)
705 return ret;
707 if (!io_alloc_file_tables(&ctx->file_table, nr_args)) {
708 io_rsrc_data_free(ctx->file_data);
709 ctx->file_data = NULL;
710 return -ENOMEM;
713 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
714 struct io_fixed_file *file_slot;
716 if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) {
717 ret = -EFAULT;
718 goto fail;
720 /* allow sparse sets */
721 if (!fds || fd == -1) {
722 ret = -EINVAL;
723 if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
724 goto fail;
725 continue;
728 file = fget(fd);
729 ret = -EBADF;
730 if (unlikely(!file))
731 goto fail;
734 * Don't allow io_uring instances to be registered.
736 if (io_is_uring_fops(file)) {
737 fput(file);
738 goto fail;
740 file_slot = io_fixed_file_slot(&ctx->file_table, i);
741 io_fixed_file_set(file_slot, file);
742 io_file_bitmap_set(&ctx->file_table, i);
745 /* default it to the whole table */
746 io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files);
747 return 0;
748 fail:
749 __io_sqe_files_unregister(ctx);
750 return ret;
753 static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
755 io_buffer_unmap(ctx, &prsrc->buf);
756 prsrc->buf = NULL;
759 void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
761 unsigned int i;
763 for (i = 0; i < ctx->nr_user_bufs; i++)
764 io_buffer_unmap(ctx, &ctx->user_bufs[i]);
765 kfree(ctx->user_bufs);
766 io_rsrc_data_free(ctx->buf_data);
767 ctx->user_bufs = NULL;
768 ctx->buf_data = NULL;
769 ctx->nr_user_bufs = 0;
772 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
774 unsigned nr = ctx->nr_user_bufs;
775 int ret;
777 if (!ctx->buf_data)
778 return -ENXIO;
781 * Quiesce may unlock ->uring_lock, and while it's not held
782 * prevent new requests using the table.
784 ctx->nr_user_bufs = 0;
785 ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
786 ctx->nr_user_bufs = nr;
787 if (!ret)
788 __io_sqe_buffers_unregister(ctx);
789 return ret;
793 * Not super efficient, but this is just a registration time. And we do cache
794 * the last compound head, so generally we'll only do a full search if we don't
795 * match that one.
797 * We check if the given compound head page has already been accounted, to
798 * avoid double accounting it. This allows us to account the full size of the
799 * page, not just the constituent pages of a huge page.
801 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
802 int nr_pages, struct page *hpage)
804 int i, j;
806 /* check current page array */
807 for (i = 0; i < nr_pages; i++) {
808 if (!PageCompound(pages[i]))
809 continue;
810 if (compound_head(pages[i]) == hpage)
811 return true;
814 /* check previously registered pages */
815 for (i = 0; i < ctx->nr_user_bufs; i++) {
816 struct io_mapped_ubuf *imu = ctx->user_bufs[i];
818 for (j = 0; j < imu->nr_bvecs; j++) {
819 if (!PageCompound(imu->bvec[j].bv_page))
820 continue;
821 if (compound_head(imu->bvec[j].bv_page) == hpage)
822 return true;
826 return false;
829 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
830 int nr_pages, struct io_mapped_ubuf *imu,
831 struct page **last_hpage)
833 int i, ret;
835 imu->acct_pages = 0;
836 for (i = 0; i < nr_pages; i++) {
837 if (!PageCompound(pages[i])) {
838 imu->acct_pages++;
839 } else {
840 struct page *hpage;
842 hpage = compound_head(pages[i]);
843 if (hpage == *last_hpage)
844 continue;
845 *last_hpage = hpage;
846 if (headpage_already_acct(ctx, pages, i, hpage))
847 continue;
848 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
852 if (!imu->acct_pages)
853 return 0;
855 ret = io_account_mem(ctx, imu->acct_pages);
856 if (ret)
857 imu->acct_pages = 0;
858 return ret;
861 static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages,
862 struct io_imu_folio_data *data, int nr_folios)
864 struct page **page_array = *pages, **new_array = NULL;
865 int nr_pages_left = *nr_pages, i, j;
867 /* Store head pages only*/
868 new_array = kvmalloc_array(nr_folios, sizeof(struct page *),
869 GFP_KERNEL);
870 if (!new_array)
871 return false;
873 new_array[0] = compound_head(page_array[0]);
875 * The pages are bound to the folio, it doesn't
876 * actually unpin them but drops all but one reference,
877 * which is usually put down by io_buffer_unmap().
878 * Note, needs a better helper.
880 if (data->nr_pages_head > 1)
881 unpin_user_pages(&page_array[1], data->nr_pages_head - 1);
883 j = data->nr_pages_head;
884 nr_pages_left -= data->nr_pages_head;
885 for (i = 1; i < nr_folios; i++) {
886 unsigned int nr_unpin;
888 new_array[i] = page_array[j];
889 nr_unpin = min_t(unsigned int, nr_pages_left - 1,
890 data->nr_pages_mid - 1);
891 if (nr_unpin)
892 unpin_user_pages(&page_array[j+1], nr_unpin);
893 j += data->nr_pages_mid;
894 nr_pages_left -= data->nr_pages_mid;
896 kvfree(page_array);
897 *pages = new_array;
898 *nr_pages = nr_folios;
899 return true;
902 static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages,
903 struct io_imu_folio_data *data)
905 struct page **page_array = *pages;
906 struct folio *folio = page_folio(page_array[0]);
907 unsigned int count = 1, nr_folios = 1;
908 int i;
910 if (*nr_pages <= 1)
911 return false;
913 data->nr_pages_mid = folio_nr_pages(folio);
914 if (data->nr_pages_mid == 1)
915 return false;
917 data->folio_shift = folio_shift(folio);
919 * Check if pages are contiguous inside a folio, and all folios have
920 * the same page count except for the head and tail.
922 for (i = 1; i < *nr_pages; i++) {
923 if (page_folio(page_array[i]) == folio &&
924 page_array[i] == page_array[i-1] + 1) {
925 count++;
926 continue;
929 if (nr_folios == 1) {
930 if (folio_page_idx(folio, page_array[i-1]) !=
931 data->nr_pages_mid - 1)
932 return false;
934 data->nr_pages_head = count;
935 } else if (count != data->nr_pages_mid) {
936 return false;
939 folio = page_folio(page_array[i]);
940 if (folio_size(folio) != (1UL << data->folio_shift) ||
941 folio_page_idx(folio, page_array[i]) != 0)
942 return false;
944 count = 1;
945 nr_folios++;
947 if (nr_folios == 1)
948 data->nr_pages_head = count;
950 return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios);
953 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
954 struct io_mapped_ubuf **pimu,
955 struct page **last_hpage)
957 struct io_mapped_ubuf *imu = NULL;
958 struct page **pages = NULL;
959 unsigned long off;
960 size_t size;
961 int ret, nr_pages, i;
962 struct io_imu_folio_data data;
963 bool coalesced;
965 *pimu = (struct io_mapped_ubuf *)&dummy_ubuf;
966 if (!iov->iov_base)
967 return 0;
969 ret = -ENOMEM;
970 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
971 &nr_pages);
972 if (IS_ERR(pages)) {
973 ret = PTR_ERR(pages);
974 pages = NULL;
975 goto done;
978 /* If it's huge page(s), try to coalesce them into fewer bvec entries */
979 coalesced = io_try_coalesce_buffer(&pages, &nr_pages, &data);
981 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
982 if (!imu)
983 goto done;
985 ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
986 if (ret) {
987 unpin_user_pages(pages, nr_pages);
988 goto done;
991 size = iov->iov_len;
992 /* store original address for later verification */
993 imu->ubuf = (unsigned long) iov->iov_base;
994 imu->len = iov->iov_len;
995 imu->nr_bvecs = nr_pages;
996 imu->folio_shift = PAGE_SHIFT;
997 if (coalesced)
998 imu->folio_shift = data.folio_shift;
999 refcount_set(&imu->refs, 1);
1000 off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1);
1001 *pimu = imu;
1002 ret = 0;
1004 for (i = 0; i < nr_pages; i++) {
1005 size_t vec_len;
1007 vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off);
1008 bvec_set_page(&imu->bvec[i], pages[i], vec_len, off);
1009 off = 0;
1010 size -= vec_len;
1012 done:
1013 if (ret)
1014 kvfree(imu);
1015 kvfree(pages);
1016 return ret;
1019 static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
1021 ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
1022 return ctx->user_bufs ? 0 : -ENOMEM;
1025 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
1026 unsigned int nr_args, u64 __user *tags)
1028 struct page *last_hpage = NULL;
1029 struct io_rsrc_data *data;
1030 struct iovec fast_iov, *iov = &fast_iov;
1031 const struct iovec __user *uvec;
1032 int i, ret;
1034 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
1036 if (ctx->user_bufs)
1037 return -EBUSY;
1038 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
1039 return -EINVAL;
1040 ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, tags, nr_args, &data);
1041 if (ret)
1042 return ret;
1043 ret = io_buffers_map_alloc(ctx, nr_args);
1044 if (ret) {
1045 io_rsrc_data_free(data);
1046 return ret;
1049 if (!arg)
1050 memset(iov, 0, sizeof(*iov));
1052 for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
1053 if (arg) {
1054 uvec = (struct iovec __user *) arg;
1055 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
1056 if (IS_ERR(iov)) {
1057 ret = PTR_ERR(iov);
1058 break;
1060 ret = io_buffer_validate(iov);
1061 if (ret)
1062 break;
1063 if (ctx->compat)
1064 arg += sizeof(struct compat_iovec);
1065 else
1066 arg += sizeof(struct iovec);
1069 if (!iov->iov_base && *io_get_tag_slot(data, i)) {
1070 ret = -EINVAL;
1071 break;
1074 ret = io_sqe_buffer_register(ctx, iov, &ctx->user_bufs[i],
1075 &last_hpage);
1076 if (ret)
1077 break;
1080 WARN_ON_ONCE(ctx->buf_data);
1082 ctx->buf_data = data;
1083 if (ret)
1084 __io_sqe_buffers_unregister(ctx);
1085 return ret;
1088 int io_import_fixed(int ddir, struct iov_iter *iter,
1089 struct io_mapped_ubuf *imu,
1090 u64 buf_addr, size_t len)
1092 u64 buf_end;
1093 size_t offset;
1095 if (WARN_ON_ONCE(!imu))
1096 return -EFAULT;
1097 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
1098 return -EFAULT;
1099 /* not inside the mapped region */
1100 if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len)))
1101 return -EFAULT;
1104 * Might not be a start of buffer, set size appropriately
1105 * and advance us to the beginning.
1107 offset = buf_addr - imu->ubuf;
1108 iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
1110 if (offset) {
1112 * Don't use iov_iter_advance() here, as it's really slow for
1113 * using the latter parts of a big fixed buffer - it iterates
1114 * over each segment manually. We can cheat a bit here, because
1115 * we know that:
1117 * 1) it's a BVEC iter, we set it up
1118 * 2) all bvecs are the same in size, except potentially the
1119 * first and last bvec
1121 * So just find our index, and adjust the iterator afterwards.
1122 * If the offset is within the first bvec (or the whole first
1123 * bvec, just use iov_iter_advance(). This makes it easier
1124 * since we can just skip the first segment, which may not
1125 * be folio_size aligned.
1127 const struct bio_vec *bvec = imu->bvec;
1129 if (offset < bvec->bv_len) {
1130 iter->bvec = bvec;
1131 iter->count -= offset;
1132 iter->iov_offset = offset;
1133 } else {
1134 unsigned long seg_skip;
1136 /* skip first vec */
1137 offset -= bvec->bv_len;
1138 seg_skip = 1 + (offset >> imu->folio_shift);
1140 iter->bvec = bvec + seg_skip;
1141 iter->nr_segs -= seg_skip;
1142 iter->count -= bvec->bv_len + offset;
1143 iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1);
1147 return 0;
1150 static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx)
1152 struct io_mapped_ubuf **user_bufs;
1153 struct io_rsrc_data *data;
1154 int i, ret, nbufs;
1157 * Drop our own lock here. We'll setup the data we need and reference
1158 * the source buffers, then re-grab, check, and assign at the end.
1160 mutex_unlock(&ctx->uring_lock);
1162 mutex_lock(&src_ctx->uring_lock);
1163 ret = -ENXIO;
1164 nbufs = src_ctx->nr_user_bufs;
1165 if (!nbufs)
1166 goto out_unlock;
1167 ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, NULL, nbufs, &data);
1168 if (ret)
1169 goto out_unlock;
1171 ret = -ENOMEM;
1172 user_bufs = kcalloc(nbufs, sizeof(*ctx->user_bufs), GFP_KERNEL);
1173 if (!user_bufs)
1174 goto out_free_data;
1176 for (i = 0; i < nbufs; i++) {
1177 struct io_mapped_ubuf *src = src_ctx->user_bufs[i];
1179 refcount_inc(&src->refs);
1180 user_bufs[i] = src;
1183 /* Have a ref on the bufs now, drop src lock and re-grab our own lock */
1184 mutex_unlock(&src_ctx->uring_lock);
1185 mutex_lock(&ctx->uring_lock);
1186 if (!ctx->user_bufs) {
1187 ctx->user_bufs = user_bufs;
1188 ctx->buf_data = data;
1189 ctx->nr_user_bufs = nbufs;
1190 return 0;
1193 /* someone raced setting up buffers, dump ours */
1194 for (i = 0; i < nbufs; i++)
1195 io_buffer_unmap(ctx, &user_bufs[i]);
1196 io_rsrc_data_free(data);
1197 kfree(user_bufs);
1198 return -EBUSY;
1199 out_free_data:
1200 io_rsrc_data_free(data);
1201 out_unlock:
1202 mutex_unlock(&src_ctx->uring_lock);
1203 mutex_lock(&ctx->uring_lock);
1204 return ret;
1208 * Copy the registered buffers from the source ring whose file descriptor
1209 * is given in the src_fd to the current ring. This is identical to registering
1210 * the buffers with ctx, except faster as mappings already exist.
1212 * Since the memory is already accounted once, don't account it again.
1214 int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
1216 struct io_uring_clone_buffers buf;
1217 bool registered_src;
1218 struct file *file;
1219 int ret;
1221 if (ctx->user_bufs || ctx->nr_user_bufs)
1222 return -EBUSY;
1223 if (copy_from_user(&buf, arg, sizeof(buf)))
1224 return -EFAULT;
1225 if (buf.flags & ~IORING_REGISTER_SRC_REGISTERED)
1226 return -EINVAL;
1227 if (memchr_inv(buf.pad, 0, sizeof(buf.pad)))
1228 return -EINVAL;
1230 registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0;
1231 file = io_uring_register_get_file(buf.src_fd, registered_src);
1232 if (IS_ERR(file))
1233 return PTR_ERR(file);
1234 ret = io_clone_buffers(ctx, file->private_data);
1235 if (!registered_src)
1236 fput(file);
1237 return ret;