Merge tag 'trace-printf-v6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/trace...
[drm/drm-misc.git] / io_uring / net.c
blobdf1f7dc6f1c8f3059ae0737f743e9cdc93442e30
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
4 #include <linux/file.h>
5 #include <linux/slab.h>
6 #include <linux/net.h>
7 #include <linux/compat.h>
8 #include <net/compat.h>
9 #include <linux/io_uring.h>
11 #include <uapi/linux/io_uring.h>
13 #include "io_uring.h"
14 #include "kbuf.h"
15 #include "alloc_cache.h"
16 #include "net.h"
17 #include "notif.h"
18 #include "rsrc.h"
20 #if defined(CONFIG_NET)
21 struct io_shutdown {
22 struct file *file;
23 int how;
26 struct io_accept {
27 struct file *file;
28 struct sockaddr __user *addr;
29 int __user *addr_len;
30 int flags;
31 int iou_flags;
32 u32 file_slot;
33 unsigned long nofile;
36 struct io_socket {
37 struct file *file;
38 int domain;
39 int type;
40 int protocol;
41 int flags;
42 u32 file_slot;
43 unsigned long nofile;
46 struct io_connect {
47 struct file *file;
48 struct sockaddr __user *addr;
49 int addr_len;
50 bool in_progress;
51 bool seen_econnaborted;
54 struct io_bind {
55 struct file *file;
56 int addr_len;
59 struct io_listen {
60 struct file *file;
61 int backlog;
64 struct io_sr_msg {
65 struct file *file;
66 union {
67 struct compat_msghdr __user *umsg_compat;
68 struct user_msghdr __user *umsg;
69 void __user *buf;
71 int len;
72 unsigned done_io;
73 unsigned msg_flags;
74 unsigned nr_multishot_loops;
75 u16 flags;
76 /* initialised and used only by !msg send variants */
77 u16 buf_group;
78 u16 buf_index;
79 void __user *msg_control;
80 /* used only for send zerocopy */
81 struct io_kiocb *notif;
85 * Number of times we'll try and do receives if there's more data. If we
86 * exceed this limit, then add us to the back of the queue and retry from
87 * there. This helps fairness between flooding clients.
89 #define MULTISHOT_MAX_RETRY 32
91 int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
93 struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
95 if (unlikely(sqe->off || sqe->addr || sqe->rw_flags ||
96 sqe->buf_index || sqe->splice_fd_in))
97 return -EINVAL;
99 shutdown->how = READ_ONCE(sqe->len);
100 req->flags |= REQ_F_FORCE_ASYNC;
101 return 0;
104 int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
106 struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
107 struct socket *sock;
108 int ret;
110 WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
112 sock = sock_from_file(req->file);
113 if (unlikely(!sock))
114 return -ENOTSOCK;
116 ret = __sys_shutdown_sock(sock, shutdown->how);
117 io_req_set_res(req, ret, 0);
118 return IOU_OK;
121 static bool io_net_retry(struct socket *sock, int flags)
123 if (!(flags & MSG_WAITALL))
124 return false;
125 return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
128 static void io_netmsg_iovec_free(struct io_async_msghdr *kmsg)
130 if (kmsg->free_iov) {
131 kfree(kmsg->free_iov);
132 kmsg->free_iov_nr = 0;
133 kmsg->free_iov = NULL;
137 static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
139 struct io_async_msghdr *hdr = req->async_data;
140 struct iovec *iov;
142 /* can't recycle, ensure we free the iovec if we have one */
143 if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
144 io_netmsg_iovec_free(hdr);
145 return;
148 /* Let normal cleanup path reap it if we fail adding to the cache */
149 iov = hdr->free_iov;
150 if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) {
151 if (iov)
152 kasan_mempool_poison_object(iov);
153 req->async_data = NULL;
154 req->flags &= ~REQ_F_ASYNC_DATA;
158 static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req)
160 struct io_ring_ctx *ctx = req->ctx;
161 struct io_async_msghdr *hdr;
163 hdr = io_alloc_cache_get(&ctx->netmsg_cache);
164 if (hdr) {
165 if (hdr->free_iov) {
166 kasan_mempool_unpoison_object(hdr->free_iov,
167 hdr->free_iov_nr * sizeof(struct iovec));
168 req->flags |= REQ_F_NEED_CLEANUP;
170 req->flags |= REQ_F_ASYNC_DATA;
171 req->async_data = hdr;
172 return hdr;
175 if (!io_alloc_async_data(req)) {
176 hdr = req->async_data;
177 hdr->free_iov_nr = 0;
178 hdr->free_iov = NULL;
179 return hdr;
181 return NULL;
184 /* assign new iovec to kmsg, if we need to */
185 static int io_net_vec_assign(struct io_kiocb *req, struct io_async_msghdr *kmsg,
186 struct iovec *iov)
188 if (iov) {
189 req->flags |= REQ_F_NEED_CLEANUP;
190 kmsg->free_iov_nr = kmsg->msg.msg_iter.nr_segs;
191 if (kmsg->free_iov)
192 kfree(kmsg->free_iov);
193 kmsg->free_iov = iov;
195 return 0;
198 static inline void io_mshot_prep_retry(struct io_kiocb *req,
199 struct io_async_msghdr *kmsg)
201 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
203 req->flags &= ~REQ_F_BL_EMPTY;
204 sr->done_io = 0;
205 sr->len = 0; /* get from the provided buffer */
206 req->buf_index = sr->buf_group;
209 #ifdef CONFIG_COMPAT
210 static int io_compat_msg_copy_hdr(struct io_kiocb *req,
211 struct io_async_msghdr *iomsg,
212 struct compat_msghdr *msg, int ddir)
214 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
215 struct compat_iovec __user *uiov;
216 struct iovec *iov;
217 int ret, nr_segs;
219 if (iomsg->free_iov) {
220 nr_segs = iomsg->free_iov_nr;
221 iov = iomsg->free_iov;
222 } else {
223 iov = &iomsg->fast_iov;
224 nr_segs = 1;
227 if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg)))
228 return -EFAULT;
230 uiov = compat_ptr(msg->msg_iov);
231 if (req->flags & REQ_F_BUFFER_SELECT) {
232 compat_ssize_t clen;
234 if (msg->msg_iovlen == 0) {
235 sr->len = iov->iov_len = 0;
236 iov->iov_base = NULL;
237 } else if (msg->msg_iovlen > 1) {
238 return -EINVAL;
239 } else {
240 if (!access_ok(uiov, sizeof(*uiov)))
241 return -EFAULT;
242 if (__get_user(clen, &uiov->iov_len))
243 return -EFAULT;
244 if (clen < 0)
245 return -EINVAL;
246 sr->len = clen;
249 return 0;
252 ret = __import_iovec(ddir, (struct iovec __user *)uiov, msg->msg_iovlen,
253 nr_segs, &iov, &iomsg->msg.msg_iter, true);
254 if (unlikely(ret < 0))
255 return ret;
257 return io_net_vec_assign(req, iomsg, iov);
259 #endif
261 static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,
262 struct user_msghdr *msg, int ddir)
264 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
265 struct user_msghdr __user *umsg = sr->umsg;
266 struct iovec *iov;
267 int ret, nr_segs;
269 if (iomsg->free_iov) {
270 nr_segs = iomsg->free_iov_nr;
271 iov = iomsg->free_iov;
272 } else {
273 iov = &iomsg->fast_iov;
274 nr_segs = 1;
277 if (!user_access_begin(umsg, sizeof(*umsg)))
278 return -EFAULT;
280 ret = -EFAULT;
281 unsafe_get_user(msg->msg_name, &umsg->msg_name, ua_end);
282 unsafe_get_user(msg->msg_namelen, &umsg->msg_namelen, ua_end);
283 unsafe_get_user(msg->msg_iov, &umsg->msg_iov, ua_end);
284 unsafe_get_user(msg->msg_iovlen, &umsg->msg_iovlen, ua_end);
285 unsafe_get_user(msg->msg_control, &umsg->msg_control, ua_end);
286 unsafe_get_user(msg->msg_controllen, &umsg->msg_controllen, ua_end);
287 msg->msg_flags = 0;
289 if (req->flags & REQ_F_BUFFER_SELECT) {
290 if (msg->msg_iovlen == 0) {
291 sr->len = iov->iov_len = 0;
292 iov->iov_base = NULL;
293 } else if (msg->msg_iovlen > 1) {
294 ret = -EINVAL;
295 goto ua_end;
296 } else {
297 /* we only need the length for provided buffers */
298 if (!access_ok(&msg->msg_iov[0].iov_len, sizeof(__kernel_size_t)))
299 goto ua_end;
300 unsafe_get_user(iov->iov_len, &msg->msg_iov[0].iov_len,
301 ua_end);
302 sr->len = iov->iov_len;
304 ret = 0;
305 ua_end:
306 user_access_end();
307 return ret;
310 user_access_end();
311 ret = __import_iovec(ddir, msg->msg_iov, msg->msg_iovlen, nr_segs,
312 &iov, &iomsg->msg.msg_iter, false);
313 if (unlikely(ret < 0))
314 return ret;
316 return io_net_vec_assign(req, iomsg, iov);
319 static int io_sendmsg_copy_hdr(struct io_kiocb *req,
320 struct io_async_msghdr *iomsg)
322 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
323 struct user_msghdr msg;
324 int ret;
326 iomsg->msg.msg_name = &iomsg->addr;
327 iomsg->msg.msg_iter.nr_segs = 0;
329 #ifdef CONFIG_COMPAT
330 if (unlikely(req->ctx->compat)) {
331 struct compat_msghdr cmsg;
333 ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_SOURCE);
334 if (unlikely(ret))
335 return ret;
337 return __get_compat_msghdr(&iomsg->msg, &cmsg, NULL);
339 #endif
341 ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE);
342 if (unlikely(ret))
343 return ret;
345 ret = __copy_msghdr(&iomsg->msg, &msg, NULL);
347 /* save msg_control as sys_sendmsg() overwrites it */
348 sr->msg_control = iomsg->msg.msg_control_user;
349 return ret;
352 void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req)
354 struct io_async_msghdr *io = req->async_data;
356 io_netmsg_iovec_free(io);
359 static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
361 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
362 struct io_async_msghdr *kmsg = req->async_data;
363 void __user *addr;
364 u16 addr_len;
365 int ret;
367 sr->buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
369 if (READ_ONCE(sqe->__pad3[0]))
370 return -EINVAL;
372 kmsg->msg.msg_name = NULL;
373 kmsg->msg.msg_namelen = 0;
374 kmsg->msg.msg_control = NULL;
375 kmsg->msg.msg_controllen = 0;
376 kmsg->msg.msg_ubuf = NULL;
378 addr = u64_to_user_ptr(READ_ONCE(sqe->addr2));
379 addr_len = READ_ONCE(sqe->addr_len);
380 if (addr) {
381 ret = move_addr_to_kernel(addr, addr_len, &kmsg->addr);
382 if (unlikely(ret < 0))
383 return ret;
384 kmsg->msg.msg_name = &kmsg->addr;
385 kmsg->msg.msg_namelen = addr_len;
387 if (!io_do_buffer_select(req)) {
388 ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len,
389 &kmsg->msg.msg_iter);
390 if (unlikely(ret < 0))
391 return ret;
393 return 0;
396 static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
398 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
399 struct io_async_msghdr *kmsg = req->async_data;
400 int ret;
402 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
404 ret = io_sendmsg_copy_hdr(req, kmsg);
405 if (!ret)
406 req->flags |= REQ_F_NEED_CLEANUP;
407 return ret;
410 #define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE)
412 int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
414 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
416 sr->done_io = 0;
418 if (req->opcode != IORING_OP_SEND) {
419 if (sqe->addr2 || sqe->file_index)
420 return -EINVAL;
423 sr->len = READ_ONCE(sqe->len);
424 sr->flags = READ_ONCE(sqe->ioprio);
425 if (sr->flags & ~SENDMSG_FLAGS)
426 return -EINVAL;
427 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
428 if (sr->msg_flags & MSG_DONTWAIT)
429 req->flags |= REQ_F_NOWAIT;
430 if (sr->flags & IORING_RECVSEND_BUNDLE) {
431 if (req->opcode == IORING_OP_SENDMSG)
432 return -EINVAL;
433 if (!(req->flags & REQ_F_BUFFER_SELECT))
434 return -EINVAL;
435 sr->msg_flags |= MSG_WAITALL;
436 sr->buf_group = req->buf_index;
437 req->buf_list = NULL;
440 #ifdef CONFIG_COMPAT
441 if (req->ctx->compat)
442 sr->msg_flags |= MSG_CMSG_COMPAT;
443 #endif
444 if (unlikely(!io_msg_alloc_async(req)))
445 return -ENOMEM;
446 if (req->opcode != IORING_OP_SENDMSG)
447 return io_send_setup(req, sqe);
448 return io_sendmsg_setup(req, sqe);
451 static void io_req_msg_cleanup(struct io_kiocb *req,
452 unsigned int issue_flags)
454 req->flags &= ~REQ_F_NEED_CLEANUP;
455 io_netmsg_recycle(req, issue_flags);
459 * For bundle completions, we need to figure out how many segments we consumed.
460 * A bundle could be using a single ITER_UBUF if that's all we mapped, or it
461 * could be using an ITER_IOVEC. If the latter, then if we consumed all of
462 * the segments, then it's a trivial questiont o answer. If we have residual
463 * data in the iter, then loop the segments to figure out how much we
464 * transferred.
466 static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret)
468 struct iovec *iov;
469 int nbufs;
471 /* no data is always zero segments, and a ubuf is always 1 segment */
472 if (ret <= 0)
473 return 0;
474 if (iter_is_ubuf(&kmsg->msg.msg_iter))
475 return 1;
477 iov = kmsg->free_iov;
478 if (!iov)
479 iov = &kmsg->fast_iov;
481 /* if all data was transferred, it's basic pointer math */
482 if (!iov_iter_count(&kmsg->msg.msg_iter))
483 return iter_iov(&kmsg->msg.msg_iter) - iov;
485 /* short transfer, count segments */
486 nbufs = 0;
487 do {
488 int this_len = min_t(int, iov[nbufs].iov_len, ret);
490 nbufs++;
491 ret -= this_len;
492 } while (ret);
494 return nbufs;
497 static inline bool io_send_finish(struct io_kiocb *req, int *ret,
498 struct io_async_msghdr *kmsg,
499 unsigned issue_flags)
501 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
502 bool bundle_finished = *ret <= 0;
503 unsigned int cflags;
505 if (!(sr->flags & IORING_RECVSEND_BUNDLE)) {
506 cflags = io_put_kbuf(req, *ret, issue_flags);
507 goto finish;
510 cflags = io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), issue_flags);
512 if (bundle_finished || req->flags & REQ_F_BL_EMPTY)
513 goto finish;
516 * Fill CQE for this receive and see if we should keep trying to
517 * receive from this socket.
519 if (io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) {
520 io_mshot_prep_retry(req, kmsg);
521 return false;
524 /* Otherwise stop bundle and use the current result. */
525 finish:
526 io_req_set_res(req, *ret, cflags);
527 *ret = IOU_OK;
528 return true;
531 int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
533 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
534 struct io_async_msghdr *kmsg = req->async_data;
535 struct socket *sock;
536 unsigned flags;
537 int min_ret = 0;
538 int ret;
540 sock = sock_from_file(req->file);
541 if (unlikely(!sock))
542 return -ENOTSOCK;
544 if (!(req->flags & REQ_F_POLLED) &&
545 (sr->flags & IORING_RECVSEND_POLL_FIRST))
546 return -EAGAIN;
548 flags = sr->msg_flags;
549 if (issue_flags & IO_URING_F_NONBLOCK)
550 flags |= MSG_DONTWAIT;
551 if (flags & MSG_WAITALL)
552 min_ret = iov_iter_count(&kmsg->msg.msg_iter);
554 kmsg->msg.msg_control_user = sr->msg_control;
556 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
558 if (ret < min_ret) {
559 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
560 return -EAGAIN;
561 if (ret > 0 && io_net_retry(sock, flags)) {
562 kmsg->msg.msg_controllen = 0;
563 kmsg->msg.msg_control = NULL;
564 sr->done_io += ret;
565 req->flags |= REQ_F_BL_NO_RECYCLE;
566 return -EAGAIN;
568 if (ret == -ERESTARTSYS)
569 ret = -EINTR;
570 req_set_fail(req);
572 io_req_msg_cleanup(req, issue_flags);
573 if (ret >= 0)
574 ret += sr->done_io;
575 else if (sr->done_io)
576 ret = sr->done_io;
577 io_req_set_res(req, ret, 0);
578 return IOU_OK;
581 int io_send(struct io_kiocb *req, unsigned int issue_flags)
583 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
584 struct io_async_msghdr *kmsg = req->async_data;
585 struct socket *sock;
586 unsigned flags;
587 int min_ret = 0;
588 int ret;
590 sock = sock_from_file(req->file);
591 if (unlikely(!sock))
592 return -ENOTSOCK;
594 if (!(req->flags & REQ_F_POLLED) &&
595 (sr->flags & IORING_RECVSEND_POLL_FIRST))
596 return -EAGAIN;
598 flags = sr->msg_flags;
599 if (issue_flags & IO_URING_F_NONBLOCK)
600 flags |= MSG_DONTWAIT;
602 retry_bundle:
603 if (io_do_buffer_select(req)) {
604 struct buf_sel_arg arg = {
605 .iovs = &kmsg->fast_iov,
606 .max_len = min_not_zero(sr->len, INT_MAX),
607 .nr_iovs = 1,
610 if (kmsg->free_iov) {
611 arg.nr_iovs = kmsg->free_iov_nr;
612 arg.iovs = kmsg->free_iov;
613 arg.mode = KBUF_MODE_FREE;
616 if (!(sr->flags & IORING_RECVSEND_BUNDLE))
617 arg.nr_iovs = 1;
618 else
619 arg.mode |= KBUF_MODE_EXPAND;
621 ret = io_buffers_select(req, &arg, issue_flags);
622 if (unlikely(ret < 0))
623 return ret;
625 if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) {
626 kmsg->free_iov_nr = ret;
627 kmsg->free_iov = arg.iovs;
628 req->flags |= REQ_F_NEED_CLEANUP;
630 sr->len = arg.out_len;
632 if (ret == 1) {
633 sr->buf = arg.iovs[0].iov_base;
634 ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len,
635 &kmsg->msg.msg_iter);
636 if (unlikely(ret))
637 return ret;
638 } else {
639 iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE,
640 arg.iovs, ret, arg.out_len);
645 * If MSG_WAITALL is set, or this is a bundle send, then we need
646 * the full amount. If just bundle is set, if we do a short send
647 * then we complete the bundle sequence rather than continue on.
649 if (flags & MSG_WAITALL || sr->flags & IORING_RECVSEND_BUNDLE)
650 min_ret = iov_iter_count(&kmsg->msg.msg_iter);
652 flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
653 kmsg->msg.msg_flags = flags;
654 ret = sock_sendmsg(sock, &kmsg->msg);
655 if (ret < min_ret) {
656 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
657 return -EAGAIN;
659 if (ret > 0 && io_net_retry(sock, flags)) {
660 sr->len -= ret;
661 sr->buf += ret;
662 sr->done_io += ret;
663 req->flags |= REQ_F_BL_NO_RECYCLE;
664 return -EAGAIN;
666 if (ret == -ERESTARTSYS)
667 ret = -EINTR;
668 req_set_fail(req);
670 if (ret >= 0)
671 ret += sr->done_io;
672 else if (sr->done_io)
673 ret = sr->done_io;
675 if (!io_send_finish(req, &ret, kmsg, issue_flags))
676 goto retry_bundle;
678 io_req_msg_cleanup(req, issue_flags);
679 return ret;
682 static int io_recvmsg_mshot_prep(struct io_kiocb *req,
683 struct io_async_msghdr *iomsg,
684 int namelen, size_t controllen)
686 if ((req->flags & (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) ==
687 (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) {
688 int hdr;
690 if (unlikely(namelen < 0))
691 return -EOVERFLOW;
692 if (check_add_overflow(sizeof(struct io_uring_recvmsg_out),
693 namelen, &hdr))
694 return -EOVERFLOW;
695 if (check_add_overflow(hdr, controllen, &hdr))
696 return -EOVERFLOW;
698 iomsg->namelen = namelen;
699 iomsg->controllen = controllen;
700 return 0;
703 return 0;
706 static int io_recvmsg_copy_hdr(struct io_kiocb *req,
707 struct io_async_msghdr *iomsg)
709 struct user_msghdr msg;
710 int ret;
712 iomsg->msg.msg_name = &iomsg->addr;
713 iomsg->msg.msg_iter.nr_segs = 0;
715 #ifdef CONFIG_COMPAT
716 if (unlikely(req->ctx->compat)) {
717 struct compat_msghdr cmsg;
719 ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_DEST);
720 if (unlikely(ret))
721 return ret;
723 ret = __get_compat_msghdr(&iomsg->msg, &cmsg, &iomsg->uaddr);
724 if (unlikely(ret))
725 return ret;
727 return io_recvmsg_mshot_prep(req, iomsg, cmsg.msg_namelen,
728 cmsg.msg_controllen);
730 #endif
732 ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST);
733 if (unlikely(ret))
734 return ret;
736 ret = __copy_msghdr(&iomsg->msg, &msg, &iomsg->uaddr);
737 if (unlikely(ret))
738 return ret;
740 return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen,
741 msg.msg_controllen);
744 static int io_recvmsg_prep_setup(struct io_kiocb *req)
746 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
747 struct io_async_msghdr *kmsg;
748 int ret;
750 kmsg = io_msg_alloc_async(req);
751 if (unlikely(!kmsg))
752 return -ENOMEM;
754 if (req->opcode == IORING_OP_RECV) {
755 kmsg->msg.msg_name = NULL;
756 kmsg->msg.msg_namelen = 0;
757 kmsg->msg.msg_control = NULL;
758 kmsg->msg.msg_get_inq = 1;
759 kmsg->msg.msg_controllen = 0;
760 kmsg->msg.msg_iocb = NULL;
761 kmsg->msg.msg_ubuf = NULL;
763 if (!io_do_buffer_select(req)) {
764 ret = import_ubuf(ITER_DEST, sr->buf, sr->len,
765 &kmsg->msg.msg_iter);
766 if (unlikely(ret))
767 return ret;
769 return 0;
772 ret = io_recvmsg_copy_hdr(req, kmsg);
773 if (!ret)
774 req->flags |= REQ_F_NEED_CLEANUP;
775 return ret;
778 #define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT | \
779 IORING_RECVSEND_BUNDLE)
781 int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
783 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
785 sr->done_io = 0;
787 if (unlikely(sqe->file_index || sqe->addr2))
788 return -EINVAL;
790 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
791 sr->len = READ_ONCE(sqe->len);
792 sr->flags = READ_ONCE(sqe->ioprio);
793 if (sr->flags & ~RECVMSG_FLAGS)
794 return -EINVAL;
795 sr->msg_flags = READ_ONCE(sqe->msg_flags);
796 if (sr->msg_flags & MSG_DONTWAIT)
797 req->flags |= REQ_F_NOWAIT;
798 if (sr->msg_flags & MSG_ERRQUEUE)
799 req->flags |= REQ_F_CLEAR_POLLIN;
800 if (req->flags & REQ_F_BUFFER_SELECT) {
802 * Store the buffer group for this multishot receive separately,
803 * as if we end up doing an io-wq based issue that selects a
804 * buffer, it has to be committed immediately and that will
805 * clear ->buf_list. This means we lose the link to the buffer
806 * list, and the eventual buffer put on completion then cannot
807 * restore it.
809 sr->buf_group = req->buf_index;
810 req->buf_list = NULL;
812 if (sr->flags & IORING_RECV_MULTISHOT) {
813 if (!(req->flags & REQ_F_BUFFER_SELECT))
814 return -EINVAL;
815 if (sr->msg_flags & MSG_WAITALL)
816 return -EINVAL;
817 if (req->opcode == IORING_OP_RECV && sr->len)
818 return -EINVAL;
819 req->flags |= REQ_F_APOLL_MULTISHOT;
821 if (sr->flags & IORING_RECVSEND_BUNDLE) {
822 if (req->opcode == IORING_OP_RECVMSG)
823 return -EINVAL;
826 #ifdef CONFIG_COMPAT
827 if (req->ctx->compat)
828 sr->msg_flags |= MSG_CMSG_COMPAT;
829 #endif
830 sr->nr_multishot_loops = 0;
831 return io_recvmsg_prep_setup(req);
835 * Finishes io_recv and io_recvmsg.
837 * Returns true if it is actually finished, or false if it should run
838 * again (for multishot).
840 static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
841 struct io_async_msghdr *kmsg,
842 bool mshot_finished, unsigned issue_flags)
844 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
845 unsigned int cflags = 0;
847 if (kmsg->msg.msg_inq > 0)
848 cflags |= IORING_CQE_F_SOCK_NONEMPTY;
850 if (sr->flags & IORING_RECVSEND_BUNDLE) {
851 cflags |= io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret),
852 issue_flags);
853 /* bundle with no more immediate buffers, we're done */
854 if (req->flags & REQ_F_BL_EMPTY)
855 goto finish;
856 } else {
857 cflags |= io_put_kbuf(req, *ret, issue_flags);
861 * Fill CQE for this receive and see if we should keep trying to
862 * receive from this socket.
864 if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished &&
865 io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) {
866 int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE;
868 io_mshot_prep_retry(req, kmsg);
869 /* Known not-empty or unknown state, retry */
870 if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) {
871 if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY)
872 return false;
873 /* mshot retries exceeded, force a requeue */
874 sr->nr_multishot_loops = 0;
875 mshot_retry_ret = IOU_REQUEUE;
877 if (issue_flags & IO_URING_F_MULTISHOT)
878 *ret = mshot_retry_ret;
879 else
880 *ret = -EAGAIN;
881 return true;
884 /* Finish the request / stop multishot. */
885 finish:
886 io_req_set_res(req, *ret, cflags);
888 if (issue_flags & IO_URING_F_MULTISHOT)
889 *ret = IOU_STOP_MULTISHOT;
890 else
891 *ret = IOU_OK;
892 io_req_msg_cleanup(req, issue_flags);
893 return true;
896 static int io_recvmsg_prep_multishot(struct io_async_msghdr *kmsg,
897 struct io_sr_msg *sr, void __user **buf,
898 size_t *len)
900 unsigned long ubuf = (unsigned long) *buf;
901 unsigned long hdr;
903 hdr = sizeof(struct io_uring_recvmsg_out) + kmsg->namelen +
904 kmsg->controllen;
905 if (*len < hdr)
906 return -EFAULT;
908 if (kmsg->controllen) {
909 unsigned long control = ubuf + hdr - kmsg->controllen;
911 kmsg->msg.msg_control_user = (void __user *) control;
912 kmsg->msg.msg_controllen = kmsg->controllen;
915 sr->buf = *buf; /* stash for later copy */
916 *buf = (void __user *) (ubuf + hdr);
917 kmsg->payloadlen = *len = *len - hdr;
918 return 0;
921 struct io_recvmsg_multishot_hdr {
922 struct io_uring_recvmsg_out msg;
923 struct sockaddr_storage addr;
926 static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io,
927 struct io_async_msghdr *kmsg,
928 unsigned int flags, bool *finished)
930 int err;
931 int copy_len;
932 struct io_recvmsg_multishot_hdr hdr;
934 if (kmsg->namelen)
935 kmsg->msg.msg_name = &hdr.addr;
936 kmsg->msg.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
937 kmsg->msg.msg_namelen = 0;
939 if (sock->file->f_flags & O_NONBLOCK)
940 flags |= MSG_DONTWAIT;
942 err = sock_recvmsg(sock, &kmsg->msg, flags);
943 *finished = err <= 0;
944 if (err < 0)
945 return err;
947 hdr.msg = (struct io_uring_recvmsg_out) {
948 .controllen = kmsg->controllen - kmsg->msg.msg_controllen,
949 .flags = kmsg->msg.msg_flags & ~MSG_CMSG_COMPAT
952 hdr.msg.payloadlen = err;
953 if (err > kmsg->payloadlen)
954 err = kmsg->payloadlen;
956 copy_len = sizeof(struct io_uring_recvmsg_out);
957 if (kmsg->msg.msg_namelen > kmsg->namelen)
958 copy_len += kmsg->namelen;
959 else
960 copy_len += kmsg->msg.msg_namelen;
963 * "fromlen shall refer to the value before truncation.."
964 * 1003.1g
966 hdr.msg.namelen = kmsg->msg.msg_namelen;
968 /* ensure that there is no gap between hdr and sockaddr_storage */
969 BUILD_BUG_ON(offsetof(struct io_recvmsg_multishot_hdr, addr) !=
970 sizeof(struct io_uring_recvmsg_out));
971 if (copy_to_user(io->buf, &hdr, copy_len)) {
972 *finished = true;
973 return -EFAULT;
976 return sizeof(struct io_uring_recvmsg_out) + kmsg->namelen +
977 kmsg->controllen + err;
980 int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
982 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
983 struct io_async_msghdr *kmsg = req->async_data;
984 struct socket *sock;
985 unsigned flags;
986 int ret, min_ret = 0;
987 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
988 bool mshot_finished = true;
990 sock = sock_from_file(req->file);
991 if (unlikely(!sock))
992 return -ENOTSOCK;
994 if (!(req->flags & REQ_F_POLLED) &&
995 (sr->flags & IORING_RECVSEND_POLL_FIRST))
996 return -EAGAIN;
998 flags = sr->msg_flags;
999 if (force_nonblock)
1000 flags |= MSG_DONTWAIT;
1002 retry_multishot:
1003 if (io_do_buffer_select(req)) {
1004 void __user *buf;
1005 size_t len = sr->len;
1007 buf = io_buffer_select(req, &len, issue_flags);
1008 if (!buf)
1009 return -ENOBUFS;
1011 if (req->flags & REQ_F_APOLL_MULTISHOT) {
1012 ret = io_recvmsg_prep_multishot(kmsg, sr, &buf, &len);
1013 if (ret) {
1014 io_kbuf_recycle(req, issue_flags);
1015 return ret;
1019 iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, buf, len);
1022 kmsg->msg.msg_get_inq = 1;
1023 kmsg->msg.msg_inq = -1;
1024 if (req->flags & REQ_F_APOLL_MULTISHOT) {
1025 ret = io_recvmsg_multishot(sock, sr, kmsg, flags,
1026 &mshot_finished);
1027 } else {
1028 /* disable partial retry for recvmsg with cmsg attached */
1029 if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen)
1030 min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1032 ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg,
1033 kmsg->uaddr, flags);
1036 if (ret < min_ret) {
1037 if (ret == -EAGAIN && force_nonblock) {
1038 if (issue_flags & IO_URING_F_MULTISHOT) {
1039 io_kbuf_recycle(req, issue_flags);
1040 return IOU_ISSUE_SKIP_COMPLETE;
1042 return -EAGAIN;
1044 if (ret > 0 && io_net_retry(sock, flags)) {
1045 sr->done_io += ret;
1046 req->flags |= REQ_F_BL_NO_RECYCLE;
1047 return -EAGAIN;
1049 if (ret == -ERESTARTSYS)
1050 ret = -EINTR;
1051 req_set_fail(req);
1052 } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
1053 req_set_fail(req);
1056 if (ret > 0)
1057 ret += sr->done_io;
1058 else if (sr->done_io)
1059 ret = sr->done_io;
1060 else
1061 io_kbuf_recycle(req, issue_flags);
1063 if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags))
1064 goto retry_multishot;
1066 return ret;
1069 static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg,
1070 size_t *len, unsigned int issue_flags)
1072 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1073 int ret;
1076 * If the ring isn't locked, then don't use the peek interface
1077 * to grab multiple buffers as we will lock/unlock between
1078 * this selection and posting the buffers.
1080 if (!(issue_flags & IO_URING_F_UNLOCKED) &&
1081 sr->flags & IORING_RECVSEND_BUNDLE) {
1082 struct buf_sel_arg arg = {
1083 .iovs = &kmsg->fast_iov,
1084 .nr_iovs = 1,
1085 .mode = KBUF_MODE_EXPAND,
1088 if (kmsg->free_iov) {
1089 arg.nr_iovs = kmsg->free_iov_nr;
1090 arg.iovs = kmsg->free_iov;
1091 arg.mode |= KBUF_MODE_FREE;
1094 if (kmsg->msg.msg_inq > 0)
1095 arg.max_len = min_not_zero(sr->len, kmsg->msg.msg_inq);
1097 ret = io_buffers_peek(req, &arg);
1098 if (unlikely(ret < 0))
1099 return ret;
1101 /* special case 1 vec, can be a fast path */
1102 if (ret == 1) {
1103 sr->buf = arg.iovs[0].iov_base;
1104 sr->len = arg.iovs[0].iov_len;
1105 goto map_ubuf;
1107 iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret,
1108 arg.out_len);
1109 if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) {
1110 kmsg->free_iov_nr = ret;
1111 kmsg->free_iov = arg.iovs;
1112 req->flags |= REQ_F_NEED_CLEANUP;
1114 } else {
1115 void __user *buf;
1117 *len = sr->len;
1118 buf = io_buffer_select(req, len, issue_flags);
1119 if (!buf)
1120 return -ENOBUFS;
1121 sr->buf = buf;
1122 sr->len = *len;
1123 map_ubuf:
1124 ret = import_ubuf(ITER_DEST, sr->buf, sr->len,
1125 &kmsg->msg.msg_iter);
1126 if (unlikely(ret))
1127 return ret;
1130 return 0;
1133 int io_recv(struct io_kiocb *req, unsigned int issue_flags)
1135 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1136 struct io_async_msghdr *kmsg = req->async_data;
1137 struct socket *sock;
1138 unsigned flags;
1139 int ret, min_ret = 0;
1140 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1141 size_t len = sr->len;
1142 bool mshot_finished;
1144 if (!(req->flags & REQ_F_POLLED) &&
1145 (sr->flags & IORING_RECVSEND_POLL_FIRST))
1146 return -EAGAIN;
1148 sock = sock_from_file(req->file);
1149 if (unlikely(!sock))
1150 return -ENOTSOCK;
1152 flags = sr->msg_flags;
1153 if (force_nonblock)
1154 flags |= MSG_DONTWAIT;
1156 retry_multishot:
1157 if (io_do_buffer_select(req)) {
1158 ret = io_recv_buf_select(req, kmsg, &len, issue_flags);
1159 if (unlikely(ret)) {
1160 kmsg->msg.msg_inq = -1;
1161 goto out_free;
1163 sr->buf = NULL;
1166 kmsg->msg.msg_flags = 0;
1167 kmsg->msg.msg_inq = -1;
1169 if (flags & MSG_WAITALL)
1170 min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1172 ret = sock_recvmsg(sock, &kmsg->msg, flags);
1173 if (ret < min_ret) {
1174 if (ret == -EAGAIN && force_nonblock) {
1175 if (issue_flags & IO_URING_F_MULTISHOT) {
1176 io_kbuf_recycle(req, issue_flags);
1177 return IOU_ISSUE_SKIP_COMPLETE;
1180 return -EAGAIN;
1182 if (ret > 0 && io_net_retry(sock, flags)) {
1183 sr->len -= ret;
1184 sr->buf += ret;
1185 sr->done_io += ret;
1186 req->flags |= REQ_F_BL_NO_RECYCLE;
1187 return -EAGAIN;
1189 if (ret == -ERESTARTSYS)
1190 ret = -EINTR;
1191 req_set_fail(req);
1192 } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
1193 out_free:
1194 req_set_fail(req);
1197 mshot_finished = ret <= 0;
1198 if (ret > 0)
1199 ret += sr->done_io;
1200 else if (sr->done_io)
1201 ret = sr->done_io;
1202 else
1203 io_kbuf_recycle(req, issue_flags);
1205 if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags))
1206 goto retry_multishot;
1208 return ret;
1211 void io_send_zc_cleanup(struct io_kiocb *req)
1213 struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
1214 struct io_async_msghdr *io = req->async_data;
1216 if (req_has_async_data(req))
1217 io_netmsg_iovec_free(io);
1218 if (zc->notif) {
1219 io_notif_flush(zc->notif);
1220 zc->notif = NULL;
1224 #define IO_ZC_FLAGS_COMMON (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF)
1225 #define IO_ZC_FLAGS_VALID (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE)
1227 int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1229 struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
1230 struct io_ring_ctx *ctx = req->ctx;
1231 struct io_kiocb *notif;
1233 zc->done_io = 0;
1234 req->flags |= REQ_F_POLL_NO_LAZY;
1236 if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3)))
1237 return -EINVAL;
1238 /* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */
1239 if (req->flags & REQ_F_CQE_SKIP)
1240 return -EINVAL;
1242 notif = zc->notif = io_alloc_notif(ctx);
1243 if (!notif)
1244 return -ENOMEM;
1245 notif->cqe.user_data = req->cqe.user_data;
1246 notif->cqe.res = 0;
1247 notif->cqe.flags = IORING_CQE_F_NOTIF;
1248 req->flags |= REQ_F_NEED_CLEANUP;
1250 zc->flags = READ_ONCE(sqe->ioprio);
1251 if (unlikely(zc->flags & ~IO_ZC_FLAGS_COMMON)) {
1252 if (zc->flags & ~IO_ZC_FLAGS_VALID)
1253 return -EINVAL;
1254 if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) {
1255 struct io_notif_data *nd = io_notif_to_data(notif);
1257 nd->zc_report = true;
1258 nd->zc_used = false;
1259 nd->zc_copied = false;
1263 if (req->opcode != IORING_OP_SEND_ZC) {
1264 if (unlikely(sqe->addr2 || sqe->file_index))
1265 return -EINVAL;
1266 if (unlikely(zc->flags & IORING_RECVSEND_FIXED_BUF))
1267 return -EINVAL;
1270 zc->len = READ_ONCE(sqe->len);
1271 zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY;
1272 zc->buf_index = READ_ONCE(sqe->buf_index);
1273 if (zc->msg_flags & MSG_DONTWAIT)
1274 req->flags |= REQ_F_NOWAIT;
1276 #ifdef CONFIG_COMPAT
1277 if (req->ctx->compat)
1278 zc->msg_flags |= MSG_CMSG_COMPAT;
1279 #endif
1280 if (unlikely(!io_msg_alloc_async(req)))
1281 return -ENOMEM;
1282 if (req->opcode != IORING_OP_SENDMSG_ZC)
1283 return io_send_setup(req, sqe);
1284 return io_sendmsg_setup(req, sqe);
1287 static int io_sg_from_iter_iovec(struct sk_buff *skb,
1288 struct iov_iter *from, size_t length)
1290 skb_zcopy_downgrade_managed(skb);
1291 return zerocopy_fill_skb_from_iter(skb, from, length);
1294 static int io_sg_from_iter(struct sk_buff *skb,
1295 struct iov_iter *from, size_t length)
1297 struct skb_shared_info *shinfo = skb_shinfo(skb);
1298 int frag = shinfo->nr_frags;
1299 int ret = 0;
1300 struct bvec_iter bi;
1301 ssize_t copied = 0;
1302 unsigned long truesize = 0;
1304 if (!frag)
1305 shinfo->flags |= SKBFL_MANAGED_FRAG_REFS;
1306 else if (unlikely(!skb_zcopy_managed(skb)))
1307 return zerocopy_fill_skb_from_iter(skb, from, length);
1309 bi.bi_size = min(from->count, length);
1310 bi.bi_bvec_done = from->iov_offset;
1311 bi.bi_idx = 0;
1313 while (bi.bi_size && frag < MAX_SKB_FRAGS) {
1314 struct bio_vec v = mp_bvec_iter_bvec(from->bvec, bi);
1316 copied += v.bv_len;
1317 truesize += PAGE_ALIGN(v.bv_len + v.bv_offset);
1318 __skb_fill_page_desc_noacc(shinfo, frag++, v.bv_page,
1319 v.bv_offset, v.bv_len);
1320 bvec_iter_advance_single(from->bvec, &bi, v.bv_len);
1322 if (bi.bi_size)
1323 ret = -EMSGSIZE;
1325 shinfo->nr_frags = frag;
1326 from->bvec += bi.bi_idx;
1327 from->nr_segs -= bi.bi_idx;
1328 from->count -= copied;
1329 from->iov_offset = bi.bi_bvec_done;
1331 skb->data_len += copied;
1332 skb->len += copied;
1333 skb->truesize += truesize;
1334 return ret;
1337 static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags)
1339 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1340 struct io_async_msghdr *kmsg = req->async_data;
1341 int ret;
1343 if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
1344 struct io_ring_ctx *ctx = req->ctx;
1345 struct io_rsrc_node *node;
1347 ret = -EFAULT;
1348 io_ring_submit_lock(ctx, issue_flags);
1349 node = io_rsrc_node_lookup(&ctx->buf_table, sr->buf_index);
1350 if (node) {
1351 io_req_assign_buf_node(sr->notif, node);
1352 ret = 0;
1354 io_ring_submit_unlock(ctx, issue_flags);
1356 if (unlikely(ret))
1357 return ret;
1359 ret = io_import_fixed(ITER_SOURCE, &kmsg->msg.msg_iter,
1360 node->buf, (u64)(uintptr_t)sr->buf,
1361 sr->len);
1362 if (unlikely(ret))
1363 return ret;
1364 kmsg->msg.sg_from_iter = io_sg_from_iter;
1365 } else {
1366 ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter);
1367 if (unlikely(ret))
1368 return ret;
1369 ret = io_notif_account_mem(sr->notif, sr->len);
1370 if (unlikely(ret))
1371 return ret;
1372 kmsg->msg.sg_from_iter = io_sg_from_iter_iovec;
1375 return ret;
1378 int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
1380 struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
1381 struct io_async_msghdr *kmsg = req->async_data;
1382 struct socket *sock;
1383 unsigned msg_flags;
1384 int ret, min_ret = 0;
1386 sock = sock_from_file(req->file);
1387 if (unlikely(!sock))
1388 return -ENOTSOCK;
1389 if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags))
1390 return -EOPNOTSUPP;
1392 if (!(req->flags & REQ_F_POLLED) &&
1393 (zc->flags & IORING_RECVSEND_POLL_FIRST))
1394 return -EAGAIN;
1396 if (!zc->done_io) {
1397 ret = io_send_zc_import(req, issue_flags);
1398 if (unlikely(ret))
1399 return ret;
1402 msg_flags = zc->msg_flags;
1403 if (issue_flags & IO_URING_F_NONBLOCK)
1404 msg_flags |= MSG_DONTWAIT;
1405 if (msg_flags & MSG_WAITALL)
1406 min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1407 msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
1409 kmsg->msg.msg_flags = msg_flags;
1410 kmsg->msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg;
1411 ret = sock_sendmsg(sock, &kmsg->msg);
1413 if (unlikely(ret < min_ret)) {
1414 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1415 return -EAGAIN;
1417 if (ret > 0 && io_net_retry(sock, kmsg->msg.msg_flags)) {
1418 zc->len -= ret;
1419 zc->buf += ret;
1420 zc->done_io += ret;
1421 req->flags |= REQ_F_BL_NO_RECYCLE;
1422 return -EAGAIN;
1424 if (ret == -ERESTARTSYS)
1425 ret = -EINTR;
1426 req_set_fail(req);
1429 if (ret >= 0)
1430 ret += zc->done_io;
1431 else if (zc->done_io)
1432 ret = zc->done_io;
1435 * If we're in io-wq we can't rely on tw ordering guarantees, defer
1436 * flushing notif to io_send_zc_cleanup()
1438 if (!(issue_flags & IO_URING_F_UNLOCKED)) {
1439 io_notif_flush(zc->notif);
1440 io_req_msg_cleanup(req, 0);
1442 io_req_set_res(req, ret, IORING_CQE_F_MORE);
1443 return IOU_OK;
1446 int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
1448 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1449 struct io_async_msghdr *kmsg = req->async_data;
1450 struct socket *sock;
1451 unsigned flags;
1452 int ret, min_ret = 0;
1454 sock = sock_from_file(req->file);
1455 if (unlikely(!sock))
1456 return -ENOTSOCK;
1457 if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags))
1458 return -EOPNOTSUPP;
1460 if (!(req->flags & REQ_F_POLLED) &&
1461 (sr->flags & IORING_RECVSEND_POLL_FIRST))
1462 return -EAGAIN;
1464 flags = sr->msg_flags;
1465 if (issue_flags & IO_URING_F_NONBLOCK)
1466 flags |= MSG_DONTWAIT;
1467 if (flags & MSG_WAITALL)
1468 min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1470 kmsg->msg.msg_control_user = sr->msg_control;
1471 kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg;
1472 kmsg->msg.sg_from_iter = io_sg_from_iter_iovec;
1473 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
1475 if (unlikely(ret < min_ret)) {
1476 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1477 return -EAGAIN;
1479 if (ret > 0 && io_net_retry(sock, flags)) {
1480 sr->done_io += ret;
1481 req->flags |= REQ_F_BL_NO_RECYCLE;
1482 return -EAGAIN;
1484 if (ret == -ERESTARTSYS)
1485 ret = -EINTR;
1486 req_set_fail(req);
1489 if (ret >= 0)
1490 ret += sr->done_io;
1491 else if (sr->done_io)
1492 ret = sr->done_io;
1495 * If we're in io-wq we can't rely on tw ordering guarantees, defer
1496 * flushing notif to io_send_zc_cleanup()
1498 if (!(issue_flags & IO_URING_F_UNLOCKED)) {
1499 io_notif_flush(sr->notif);
1500 io_req_msg_cleanup(req, 0);
1502 io_req_set_res(req, ret, IORING_CQE_F_MORE);
1503 return IOU_OK;
1506 void io_sendrecv_fail(struct io_kiocb *req)
1508 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1510 if (sr->done_io)
1511 req->cqe.res = sr->done_io;
1513 if ((req->flags & REQ_F_NEED_CLEANUP) &&
1514 (req->opcode == IORING_OP_SEND_ZC || req->opcode == IORING_OP_SENDMSG_ZC))
1515 req->cqe.flags |= IORING_CQE_F_MORE;
1518 #define ACCEPT_FLAGS (IORING_ACCEPT_MULTISHOT | IORING_ACCEPT_DONTWAIT | \
1519 IORING_ACCEPT_POLL_FIRST)
1521 int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1523 struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
1525 if (sqe->len || sqe->buf_index)
1526 return -EINVAL;
1528 accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
1529 accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
1530 accept->flags = READ_ONCE(sqe->accept_flags);
1531 accept->nofile = rlimit(RLIMIT_NOFILE);
1532 accept->iou_flags = READ_ONCE(sqe->ioprio);
1533 if (accept->iou_flags & ~ACCEPT_FLAGS)
1534 return -EINVAL;
1536 accept->file_slot = READ_ONCE(sqe->file_index);
1537 if (accept->file_slot) {
1538 if (accept->flags & SOCK_CLOEXEC)
1539 return -EINVAL;
1540 if (accept->iou_flags & IORING_ACCEPT_MULTISHOT &&
1541 accept->file_slot != IORING_FILE_INDEX_ALLOC)
1542 return -EINVAL;
1544 if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1545 return -EINVAL;
1546 if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
1547 accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
1548 if (accept->iou_flags & IORING_ACCEPT_MULTISHOT)
1549 req->flags |= REQ_F_APOLL_MULTISHOT;
1550 if (accept->iou_flags & IORING_ACCEPT_DONTWAIT)
1551 req->flags |= REQ_F_NOWAIT;
1552 return 0;
1555 int io_accept(struct io_kiocb *req, unsigned int issue_flags)
1557 struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
1558 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1559 bool fixed = !!accept->file_slot;
1560 struct proto_accept_arg arg = {
1561 .flags = force_nonblock ? O_NONBLOCK : 0,
1563 struct file *file;
1564 unsigned cflags;
1565 int ret, fd;
1567 if (!(req->flags & REQ_F_POLLED) &&
1568 accept->iou_flags & IORING_ACCEPT_POLL_FIRST)
1569 return -EAGAIN;
1571 retry:
1572 if (!fixed) {
1573 fd = __get_unused_fd_flags(accept->flags, accept->nofile);
1574 if (unlikely(fd < 0))
1575 return fd;
1577 arg.err = 0;
1578 arg.is_empty = -1;
1579 file = do_accept(req->file, &arg, accept->addr, accept->addr_len,
1580 accept->flags);
1581 if (IS_ERR(file)) {
1582 if (!fixed)
1583 put_unused_fd(fd);
1584 ret = PTR_ERR(file);
1585 if (ret == -EAGAIN && force_nonblock &&
1586 !(accept->iou_flags & IORING_ACCEPT_DONTWAIT)) {
1588 * if it's multishot and polled, we don't need to
1589 * return EAGAIN to arm the poll infra since it
1590 * has already been done
1592 if (issue_flags & IO_URING_F_MULTISHOT)
1593 return IOU_ISSUE_SKIP_COMPLETE;
1594 return ret;
1596 if (ret == -ERESTARTSYS)
1597 ret = -EINTR;
1598 req_set_fail(req);
1599 } else if (!fixed) {
1600 fd_install(fd, file);
1601 ret = fd;
1602 } else {
1603 ret = io_fixed_fd_install(req, issue_flags, file,
1604 accept->file_slot);
1607 cflags = 0;
1608 if (!arg.is_empty)
1609 cflags |= IORING_CQE_F_SOCK_NONEMPTY;
1611 if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
1612 io_req_set_res(req, ret, cflags);
1613 return IOU_OK;
1616 if (ret < 0)
1617 return ret;
1618 if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) {
1619 if (cflags & IORING_CQE_F_SOCK_NONEMPTY || arg.is_empty == -1)
1620 goto retry;
1621 if (issue_flags & IO_URING_F_MULTISHOT)
1622 return IOU_ISSUE_SKIP_COMPLETE;
1623 return -EAGAIN;
1626 io_req_set_res(req, ret, cflags);
1627 return IOU_STOP_MULTISHOT;
1630 int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1632 struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
1634 if (sqe->addr || sqe->rw_flags || sqe->buf_index)
1635 return -EINVAL;
1637 sock->domain = READ_ONCE(sqe->fd);
1638 sock->type = READ_ONCE(sqe->off);
1639 sock->protocol = READ_ONCE(sqe->len);
1640 sock->file_slot = READ_ONCE(sqe->file_index);
1641 sock->nofile = rlimit(RLIMIT_NOFILE);
1643 sock->flags = sock->type & ~SOCK_TYPE_MASK;
1644 if (sock->file_slot && (sock->flags & SOCK_CLOEXEC))
1645 return -EINVAL;
1646 if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1647 return -EINVAL;
1648 return 0;
1651 int io_socket(struct io_kiocb *req, unsigned int issue_flags)
1653 struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
1654 bool fixed = !!sock->file_slot;
1655 struct file *file;
1656 int ret, fd;
1658 if (!fixed) {
1659 fd = __get_unused_fd_flags(sock->flags, sock->nofile);
1660 if (unlikely(fd < 0))
1661 return fd;
1663 file = __sys_socket_file(sock->domain, sock->type, sock->protocol);
1664 if (IS_ERR(file)) {
1665 if (!fixed)
1666 put_unused_fd(fd);
1667 ret = PTR_ERR(file);
1668 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1669 return -EAGAIN;
1670 if (ret == -ERESTARTSYS)
1671 ret = -EINTR;
1672 req_set_fail(req);
1673 } else if (!fixed) {
1674 fd_install(fd, file);
1675 ret = fd;
1676 } else {
1677 ret = io_fixed_fd_install(req, issue_flags, file,
1678 sock->file_slot);
1680 io_req_set_res(req, ret, 0);
1681 return IOU_OK;
1684 int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1686 struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect);
1687 struct io_async_msghdr *io;
1689 if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
1690 return -EINVAL;
1692 conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
1693 conn->addr_len = READ_ONCE(sqe->addr2);
1694 conn->in_progress = conn->seen_econnaborted = false;
1696 io = io_msg_alloc_async(req);
1697 if (unlikely(!io))
1698 return -ENOMEM;
1700 return move_addr_to_kernel(conn->addr, conn->addr_len, &io->addr);
1703 int io_connect(struct io_kiocb *req, unsigned int issue_flags)
1705 struct io_connect *connect = io_kiocb_to_cmd(req, struct io_connect);
1706 struct io_async_msghdr *io = req->async_data;
1707 unsigned file_flags;
1708 int ret;
1709 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1711 file_flags = force_nonblock ? O_NONBLOCK : 0;
1713 ret = __sys_connect_file(req->file, &io->addr, connect->addr_len,
1714 file_flags);
1715 if ((ret == -EAGAIN || ret == -EINPROGRESS || ret == -ECONNABORTED)
1716 && force_nonblock) {
1717 if (ret == -EINPROGRESS) {
1718 connect->in_progress = true;
1719 } else if (ret == -ECONNABORTED) {
1720 if (connect->seen_econnaborted)
1721 goto out;
1722 connect->seen_econnaborted = true;
1724 return -EAGAIN;
1726 if (connect->in_progress) {
1728 * At least bluetooth will return -EBADFD on a re-connect
1729 * attempt, and it's (supposedly) also valid to get -EISCONN
1730 * which means the previous result is good. For both of these,
1731 * grab the sock_error() and use that for the completion.
1733 if (ret == -EBADFD || ret == -EISCONN)
1734 ret = sock_error(sock_from_file(req->file)->sk);
1736 if (ret == -ERESTARTSYS)
1737 ret = -EINTR;
1738 out:
1739 if (ret < 0)
1740 req_set_fail(req);
1741 io_req_msg_cleanup(req, issue_flags);
1742 io_req_set_res(req, ret, 0);
1743 return IOU_OK;
1746 int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1748 struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind);
1749 struct sockaddr __user *uaddr;
1750 struct io_async_msghdr *io;
1752 if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
1753 return -EINVAL;
1755 uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr));
1756 bind->addr_len = READ_ONCE(sqe->addr2);
1758 io = io_msg_alloc_async(req);
1759 if (unlikely(!io))
1760 return -ENOMEM;
1761 return move_addr_to_kernel(uaddr, bind->addr_len, &io->addr);
1764 int io_bind(struct io_kiocb *req, unsigned int issue_flags)
1766 struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind);
1767 struct io_async_msghdr *io = req->async_data;
1768 struct socket *sock;
1769 int ret;
1771 sock = sock_from_file(req->file);
1772 if (unlikely(!sock))
1773 return -ENOTSOCK;
1775 ret = __sys_bind_socket(sock, &io->addr, bind->addr_len);
1776 if (ret < 0)
1777 req_set_fail(req);
1778 io_req_set_res(req, ret, 0);
1779 return 0;
1782 int io_listen_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1784 struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen);
1786 if (sqe->addr || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in || sqe->addr2)
1787 return -EINVAL;
1789 listen->backlog = READ_ONCE(sqe->len);
1790 return 0;
1793 int io_listen(struct io_kiocb *req, unsigned int issue_flags)
1795 struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen);
1796 struct socket *sock;
1797 int ret;
1799 sock = sock_from_file(req->file);
1800 if (unlikely(!sock))
1801 return -ENOTSOCK;
1803 ret = __sys_listen_socket(sock, listen->backlog);
1804 if (ret < 0)
1805 req_set_fail(req);
1806 io_req_set_res(req, ret, 0);
1807 return 0;
1810 void io_netmsg_cache_free(const void *entry)
1812 struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry;
1814 if (kmsg->free_iov) {
1815 kasan_mempool_unpoison_object(kmsg->free_iov,
1816 kmsg->free_iov_nr * sizeof(struct iovec));
1817 io_netmsg_iovec_free(kmsg);
1819 kfree(kmsg);
1821 #endif