fs/io_uring.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Shared application/kernel submission and completion ring pairs, for
   4  * supporting fast/efficient IO.
   5  *
   6  * A note on the read/write ordering memory barriers that are matched between
   7  * the application and kernel side.
   8  *
   9  * After the application reads the CQ ring tail, it must use an
  10  * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
  11  * before writing the tail (using smp_load_acquire to read the tail will
  12  * do). It also needs a smp_mb() before updating CQ head (ordering the
  13  * entry load(s) with the head store), pairing with an implicit barrier
  14  * through a control-dependency in io_get_cqring (smp_store_release to
  15  * store head will do). Failure to do so could lead to reading invalid
  16  * CQ entries.
  17  *
  18  * Likewise, the application must use an appropriate smp_wmb() before
  19  * writing the SQ tail (ordering SQ entry stores with the tail store),
  20  * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
  21  * to store the tail will do). And it needs a barrier ordering the SQ
  22  * head load before writing new SQ entries (smp_load_acquire to read
  23  * head will do).
  24  *
  25  * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
  26  * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
  27  * updating the SQ tail; a full memory barrier smp_mb() is needed
  28  * between.
  29  *
  30  * Also see the examples in the liburing library:
  31  *
  32  *      git://git.kernel.dk/liburing
  33  *
  34  * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
  35  * from data shared between the kernel and application. This is done both
  36  * for ordering purposes, but also to ensure that once a value is loaded from
  37  * data that the application could potentially modify, it remains stable.
  38  *
  39  * Copyright (C) 2018-2019 Jens Axboe
  40  * Copyright (c) 2018-2019 Christoph Hellwig
  41  */
  42 #include <linux/kernel.h>
  43 #include <linux/init.h>
  44 #include <linux/errno.h>
  45 #include <linux/syscalls.h>
  46 #include <linux/compat.h>
  47 #include <net/compat.h>
  48 #include <linux/refcount.h>
  49 #include <linux/uio.h>
  50 #include <linux/bits.h>
  51
  52 #include <linux/sched/signal.h>
  53 #include <linux/fs.h>
  54 #include <linux/file.h>
  55 #include <linux/fdtable.h>
  56 #include <linux/mm.h>
  57 #include <linux/mman.h>
  58 #include <linux/mmu_context.h>
  59 #include <linux/percpu.h>
  60 #include <linux/slab.h>
  61 #include <linux/kthread.h>
  62 #include <linux/blkdev.h>
  63 #include <linux/bvec.h>
  64 #include <linux/net.h>
  65 #include <net/sock.h>
  66 #include <net/af_unix.h>
  67 #include <net/scm.h>
  68 #include <linux/anon_inodes.h>
  69 #include <linux/sched/mm.h>
  70 #include <linux/uaccess.h>
  71 #include <linux/nospec.h>
  72 #include <linux/sizes.h>
  73 #include <linux/hugetlb.h>
  74 #include <linux/highmem.h>
  75 #include <linux/namei.h>
  76 #include <linux/fsnotify.h>
  77 #include <linux/fadvise.h>
  78 #include <linux/eventpoll.h>
  79 #include <linux/fs_struct.h>
  80 #include <linux/splice.h>
  81 #include <linux/task_work.h>
  82
  83 #define CREATE_TRACE_POINTS
  84 #include <trace/events/io_uring.h>
  85
  86 #include <uapi/linux/io_uring.h>
  87
  88 #include "internal.h"
  89 #include "io-wq.h"
  90
  91 #define IORING_MAX_ENTRIES      32768
  92 #define IORING_MAX_CQ_ENTRIES   (2 * IORING_MAX_ENTRIES)
  93
  94 /*
  95  * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
  96  */
  97 #define IORING_FILE_TABLE_SHIFT 9
  98 #define IORING_MAX_FILES_TABLE  (1U << IORING_FILE_TABLE_SHIFT)
  99 #define IORING_FILE_TABLE_MASK  (IORING_MAX_FILES_TABLE - 1)
 100 #define IORING_MAX_FIXED_FILES  (64 * IORING_MAX_FILES_TABLE)
 101
 102 struct io_uring {
 103         u32 head ____cacheline_aligned_in_smp;
 104         u32 tail ____cacheline_aligned_in_smp;
 105 };
 106
 107 /*
 108  * This data is shared with the application through the mmap at offsets
 109  * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
 110  *
 111  * The offsets to the member fields are published through struct
 112  * io_sqring_offsets when calling io_uring_setup.
 113  */
 114 struct io_rings {
 115         /*
 116          * Head and tail offsets into the ring; the offsets need to be
 117          * masked to get valid indices.
 118          *
 119          * The kernel controls head of the sq ring and the tail of the cq ring,
 120          * and the application controls tail of the sq ring and the head of the
 121          * cq ring.
 122          */
 123         struct io_uring         sq, cq;
 124         /*
 125          * Bitmasks to apply to head and tail offsets (constant, equals
 126          * ring_entries - 1)
 127          */
 128         u32                     sq_ring_mask, cq_ring_mask;
 129         /* Ring sizes (constant, power of 2) */
 130         u32                     sq_ring_entries, cq_ring_entries;
 131         /*
 132          * Number of invalid entries dropped by the kernel due to
 133          * invalid index stored in array
 134          *
 135          * Written by the kernel, shouldn't be modified by the
 136          * application (i.e. get number of "new events" by comparing to
 137          * cached value).
 138          *
 139          * After a new SQ head value was read by the application this
 140          * counter includes all submissions that were dropped reaching
 141          * the new SQ head (and possibly more).
 142          */
 143         u32                     sq_dropped;
 144         /*
 145          * Runtime flags
 146          *
 147          * Written by the kernel, shouldn't be modified by the
 148          * application.
 149          *
 150          * The application needs a full memory barrier before checking
 151          * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
 152          */
 153         u32                     sq_flags;
 154         /*
 155          * Number of completion events lost because the queue was full;
 156          * this should be avoided by the application by making sure
 157          * there are not more requests pending than there is space in
 158          * the completion queue.
 159          *
 160          * Written by the kernel, shouldn't be modified by the
 161          * application (i.e. get number of "new events" by comparing to
 162          * cached value).
 163          *
 164          * As completion events come in out of order this counter is not
 165          * ordered with any other data.
 166          */
 167         u32                     cq_overflow;
 168         /*
 169          * Ring buffer of completion events.
 170          *
 171          * The kernel writes completion events fresh every time they are
 172          * produced, so the application is allowed to modify pending
 173          * entries.
 174          */
 175         struct io_uring_cqe     cqes[] ____cacheline_aligned_in_smp;
 176 };
 177
 178 struct io_mapped_ubuf {
 179         u64             ubuf;
 180         size_t          len;
 181         struct          bio_vec *bvec;
 182         unsigned int    nr_bvecs;
 183 };
 184
 185 struct fixed_file_table {
 186         struct file             **files;
 187 };
 188
 189 struct fixed_file_ref_node {
 190         struct percpu_ref               refs;
 191         struct list_head                node;
 192         struct list_head                file_list;
 193         struct fixed_file_data          *file_data;
 194         struct work_struct              work;
 195 };
 196
 197 struct fixed_file_data {
 198         struct fixed_file_table         *table;
 199         struct io_ring_ctx              *ctx;
 200
 201         struct percpu_ref               *cur_refs;
 202         struct percpu_ref               refs;
 203         struct completion               done;
 204         struct list_head                ref_list;
 205         spinlock_t                      lock;
 206 };
 207
 208 struct io_buffer {
 209         struct list_head list;
 210         __u64 addr;
 211         __s32 len;
 212         __u16 bid;
 213 };
 214
 215 struct io_ring_ctx {
 216         struct {
 217                 struct percpu_ref       refs;
 218         } ____cacheline_aligned_in_smp;
 219
 220         struct {
 221                 unsigned int            flags;
 222                 unsigned int            compat: 1;
 223                 unsigned int            account_mem: 1;
 224                 unsigned int            cq_overflow_flushed: 1;
 225                 unsigned int            drain_next: 1;
 226                 unsigned int            eventfd_async: 1;
 227
 228                 /*
 229                  * Ring buffer of indices into array of io_uring_sqe, which is
 230                  * mmapped by the application using the IORING_OFF_SQES offset.
 231                  *
 232                  * This indirection could e.g. be used to assign fixed
 233                  * io_uring_sqe entries to operations and only submit them to
 234                  * the queue when needed.
 235                  *
 236                  * The kernel modifies neither the indices array nor the entries
 237                  * array.
 238                  */
 239                 u32                     *sq_array;
 240                 unsigned                cached_sq_head;
 241                 unsigned                sq_entries;
 242                 unsigned                sq_mask;
 243                 unsigned                sq_thread_idle;
 244                 unsigned                cached_sq_dropped;
 245                 atomic_t                cached_cq_overflow;
 246                 unsigned long           sq_check_overflow;
 247
 248                 struct list_head        defer_list;
 249                 struct list_head        timeout_list;
 250                 struct list_head        cq_overflow_list;
 251
 252                 wait_queue_head_t       inflight_wait;
 253                 struct io_uring_sqe     *sq_sqes;
 254         } ____cacheline_aligned_in_smp;
 255
 256         struct io_rings *rings;
 257
 258         /* IO offload */
 259         struct io_wq            *io_wq;
 260         struct task_struct      *sqo_thread;    /* if using sq thread polling */
 261         struct mm_struct        *sqo_mm;
 262         wait_queue_head_t       sqo_wait;
 263
 264         /*
 265          * If used, fixed file set. Writers must ensure that ->refs is dead,
 266          * readers must ensure that ->refs is alive as long as the file* is
 267          * used. Only updated through io_uring_register(2).
 268          */
 269         struct fixed_file_data  *file_data;
 270         unsigned                nr_user_files;
 271         int                     ring_fd;
 272         struct file             *ring_file;
 273
 274         /* if used, fixed mapped user buffers */
 275         unsigned                nr_user_bufs;
 276         struct io_mapped_ubuf   *user_bufs;
 277
 278         struct user_struct      *user;
 279
 280         const struct cred       *creds;
 281
 282         /* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */
 283         struct completion       *completions;
 284
 285         /* if all else fails... */
 286         struct io_kiocb         *fallback_req;
 287
 288 #if defined(CONFIG_UNIX)
 289         struct socket           *ring_sock;
 290 #endif
 291
 292         struct idr              io_buffer_idr;
 293
 294         struct idr              personality_idr;
 295
 296         struct {
 297                 unsigned                cached_cq_tail;
 298                 unsigned                cq_entries;
 299                 unsigned                cq_mask;
 300                 atomic_t                cq_timeouts;
 301                 unsigned long           cq_check_overflow;
 302                 struct wait_queue_head  cq_wait;
 303                 struct fasync_struct    *cq_fasync;
 304                 struct eventfd_ctx      *cq_ev_fd;
 305         } ____cacheline_aligned_in_smp;
 306
 307         struct {
 308                 struct mutex            uring_lock;
 309                 wait_queue_head_t       wait;
 310         } ____cacheline_aligned_in_smp;
 311
 312         struct {
 313                 spinlock_t              completion_lock;
 314
 315                 /*
 316                  * ->poll_list is protected by the ctx->uring_lock for
 317                  * io_uring instances that don't use IORING_SETUP_SQPOLL.
 318                  * For SQPOLL, only the single threaded io_sq_thread() will
 319                  * manipulate the list, hence no extra locking is needed there.
 320                  */
 321                 struct list_head        poll_list;
 322                 struct hlist_head       *cancel_hash;
 323                 unsigned                cancel_hash_bits;
 324                 bool                    poll_multi_file;
 325
 326                 spinlock_t              inflight_lock;
 327                 struct list_head        inflight_list;
 328         } ____cacheline_aligned_in_smp;
 329
 330         struct work_struct              exit_work;
 331 };
 332
 333 /*
 334  * First field must be the file pointer in all the
 335  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
 336  */
 337 struct io_poll_iocb {
 338         struct file                     *file;
 339         union {
 340                 struct wait_queue_head  *head;
 341                 u64                     addr;
 342         };
 343         __poll_t                        events;
 344         bool                            done;
 345         bool                            canceled;
 346         struct wait_queue_entry         wait;
 347 };
 348
 349 struct io_close {
 350         struct file                     *file;
 351         struct file                     *put_file;
 352         int                             fd;
 353 };
 354
 355 struct io_timeout_data {
 356         struct io_kiocb                 *req;
 357         struct hrtimer                  timer;
 358         struct timespec64               ts;
 359         enum hrtimer_mode               mode;
 360 };
 361
 362 struct io_accept {
 363         struct file                     *file;
 364         struct sockaddr __user          *addr;
 365         int __user                      *addr_len;
 366         int                             flags;
 367         unsigned long                   nofile;
 368 };
 369
 370 struct io_sync {
 371         struct file                     *file;
 372         loff_t                          len;
 373         loff_t                          off;
 374         int                             flags;
 375         int                             mode;
 376 };
 377
 378 struct io_cancel {
 379         struct file                     *file;
 380         u64                             addr;
 381 };
 382
 383 struct io_timeout {
 384         struct file                     *file;
 385         u64                             addr;
 386         int                             flags;
 387         u32                             count;
 388 };
 389
 390 struct io_rw {
 391         /* NOTE: kiocb has the file as the first member, so don't do it here */
 392         struct kiocb                    kiocb;
 393         u64                             addr;
 394         u64                             len;
 395 };
 396
 397 struct io_connect {
 398         struct file                     *file;
 399         struct sockaddr __user          *addr;
 400         int                             addr_len;
 401 };
 402
 403 struct io_sr_msg {
 404         struct file                     *file;
 405         union {
 406                 struct user_msghdr __user *msg;
 407                 void __user             *buf;
 408         };
 409         int                             msg_flags;
 410         int                             bgid;
 411         size_t                          len;
 412         struct io_buffer                *kbuf;
 413 };
 414
 415 struct io_open {
 416         struct file                     *file;
 417         int                             dfd;
 418         union {
 419                 unsigned                mask;
 420         };
 421         struct filename                 *filename;
 422         struct statx __user             *buffer;
 423         struct open_how                 how;
 424         unsigned long                   nofile;
 425 };
 426
 427 struct io_files_update {
 428         struct file                     *file;
 429         u64                             arg;
 430         u32                             nr_args;
 431         u32                             offset;
 432 };
 433
 434 struct io_fadvise {
 435         struct file                     *file;
 436         u64                             offset;
 437         u32                             len;
 438         u32                             advice;
 439 };
 440
 441 struct io_madvise {
 442         struct file                     *file;
 443         u64                             addr;
 444         u32                             len;
 445         u32                             advice;
 446 };
 447
 448 struct io_epoll {
 449         struct file                     *file;
 450         int                             epfd;
 451         int                             op;
 452         int                             fd;
 453         struct epoll_event              event;
 454 };
 455
 456 struct io_splice {
 457         struct file                     *file_out;
 458         struct file                     *file_in;
 459         loff_t                          off_out;
 460         loff_t                          off_in;
 461         u64                             len;
 462         unsigned int                    flags;
 463 };
 464
 465 struct io_provide_buf {
 466         struct file                     *file;
 467         __u64                           addr;
 468         __s32                           len;
 469         __u32                           bgid;
 470         __u16                           nbufs;
 471         __u16                           bid;
 472 };
 473
 474 struct io_async_connect {
 475         struct sockaddr_storage         address;
 476 };
 477
 478 struct io_async_msghdr {
 479         struct iovec                    fast_iov[UIO_FASTIOV];
 480         struct iovec                    *iov;
 481         struct sockaddr __user          *uaddr;
 482         struct msghdr                   msg;
 483         struct sockaddr_storage         addr;
 484 };
 485
 486 struct io_async_rw {
 487         struct iovec                    fast_iov[UIO_FASTIOV];
 488         struct iovec                    *iov;
 489         ssize_t                         nr_segs;
 490         ssize_t                         size;
 491 };
 492
 493 struct io_async_ctx {
 494         union {
 495                 struct io_async_rw      rw;
 496                 struct io_async_msghdr  msg;
 497                 struct io_async_connect connect;
 498                 struct io_timeout_data  timeout;
 499         };
 500 };
 501
 502 enum {
 503         REQ_F_FIXED_FILE_BIT    = IOSQE_FIXED_FILE_BIT,
 504         REQ_F_IO_DRAIN_BIT      = IOSQE_IO_DRAIN_BIT,
 505         REQ_F_LINK_BIT          = IOSQE_IO_LINK_BIT,
 506         REQ_F_HARDLINK_BIT      = IOSQE_IO_HARDLINK_BIT,
 507         REQ_F_FORCE_ASYNC_BIT   = IOSQE_ASYNC_BIT,
 508         REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
 509
 510         REQ_F_LINK_HEAD_BIT,
 511         REQ_F_LINK_NEXT_BIT,
 512         REQ_F_FAIL_LINK_BIT,
 513         REQ_F_INFLIGHT_BIT,
 514         REQ_F_CUR_POS_BIT,
 515         REQ_F_NOWAIT_BIT,
 516         REQ_F_LINK_TIMEOUT_BIT,
 517         REQ_F_TIMEOUT_BIT,
 518         REQ_F_ISREG_BIT,
 519         REQ_F_MUST_PUNT_BIT,
 520         REQ_F_TIMEOUT_NOSEQ_BIT,
 521         REQ_F_COMP_LOCKED_BIT,
 522         REQ_F_NEED_CLEANUP_BIT,
 523         REQ_F_OVERFLOW_BIT,
 524         REQ_F_POLLED_BIT,
 525         REQ_F_BUFFER_SELECTED_BIT,
 526         REQ_F_NO_FILE_TABLE_BIT,
 527
 528         /* not a real bit, just to check we're not overflowing the space */
 529         __REQ_F_LAST_BIT,
 530 };
 531
 532 enum {
 533         /* ctx owns file */
 534         REQ_F_FIXED_FILE        = BIT(REQ_F_FIXED_FILE_BIT),
 535         /* drain existing IO first */
 536         REQ_F_IO_DRAIN          = BIT(REQ_F_IO_DRAIN_BIT),
 537         /* linked sqes */
 538         REQ_F_LINK              = BIT(REQ_F_LINK_BIT),
 539         /* doesn't sever on completion < 0 */
 540         REQ_F_HARDLINK          = BIT(REQ_F_HARDLINK_BIT),
 541         /* IOSQE_ASYNC */
 542         REQ_F_FORCE_ASYNC       = BIT(REQ_F_FORCE_ASYNC_BIT),
 543         /* IOSQE_BUFFER_SELECT */
 544         REQ_F_BUFFER_SELECT     = BIT(REQ_F_BUFFER_SELECT_BIT),
 545
 546         /* head of a link */
 547         REQ_F_LINK_HEAD         = BIT(REQ_F_LINK_HEAD_BIT),
 548         /* already grabbed next link */
 549         REQ_F_LINK_NEXT         = BIT(REQ_F_LINK_NEXT_BIT),
 550         /* fail rest of links */
 551         REQ_F_FAIL_LINK         = BIT(REQ_F_FAIL_LINK_BIT),
 552         /* on inflight list */
 553         REQ_F_INFLIGHT          = BIT(REQ_F_INFLIGHT_BIT),
 554         /* read/write uses file position */
 555         REQ_F_CUR_POS           = BIT(REQ_F_CUR_POS_BIT),
 556         /* must not punt to workers */
 557         REQ_F_NOWAIT            = BIT(REQ_F_NOWAIT_BIT),
 558         /* has linked timeout */
 559         REQ_F_LINK_TIMEOUT      = BIT(REQ_F_LINK_TIMEOUT_BIT),
 560         /* timeout request */
 561         REQ_F_TIMEOUT           = BIT(REQ_F_TIMEOUT_BIT),
 562         /* regular file */
 563         REQ_F_ISREG             = BIT(REQ_F_ISREG_BIT),
 564         /* must be punted even for NONBLOCK */
 565         REQ_F_MUST_PUNT         = BIT(REQ_F_MUST_PUNT_BIT),
 566         /* no timeout sequence */
 567         REQ_F_TIMEOUT_NOSEQ     = BIT(REQ_F_TIMEOUT_NOSEQ_BIT),
 568         /* completion under lock */
 569         REQ_F_COMP_LOCKED       = BIT(REQ_F_COMP_LOCKED_BIT),
 570         /* needs cleanup */
 571         REQ_F_NEED_CLEANUP      = BIT(REQ_F_NEED_CLEANUP_BIT),
 572         /* in overflow list */
 573         REQ_F_OVERFLOW          = BIT(REQ_F_OVERFLOW_BIT),
 574         /* already went through poll handler */
 575         REQ_F_POLLED            = BIT(REQ_F_POLLED_BIT),
 576         /* buffer already selected */
 577         REQ_F_BUFFER_SELECTED   = BIT(REQ_F_BUFFER_SELECTED_BIT),
 578         /* doesn't need file table for this request */
 579         REQ_F_NO_FILE_TABLE     = BIT(REQ_F_NO_FILE_TABLE_BIT),
 580 };
 581
 582 struct async_poll {
 583         struct io_poll_iocb     poll;
 584         struct io_wq_work       work;
 585 };
 586
 587 /*
 588  * NOTE! Each of the iocb union members has the file pointer
 589  * as the first entry in their struct definition. So you can
 590  * access the file pointer through any of the sub-structs,
 591  * or directly as just 'ki_filp' in this struct.
 592  */
 593 struct io_kiocb {
 594         union {
 595                 struct file             *file;
 596                 struct io_rw            rw;
 597                 struct io_poll_iocb     poll;
 598                 struct io_accept        accept;
 599                 struct io_sync          sync;
 600                 struct io_cancel        cancel;
 601                 struct io_timeout       timeout;
 602                 struct io_connect       connect;
 603                 struct io_sr_msg        sr_msg;
 604                 struct io_open          open;
 605                 struct io_close         close;
 606                 struct io_files_update  files_update;
 607                 struct io_fadvise       fadvise;
 608                 struct io_madvise       madvise;
 609                 struct io_epoll         epoll;
 610                 struct io_splice        splice;
 611                 struct io_provide_buf   pbuf;
 612         };
 613
 614         struct io_async_ctx             *io;
 615         int                             cflags;
 616         bool                            needs_fixed_file;
 617         u8                              opcode;
 618         /* polled IO has completed */
 619         u8                              iopoll_completed;
 620
 621         u16                             buf_index;
 622
 623         struct io_ring_ctx      *ctx;
 624         struct list_head        list;
 625         unsigned int            flags;
 626         refcount_t              refs;
 627         struct task_struct      *task;
 628         unsigned long           fsize;
 629         u64                     user_data;
 630         u32                     result;
 631         u32                     sequence;
 632
 633         struct list_head        link_list;
 634
 635         struct list_head        inflight_entry;
 636
 637         struct percpu_ref       *fixed_file_refs;
 638
 639         union {
 640                 /*
 641                  * Only commands that never go async can use the below fields,
 642                  * obviously. Right now only IORING_OP_POLL_ADD uses them, and
 643                  * async armed poll handlers for regular commands. The latter
 644                  * restore the work, if needed.
 645                  */
 646                 struct {
 647                         struct callback_head    task_work;
 648                         struct hlist_node       hash_node;
 649                         struct async_poll       *apoll;
 650                 };
 651                 struct io_wq_work       work;
 652         };
 653 };
 654
 655 #define IO_PLUG_THRESHOLD               2
 656 #define IO_IOPOLL_BATCH                 8
 657
 658 struct io_submit_state {
 659         struct blk_plug         plug;
 660
 661         /*
 662          * io_kiocb alloc cache
 663          */
 664         void                    *reqs[IO_IOPOLL_BATCH];
 665         unsigned int            free_reqs;
 666
 667         /*
 668          * File reference cache
 669          */
 670         struct file             *file;
 671         unsigned int            fd;
 672         unsigned int            has_refs;
 673         unsigned int            used_refs;
 674         unsigned int            ios_left;
 675 };
 676
 677 struct io_op_def {
 678         /* needs req->io allocated for deferral/async */
 679         unsigned                async_ctx : 1;
 680         /* needs current->mm setup, does mm access */
 681         unsigned                needs_mm : 1;
 682         /* needs req->file assigned */
 683         unsigned                needs_file : 1;
 684         /* hash wq insertion if file is a regular file */
 685         unsigned                hash_reg_file : 1;
 686         /* unbound wq insertion if file is a non-regular file */
 687         unsigned                unbound_nonreg_file : 1;
 688         /* opcode is not supported by this kernel */
 689         unsigned                not_supported : 1;
 690         /* needs file table */
 691         unsigned                file_table : 1;
 692         /* needs ->fs */
 693         unsigned                needs_fs : 1;
 694         /* set if opcode supports polled "wait" */
 695         unsigned                pollin : 1;
 696         unsigned                pollout : 1;
 697         /* op supports buffer selection */
 698         unsigned                buffer_select : 1;
 699 };
 700
 701 static const struct io_op_def io_op_defs[] = {
 702         [IORING_OP_NOP] = {},
 703         [IORING_OP_READV] = {
 704                 .async_ctx              = 1,
 705                 .needs_mm               = 1,
 706                 .needs_file             = 1,
 707                 .unbound_nonreg_file    = 1,
 708                 .pollin                 = 1,
 709                 .buffer_select          = 1,
 710         },
 711         [IORING_OP_WRITEV] = {
 712                 .async_ctx              = 1,
 713                 .needs_mm               = 1,
 714                 .needs_file             = 1,
 715                 .hash_reg_file          = 1,
 716                 .unbound_nonreg_file    = 1,
 717                 .pollout                = 1,
 718         },
 719         [IORING_OP_FSYNC] = {
 720                 .needs_file             = 1,
 721         },
 722         [IORING_OP_READ_FIXED] = {
 723                 .needs_file             = 1,
 724                 .unbound_nonreg_file    = 1,
 725                 .pollin                 = 1,
 726         },
 727         [IORING_OP_WRITE_FIXED] = {
 728                 .needs_file             = 1,
 729                 .hash_reg_file          = 1,
 730                 .unbound_nonreg_file    = 1,
 731                 .pollout                = 1,
 732         },
 733         [IORING_OP_POLL_ADD] = {
 734                 .needs_file             = 1,
 735                 .unbound_nonreg_file    = 1,
 736         },
 737         [IORING_OP_POLL_REMOVE] = {},
 738         [IORING_OP_SYNC_FILE_RANGE] = {
 739                 .needs_file             = 1,
 740         },
 741         [IORING_OP_SENDMSG] = {
 742                 .async_ctx              = 1,
 743                 .needs_mm               = 1,
 744                 .needs_file             = 1,
 745                 .unbound_nonreg_file    = 1,
 746                 .needs_fs               = 1,
 747                 .pollout                = 1,
 748         },
 749         [IORING_OP_RECVMSG] = {
 750                 .async_ctx              = 1,
 751                 .needs_mm               = 1,
 752                 .needs_file             = 1,
 753                 .unbound_nonreg_file    = 1,
 754                 .needs_fs               = 1,
 755                 .pollin                 = 1,
 756                 .buffer_select          = 1,
 757         },
 758         [IORING_OP_TIMEOUT] = {
 759                 .async_ctx              = 1,
 760                 .needs_mm               = 1,
 761         },
 762         [IORING_OP_TIMEOUT_REMOVE] = {},
 763         [IORING_OP_ACCEPT] = {
 764                 .needs_mm               = 1,
 765                 .needs_file             = 1,
 766                 .unbound_nonreg_file    = 1,
 767                 .file_table             = 1,
 768                 .pollin                 = 1,
 769         },
 770         [IORING_OP_ASYNC_CANCEL] = {},
 771         [IORING_OP_LINK_TIMEOUT] = {
 772                 .async_ctx              = 1,
 773                 .needs_mm               = 1,
 774         },
 775         [IORING_OP_CONNECT] = {
 776                 .async_ctx              = 1,
 777                 .needs_mm               = 1,
 778                 .needs_file             = 1,
 779                 .unbound_nonreg_file    = 1,
 780                 .pollout                = 1,
 781         },
 782         [IORING_OP_FALLOCATE] = {
 783                 .needs_file             = 1,
 784         },
 785         [IORING_OP_OPENAT] = {
 786                 .file_table             = 1,
 787                 .needs_fs               = 1,
 788         },
 789         [IORING_OP_CLOSE] = {
 790                 .needs_file             = 1,
 791                 .file_table             = 1,
 792         },
 793         [IORING_OP_FILES_UPDATE] = {
 794                 .needs_mm               = 1,
 795                 .file_table             = 1,
 796         },
 797         [IORING_OP_STATX] = {
 798                 .needs_mm               = 1,
 799                 .needs_fs               = 1,
 800                 .file_table             = 1,
 801         },
 802         [IORING_OP_READ] = {
 803                 .needs_mm               = 1,
 804                 .needs_file             = 1,
 805                 .unbound_nonreg_file    = 1,
 806                 .pollin                 = 1,
 807                 .buffer_select          = 1,
 808         },
 809         [IORING_OP_WRITE] = {
 810                 .needs_mm               = 1,
 811                 .needs_file             = 1,
 812                 .unbound_nonreg_file    = 1,
 813                 .pollout                = 1,
 814         },
 815         [IORING_OP_FADVISE] = {
 816                 .needs_file             = 1,
 817         },
 818         [IORING_OP_MADVISE] = {
 819                 .needs_mm               = 1,
 820         },
 821         [IORING_OP_SEND] = {
 822                 .needs_mm               = 1,
 823                 .needs_file             = 1,
 824                 .unbound_nonreg_file    = 1,
 825                 .pollout                = 1,
 826         },
 827         [IORING_OP_RECV] = {
 828                 .needs_mm               = 1,
 829                 .needs_file             = 1,
 830                 .unbound_nonreg_file    = 1,
 831                 .pollin                 = 1,
 832                 .buffer_select          = 1,
 833         },
 834         [IORING_OP_OPENAT2] = {
 835                 .file_table             = 1,
 836                 .needs_fs               = 1,
 837         },
 838         [IORING_OP_EPOLL_CTL] = {
 839                 .unbound_nonreg_file    = 1,
 840                 .file_table             = 1,
 841         },
 842         [IORING_OP_SPLICE] = {
 843                 .needs_file             = 1,
 844                 .hash_reg_file          = 1,
 845                 .unbound_nonreg_file    = 1,
 846         },
 847         [IORING_OP_PROVIDE_BUFFERS] = {},
 848         [IORING_OP_REMOVE_BUFFERS] = {},
 849 };
 850
 851 static void io_wq_submit_work(struct io_wq_work **workptr);
 852 static void io_cqring_fill_event(struct io_kiocb *req, long res);
 853 static void io_put_req(struct io_kiocb *req);
 854 static void __io_double_put_req(struct io_kiocb *req);
 855 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
 856 static void io_queue_linked_timeout(struct io_kiocb *req);
 857 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 858                                  struct io_uring_files_update *ip,
 859                                  unsigned nr_args);
 860 static int io_grab_files(struct io_kiocb *req);
 861 static void io_cleanup_req(struct io_kiocb *req);
 862 static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
 863                        int fd, struct file **out_file, bool fixed);
 864 static void __io_queue_sqe(struct io_kiocb *req,
 865                            const struct io_uring_sqe *sqe);
 866
 867 static struct kmem_cache *req_cachep;
 868
 869 static const struct file_operations io_uring_fops;
 870
 871 struct sock *io_uring_get_socket(struct file *file)
 872 {
 873 #if defined(CONFIG_UNIX)
 874         if (file->f_op == &io_uring_fops) {
 875                 struct io_ring_ctx *ctx = file->private_data;
 876
 877                 return ctx->ring_sock->sk;
 878         }
 879 #endif
 880         return NULL;
 881 }
 882 EXPORT_SYMBOL(io_uring_get_socket);
 883
 884 static void io_ring_ctx_ref_free(struct percpu_ref *ref)
 885 {
 886         struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
 887
 888         complete(&ctx->completions[0]);
 889 }
 890
 891 static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 892 {
 893         struct io_ring_ctx *ctx;
 894         int hash_bits;
 895
 896         ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 897         if (!ctx)
 898                 return NULL;
 899
 900         ctx->fallback_req = kmem_cache_alloc(req_cachep, GFP_KERNEL);
 901         if (!ctx->fallback_req)
 902                 goto err;
 903
 904         ctx->completions = kmalloc(2 * sizeof(struct completion), GFP_KERNEL);
 905         if (!ctx->completions)
 906                 goto err;
 907
 908         /*
 909          * Use 5 bits less than the max cq entries, that should give us around
 910          * 32 entries per hash list if totally full and uniformly spread.
 911          */
 912         hash_bits = ilog2(p->cq_entries);
 913         hash_bits -= 5;
 914         if (hash_bits <= 0)
 915                 hash_bits = 1;
 916         ctx->cancel_hash_bits = hash_bits;
 917         ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
 918                                         GFP_KERNEL);
 919         if (!ctx->cancel_hash)
 920                 goto err;
 921         __hash_init(ctx->cancel_hash, 1U << hash_bits);
 922
 923         if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
 924                             PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
 925                 goto err;
 926
 927         ctx->flags = p->flags;
 928         init_waitqueue_head(&ctx->sqo_wait);
 929         init_waitqueue_head(&ctx->cq_wait);
 930         INIT_LIST_HEAD(&ctx->cq_overflow_list);
 931         init_completion(&ctx->completions[0]);
 932         init_completion(&ctx->completions[1]);
 933         idr_init(&ctx->io_buffer_idr);
 934         idr_init(&ctx->personality_idr);
 935         mutex_init(&ctx->uring_lock);
 936         init_waitqueue_head(&ctx->wait);
 937         spin_lock_init(&ctx->completion_lock);
 938         INIT_LIST_HEAD(&ctx->poll_list);
 939         INIT_LIST_HEAD(&ctx->defer_list);
 940         INIT_LIST_HEAD(&ctx->timeout_list);
 941         init_waitqueue_head(&ctx->inflight_wait);
 942         spin_lock_init(&ctx->inflight_lock);
 943         INIT_LIST_HEAD(&ctx->inflight_list);
 944         return ctx;
 945 err:
 946         if (ctx->fallback_req)
 947                 kmem_cache_free(req_cachep, ctx->fallback_req);
 948         kfree(ctx->completions);
 949         kfree(ctx->cancel_hash);
 950         kfree(ctx);
 951         return NULL;
 952 }
 953
 954 static inline bool __req_need_defer(struct io_kiocb *req)
 955 {
 956         struct io_ring_ctx *ctx = req->ctx;
 957
 958         return req->sequence != ctx->cached_cq_tail
 959                                 + atomic_read(&ctx->cached_cq_overflow);
 960 }
 961
 962 static inline bool req_need_defer(struct io_kiocb *req)
 963 {
 964         if (unlikely(req->flags & REQ_F_IO_DRAIN))
 965                 return __req_need_defer(req);
 966
 967         return false;
 968 }
 969
 970 static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
 971 {
 972         struct io_kiocb *req;
 973
 974         req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list);
 975         if (req && !req_need_defer(req)) {
 976                 list_del_init(&req->list);
 977                 return req;
 978         }
 979
 980         return NULL;
 981 }
 982
 983 static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx)
 984 {
 985         struct io_kiocb *req;
 986
 987         req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list);
 988         if (req) {
 989                 if (req->flags & REQ_F_TIMEOUT_NOSEQ)
 990                         return NULL;
 991                 if (!__req_need_defer(req)) {
 992                         list_del_init(&req->list);
 993                         return req;
 994                 }
 995         }
 996
 997         return NULL;
 998 }
 999
1000 static void __io_commit_cqring(struct io_ring_ctx *ctx)
1001 {
1002         struct io_rings *rings = ctx->rings;
1003
1004         /* order cqe stores with ring update */
1005         smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
1006
1007         if (wq_has_sleeper(&ctx->cq_wait)) {
1008                 wake_up_interruptible(&ctx->cq_wait);
1009                 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
1010         }
1011 }
1012
1013 static inline void io_req_work_grab_env(struct io_kiocb *req,
1014                                         const struct io_op_def *def)
1015 {
1016         if (!req->work.mm && def->needs_mm) {
1017                 mmgrab(current->mm);
1018                 req->work.mm = current->mm;
1019         }
1020         if (!req->work.creds)
1021                 req->work.creds = get_current_cred();
1022         if (!req->work.fs && def->needs_fs) {
1023                 spin_lock(&current->fs->lock);
1024                 if (!current->fs->in_exec) {
1025                         req->work.fs = current->fs;
1026                         req->work.fs->users++;
1027                 } else {
1028                         req->work.flags |= IO_WQ_WORK_CANCEL;
1029                 }
1030                 spin_unlock(&current->fs->lock);
1031         }
1032         if (!req->work.task_pid)
1033                 req->work.task_pid = task_pid_vnr(current);
1034 }
1035
1036 static inline void io_req_work_drop_env(struct io_kiocb *req)
1037 {
1038         if (req->work.mm) {
1039                 mmdrop(req->work.mm);
1040                 req->work.mm = NULL;
1041         }
1042         if (req->work.creds) {
1043                 put_cred(req->work.creds);
1044                 req->work.creds = NULL;
1045         }
1046         if (req->work.fs) {
1047                 struct fs_struct *fs = req->work.fs;
1048
1049                 spin_lock(&req->work.fs->lock);
1050                 if (--fs->users)
1051                         fs = NULL;
1052                 spin_unlock(&req->work.fs->lock);
1053                 if (fs)
1054                         free_fs_struct(fs);
1055         }
1056 }
1057
1058 static inline void io_prep_async_work(struct io_kiocb *req,
1059                                       struct io_kiocb **link)
1060 {
1061         const struct io_op_def *def = &io_op_defs[req->opcode];
1062
1063         if (req->flags & REQ_F_ISREG) {
1064                 if (def->hash_reg_file)
1065                         io_wq_hash_work(&req->work, file_inode(req->file));
1066         } else {
1067                 if (def->unbound_nonreg_file)
1068                         req->work.flags |= IO_WQ_WORK_UNBOUND;
1069         }
1070
1071         io_req_work_grab_env(req, def);
1072
1073         *link = io_prep_linked_timeout(req);
1074 }
1075
1076 static inline void io_queue_async_work(struct io_kiocb *req)
1077 {
1078         struct io_ring_ctx *ctx = req->ctx;
1079         struct io_kiocb *link;
1080
1081         io_prep_async_work(req, &link);
1082
1083         trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
1084                                         &req->work, req->flags);
1085         io_wq_enqueue(ctx->io_wq, &req->work);
1086
1087         if (link)
1088                 io_queue_linked_timeout(link);
1089 }
1090
1091 static void io_kill_timeout(struct io_kiocb *req)
1092 {
1093         int ret;
1094
1095         ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
1096         if (ret != -1) {
1097                 atomic_inc(&req->ctx->cq_timeouts);
1098                 list_del_init(&req->list);
1099                 req->flags |= REQ_F_COMP_LOCKED;
1100                 io_cqring_fill_event(req, 0);
1101                 io_put_req(req);
1102         }
1103 }
1104
1105 static void io_kill_timeouts(struct io_ring_ctx *ctx)
1106 {
1107         struct io_kiocb *req, *tmp;
1108
1109         spin_lock_irq(&ctx->completion_lock);
1110         list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list)
1111                 io_kill_timeout(req);
1112         spin_unlock_irq(&ctx->completion_lock);
1113 }
1114
1115 static void io_commit_cqring(struct io_ring_ctx *ctx)
1116 {
1117         struct io_kiocb *req;
1118
1119         while ((req = io_get_timeout_req(ctx)) != NULL)
1120                 io_kill_timeout(req);
1121
1122         __io_commit_cqring(ctx);
1123
1124         while ((req = io_get_deferred_req(ctx)) != NULL)
1125                 io_queue_async_work(req);
1126 }
1127
1128 static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
1129 {
1130         struct io_rings *rings = ctx->rings;
1131         unsigned tail;
1132
1133         tail = ctx->cached_cq_tail;
1134         /*
1135          * writes to the cq entry need to come after reading head; the
1136          * control dependency is enough as we're using WRITE_ONCE to
1137          * fill the cq entry
1138          */
1139         if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)
1140                 return NULL;
1141
1142         ctx->cached_cq_tail++;
1143         return &rings->cqes[tail & ctx->cq_mask];
1144 }
1145
1146 static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1147 {
1148         if (!ctx->cq_ev_fd)
1149                 return false;
1150         if (!ctx->eventfd_async)
1151                 return true;
1152         return io_wq_current_is_worker();
1153 }
1154
1155 static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1156 {
1157         if (waitqueue_active(&ctx->wait))
1158                 wake_up(&ctx->wait);
1159         if (waitqueue_active(&ctx->sqo_wait))
1160                 wake_up(&ctx->sqo_wait);
1161         if (io_should_trigger_evfd(ctx))
1162                 eventfd_signal(ctx->cq_ev_fd, 1);
1163 }
1164
1165 /* Returns true if there are no backlogged entries after the flush */
1166 static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
1167 {
1168         struct io_rings *rings = ctx->rings;
1169         struct io_uring_cqe *cqe;
1170         struct io_kiocb *req;
1171         unsigned long flags;
1172         LIST_HEAD(list);
1173
1174         if (!force) {
1175                 if (list_empty_careful(&ctx->cq_overflow_list))
1176                         return true;
1177                 if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
1178                     rings->cq_ring_entries))
1179                         return false;
1180         }
1181
1182         spin_lock_irqsave(&ctx->completion_lock, flags);
1183
1184         /* if force is set, the ring is going away. always drop after that */
1185         if (force)
1186                 ctx->cq_overflow_flushed = 1;
1187
1188         cqe = NULL;
1189         while (!list_empty(&ctx->cq_overflow_list)) {
1190                 cqe = io_get_cqring(ctx);
1191                 if (!cqe && !force)
1192                         break;
1193
1194                 req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb,
1195                                                 list);
1196                 list_move(&req->list, &list);
1197                 req->flags &= ~REQ_F_OVERFLOW;
1198                 if (cqe) {
1199                         WRITE_ONCE(cqe->user_data, req->user_data);
1200                         WRITE_ONCE(cqe->res, req->result);
1201                         WRITE_ONCE(cqe->flags, req->cflags);
1202                 } else {
1203                         WRITE_ONCE(ctx->rings->cq_overflow,
1204                                 atomic_inc_return(&ctx->cached_cq_overflow));
1205                 }
1206         }
1207
1208         io_commit_cqring(ctx);
1209         if (cqe) {
1210                 clear_bit(0, &ctx->sq_check_overflow);
1211                 clear_bit(0, &ctx->cq_check_overflow);
1212         }
1213         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1214         io_cqring_ev_posted(ctx);
1215
1216         while (!list_empty(&list)) {
1217                 req = list_first_entry(&list, struct io_kiocb, list);
1218                 list_del(&req->list);
1219                 io_put_req(req);
1220         }
1221
1222         return cqe != NULL;
1223 }
1224
1225 static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
1226 {
1227         struct io_ring_ctx *ctx = req->ctx;
1228         struct io_uring_cqe *cqe;
1229
1230         trace_io_uring_complete(ctx, req->user_data, res);
1231
1232         /*
1233          * If we can't get a cq entry, userspace overflowed the
1234          * submission (by quite a lot). Increment the overflow count in
1235          * the ring.
1236          */
1237         cqe = io_get_cqring(ctx);
1238         if (likely(cqe)) {
1239                 WRITE_ONCE(cqe->user_data, req->user_data);
1240                 WRITE_ONCE(cqe->res, res);
1241                 WRITE_ONCE(cqe->flags, cflags);
1242         } else if (ctx->cq_overflow_flushed) {
1243                 WRITE_ONCE(ctx->rings->cq_overflow,
1244                                 atomic_inc_return(&ctx->cached_cq_overflow));
1245         } else {
1246                 if (list_empty(&ctx->cq_overflow_list)) {
1247                         set_bit(0, &ctx->sq_check_overflow);
1248                         set_bit(0, &ctx->cq_check_overflow);
1249                 }
1250                 req->flags |= REQ_F_OVERFLOW;
1251                 refcount_inc(&req->refs);
1252                 req->result = res;
1253                 req->cflags = cflags;
1254                 list_add_tail(&req->list, &ctx->cq_overflow_list);
1255         }
1256 }
1257
1258 static void io_cqring_fill_event(struct io_kiocb *req, long res)
1259 {
1260         __io_cqring_fill_event(req, res, 0);
1261 }
1262
1263 static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
1264 {
1265         struct io_ring_ctx *ctx = req->ctx;
1266         unsigned long flags;
1267
1268         spin_lock_irqsave(&ctx->completion_lock, flags);
1269         __io_cqring_fill_event(req, res, cflags);
1270         io_commit_cqring(ctx);
1271         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1272
1273         io_cqring_ev_posted(ctx);
1274 }
1275
1276 static void io_cqring_add_event(struct io_kiocb *req, long res)
1277 {
1278         __io_cqring_add_event(req, res, 0);
1279 }
1280
1281 static inline bool io_is_fallback_req(struct io_kiocb *req)
1282 {
1283         return req == (struct io_kiocb *)
1284                         ((unsigned long) req->ctx->fallback_req & ~1UL);
1285 }
1286
1287 static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
1288 {
1289         struct io_kiocb *req;
1290
1291         req = ctx->fallback_req;
1292         if (!test_and_set_bit_lock(0, (unsigned long *) &ctx->fallback_req))
1293                 return req;
1294
1295         return NULL;
1296 }
1297
1298 static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
1299                                      struct io_submit_state *state)
1300 {
1301         gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
1302         struct io_kiocb *req;
1303
1304         if (!state) {
1305                 req = kmem_cache_alloc(req_cachep, gfp);
1306                 if (unlikely(!req))
1307                         goto fallback;
1308         } else if (!state->free_reqs) {
1309                 size_t sz;
1310                 int ret;
1311
1312                 sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
1313                 ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
1314
1315                 /*
1316                  * Bulk alloc is all-or-nothing. If we fail to get a batch,
1317                  * retry single alloc to be on the safe side.
1318                  */
1319                 if (unlikely(ret <= 0)) {
1320                         state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
1321                         if (!state->reqs[0])
1322                                 goto fallback;
1323                         ret = 1;
1324                 }
1325                 state->free_reqs = ret - 1;
1326                 req = state->reqs[ret - 1];
1327         } else {
1328                 state->free_reqs--;
1329                 req = state->reqs[state->free_reqs];
1330         }
1331
1332         return req;
1333 fallback:
1334         return io_get_fallback_req(ctx);
1335 }
1336
1337 static inline void io_put_file(struct io_kiocb *req, struct file *file,
1338                           bool fixed)
1339 {
1340         if (fixed)
1341                 percpu_ref_put(req->fixed_file_refs);
1342         else
1343                 fput(file);
1344 }
1345
1346 static void __io_req_aux_free(struct io_kiocb *req)
1347 {
1348         if (req->flags & REQ_F_NEED_CLEANUP)
1349                 io_cleanup_req(req);
1350
1351         kfree(req->io);
1352         if (req->file)
1353                 io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
1354         if (req->task)
1355                 put_task_struct(req->task);
1356
1357         io_req_work_drop_env(req);
1358 }
1359
1360 static void __io_free_req(struct io_kiocb *req)
1361 {
1362         __io_req_aux_free(req);
1363
1364         if (req->flags & REQ_F_INFLIGHT) {
1365                 struct io_ring_ctx *ctx = req->ctx;
1366                 unsigned long flags;
1367
1368                 spin_lock_irqsave(&ctx->inflight_lock, flags);
1369                 list_del(&req->inflight_entry);
1370                 if (waitqueue_active(&ctx->inflight_wait))
1371                         wake_up(&ctx->inflight_wait);
1372                 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
1373         }
1374
1375         percpu_ref_put(&req->ctx->refs);
1376         if (likely(!io_is_fallback_req(req)))
1377                 kmem_cache_free(req_cachep, req);
1378         else
1379                 clear_bit_unlock(0, (unsigned long *) &req->ctx->fallback_req);
1380 }
1381
1382 struct req_batch {
1383         void *reqs[IO_IOPOLL_BATCH];
1384         int to_free;
1385         int need_iter;
1386 };
1387
1388 static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
1389 {
1390         if (!rb->to_free)
1391                 return;
1392         if (rb->need_iter) {
1393                 int i, inflight = 0;
1394                 unsigned long flags;
1395
1396                 for (i = 0; i < rb->to_free; i++) {
1397                         struct io_kiocb *req = rb->reqs[i];
1398
1399                         if (req->flags & REQ_F_INFLIGHT)
1400                                 inflight++;
1401                         __io_req_aux_free(req);
1402                 }
1403                 if (!inflight)
1404                         goto do_free;
1405
1406                 spin_lock_irqsave(&ctx->inflight_lock, flags);
1407                 for (i = 0; i < rb->to_free; i++) {
1408                         struct io_kiocb *req = rb->reqs[i];
1409
1410                         if (req->flags & REQ_F_INFLIGHT) {
1411                                 list_del(&req->inflight_entry);
1412                                 if (!--inflight)
1413                                         break;
1414                         }
1415                 }
1416                 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
1417
1418                 if (waitqueue_active(&ctx->inflight_wait))
1419                         wake_up(&ctx->inflight_wait);
1420         }
1421 do_free:
1422         kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
1423         percpu_ref_put_many(&ctx->refs, rb->to_free);
1424         rb->to_free = rb->need_iter = 0;
1425 }
1426
1427 static bool io_link_cancel_timeout(struct io_kiocb *req)
1428 {
1429         struct io_ring_ctx *ctx = req->ctx;
1430         int ret;
1431
1432         ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
1433         if (ret != -1) {
1434                 io_cqring_fill_event(req, -ECANCELED);
1435                 io_commit_cqring(ctx);
1436                 req->flags &= ~REQ_F_LINK_HEAD;
1437                 io_put_req(req);
1438                 return true;
1439         }
1440
1441         return false;
1442 }
1443
1444 static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
1445 {
1446         struct io_ring_ctx *ctx = req->ctx;
1447         bool wake_ev = false;
1448
1449         /* Already got next link */
1450         if (req->flags & REQ_F_LINK_NEXT)
1451                 return;
1452
1453         /*
1454          * The list should never be empty when we are called here. But could
1455          * potentially happen if the chain is messed up, check to be on the
1456          * safe side.
1457          */
1458         while (!list_empty(&req->link_list)) {
1459                 struct io_kiocb *nxt = list_first_entry(&req->link_list,
1460                                                 struct io_kiocb, link_list);
1461
1462                 if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) &&
1463                              (nxt->flags & REQ_F_TIMEOUT))) {
1464                         list_del_init(&nxt->link_list);
1465                         wake_ev |= io_link_cancel_timeout(nxt);
1466                         req->flags &= ~REQ_F_LINK_TIMEOUT;
1467                         continue;
1468                 }
1469
1470                 list_del_init(&req->link_list);
1471                 if (!list_empty(&nxt->link_list))
1472                         nxt->flags |= REQ_F_LINK_HEAD;
1473                 *nxtptr = nxt;
1474                 break;
1475         }
1476
1477         req->flags |= REQ_F_LINK_NEXT;
1478         if (wake_ev)
1479                 io_cqring_ev_posted(ctx);
1480 }
1481
1482 /*
1483  * Called if REQ_F_LINK_HEAD is set, and we fail the head request
1484  */
1485 static void io_fail_links(struct io_kiocb *req)
1486 {
1487         struct io_ring_ctx *ctx = req->ctx;
1488         unsigned long flags;
1489
1490         spin_lock_irqsave(&ctx->completion_lock, flags);
1491
1492         while (!list_empty(&req->link_list)) {
1493                 struct io_kiocb *link = list_first_entry(&req->link_list,
1494                                                 struct io_kiocb, link_list);
1495
1496                 list_del_init(&link->link_list);
1497                 trace_io_uring_fail_link(req, link);
1498
1499                 if ((req->flags & REQ_F_LINK_TIMEOUT) &&
1500                     link->opcode == IORING_OP_LINK_TIMEOUT) {
1501                         io_link_cancel_timeout(link);
1502                 } else {
1503                         io_cqring_fill_event(link, -ECANCELED);
1504                         __io_double_put_req(link);
1505                 }
1506                 req->flags &= ~REQ_F_LINK_TIMEOUT;
1507         }
1508
1509         io_commit_cqring(ctx);
1510         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1511         io_cqring_ev_posted(ctx);
1512 }
1513
1514 static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
1515 {
1516         if (likely(!(req->flags & REQ_F_LINK_HEAD)))
1517                 return;
1518
1519         /*
1520          * If LINK is set, we have dependent requests in this chain. If we
1521          * didn't fail this request, queue the first one up, moving any other
1522          * dependencies to the next request. In case of failure, fail the rest
1523          * of the chain.
1524          */
1525         if (req->flags & REQ_F_FAIL_LINK) {
1526                 io_fail_links(req);
1527         } else if ((req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_COMP_LOCKED)) ==
1528                         REQ_F_LINK_TIMEOUT) {
1529                 struct io_ring_ctx *ctx = req->ctx;
1530                 unsigned long flags;
1531
1532                 /*
1533                  * If this is a timeout link, we could be racing with the
1534                  * timeout timer. Grab the completion lock for this case to
1535                  * protect against that.
1536                  */
1537                 spin_lock_irqsave(&ctx->completion_lock, flags);
1538                 io_req_link_next(req, nxt);
1539                 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1540         } else {
1541                 io_req_link_next(req, nxt);
1542         }
1543 }
1544
1545 static void io_free_req(struct io_kiocb *req)
1546 {
1547         struct io_kiocb *nxt = NULL;
1548
1549         io_req_find_next(req, &nxt);
1550         __io_free_req(req);
1551
1552         if (nxt)
1553                 io_queue_async_work(nxt);
1554 }
1555
1556 static void io_link_work_cb(struct io_wq_work **workptr)
1557 {
1558         struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
1559         struct io_kiocb *link;
1560
1561         link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
1562         io_queue_linked_timeout(link);
1563         io_wq_submit_work(workptr);
1564 }
1565
1566 static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
1567 {
1568         struct io_kiocb *link;
1569         const struct io_op_def *def = &io_op_defs[nxt->opcode];
1570
1571         if ((nxt->flags & REQ_F_ISREG) && def->hash_reg_file)
1572                 io_wq_hash_work(&nxt->work, file_inode(nxt->file));
1573
1574         *workptr = &nxt->work;
1575         link = io_prep_linked_timeout(nxt);
1576         if (link)
1577                 nxt->work.func = io_link_work_cb;
1578 }
1579
1580 /*
1581  * Drop reference to request, return next in chain (if there is one) if this
1582  * was the last reference to this request.
1583  */
1584 __attribute__((nonnull))
1585 static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
1586 {
1587         if (refcount_dec_and_test(&req->refs)) {
1588                 io_req_find_next(req, nxtptr);
1589                 __io_free_req(req);
1590         }
1591 }
1592
1593 static void io_put_req(struct io_kiocb *req)
1594 {
1595         if (refcount_dec_and_test(&req->refs))
1596                 io_free_req(req);
1597 }
1598
1599 static void io_steal_work(struct io_kiocb *req,
1600                           struct io_wq_work **workptr)
1601 {
1602         /*
1603          * It's in an io-wq worker, so there always should be at least
1604          * one reference, which will be dropped in io_put_work() just
1605          * after the current handler returns.
1606          *
1607          * It also means, that if the counter dropped to 1, then there is
1608          * no asynchronous users left, so it's safe to steal the next work.
1609          */
1610         if (refcount_read(&req->refs) == 1) {
1611                 struct io_kiocb *nxt = NULL;
1612
1613                 io_req_find_next(req, &nxt);
1614                 if (nxt)
1615                         io_wq_assign_next(workptr, nxt);
1616         }
1617 }
1618
1619 /*
1620  * Must only be used if we don't need to care about links, usually from
1621  * within the completion handling itself.
1622  */
1623 static void __io_double_put_req(struct io_kiocb *req)
1624 {
1625         /* drop both submit and complete references */
1626         if (refcount_sub_and_test(2, &req->refs))
1627                 __io_free_req(req);
1628 }
1629
1630 static void io_double_put_req(struct io_kiocb *req)
1631 {
1632         /* drop both submit and complete references */
1633         if (refcount_sub_and_test(2, &req->refs))
1634                 io_free_req(req);
1635 }
1636
1637 static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
1638 {
1639         struct io_rings *rings = ctx->rings;
1640
1641         if (test_bit(0, &ctx->cq_check_overflow)) {
1642                 /*
1643                  * noflush == true is from the waitqueue handler, just ensure
1644                  * we wake up the task, and the next invocation will flush the
1645                  * entries. We cannot safely to it from here.
1646                  */
1647                 if (noflush && !list_empty(&ctx->cq_overflow_list))
1648                         return -1U;
1649
1650                 io_cqring_overflow_flush(ctx, false);
1651         }
1652
1653         /* See comment at the top of this file */
1654         smp_rmb();
1655         return ctx->cached_cq_tail - READ_ONCE(rings->cq.head);
1656 }
1657
1658 static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
1659 {
1660         struct io_rings *rings = ctx->rings;
1661
1662         /* make sure SQ entry isn't read before tail */
1663         return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
1664 }
1665
1666 static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
1667 {
1668         if ((req->flags & REQ_F_LINK_HEAD) || io_is_fallback_req(req))
1669                 return false;
1670
1671         if (req->file || req->io)
1672                 rb->need_iter++;
1673
1674         rb->reqs[rb->to_free++] = req;
1675         if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
1676                 io_free_req_many(req->ctx, rb);
1677         return true;
1678 }
1679
1680 static int io_put_kbuf(struct io_kiocb *req)
1681 {
1682         struct io_buffer *kbuf;
1683         int cflags;
1684
1685         kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
1686         cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
1687         cflags |= IORING_CQE_F_BUFFER;
1688         req->rw.addr = 0;
1689         kfree(kbuf);
1690         return cflags;
1691 }
1692
1693 static void io_iopoll_queue(struct list_head *again)
1694 {
1695         struct io_kiocb *req;
1696
1697         do {
1698                 req = list_first_entry(again, struct io_kiocb, list);
1699                 list_del(&req->list);
1700                 refcount_inc(&req->refs);
1701                 io_queue_async_work(req);
1702         } while (!list_empty(again));
1703 }
1704
1705 /*
1706  * Find and free completed poll iocbs
1707  */
1708 static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
1709                                struct list_head *done)
1710 {
1711         struct req_batch rb;
1712         struct io_kiocb *req;
1713         LIST_HEAD(again);
1714
1715         /* order with ->result store in io_complete_rw_iopoll() */
1716         smp_rmb();
1717
1718         rb.to_free = rb.need_iter = 0;
1719         while (!list_empty(done)) {
1720                 int cflags = 0;
1721
1722                 req = list_first_entry(done, struct io_kiocb, list);
1723                 if (READ_ONCE(req->result) == -EAGAIN) {
1724                         req->iopoll_completed = 0;
1725                         list_move_tail(&req->list, &again);
1726                         continue;
1727                 }
1728                 list_del(&req->list);
1729
1730                 if (req->flags & REQ_F_BUFFER_SELECTED)
1731                         cflags = io_put_kbuf(req);
1732
1733                 __io_cqring_fill_event(req, req->result, cflags);
1734                 (*nr_events)++;
1735
1736                 if (refcount_dec_and_test(&req->refs) &&
1737                     !io_req_multi_free(&rb, req))
1738                         io_free_req(req);
1739         }
1740
1741         io_commit_cqring(ctx);
1742         if (ctx->flags & IORING_SETUP_SQPOLL)
1743                 io_cqring_ev_posted(ctx);
1744         io_free_req_many(ctx, &rb);
1745
1746         if (!list_empty(&again))
1747                 io_iopoll_queue(&again);
1748 }
1749
1750 static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
1751                         long min)
1752 {
1753         struct io_kiocb *req, *tmp;
1754         LIST_HEAD(done);
1755         bool spin;
1756         int ret;
1757
1758         /*
1759          * Only spin for completions if we don't have multiple devices hanging
1760          * off our complete list, and we're under the requested amount.
1761          */
1762         spin = !ctx->poll_multi_file && *nr_events < min;
1763
1764         ret = 0;
1765         list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
1766                 struct kiocb *kiocb = &req->rw.kiocb;
1767
1768                 /*
1769                  * Move completed and retryable entries to our local lists.
1770                  * If we find a request that requires polling, break out
1771                  * and complete those lists first, if we have entries there.
1772                  */
1773                 if (READ_ONCE(req->iopoll_completed)) {
1774                         list_move_tail(&req->list, &done);
1775                         continue;
1776                 }
1777                 if (!list_empty(&done))
1778                         break;
1779
1780                 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
1781                 if (ret < 0)
1782                         break;
1783
1784                 if (ret && spin)
1785                         spin = false;
1786                 ret = 0;
1787         }
1788
1789         if (!list_empty(&done))
1790                 io_iopoll_complete(ctx, nr_events, &done);
1791
1792         return ret;
1793 }
1794
1795 /*
1796  * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a
1797  * non-spinning poll check - we'll still enter the driver poll loop, but only
1798  * as a non-spinning completion check.
1799  */
1800 static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
1801                                 long min)
1802 {
1803         while (!list_empty(&ctx->poll_list) && !need_resched()) {
1804                 int ret;
1805
1806                 ret = io_do_iopoll(ctx, nr_events, min);
1807                 if (ret < 0)
1808                         return ret;
1809                 if (!min || *nr_events >= min)
1810                         return 0;
1811         }
1812
1813         return 1;
1814 }
1815
1816 /*
1817  * We can't just wait for polled events to come to us, we have to actively
1818  * find and complete them.
1819  */
1820 static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
1821 {
1822         if (!(ctx->flags & IORING_SETUP_IOPOLL))
1823                 return;
1824
1825         mutex_lock(&ctx->uring_lock);
1826         while (!list_empty(&ctx->poll_list)) {
1827                 unsigned int nr_events = 0;
1828
1829                 io_iopoll_getevents(ctx, &nr_events, 1);
1830
1831                 /*
1832                  * Ensure we allow local-to-the-cpu processing to take place,
1833                  * in this case we need to ensure that we reap all events.
1834                  */
1835                 cond_resched();
1836         }
1837         mutex_unlock(&ctx->uring_lock);
1838 }
1839
1840 static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
1841                            long min)
1842 {
1843         int iters = 0, ret = 0;
1844
1845         /*
1846          * We disallow the app entering submit/complete with polling, but we
1847          * still need to lock the ring to prevent racing with polled issue
1848          * that got punted to a workqueue.
1849          */
1850         mutex_lock(&ctx->uring_lock);
1851         do {
1852                 int tmin = 0;
1853
1854                 /*
1855                  * Don't enter poll loop if we already have events pending.
1856                  * If we do, we can potentially be spinning for commands that
1857                  * already triggered a CQE (eg in error).
1858                  */
1859                 if (io_cqring_events(ctx, false))
1860                         break;
1861
1862                 /*
1863                  * If a submit got punted to a workqueue, we can have the
1864                  * application entering polling for a command before it gets
1865                  * issued. That app will hold the uring_lock for the duration
1866                  * of the poll right here, so we need to take a breather every
1867                  * now and then to ensure that the issue has a chance to add
1868                  * the poll to the issued list. Otherwise we can spin here
1869                  * forever, while the workqueue is stuck trying to acquire the
1870                  * very same mutex.
1871                  */
1872                 if (!(++iters & 7)) {
1873                         mutex_unlock(&ctx->uring_lock);
1874                         mutex_lock(&ctx->uring_lock);
1875                 }
1876
1877                 if (*nr_events < min)
1878                         tmin = min - *nr_events;
1879
1880                 ret = io_iopoll_getevents(ctx, nr_events, tmin);
1881                 if (ret <= 0)
1882                         break;
1883                 ret = 0;
1884         } while (min && !*nr_events && !need_resched());
1885
1886         mutex_unlock(&ctx->uring_lock);
1887         return ret;
1888 }
1889
1890 static void kiocb_end_write(struct io_kiocb *req)
1891 {
1892         /*
1893          * Tell lockdep we inherited freeze protection from submission
1894          * thread.
1895          */
1896         if (req->flags & REQ_F_ISREG) {
1897                 struct inode *inode = file_inode(req->file);
1898
1899                 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
1900         }
1901         file_end_write(req->file);
1902 }
1903
1904 static inline void req_set_fail_links(struct io_kiocb *req)
1905 {
1906         if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
1907                 req->flags |= REQ_F_FAIL_LINK;
1908 }
1909
1910 static void io_complete_rw_common(struct kiocb *kiocb, long res)
1911 {
1912         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
1913         int cflags = 0;
1914
1915         if (kiocb->ki_flags & IOCB_WRITE)
1916                 kiocb_end_write(req);
1917
1918         if (res != req->result)
1919                 req_set_fail_links(req);
1920         if (req->flags & REQ_F_BUFFER_SELECTED)
1921                 cflags = io_put_kbuf(req);
1922         __io_cqring_add_event(req, res, cflags);
1923 }
1924
1925 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
1926 {
1927         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
1928
1929         io_complete_rw_common(kiocb, res);
1930         io_put_req(req);
1931 }
1932
1933 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
1934 {
1935         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
1936
1937         if (kiocb->ki_flags & IOCB_WRITE)
1938                 kiocb_end_write(req);
1939
1940         if (res != -EAGAIN && res != req->result)
1941                 req_set_fail_links(req);
1942
1943         WRITE_ONCE(req->result, res);
1944         /* order with io_poll_complete() checking ->result */
1945         smp_wmb();
1946         WRITE_ONCE(req->iopoll_completed, 1);
1947 }
1948
1949 /*
1950  * After the iocb has been issued, it's safe to be found on the poll list.
1951  * Adding the kiocb to the list AFTER submission ensures that we don't
1952  * find it from a io_iopoll_getevents() thread before the issuer is done
1953  * accessing the kiocb cookie.
1954  */
1955 static void io_iopoll_req_issued(struct io_kiocb *req)
1956 {
1957         struct io_ring_ctx *ctx = req->ctx;
1958
1959         /*
1960          * Track whether we have multiple files in our lists. This will impact
1961          * how we do polling eventually, not spinning if we're on potentially
1962          * different devices.
1963          */
1964         if (list_empty(&ctx->poll_list)) {
1965                 ctx->poll_multi_file = false;
1966         } else if (!ctx->poll_multi_file) {
1967                 struct io_kiocb *list_req;
1968
1969                 list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
1970                                                 list);
1971                 if (list_req->file != req->file)
1972                         ctx->poll_multi_file = true;
1973         }
1974
1975         /*
1976          * For fast devices, IO may have already completed. If it has, add
1977          * it to the front so we find it first.
1978          */
1979         if (READ_ONCE(req->iopoll_completed))
1980                 list_add(&req->list, &ctx->poll_list);
1981         else
1982                 list_add_tail(&req->list, &ctx->poll_list);
1983
1984         if ((ctx->flags & IORING_SETUP_SQPOLL) &&
1985             wq_has_sleeper(&ctx->sqo_wait))
1986                 wake_up(&ctx->sqo_wait);
1987 }
1988
1989 static void io_file_put(struct io_submit_state *state)
1990 {
1991         if (state->file) {
1992                 int diff = state->has_refs - state->used_refs;
1993
1994                 if (diff)
1995                         fput_many(state->file, diff);
1996                 state->file = NULL;
1997         }
1998 }
1999
2000 /*
2001  * Get as many references to a file as we have IOs left in this submission,
2002  * assuming most submissions are for one file, or at least that each file
2003  * has more than one submission.
2004  */
2005 static struct file *__io_file_get(struct io_submit_state *state, int fd)
2006 {
2007         if (!state)
2008                 return fget(fd);
2009
2010         if (state->file) {
2011                 if (state->fd == fd) {
2012                         state->used_refs++;
2013                         state->ios_left--;
2014                         return state->file;
2015                 }
2016                 io_file_put(state);
2017         }
2018         state->file = fget_many(fd, state->ios_left);
2019         if (!state->file)
2020                 return NULL;
2021
2022         state->fd = fd;
2023         state->has_refs = state->ios_left;
2024         state->used_refs = 1;
2025         state->ios_left--;
2026         return state->file;
2027 }
2028
2029 /*
2030  * If we tracked the file through the SCM inflight mechanism, we could support
2031  * any file. For now, just ensure that anything potentially problematic is done
2032  * inline.
2033  */
2034 static bool io_file_supports_async(struct file *file, int rw)
2035 {
2036         umode_t mode = file_inode(file)->i_mode;
2037
2038         if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISSOCK(mode))
2039                 return true;
2040         if (S_ISREG(mode) && file->f_op != &io_uring_fops)
2041                 return true;
2042
2043         /* any ->read/write should understand O_NONBLOCK */
2044         if (file->f_flags & O_NONBLOCK)
2045                 return true;
2046
2047         if (!(file->f_mode & FMODE_NOWAIT))
2048                 return false;
2049
2050         if (rw == READ)
2051                 return file->f_op->read_iter != NULL;
2052
2053         return file->f_op->write_iter != NULL;
2054 }
2055
2056 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2057                       bool force_nonblock)
2058 {
2059         struct io_ring_ctx *ctx = req->ctx;
2060         struct kiocb *kiocb = &req->rw.kiocb;
2061         unsigned ioprio;
2062         int ret;
2063
2064         if (S_ISREG(file_inode(req->file)->i_mode))
2065                 req->flags |= REQ_F_ISREG;
2066
2067         kiocb->ki_pos = READ_ONCE(sqe->off);
2068         if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) {
2069                 req->flags |= REQ_F_CUR_POS;
2070                 kiocb->ki_pos = req->file->f_pos;
2071         }
2072         kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
2073         kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
2074         ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
2075         if (unlikely(ret))
2076                 return ret;
2077
2078         ioprio = READ_ONCE(sqe->ioprio);
2079         if (ioprio) {
2080                 ret = ioprio_check_cap(ioprio);
2081                 if (ret)
2082                         return ret;
2083
2084                 kiocb->ki_ioprio = ioprio;
2085         } else
2086                 kiocb->ki_ioprio = get_current_ioprio();
2087
2088         /* don't allow async punt if RWF_NOWAIT was requested */
2089         if (kiocb->ki_flags & IOCB_NOWAIT)
2090                 req->flags |= REQ_F_NOWAIT;
2091
2092         if (force_nonblock)
2093                 kiocb->ki_flags |= IOCB_NOWAIT;
2094
2095         if (ctx->flags & IORING_SETUP_IOPOLL) {
2096                 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
2097                     !kiocb->ki_filp->f_op->iopoll)
2098                         return -EOPNOTSUPP;
2099
2100                 kiocb->ki_flags |= IOCB_HIPRI;
2101                 kiocb->ki_complete = io_complete_rw_iopoll;
2102                 req->result = 0;
2103                 req->iopoll_completed = 0;
2104         } else {
2105                 if (kiocb->ki_flags & IOCB_HIPRI)
2106                         return -EINVAL;
2107                 kiocb->ki_complete = io_complete_rw;
2108         }
2109
2110         req->rw.addr = READ_ONCE(sqe->addr);
2111         req->rw.len = READ_ONCE(sqe->len);
2112         req->buf_index = READ_ONCE(sqe->buf_index);
2113         return 0;
2114 }
2115
2116 static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
2117 {
2118         switch (ret) {
2119         case -EIOCBQUEUED:
2120                 break;
2121         case -ERESTARTSYS:
2122         case -ERESTARTNOINTR:
2123         case -ERESTARTNOHAND:
2124         case -ERESTART_RESTARTBLOCK:
2125                 /*
2126                  * We can't just restart the syscall, since previously
2127                  * submitted sqes may already be in progress. Just fail this
2128                  * IO with EINTR.
2129                  */
2130                 ret = -EINTR;
2131                 /* fall through */
2132         default:
2133                 kiocb->ki_complete(kiocb, ret, 0);
2134         }
2135 }
2136
2137 static void kiocb_done(struct kiocb *kiocb, ssize_t ret)
2138 {
2139         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2140
2141         if (req->flags & REQ_F_CUR_POS)
2142                 req->file->f_pos = kiocb->ki_pos;
2143         if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
2144                 io_complete_rw(kiocb, ret, 0);
2145         else
2146                 io_rw_done(kiocb, ret);
2147 }
2148
2149 static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
2150                                struct iov_iter *iter)
2151 {
2152         struct io_ring_ctx *ctx = req->ctx;
2153         size_t len = req->rw.len;
2154         struct io_mapped_ubuf *imu;
2155         u16 index, buf_index;
2156         size_t offset;
2157         u64 buf_addr;
2158
2159         /* attempt to use fixed buffers without having provided iovecs */
2160         if (unlikely(!ctx->user_bufs))
2161                 return -EFAULT;
2162
2163         buf_index = req->buf_index;
2164         if (unlikely(buf_index >= ctx->nr_user_bufs))
2165                 return -EFAULT;
2166
2167         index = array_index_nospec(buf_index, ctx->nr_user_bufs);
2168         imu = &ctx->user_bufs[index];
2169         buf_addr = req->rw.addr;
2170
2171         /* overflow */
2172         if (buf_addr + len < buf_addr)
2173                 return -EFAULT;
2174         /* not inside the mapped region */
2175         if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
2176                 return -EFAULT;
2177
2178         /*
2179          * May not be a start of buffer, set size appropriately
2180          * and advance us to the beginning.
2181          */
2182         offset = buf_addr - imu->ubuf;
2183         iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
2184
2185         if (offset) {
2186                 /*
2187                  * Don't use iov_iter_advance() here, as it's really slow for
2188                  * using the latter parts of a big fixed buffer - it iterates
2189                  * over each segment manually. We can cheat a bit here, because
2190                  * we know that:
2191                  *
2192                  * 1) it's a BVEC iter, we set it up
2193                  * 2) all bvecs are PAGE_SIZE in size, except potentially the
2194                  *    first and last bvec
2195                  *
2196                  * So just find our index, and adjust the iterator afterwards.
2197                  * If the offset is within the first bvec (or the whole first
2198                  * bvec, just use iov_iter_advance(). This makes it easier
2199                  * since we can just skip the first segment, which may not
2200                  * be PAGE_SIZE aligned.
2201                  */
2202                 const struct bio_vec *bvec = imu->bvec;
2203
2204                 if (offset <= bvec->bv_len) {
2205                         iov_iter_advance(iter, offset);
2206                 } else {
2207                         unsigned long seg_skip;
2208
2209                         /* skip first vec */
2210                         offset -= bvec->bv_len;
2211                         seg_skip = 1 + (offset >> PAGE_SHIFT);
2212
2213                         iter->bvec = bvec + seg_skip;
2214                         iter->nr_segs -= seg_skip;
2215                         iter->count -= bvec->bv_len + offset;
2216                         iter->iov_offset = offset & ~PAGE_MASK;
2217                 }
2218         }
2219
2220         return len;
2221 }
2222
2223 static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
2224 {
2225         if (needs_lock)
2226                 mutex_unlock(&ctx->uring_lock);
2227 }
2228
2229 static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
2230 {
2231         /*
2232          * "Normal" inline submissions always hold the uring_lock, since we
2233          * grab it from the system call. Same is true for the SQPOLL offload.
2234          * The only exception is when we've detached the request and issue it
2235          * from an async worker thread, grab the lock for that case.
2236          */
2237         if (needs_lock)
2238                 mutex_lock(&ctx->uring_lock);
2239 }
2240
2241 static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
2242                                           int bgid, struct io_buffer *kbuf,
2243                                           bool needs_lock)
2244 {
2245         struct io_buffer *head;
2246
2247         if (req->flags & REQ_F_BUFFER_SELECTED)
2248                 return kbuf;
2249
2250         io_ring_submit_lock(req->ctx, needs_lock);
2251
2252         lockdep_assert_held(&req->ctx->uring_lock);
2253
2254         head = idr_find(&req->ctx->io_buffer_idr, bgid);
2255         if (head) {
2256                 if (!list_empty(&head->list)) {
2257                         kbuf = list_last_entry(&head->list, struct io_buffer,
2258                                                         list);
2259                         list_del(&kbuf->list);
2260                 } else {
2261                         kbuf = head;
2262                         idr_remove(&req->ctx->io_buffer_idr, bgid);
2263                 }
2264                 if (*len > kbuf->len)
2265                         *len = kbuf->len;
2266         } else {
2267                 kbuf = ERR_PTR(-ENOBUFS);
2268         }
2269
2270         io_ring_submit_unlock(req->ctx, needs_lock);
2271
2272         return kbuf;
2273 }
2274
2275 static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
2276                                         bool needs_lock)
2277 {
2278         struct io_buffer *kbuf;
2279         u16 bgid;
2280
2281         kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2282         bgid = req->buf_index;
2283         kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
2284         if (IS_ERR(kbuf))
2285                 return kbuf;
2286         req->rw.addr = (u64) (unsigned long) kbuf;
2287         req->flags |= REQ_F_BUFFER_SELECTED;
2288         return u64_to_user_ptr(kbuf->addr);
2289 }
2290
2291 #ifdef CONFIG_COMPAT
2292 static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
2293                                 bool needs_lock)
2294 {
2295         struct compat_iovec __user *uiov;
2296         compat_ssize_t clen;
2297         void __user *buf;
2298         ssize_t len;
2299
2300         uiov = u64_to_user_ptr(req->rw.addr);
2301         if (!access_ok(uiov, sizeof(*uiov)))
2302                 return -EFAULT;
2303         if (__get_user(clen, &uiov->iov_len))
2304                 return -EFAULT;
2305         if (clen < 0)
2306                 return -EINVAL;
2307
2308         len = clen;
2309         buf = io_rw_buffer_select(req, &len, needs_lock);
2310         if (IS_ERR(buf))
2311                 return PTR_ERR(buf);
2312         iov[0].iov_base = buf;
2313         iov[0].iov_len = (compat_size_t) len;
2314         return 0;
2315 }
2316 #endif
2317
2318 static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2319                                       bool needs_lock)
2320 {
2321         struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
2322         void __user *buf;
2323         ssize_t len;
2324
2325         if (copy_from_user(iov, uiov, sizeof(*uiov)))
2326                 return -EFAULT;
2327
2328         len = iov[0].iov_len;
2329         if (len < 0)
2330                 return -EINVAL;
2331         buf = io_rw_buffer_select(req, &len, needs_lock);
2332         if (IS_ERR(buf))
2333                 return PTR_ERR(buf);
2334         iov[0].iov_base = buf;
2335         iov[0].iov_len = len;
2336         return 0;
2337 }
2338
2339 static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2340                                     bool needs_lock)
2341 {
2342         if (req->flags & REQ_F_BUFFER_SELECTED) {
2343                 struct io_buffer *kbuf;
2344
2345                 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2346                 iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
2347                 iov[0].iov_len = kbuf->len;
2348                 return 0;
2349         }
2350         if (!req->rw.len)
2351                 return 0;
2352         else if (req->rw.len > 1)
2353                 return -EINVAL;
2354
2355 #ifdef CONFIG_COMPAT
2356         if (req->ctx->compat)
2357                 return io_compat_import(req, iov, needs_lock);
2358 #endif
2359
2360         return __io_iov_buffer_select(req, iov, needs_lock);
2361 }
2362
2363 static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
2364                                struct iovec **iovec, struct iov_iter *iter,
2365                                bool needs_lock)
2366 {
2367         void __user *buf = u64_to_user_ptr(req->rw.addr);
2368         size_t sqe_len = req->rw.len;
2369         ssize_t ret;
2370         u8 opcode;
2371
2372         opcode = req->opcode;
2373         if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
2374                 *iovec = NULL;
2375                 return io_import_fixed(req, rw, iter);
2376         }
2377
2378         /* buffer index only valid with fixed read/write, or buffer select  */
2379         if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
2380                 return -EINVAL;
2381
2382         if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
2383                 if (req->flags & REQ_F_BUFFER_SELECT) {
2384                         buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
2385                         if (IS_ERR(buf)) {
2386                                 *iovec = NULL;
2387                                 return PTR_ERR(buf);
2388                         }
2389                         req->rw.len = sqe_len;
2390                 }
2391
2392                 ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
2393                 *iovec = NULL;
2394                 return ret < 0 ? ret : sqe_len;
2395         }
2396
2397         if (req->io) {
2398                 struct io_async_rw *iorw = &req->io->rw;
2399
2400                 *iovec = iorw->iov;
2401                 iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size);
2402                 if (iorw->iov == iorw->fast_iov)
2403                         *iovec = NULL;
2404                 return iorw->size;
2405         }
2406
2407         if (req->flags & REQ_F_BUFFER_SELECT) {
2408                 ret = io_iov_buffer_select(req, *iovec, needs_lock);
2409                 if (!ret) {
2410                         ret = (*iovec)->iov_len;
2411                         iov_iter_init(iter, rw, *iovec, 1, ret);
2412                 }
2413                 *iovec = NULL;
2414                 return ret;
2415         }
2416
2417 #ifdef CONFIG_COMPAT
2418         if (req->ctx->compat)
2419                 return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
2420                                                 iovec, iter);
2421 #endif
2422
2423         return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
2424 }
2425
2426 /*
2427  * For files that don't have ->read_iter() and ->write_iter(), handle them
2428  * by looping over ->read() or ->write() manually.
2429  */
2430 static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
2431                            struct iov_iter *iter)
2432 {
2433         ssize_t ret = 0;
2434
2435         /*
2436          * Don't support polled IO through this interface, and we can't
2437          * support non-blocking either. For the latter, this just causes
2438          * the kiocb to be handled from an async context.
2439          */
2440         if (kiocb->ki_flags & IOCB_HIPRI)
2441                 return -EOPNOTSUPP;
2442         if (kiocb->ki_flags & IOCB_NOWAIT)
2443                 return -EAGAIN;
2444
2445         while (iov_iter_count(iter)) {
2446                 struct iovec iovec;
2447                 ssize_t nr;
2448
2449                 if (!iov_iter_is_bvec(iter)) {
2450                         iovec = iov_iter_iovec(iter);
2451                 } else {
2452                         /* fixed buffers import bvec */
2453                         iovec.iov_base = kmap(iter->bvec->bv_page)
2454                                                 + iter->iov_offset;
2455                         iovec.iov_len = min(iter->count,
2456                                         iter->bvec->bv_len - iter->iov_offset);
2457                 }
2458
2459                 if (rw == READ) {
2460                         nr = file->f_op->read(file, iovec.iov_base,
2461                                               iovec.iov_len, &kiocb->ki_pos);
2462                 } else {
2463                         nr = file->f_op->write(file, iovec.iov_base,
2464                                                iovec.iov_len, &kiocb->ki_pos);
2465                 }
2466
2467                 if (iov_iter_is_bvec(iter))
2468                         kunmap(iter->bvec->bv_page);
2469
2470                 if (nr < 0) {
2471                         if (!ret)
2472                                 ret = nr;
2473                         break;
2474                 }
2475                 ret += nr;
2476                 if (nr != iovec.iov_len)
2477                         break;
2478                 iov_iter_advance(iter, nr);
2479         }
2480
2481         return ret;
2482 }
2483
2484 static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
2485                           struct iovec *iovec, struct iovec *fast_iov,
2486                           struct iov_iter *iter)
2487 {
2488         req->io->rw.nr_segs = iter->nr_segs;
2489         req->io->rw.size = io_size;
2490         req->io->rw.iov = iovec;
2491         if (!req->io->rw.iov) {
2492                 req->io->rw.iov = req->io->rw.fast_iov;
2493                 if (req->io->rw.iov != fast_iov)
2494                         memcpy(req->io->rw.iov, fast_iov,
2495                                sizeof(struct iovec) * iter->nr_segs);
2496         } else {
2497                 req->flags |= REQ_F_NEED_CLEANUP;
2498         }
2499 }
2500
2501 static inline int __io_alloc_async_ctx(struct io_kiocb *req)
2502 {
2503         req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
2504         return req->io == NULL;
2505 }
2506
2507 static int io_alloc_async_ctx(struct io_kiocb *req)
2508 {
2509         if (!io_op_defs[req->opcode].async_ctx)
2510                 return 0;
2511
2512         return  __io_alloc_async_ctx(req);
2513 }
2514
2515 static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
2516                              struct iovec *iovec, struct iovec *fast_iov,
2517                              struct iov_iter *iter)
2518 {
2519         if (!io_op_defs[req->opcode].async_ctx)
2520                 return 0;
2521         if (!req->io) {
2522                 if (__io_alloc_async_ctx(req))
2523                         return -ENOMEM;
2524
2525                 io_req_map_rw(req, io_size, iovec, fast_iov, iter);
2526         }
2527         return 0;
2528 }
2529
2530 static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2531                         bool force_nonblock)
2532 {
2533         struct io_async_ctx *io;
2534         struct iov_iter iter;
2535         ssize_t ret;
2536
2537         ret = io_prep_rw(req, sqe, force_nonblock);
2538         if (ret)
2539                 return ret;
2540
2541         if (unlikely(!(req->file->f_mode & FMODE_READ)))
2542                 return -EBADF;
2543
2544         /* either don't need iovec imported or already have it */
2545         if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
2546                 return 0;
2547
2548         io = req->io;
2549         io->rw.iov = io->rw.fast_iov;
2550         req->io = NULL;
2551         ret = io_import_iovec(READ, req, &io->rw.iov, &iter, !force_nonblock);
2552         req->io = io;
2553         if (ret < 0)
2554                 return ret;
2555
2556         io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
2557         return 0;
2558 }
2559
2560 static int io_read(struct io_kiocb *req, bool force_nonblock)
2561 {
2562         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
2563         struct kiocb *kiocb = &req->rw.kiocb;
2564         struct iov_iter iter;
2565         size_t iov_count;
2566         ssize_t io_size, ret;
2567
2568         ret = io_import_iovec(READ, req, &iovec, &iter, !force_nonblock);
2569         if (ret < 0)
2570                 return ret;
2571
2572         /* Ensure we clear previously set non-block flag */
2573         if (!force_nonblock)
2574                 kiocb->ki_flags &= ~IOCB_NOWAIT;
2575
2576         req->result = 0;
2577         io_size = ret;
2578         if (req->flags & REQ_F_LINK_HEAD)
2579                 req->result = io_size;
2580
2581         /*
2582          * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
2583          * we know to async punt it even if it was opened O_NONBLOCK
2584          */
2585         if (force_nonblock && !io_file_supports_async(req->file, READ))
2586                 goto copy_iov;
2587
2588         iov_count = iov_iter_count(&iter);
2589         ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count);
2590         if (!ret) {
2591                 ssize_t ret2;
2592
2593                 if (req->file->f_op->read_iter)
2594                         ret2 = call_read_iter(req->file, kiocb, &iter);
2595                 else
2596                         ret2 = loop_rw_iter(READ, req->file, kiocb, &iter);
2597
2598                 /* Catch -EAGAIN return for forced non-blocking submission */
2599                 if (!force_nonblock || ret2 != -EAGAIN) {
2600                         kiocb_done(kiocb, ret2);
2601                 } else {
2602 copy_iov:
2603                         ret = io_setup_async_rw(req, io_size, iovec,
2604                                                 inline_vecs, &iter);
2605                         if (ret)
2606                                 goto out_free;
2607                         /* any defer here is final, must blocking retry */
2608                         if (!(req->flags & REQ_F_NOWAIT) &&
2609                             !file_can_poll(req->file))
2610                                 req->flags |= REQ_F_MUST_PUNT;
2611                         return -EAGAIN;
2612                 }
2613         }
2614 out_free:
2615         if (!(req->flags & REQ_F_NEED_CLEANUP))
2616                 kfree(iovec);
2617         return ret;
2618 }
2619
2620 static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2621                          bool force_nonblock)
2622 {
2623         struct io_async_ctx *io;
2624         struct iov_iter iter;
2625         ssize_t ret;
2626
2627         ret = io_prep_rw(req, sqe, force_nonblock);
2628         if (ret)
2629                 return ret;
2630
2631         if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
2632                 return -EBADF;
2633
2634         req->fsize = rlimit(RLIMIT_FSIZE);
2635
2636         /* either don't need iovec imported or already have it */
2637         if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
2638                 return 0;
2639
2640         io = req->io;
2641         io->rw.iov = io->rw.fast_iov;
2642         req->io = NULL;
2643         ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter, !force_nonblock);
2644         req->io = io;
2645         if (ret < 0)
2646                 return ret;
2647
2648         io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
2649         return 0;
2650 }
2651
2652 static int io_write(struct io_kiocb *req, bool force_nonblock)
2653 {
2654         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
2655         struct kiocb *kiocb = &req->rw.kiocb;
2656         struct iov_iter iter;
2657         size_t iov_count;
2658         ssize_t ret, io_size;
2659
2660         ret = io_import_iovec(WRITE, req, &iovec, &iter, !force_nonblock);
2661         if (ret < 0)
2662                 return ret;
2663
2664         /* Ensure we clear previously set non-block flag */
2665         if (!force_nonblock)
2666                 req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
2667
2668         req->result = 0;
2669         io_size = ret;
2670         if (req->flags & REQ_F_LINK_HEAD)
2671                 req->result = io_size;
2672
2673         /*
2674          * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
2675          * we know to async punt it even if it was opened O_NONBLOCK
2676          */
2677         if (force_nonblock && !io_file_supports_async(req->file, WRITE))
2678                 goto copy_iov;
2679
2680         /* file path doesn't support NOWAIT for non-direct_IO */
2681         if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
2682             (req->flags & REQ_F_ISREG))
2683                 goto copy_iov;
2684
2685         iov_count = iov_iter_count(&iter);
2686         ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count);
2687         if (!ret) {
2688                 ssize_t ret2;
2689
2690                 /*
2691                  * Open-code file_start_write here to grab freeze protection,
2692                  * which will be released by another thread in
2693                  * io_complete_rw().  Fool lockdep by telling it the lock got
2694                  * released so that it doesn't complain about the held lock when
2695                  * we return to userspace.
2696                  */
2697                 if (req->flags & REQ_F_ISREG) {
2698                         __sb_start_write(file_inode(req->file)->i_sb,
2699                                                 SB_FREEZE_WRITE, true);
2700                         __sb_writers_release(file_inode(req->file)->i_sb,
2701                                                 SB_FREEZE_WRITE);
2702                 }
2703                 kiocb->ki_flags |= IOCB_WRITE;
2704
2705                 if (!force_nonblock)
2706                         current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
2707
2708                 if (req->file->f_op->write_iter)
2709                         ret2 = call_write_iter(req->file, kiocb, &iter);
2710                 else
2711                         ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
2712
2713                 if (!force_nonblock)
2714                         current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
2715
2716                 /*
2717                  * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
2718                  * retry them without IOCB_NOWAIT.
2719                  */
2720                 if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
2721                         ret2 = -EAGAIN;
2722                 if (!force_nonblock || ret2 != -EAGAIN) {
2723                         kiocb_done(kiocb, ret2);
2724                 } else {
2725 copy_iov:
2726                         ret = io_setup_async_rw(req, io_size, iovec,
2727                                                 inline_vecs, &iter);
2728                         if (ret)
2729                                 goto out_free;
2730                         /* any defer here is final, must blocking retry */
2731                         if (!(req->flags & REQ_F_NOWAIT) &&
2732                             !file_can_poll(req->file))
2733                                 req->flags |= REQ_F_MUST_PUNT;
2734                         return -EAGAIN;
2735                 }
2736         }
2737 out_free:
2738         if (!(req->flags & REQ_F_NEED_CLEANUP))
2739                 kfree(iovec);
2740         return ret;
2741 }
2742
2743 static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2744 {
2745         struct io_splice* sp = &req->splice;
2746         unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
2747         int ret;
2748
2749         if (req->flags & REQ_F_NEED_CLEANUP)
2750                 return 0;
2751
2752         sp->file_in = NULL;
2753         sp->off_in = READ_ONCE(sqe->splice_off_in);
2754         sp->off_out = READ_ONCE(sqe->off);
2755         sp->len = READ_ONCE(sqe->len);
2756         sp->flags = READ_ONCE(sqe->splice_flags);
2757
2758         if (unlikely(sp->flags & ~valid_flags))
2759                 return -EINVAL;
2760
2761         ret = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), &sp->file_in,
2762                           (sp->flags & SPLICE_F_FD_IN_FIXED));
2763         if (ret)
2764                 return ret;
2765         req->flags |= REQ_F_NEED_CLEANUP;
2766
2767         if (!S_ISREG(file_inode(sp->file_in)->i_mode))
2768                 req->work.flags |= IO_WQ_WORK_UNBOUND;
2769
2770         return 0;
2771 }
2772
2773 static int io_splice(struct io_kiocb *req, bool force_nonblock)
2774 {
2775         struct io_splice *sp = &req->splice;
2776         struct file *in = sp->file_in;
2777         struct file *out = sp->file_out;
2778         unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
2779         loff_t *poff_in, *poff_out;
2780         long ret = 0;
2781
2782         if (force_nonblock)
2783                 return -EAGAIN;
2784
2785         poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
2786         poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
2787
2788         if (sp->len)
2789                 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
2790
2791         io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
2792         req->flags &= ~REQ_F_NEED_CLEANUP;
2793
2794         io_cqring_add_event(req, ret);
2795         if (ret != sp->len)
2796                 req_set_fail_links(req);
2797         io_put_req(req);
2798         return 0;
2799 }
2800
2801 /*
2802  * IORING_OP_NOP just posts a completion event, nothing else.
2803  */
2804 static int io_nop(struct io_kiocb *req)
2805 {
2806         struct io_ring_ctx *ctx = req->ctx;
2807
2808         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
2809                 return -EINVAL;
2810
2811         io_cqring_add_event(req, 0);
2812         io_put_req(req);
2813         return 0;
2814 }
2815
2816 static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2817 {
2818         struct io_ring_ctx *ctx = req->ctx;
2819
2820         if (!req->file)
2821                 return -EBADF;
2822
2823         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
2824                 return -EINVAL;
2825         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
2826                 return -EINVAL;
2827
2828         req->sync.flags = READ_ONCE(sqe->fsync_flags);
2829         if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
2830                 return -EINVAL;
2831
2832         req->sync.off = READ_ONCE(sqe->off);
2833         req->sync.len = READ_ONCE(sqe->len);
2834         return 0;
2835 }
2836
2837 static bool io_req_cancelled(struct io_kiocb *req)
2838 {
2839         if (req->work.flags & IO_WQ_WORK_CANCEL) {
2840                 req_set_fail_links(req);
2841                 io_cqring_add_event(req, -ECANCELED);
2842                 io_put_req(req);
2843                 return true;
2844         }
2845
2846         return false;
2847 }
2848
2849 static void __io_fsync(struct io_kiocb *req)
2850 {
2851         loff_t end = req->sync.off + req->sync.len;
2852         int ret;
2853
2854         ret = vfs_fsync_range(req->file, req->sync.off,
2855                                 end > 0 ? end : LLONG_MAX,
2856                                 req->sync.flags & IORING_FSYNC_DATASYNC);
2857         if (ret < 0)
2858                 req_set_fail_links(req);
2859         io_cqring_add_event(req, ret);
2860         io_put_req(req);
2861 }
2862
2863 static void io_fsync_finish(struct io_wq_work **workptr)
2864 {
2865         struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2866
2867         if (io_req_cancelled(req))
2868                 return;
2869         __io_fsync(req);
2870         io_steal_work(req, workptr);
2871 }
2872
2873 static int io_fsync(struct io_kiocb *req, bool force_nonblock)
2874 {
2875         /* fsync always requires a blocking context */
2876         if (force_nonblock) {
2877                 req->work.func = io_fsync_finish;
2878                 return -EAGAIN;
2879         }
2880         __io_fsync(req);
2881         return 0;
2882 }
2883
2884 static void __io_fallocate(struct io_kiocb *req)
2885 {
2886         int ret;
2887
2888         current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
2889         ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
2890                                 req->sync.len);
2891         current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
2892         if (ret < 0)
2893                 req_set_fail_links(req);
2894         io_cqring_add_event(req, ret);
2895         io_put_req(req);
2896 }
2897
2898 static void io_fallocate_finish(struct io_wq_work **workptr)
2899 {
2900         struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2901
2902         if (io_req_cancelled(req))
2903                 return;
2904         __io_fallocate(req);
2905         io_steal_work(req, workptr);
2906 }
2907
2908 static int io_fallocate_prep(struct io_kiocb *req,
2909                              const struct io_uring_sqe *sqe)
2910 {
2911         if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
2912                 return -EINVAL;
2913
2914         req->sync.off = READ_ONCE(sqe->off);
2915         req->sync.len = READ_ONCE(sqe->addr);
2916         req->sync.mode = READ_ONCE(sqe->len);
2917         req->fsize = rlimit(RLIMIT_FSIZE);
2918         return 0;
2919 }
2920
2921 static int io_fallocate(struct io_kiocb *req, bool force_nonblock)
2922 {
2923         /* fallocate always requiring blocking context */
2924         if (force_nonblock) {
2925                 req->work.func = io_fallocate_finish;
2926                 return -EAGAIN;
2927         }
2928
2929         __io_fallocate(req);
2930         return 0;
2931 }
2932
2933 static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2934 {
2935         const char __user *fname;
2936         int ret;
2937
2938         if (sqe->ioprio || sqe->buf_index)
2939                 return -EINVAL;
2940         if (req->flags & REQ_F_FIXED_FILE)
2941                 return -EBADF;
2942         if (req->flags & REQ_F_NEED_CLEANUP)
2943                 return 0;
2944
2945         req->open.dfd = READ_ONCE(sqe->fd);
2946         req->open.how.mode = READ_ONCE(sqe->len);
2947         fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
2948         req->open.how.flags = READ_ONCE(sqe->open_flags);
2949         if (force_o_largefile())
2950                 req->open.how.flags |= O_LARGEFILE;
2951
2952         req->open.filename = getname(fname);
2953         if (IS_ERR(req->open.filename)) {
2954                 ret = PTR_ERR(req->open.filename);
2955                 req->open.filename = NULL;
2956                 return ret;
2957         }
2958
2959         req->open.nofile = rlimit(RLIMIT_NOFILE);
2960         req->flags |= REQ_F_NEED_CLEANUP;
2961         return 0;
2962 }
2963
2964 static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2965 {
2966         struct open_how __user *how;
2967         const char __user *fname;
2968         size_t len;
2969         int ret;
2970
2971         if (sqe->ioprio || sqe->buf_index)
2972                 return -EINVAL;
2973         if (req->flags & REQ_F_FIXED_FILE)
2974                 return -EBADF;
2975         if (req->flags & REQ_F_NEED_CLEANUP)
2976                 return 0;
2977
2978         req->open.dfd = READ_ONCE(sqe->fd);
2979         fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
2980         how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
2981         len = READ_ONCE(sqe->len);
2982
2983         if (len < OPEN_HOW_SIZE_VER0)
2984                 return -EINVAL;
2985
2986         ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
2987                                         len);
2988         if (ret)
2989                 return ret;
2990
2991         if (!(req->open.how.flags & O_PATH) && force_o_largefile())
2992                 req->open.how.flags |= O_LARGEFILE;
2993
2994         req->open.filename = getname(fname);
2995         if (IS_ERR(req->open.filename)) {
2996                 ret = PTR_ERR(req->open.filename);
2997                 req->open.filename = NULL;
2998                 return ret;
2999         }
3000
3001         req->open.nofile = rlimit(RLIMIT_NOFILE);
3002         req->flags |= REQ_F_NEED_CLEANUP;
3003         return 0;
3004 }
3005
3006 static int io_openat2(struct io_kiocb *req, bool force_nonblock)
3007 {
3008         struct open_flags op;
3009         struct file *file;
3010         int ret;
3011
3012         if (force_nonblock)
3013                 return -EAGAIN;
3014
3015         ret = build_open_flags(&req->open.how, &op);
3016         if (ret)
3017                 goto err;
3018
3019         ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
3020         if (ret < 0)
3021                 goto err;
3022
3023         file = do_filp_open(req->open.dfd, req->open.filename, &op);
3024         if (IS_ERR(file)) {
3025                 put_unused_fd(ret);
3026                 ret = PTR_ERR(file);
3027         } else {
3028                 fsnotify_open(file);
3029                 fd_install(ret, file);
3030         }
3031 err:
3032         putname(req->open.filename);
3033         req->flags &= ~REQ_F_NEED_CLEANUP;
3034         if (ret < 0)
3035                 req_set_fail_links(req);
3036         io_cqring_add_event(req, ret);
3037         io_put_req(req);
3038         return 0;
3039 }
3040
3041 static int io_openat(struct io_kiocb *req, bool force_nonblock)
3042 {
3043         req->open.how = build_open_how(req->open.how.flags, req->open.how.mode);
3044         return io_openat2(req, force_nonblock);
3045 }
3046
3047 static int io_remove_buffers_prep(struct io_kiocb *req,
3048                                   const struct io_uring_sqe *sqe)
3049 {
3050         struct io_provide_buf *p = &req->pbuf;
3051         u64 tmp;
3052
3053         if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off)
3054                 return -EINVAL;
3055
3056         tmp = READ_ONCE(sqe->fd);
3057         if (!tmp || tmp > USHRT_MAX)
3058                 return -EINVAL;
3059
3060         memset(p, 0, sizeof(*p));
3061         p->nbufs = tmp;
3062         p->bgid = READ_ONCE(sqe->buf_group);
3063         return 0;
3064 }
3065
3066 static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
3067                                int bgid, unsigned nbufs)
3068 {
3069         unsigned i = 0;
3070
3071         /* shouldn't happen */
3072         if (!nbufs)
3073                 return 0;
3074
3075         /* the head kbuf is the list itself */
3076         while (!list_empty(&buf->list)) {
3077                 struct io_buffer *nxt;
3078
3079                 nxt = list_first_entry(&buf->list, struct io_buffer, list);
3080                 list_del(&nxt->list);
3081                 kfree(nxt);
3082                 if (++i == nbufs)
3083                         return i;
3084         }
3085         i++;
3086         kfree(buf);
3087         idr_remove(&ctx->io_buffer_idr, bgid);
3088
3089         return i;
3090 }
3091
3092 static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock)
3093 {
3094         struct io_provide_buf *p = &req->pbuf;
3095         struct io_ring_ctx *ctx = req->ctx;
3096         struct io_buffer *head;
3097         int ret = 0;
3098
3099         io_ring_submit_lock(ctx, !force_nonblock);
3100
3101         lockdep_assert_held(&ctx->uring_lock);
3102
3103         ret = -ENOENT;
3104         head = idr_find(&ctx->io_buffer_idr, p->bgid);
3105         if (head)
3106                 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
3107
3108         io_ring_submit_lock(ctx, !force_nonblock);
3109         if (ret < 0)
3110                 req_set_fail_links(req);
3111         io_cqring_add_event(req, ret);
3112         io_put_req(req);
3113         return 0;
3114 }
3115
3116 static int io_provide_buffers_prep(struct io_kiocb *req,
3117                                    const struct io_uring_sqe *sqe)
3118 {
3119         struct io_provide_buf *p = &req->pbuf;
3120         u64 tmp;
3121
3122         if (sqe->ioprio || sqe->rw_flags)
3123                 return -EINVAL;
3124
3125         tmp = READ_ONCE(sqe->fd);
3126         if (!tmp || tmp > USHRT_MAX)
3127                 return -E2BIG;
3128         p->nbufs = tmp;
3129         p->addr = READ_ONCE(sqe->addr);
3130         p->len = READ_ONCE(sqe->len);
3131
3132         if (!access_ok(u64_to_user_ptr(p->addr), p->len))
3133                 return -EFAULT;
3134
3135         p->bgid = READ_ONCE(sqe->buf_group);
3136         tmp = READ_ONCE(sqe->off);
3137         if (tmp > USHRT_MAX)
3138                 return -E2BIG;
3139         p->bid = tmp;
3140         return 0;
3141 }
3142
3143 static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
3144 {
3145         struct io_buffer *buf;
3146         u64 addr = pbuf->addr;
3147         int i, bid = pbuf->bid;
3148
3149         for (i = 0; i < pbuf->nbufs; i++) {
3150                 buf = kmalloc(sizeof(*buf), GFP_KERNEL);
3151                 if (!buf)
3152                         break;
3153
3154                 buf->addr = addr;
3155                 buf->len = pbuf->len;
3156                 buf->bid = bid;
3157                 addr += pbuf->len;
3158                 bid++;
3159                 if (!*head) {
3160                         INIT_LIST_HEAD(&buf->list);
3161                         *head = buf;
3162                 } else {
3163                         list_add_tail(&buf->list, &(*head)->list);
3164                 }
3165         }
3166
3167         return i ? i : -ENOMEM;
3168 }
3169
3170 static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock)
3171 {
3172         struct io_provide_buf *p = &req->pbuf;
3173         struct io_ring_ctx *ctx = req->ctx;
3174         struct io_buffer *head, *list;
3175         int ret = 0;
3176
3177         io_ring_submit_lock(ctx, !force_nonblock);
3178
3179         lockdep_assert_held(&ctx->uring_lock);
3180
3181         list = head = idr_find(&ctx->io_buffer_idr, p->bgid);
3182
3183         ret = io_add_buffers(p, &head);
3184         if (ret < 0)
3185                 goto out;
3186
3187         if (!list) {
3188                 ret = idr_alloc(&ctx->io_buffer_idr, head, p->bgid, p->bgid + 1,
3189                                         GFP_KERNEL);
3190                 if (ret < 0) {
3191                         __io_remove_buffers(ctx, head, p->bgid, -1U);
3192                         goto out;
3193                 }
3194         }
3195 out:
3196         io_ring_submit_unlock(ctx, !force_nonblock);
3197         if (ret < 0)
3198                 req_set_fail_links(req);
3199         io_cqring_add_event(req, ret);
3200         io_put_req(req);
3201         return 0;
3202 }
3203
3204 static int io_epoll_ctl_prep(struct io_kiocb *req,
3205                              const struct io_uring_sqe *sqe)
3206 {
3207 #if defined(CONFIG_EPOLL)
3208         if (sqe->ioprio || sqe->buf_index)
3209                 return -EINVAL;
3210
3211         req->epoll.epfd = READ_ONCE(sqe->fd);
3212         req->epoll.op = READ_ONCE(sqe->len);
3213         req->epoll.fd = READ_ONCE(sqe->off);
3214
3215         if (ep_op_has_event(req->epoll.op)) {
3216                 struct epoll_event __user *ev;
3217
3218                 ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
3219                 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
3220                         return -EFAULT;
3221         }
3222
3223         return 0;
3224 #else
3225         return -EOPNOTSUPP;
3226 #endif
3227 }
3228
3229 static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock)
3230 {
3231 #if defined(CONFIG_EPOLL)
3232         struct io_epoll *ie = &req->epoll;
3233         int ret;
3234
3235         ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
3236         if (force_nonblock && ret == -EAGAIN)
3237                 return -EAGAIN;
3238
3239         if (ret < 0)
3240                 req_set_fail_links(req);
3241         io_cqring_add_event(req, ret);
3242         io_put_req(req);
3243         return 0;
3244 #else
3245         return -EOPNOTSUPP;
3246 #endif
3247 }
3248
3249 static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3250 {
3251 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
3252         if (sqe->ioprio || sqe->buf_index || sqe->off)
3253                 return -EINVAL;
3254
3255         req->madvise.addr = READ_ONCE(sqe->addr);
3256         req->madvise.len = READ_ONCE(sqe->len);
3257         req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
3258         return 0;
3259 #else
3260         return -EOPNOTSUPP;
3261 #endif
3262 }
3263
3264 static int io_madvise(struct io_kiocb *req, bool force_nonblock)
3265 {
3266 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
3267         struct io_madvise *ma = &req->madvise;
3268         int ret;
3269
3270         if (force_nonblock)
3271                 return -EAGAIN;
3272
3273         ret = do_madvise(ma->addr, ma->len, ma->advice);
3274         if (ret < 0)
3275                 req_set_fail_links(req);
3276         io_cqring_add_event(req, ret);
3277         io_put_req(req);
3278         return 0;
3279 #else
3280         return -EOPNOTSUPP;
3281 #endif
3282 }
3283
3284 static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3285 {
3286         if (sqe->ioprio || sqe->buf_index || sqe->addr)
3287                 return -EINVAL;
3288
3289         req->fadvise.offset = READ_ONCE(sqe->off);
3290         req->fadvise.len = READ_ONCE(sqe->len);
3291         req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
3292         return 0;
3293 }
3294
3295 static int io_fadvise(struct io_kiocb *req, bool force_nonblock)
3296 {
3297         struct io_fadvise *fa = &req->fadvise;
3298         int ret;
3299
3300         if (force_nonblock) {
3301                 switch (fa->advice) {
3302                 case POSIX_FADV_NORMAL:
3303                 case POSIX_FADV_RANDOM:
3304                 case POSIX_FADV_SEQUENTIAL:
3305                         break;
3306                 default:
3307                         return -EAGAIN;
3308                 }
3309         }
3310
3311         ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
3312         if (ret < 0)
3313                 req_set_fail_links(req);
3314         io_cqring_add_event(req, ret);
3315         io_put_req(req);
3316         return 0;
3317 }
3318
3319 static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3320 {
3321         const char __user *fname;
3322         unsigned lookup_flags;
3323         int ret;
3324
3325         if (sqe->ioprio || sqe->buf_index)
3326                 return -EINVAL;
3327         if (req->flags & REQ_F_FIXED_FILE)
3328                 return -EBADF;
3329         if (req->flags & REQ_F_NEED_CLEANUP)
3330                 return 0;
3331
3332         req->open.dfd = READ_ONCE(sqe->fd);
3333         req->open.mask = READ_ONCE(sqe->len);
3334         fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3335         req->open.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3336         req->open.how.flags = READ_ONCE(sqe->statx_flags);
3337
3338         if (vfs_stat_set_lookup_flags(&lookup_flags, req->open.how.flags))
3339                 return -EINVAL;
3340
3341         req->open.filename = getname_flags(fname, lookup_flags, NULL);
3342         if (IS_ERR(req->open.filename)) {
3343                 ret = PTR_ERR(req->open.filename);
3344                 req->open.filename = NULL;
3345                 return ret;
3346         }
3347
3348         req->flags |= REQ_F_NEED_CLEANUP;
3349         return 0;
3350 }
3351
3352 static int io_statx(struct io_kiocb *req, bool force_nonblock)
3353 {
3354         struct io_open *ctx = &req->open;
3355         unsigned lookup_flags;
3356         struct path path;
3357         struct kstat stat;
3358         int ret;
3359
3360         if (force_nonblock) {
3361                 /* only need file table for an actual valid fd */
3362                 if (ctx->dfd == -1 || ctx->dfd == AT_FDCWD)
3363                         req->flags |= REQ_F_NO_FILE_TABLE;
3364                 return -EAGAIN;
3365         }
3366
3367         if (vfs_stat_set_lookup_flags(&lookup_flags, ctx->how.flags))
3368                 return -EINVAL;
3369
3370 retry:
3371         /* filename_lookup() drops it, keep a reference */
3372         ctx->filename->refcnt++;
3373
3374         ret = filename_lookup(ctx->dfd, ctx->filename, lookup_flags, &path,
3375                                 NULL);
3376         if (ret)
3377                 goto err;
3378
3379         ret = vfs_getattr(&path, &stat, ctx->mask, ctx->how.flags);
3380         path_put(&path);
3381         if (retry_estale(ret, lookup_flags)) {
3382                 lookup_flags |= LOOKUP_REVAL;
3383                 goto retry;
3384         }
3385         if (!ret)
3386                 ret = cp_statx(&stat, ctx->buffer);
3387 err:
3388         putname(ctx->filename);
3389         req->flags &= ~REQ_F_NEED_CLEANUP;
3390         if (ret < 0)
3391                 req_set_fail_links(req);
3392         io_cqring_add_event(req, ret);
3393         io_put_req(req);
3394         return 0;
3395 }
3396
3397 static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3398 {
3399         /*
3400          * If we queue this for async, it must not be cancellable. That would
3401          * leave the 'file' in an undeterminate state.
3402          */
3403         req->work.flags |= IO_WQ_WORK_NO_CANCEL;
3404
3405         if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
3406             sqe->rw_flags || sqe->buf_index)
3407                 return -EINVAL;
3408         if (req->flags & REQ_F_FIXED_FILE)
3409                 return -EBADF;
3410
3411         req->close.fd = READ_ONCE(sqe->fd);
3412         if (req->file->f_op == &io_uring_fops ||
3413             req->close.fd == req->ctx->ring_fd)
3414                 return -EBADF;
3415
3416         return 0;
3417 }
3418
3419 /* only called when __close_fd_get_file() is done */
3420 static void __io_close_finish(struct io_kiocb *req)
3421 {
3422         int ret;
3423
3424         ret = filp_close(req->close.put_file, req->work.files);
3425         if (ret < 0)
3426                 req_set_fail_links(req);
3427         io_cqring_add_event(req, ret);
3428         fput(req->close.put_file);
3429         io_put_req(req);
3430 }
3431
3432 static void io_close_finish(struct io_wq_work **workptr)
3433 {
3434         struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
3435
3436         /* not cancellable, don't do io_req_cancelled() */
3437         __io_close_finish(req);
3438         io_steal_work(req, workptr);
3439 }
3440
3441 static int io_close(struct io_kiocb *req, bool force_nonblock)
3442 {
3443         int ret;
3444
3445         req->close.put_file = NULL;
3446         ret = __close_fd_get_file(req->close.fd, &req->close.put_file);
3447         if (ret < 0)
3448                 return ret;
3449
3450         /* if the file has a flush method, be safe and punt to async */
3451         if (req->close.put_file->f_op->flush && force_nonblock) {
3452                 /* submission ref will be dropped, take it for async */
3453                 refcount_inc(&req->refs);
3454
3455                 req->work.func = io_close_finish;
3456                 /*
3457                  * Do manual async queue here to avoid grabbing files - we don't
3458                  * need the files, and it'll cause io_close_finish() to close
3459                  * the file again and cause a double CQE entry for this request
3460                  */
3461                 io_queue_async_work(req);
3462                 return 0;
3463         }
3464
3465         /*
3466          * No ->flush(), safely close from here and just punt the
3467          * fput() to async context.
3468          */
3469         __io_close_finish(req);
3470         return 0;
3471 }
3472
3473 static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3474 {
3475         struct io_ring_ctx *ctx = req->ctx;
3476
3477         if (!req->file)
3478                 return -EBADF;
3479
3480         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3481                 return -EINVAL;
3482         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
3483                 return -EINVAL;
3484
3485         req->sync.off = READ_ONCE(sqe->off);
3486         req->sync.len = READ_ONCE(sqe->len);
3487         req->sync.flags = READ_ONCE(sqe->sync_range_flags);
3488         return 0;
3489 }
3490
3491 static void __io_sync_file_range(struct io_kiocb *req)
3492 {
3493         int ret;
3494
3495         ret = sync_file_range(req->file, req->sync.off, req->sync.len,
3496                                 req->sync.flags);
3497         if (ret < 0)
3498                 req_set_fail_links(req);
3499         io_cqring_add_event(req, ret);
3500         io_put_req(req);
3501 }
3502
3503
3504 static void io_sync_file_range_finish(struct io_wq_work **workptr)
3505 {
3506         struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
3507
3508         if (io_req_cancelled(req))
3509                 return;
3510         __io_sync_file_range(req);
3511         io_steal_work(req, workptr);
3512 }
3513
3514 static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
3515 {
3516         /* sync_file_range always requires a blocking context */
3517         if (force_nonblock) {
3518                 req->work.func = io_sync_file_range_finish;
3519                 return -EAGAIN;
3520         }
3521
3522         __io_sync_file_range(req);
3523         return 0;
3524 }
3525
3526 #if defined(CONFIG_NET)
3527 static int io_setup_async_msg(struct io_kiocb *req,
3528                               struct io_async_msghdr *kmsg)
3529 {
3530         if (req->io)
3531                 return -EAGAIN;
3532         if (io_alloc_async_ctx(req)) {
3533                 if (kmsg->iov != kmsg->fast_iov)
3534                         kfree(kmsg->iov);
3535                 return -ENOMEM;
3536         }
3537         req->flags |= REQ_F_NEED_CLEANUP;
3538         memcpy(&req->io->msg, kmsg, sizeof(*kmsg));
3539         return -EAGAIN;
3540 }
3541
3542 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3543 {
3544         struct io_sr_msg *sr = &req->sr_msg;
3545         struct io_async_ctx *io = req->io;
3546         int ret;
3547
3548         sr->msg_flags = READ_ONCE(sqe->msg_flags);
3549         sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
3550         sr->len = READ_ONCE(sqe->len);
3551
3552 #ifdef CONFIG_COMPAT
3553         if (req->ctx->compat)
3554                 sr->msg_flags |= MSG_CMSG_COMPAT;
3555 #endif
3556
3557         if (!io || req->opcode == IORING_OP_SEND)
3558                 return 0;
3559         /* iovec is already imported */
3560         if (req->flags & REQ_F_NEED_CLEANUP)
3561                 return 0;
3562
3563         io->msg.iov = io->msg.fast_iov;
3564         ret = sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
3565                                         &io->msg.iov);
3566         if (!ret)
3567                 req->flags |= REQ_F_NEED_CLEANUP;
3568         return ret;
3569 }
3570
3571 static int io_sendmsg(struct io_kiocb *req, bool force_nonblock)
3572 {
3573         struct io_async_msghdr *kmsg = NULL;
3574         struct socket *sock;
3575         int ret;
3576
3577         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3578                 return -EINVAL;
3579
3580         sock = sock_from_file(req->file, &ret);
3581         if (sock) {
3582                 struct io_async_ctx io;
3583                 unsigned flags;
3584
3585                 if (req->io) {
3586                         kmsg = &req->io->msg;
3587                         kmsg->msg.msg_name = &req->io->msg.addr;
3588                         /* if iov is set, it's allocated already */
3589                         if (!kmsg->iov)
3590                                 kmsg->iov = kmsg->fast_iov;
3591                         kmsg->msg.msg_iter.iov = kmsg->iov;
3592                 } else {
3593                         struct io_sr_msg *sr = &req->sr_msg;
3594
3595                         kmsg = &io.msg;
3596                         kmsg->msg.msg_name = &io.msg.addr;
3597
3598                         io.msg.iov = io.msg.fast_iov;
3599                         ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg,
3600                                         sr->msg_flags, &io.msg.iov);
3601                         if (ret)
3602                                 return ret;
3603                 }
3604
3605                 flags = req->sr_msg.msg_flags;
3606                 if (flags & MSG_DONTWAIT)
3607                         req->flags |= REQ_F_NOWAIT;
3608                 else if (force_nonblock)
3609                         flags |= MSG_DONTWAIT;
3610
3611                 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
3612                 if (force_nonblock && ret == -EAGAIN)
3613                         return io_setup_async_msg(req, kmsg);
3614                 if (ret == -ERESTARTSYS)
3615                         ret = -EINTR;
3616         }
3617
3618         if (kmsg && kmsg->iov != kmsg->fast_iov)
3619                 kfree(kmsg->iov);
3620         req->flags &= ~REQ_F_NEED_CLEANUP;
3621         io_cqring_add_event(req, ret);
3622         if (ret < 0)
3623                 req_set_fail_links(req);
3624         io_put_req(req);
3625         return 0;
3626 }
3627
3628 static int io_send(struct io_kiocb *req, bool force_nonblock)
3629 {
3630         struct socket *sock;
3631         int ret;
3632
3633         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3634                 return -EINVAL;
3635
3636         sock = sock_from_file(req->file, &ret);
3637         if (sock) {
3638                 struct io_sr_msg *sr = &req->sr_msg;
3639                 struct msghdr msg;
3640                 struct iovec iov;
3641                 unsigned flags;
3642
3643                 ret = import_single_range(WRITE, sr->buf, sr->len, &iov,
3644                                                 &msg.msg_iter);
3645                 if (ret)
3646                         return ret;
3647
3648                 msg.msg_name = NULL;
3649                 msg.msg_control = NULL;
3650                 msg.msg_controllen = 0;
3651                 msg.msg_namelen = 0;
3652
3653                 flags = req->sr_msg.msg_flags;
3654                 if (flags & MSG_DONTWAIT)
3655                         req->flags |= REQ_F_NOWAIT;
3656                 else if (force_nonblock)
3657                         flags |= MSG_DONTWAIT;
3658
3659                 msg.msg_flags = flags;
3660                 ret = sock_sendmsg(sock, &msg);
3661                 if (force_nonblock && ret == -EAGAIN)
3662                         return -EAGAIN;
3663                 if (ret == -ERESTARTSYS)
3664                         ret = -EINTR;
3665         }
3666
3667         io_cqring_add_event(req, ret);
3668         if (ret < 0)
3669                 req_set_fail_links(req);
3670         io_put_req(req);
3671         return 0;
3672 }
3673
3674 static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io)
3675 {
3676         struct io_sr_msg *sr = &req->sr_msg;
3677         struct iovec __user *uiov;
3678         size_t iov_len;
3679         int ret;
3680
3681         ret = __copy_msghdr_from_user(&io->msg.msg, sr->msg, &io->msg.uaddr,
3682                                         &uiov, &iov_len);
3683         if (ret)
3684                 return ret;
3685
3686         if (req->flags & REQ_F_BUFFER_SELECT) {
3687                 if (iov_len > 1)
3688                         return -EINVAL;
3689                 if (copy_from_user(io->msg.iov, uiov, sizeof(*uiov)))
3690                         return -EFAULT;
3691                 sr->len = io->msg.iov[0].iov_len;
3692                 iov_iter_init(&io->msg.msg.msg_iter, READ, io->msg.iov, 1,
3693                                 sr->len);
3694                 io->msg.iov = NULL;
3695         } else {
3696                 ret = import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
3697                                         &io->msg.iov, &io->msg.msg.msg_iter);
3698                 if (ret > 0)
3699                         ret = 0;
3700         }
3701
3702         return ret;
3703 }
3704
3705 #ifdef CONFIG_COMPAT
3706 static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
3707                                         struct io_async_ctx *io)
3708 {
3709         struct compat_msghdr __user *msg_compat;
3710         struct io_sr_msg *sr = &req->sr_msg;
3711         struct compat_iovec __user *uiov;
3712         compat_uptr_t ptr;
3713         compat_size_t len;
3714         int ret;
3715
3716         msg_compat = (struct compat_msghdr __user *) sr->msg;
3717         ret = __get_compat_msghdr(&io->msg.msg, msg_compat, &io->msg.uaddr,
3718                                         &ptr, &len);
3719         if (ret)
3720                 return ret;
3721
3722         uiov = compat_ptr(ptr);
3723         if (req->flags & REQ_F_BUFFER_SELECT) {
3724                 compat_ssize_t clen;
3725
3726                 if (len > 1)
3727                         return -EINVAL;
3728                 if (!access_ok(uiov, sizeof(*uiov)))
3729                         return -EFAULT;
3730                 if (__get_user(clen, &uiov->iov_len))
3731                         return -EFAULT;
3732                 if (clen < 0)
3733                         return -EINVAL;
3734                 sr->len = io->msg.iov[0].iov_len;
3735                 io->msg.iov = NULL;
3736         } else {
3737                 ret = compat_import_iovec(READ, uiov, len, UIO_FASTIOV,
3738                                                 &io->msg.iov,
3739                                                 &io->msg.msg.msg_iter);
3740                 if (ret < 0)
3741                         return ret;
3742         }
3743
3744         return 0;
3745 }
3746 #endif
3747
3748 static int io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io)
3749 {
3750         io->msg.iov = io->msg.fast_iov;
3751
3752 #ifdef CONFIG_COMPAT
3753         if (req->ctx->compat)
3754                 return __io_compat_recvmsg_copy_hdr(req, io);
3755 #endif
3756
3757         return __io_recvmsg_copy_hdr(req, io);
3758 }
3759
3760 static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
3761                                                int *cflags, bool needs_lock)
3762 {
3763         struct io_sr_msg *sr = &req->sr_msg;
3764         struct io_buffer *kbuf;
3765
3766         if (!(req->flags & REQ_F_BUFFER_SELECT))
3767                 return NULL;
3768
3769         kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
3770         if (IS_ERR(kbuf))
3771                 return kbuf;
3772
3773         sr->kbuf = kbuf;
3774         req->flags |= REQ_F_BUFFER_SELECTED;
3775
3776         *cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
3777         *cflags |= IORING_CQE_F_BUFFER;
3778         return kbuf;
3779 }
3780
3781 static int io_recvmsg_prep(struct io_kiocb *req,
3782                            const struct io_uring_sqe *sqe)
3783 {
3784         struct io_sr_msg *sr = &req->sr_msg;
3785         struct io_async_ctx *io = req->io;
3786         int ret;
3787
3788         sr->msg_flags = READ_ONCE(sqe->msg_flags);
3789         sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
3790         sr->len = READ_ONCE(sqe->len);
3791         sr->bgid = READ_ONCE(sqe->buf_group);
3792
3793 #ifdef CONFIG_COMPAT
3794         if (req->ctx->compat)
3795                 sr->msg_flags |= MSG_CMSG_COMPAT;
3796 #endif
3797
3798         if (!io || req->opcode == IORING_OP_RECV)
3799                 return 0;
3800         /* iovec is already imported */
3801         if (req->flags & REQ_F_NEED_CLEANUP)
3802                 return 0;
3803
3804         ret = io_recvmsg_copy_hdr(req, io);
3805         if (!ret)
3806                 req->flags |= REQ_F_NEED_CLEANUP;
3807         return ret;
3808 }
3809
3810 static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
3811 {
3812         struct io_async_msghdr *kmsg = NULL;
3813         struct socket *sock;
3814         int ret, cflags = 0;
3815
3816         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3817                 return -EINVAL;
3818
3819         sock = sock_from_file(req->file, &ret);
3820         if (sock) {
3821                 struct io_buffer *kbuf;
3822                 struct io_async_ctx io;
3823                 unsigned flags;
3824
3825                 if (req->io) {
3826                         kmsg = &req->io->msg;
3827                         kmsg->msg.msg_name = &req->io->msg.addr;
3828                         /* if iov is set, it's allocated already */
3829                         if (!kmsg->iov)
3830                                 kmsg->iov = kmsg->fast_iov;
3831                         kmsg->msg.msg_iter.iov = kmsg->iov;
3832                 } else {
3833                         kmsg = &io.msg;
3834                         kmsg->msg.msg_name = &io.msg.addr;
3835
3836                         ret = io_recvmsg_copy_hdr(req, &io);
3837                         if (ret)
3838                                 return ret;
3839                 }
3840
3841                 kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
3842                 if (IS_ERR(kbuf)) {
3843                         return PTR_ERR(kbuf);
3844                 } else if (kbuf) {
3845                         kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
3846                         iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov,
3847                                         1, req->sr_msg.len);
3848                 }
3849
3850                 flags = req->sr_msg.msg_flags;
3851                 if (flags & MSG_DONTWAIT)
3852                         req->flags |= REQ_F_NOWAIT;
3853                 else if (force_nonblock)
3854                         flags |= MSG_DONTWAIT;
3855
3856                 ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg,
3857                                                 kmsg->uaddr, flags);
3858                 if (force_nonblock && ret == -EAGAIN)
3859                         return io_setup_async_msg(req, kmsg);
3860                 if (ret == -ERESTARTSYS)
3861                         ret = -EINTR;
3862         }
3863
3864         if (kmsg && kmsg->iov != kmsg->fast_iov)
3865                 kfree(kmsg->iov);
3866         req->flags &= ~REQ_F_NEED_CLEANUP;
3867         __io_cqring_add_event(req, ret, cflags);
3868         if (ret < 0)
3869                 req_set_fail_links(req);
3870         io_put_req(req);
3871         return 0;
3872 }
3873
3874 static int io_recv(struct io_kiocb *req, bool force_nonblock)
3875 {
3876         struct io_buffer *kbuf = NULL;
3877         struct socket *sock;
3878         int ret, cflags = 0;
3879
3880         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3881                 return -EINVAL;
3882
3883         sock = sock_from_file(req->file, &ret);
3884         if (sock) {
3885                 struct io_sr_msg *sr = &req->sr_msg;
3886                 void __user *buf = sr->buf;
3887                 struct msghdr msg;
3888                 struct iovec iov;
3889                 unsigned flags;
3890
3891                 kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
3892                 if (IS_ERR(kbuf))
3893                         return PTR_ERR(kbuf);
3894                 else if (kbuf)
3895                         buf = u64_to_user_ptr(kbuf->addr);
3896
3897                 ret = import_single_range(READ, buf, sr->len, &iov,
3898                                                 &msg.msg_iter);
3899                 if (ret) {
3900                         kfree(kbuf);
3901                         return ret;
3902                 }
3903
3904                 req->flags |= REQ_F_NEED_CLEANUP;
3905                 msg.msg_name = NULL;
3906                 msg.msg_control = NULL;
3907                 msg.msg_controllen = 0;
3908                 msg.msg_namelen = 0;
3909                 msg.msg_iocb = NULL;
3910                 msg.msg_flags = 0;
3911
3912                 flags = req->sr_msg.msg_flags;
3913                 if (flags & MSG_DONTWAIT)
3914                         req->flags |= REQ_F_NOWAIT;
3915                 else if (force_nonblock)
3916                         flags |= MSG_DONTWAIT;
3917
3918                 ret = sock_recvmsg(sock, &msg, flags);
3919                 if (force_nonblock && ret == -EAGAIN)
3920                         return -EAGAIN;
3921                 if (ret == -ERESTARTSYS)
3922                         ret = -EINTR;
3923         }
3924
3925         kfree(kbuf);
3926         req->flags &= ~REQ_F_NEED_CLEANUP;
3927         __io_cqring_add_event(req, ret, cflags);
3928         if (ret < 0)
3929                 req_set_fail_links(req);
3930         io_put_req(req);
3931         return 0;
3932 }
3933
3934 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3935 {
3936         struct io_accept *accept = &req->accept;
3937
3938         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3939                 return -EINVAL;
3940         if (sqe->ioprio || sqe->len || sqe->buf_index)
3941                 return -EINVAL;
3942
3943         accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
3944         accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3945         accept->flags = READ_ONCE(sqe->accept_flags);
3946         accept->nofile = rlimit(RLIMIT_NOFILE);
3947         return 0;
3948 }
3949
3950 static int __io_accept(struct io_kiocb *req, bool force_nonblock)
3951 {
3952         struct io_accept *accept = &req->accept;
3953         unsigned file_flags;
3954         int ret;
3955
3956         file_flags = force_nonblock ? O_NONBLOCK : 0;
3957         ret = __sys_accept4_file(req->file, file_flags, accept->addr,
3958                                         accept->addr_len, accept->flags,
3959                                         accept->nofile);
3960         if (ret == -EAGAIN && force_nonblock)
3961                 return -EAGAIN;
3962         if (ret == -ERESTARTSYS)
3963                 ret = -EINTR;
3964         if (ret < 0)
3965                 req_set_fail_links(req);
3966         io_cqring_add_event(req, ret);
3967         io_put_req(req);
3968         return 0;
3969 }
3970
3971 static void io_accept_finish(struct io_wq_work **workptr)
3972 {
3973         struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
3974
3975         if (io_req_cancelled(req))
3976                 return;
3977         __io_accept(req, false);
3978         io_steal_work(req, workptr);
3979 }
3980
3981 static int io_accept(struct io_kiocb *req, bool force_nonblock)
3982 {
3983         int ret;
3984
3985         ret = __io_accept(req, force_nonblock);
3986         if (ret == -EAGAIN && force_nonblock) {
3987                 req->work.func = io_accept_finish;
3988                 return -EAGAIN;
3989         }
3990         return 0;
3991 }
3992
3993 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3994 {
3995         struct io_connect *conn = &req->connect;
3996         struct io_async_ctx *io = req->io;
3997
3998         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3999                 return -EINVAL;
4000         if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
4001                 return -EINVAL;
4002
4003         conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
4004         conn->addr_len =  READ_ONCE(sqe->addr2);
4005
4006         if (!io)
4007                 return 0;
4008
4009         return move_addr_to_kernel(conn->addr, conn->addr_len,
4010                                         &io->connect.address);
4011 }
4012
4013 static int io_connect(struct io_kiocb *req, bool force_nonblock)
4014 {
4015         struct io_async_ctx __io, *io;
4016         unsigned file_flags;
4017         int ret;
4018
4019         if (req->io) {
4020                 io = req->io;
4021         } else {
4022                 ret = move_addr_to_kernel(req->connect.addr,
4023                                                 req->connect.addr_len,
4024                                                 &__io.connect.address);
4025                 if (ret)
4026                         goto out;
4027                 io = &__io;
4028         }
4029
4030         file_flags = force_nonblock ? O_NONBLOCK : 0;
4031
4032         ret = __sys_connect_file(req->file, &io->connect.address,
4033                                         req->connect.addr_len, file_flags);
4034         if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
4035                 if (req->io)
4036                         return -EAGAIN;
4037                 if (io_alloc_async_ctx(req)) {
4038                         ret = -ENOMEM;
4039                         goto out;
4040                 }
4041                 memcpy(&req->io->connect, &__io.connect, sizeof(__io.connect));
4042                 return -EAGAIN;
4043         }
4044         if (ret == -ERESTARTSYS)
4045                 ret = -EINTR;
4046 out:
4047         if (ret < 0)
4048                 req_set_fail_links(req);
4049         io_cqring_add_event(req, ret);
4050         io_put_req(req);
4051         return 0;
4052 }
4053 #else /* !CONFIG_NET */
4054 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4055 {
4056         return -EOPNOTSUPP;
4057 }
4058
4059 static int io_sendmsg(struct io_kiocb *req, bool force_nonblock)
4060 {
4061         return -EOPNOTSUPP;
4062 }
4063
4064 static int io_send(struct io_kiocb *req, bool force_nonblock)
4065 {
4066         return -EOPNOTSUPP;
4067 }
4068
4069 static int io_recvmsg_prep(struct io_kiocb *req,
4070                            const struct io_uring_sqe *sqe)
4071 {
4072         return -EOPNOTSUPP;
4073 }
4074
4075 static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
4076 {
4077         return -EOPNOTSUPP;
4078 }
4079
4080 static int io_recv(struct io_kiocb *req, bool force_nonblock)
4081 {
4082         return -EOPNOTSUPP;
4083 }
4084
4085 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4086 {
4087         return -EOPNOTSUPP;
4088 }
4089
4090 static int io_accept(struct io_kiocb *req, bool force_nonblock)
4091 {
4092         return -EOPNOTSUPP;
4093 }
4094
4095 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4096 {
4097         return -EOPNOTSUPP;
4098 }
4099
4100 static int io_connect(struct io_kiocb *req, bool force_nonblock)
4101 {
4102         return -EOPNOTSUPP;
4103 }
4104 #endif /* CONFIG_NET */
4105
4106 struct io_poll_table {
4107         struct poll_table_struct pt;
4108         struct io_kiocb *req;
4109         int error;
4110 };
4111
4112 static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
4113                            __poll_t mask, task_work_func_t func)
4114 {
4115         struct task_struct *tsk;
4116         int ret;
4117
4118         /* for instances that support it check for an event match first: */
4119         if (mask && !(mask & poll->events))
4120                 return 0;
4121
4122         trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
4123
4124         list_del_init(&poll->wait.entry);
4125
4126         tsk = req->task;
4127         req->result = mask;
4128         init_task_work(&req->task_work, func);
4129         /*
4130          * If this fails, then the task is exiting. When a task exits, the
4131          * work gets canceled, so just cancel this request as well instead
4132          * of executing it. We can't safely execute it anyway, as we may not
4133          * have the needed state needed for it anyway.
4134          */
4135         ret = task_work_add(tsk, &req->task_work, true);
4136         if (unlikely(ret)) {
4137                 WRITE_ONCE(poll->canceled, true);
4138                 tsk = io_wq_get_task(req->ctx->io_wq);
4139                 task_work_add(tsk, &req->task_work, true);
4140         }
4141         wake_up_process(tsk);
4142         return 1;
4143 }
4144
4145 static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
4146         __acquires(&req->ctx->completion_lock)
4147 {
4148         struct io_ring_ctx *ctx = req->ctx;
4149
4150         if (!req->result && !READ_ONCE(poll->canceled)) {
4151                 struct poll_table_struct pt = { ._key = poll->events };
4152
4153                 req->result = vfs_poll(req->file, &pt) & poll->events;
4154         }
4155
4156         spin_lock_irq(&ctx->completion_lock);
4157         if (!req->result && !READ_ONCE(poll->canceled)) {
4158                 add_wait_queue(poll->head, &poll->wait);
4159                 return true;
4160         }
4161
4162         return false;
4163 }
4164
4165 static void io_poll_remove_double(struct io_kiocb *req)
4166 {
4167         struct io_poll_iocb *poll = (struct io_poll_iocb *) req->io;
4168
4169         lockdep_assert_held(&req->ctx->completion_lock);
4170
4171         if (poll && poll->head) {
4172                 struct wait_queue_head *head = poll->head;
4173
4174                 spin_lock(&head->lock);
4175                 list_del_init(&poll->wait.entry);
4176                 if (poll->wait.private)
4177                         refcount_dec(&req->refs);
4178                 poll->head = NULL;
4179                 spin_unlock(&head->lock);
4180         }
4181 }
4182
4183 static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
4184 {
4185         struct io_ring_ctx *ctx = req->ctx;
4186
4187         io_poll_remove_double(req);
4188         req->poll.done = true;
4189         io_cqring_fill_event(req, error ? error : mangle_poll(mask));
4190         io_commit_cqring(ctx);
4191 }
4192
4193 static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
4194 {
4195         struct io_ring_ctx *ctx = req->ctx;
4196
4197         if (io_poll_rewait(req, &req->poll)) {
4198                 spin_unlock_irq(&ctx->completion_lock);
4199                 return;
4200         }
4201
4202         hash_del(&req->hash_node);
4203         io_poll_complete(req, req->result, 0);
4204         req->flags |= REQ_F_COMP_LOCKED;
4205         io_put_req_find_next(req, nxt);
4206         spin_unlock_irq(&ctx->completion_lock);
4207
4208         io_cqring_ev_posted(ctx);
4209 }
4210
4211 static void io_poll_task_func(struct callback_head *cb)
4212 {
4213         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
4214         struct io_kiocb *nxt = NULL;
4215
4216         io_poll_task_handler(req, &nxt);
4217         if (nxt) {
4218                 struct io_ring_ctx *ctx = nxt->ctx;
4219
4220                 mutex_lock(&ctx->uring_lock);
4221                 __io_queue_sqe(nxt, NULL);
4222                 mutex_unlock(&ctx->uring_lock);
4223         }
4224 }
4225
4226 static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
4227                                int sync, void *key)
4228 {
4229         struct io_kiocb *req = wait->private;
4230         struct io_poll_iocb *poll = (struct io_poll_iocb *) req->io;
4231         __poll_t mask = key_to_poll(key);
4232
4233         /* for instances that support it check for an event match first: */
4234         if (mask && !(mask & poll->events))
4235                 return 0;
4236
4237         if (req->poll.head) {
4238                 bool done;
4239
4240                 spin_lock(&req->poll.head->lock);
4241                 done = list_empty(&req->poll.wait.entry);
4242                 if (!done)
4243                         list_del_init(&req->poll.wait.entry);
4244                 spin_unlock(&req->poll.head->lock);
4245                 if (!done)
4246                         __io_async_wake(req, poll, mask, io_poll_task_func);
4247         }
4248         refcount_dec(&req->refs);
4249         return 1;
4250 }
4251
4252 static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
4253                               wait_queue_func_t wake_func)
4254 {
4255         poll->head = NULL;
4256         poll->done = false;
4257         poll->canceled = false;
4258         poll->events = events;
4259         INIT_LIST_HEAD(&poll->wait.entry);
4260         init_waitqueue_func_entry(&poll->wait, wake_func);
4261 }
4262
4263 static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
4264                             struct wait_queue_head *head)
4265 {
4266         struct io_kiocb *req = pt->req;
4267
4268         /*
4269          * If poll->head is already set, it's because the file being polled
4270          * uses multiple waitqueues for poll handling (eg one for read, one
4271          * for write). Setup a separate io_poll_iocb if this happens.
4272          */
4273         if (unlikely(poll->head)) {
4274                 /* already have a 2nd entry, fail a third attempt */
4275                 if (req->io) {
4276                         pt->error = -EINVAL;
4277                         return;
4278                 }
4279                 poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
4280                 if (!poll) {
4281                         pt->error = -ENOMEM;
4282                         return;
4283                 }
4284                 io_init_poll_iocb(poll, req->poll.events, io_poll_double_wake);
4285                 refcount_inc(&req->refs);
4286                 poll->wait.private = req;
4287                 req->io = (void *) poll;
4288         }
4289
4290         pt->error = 0;
4291         poll->head = head;
4292         add_wait_queue(head, &poll->wait);
4293 }
4294
4295 static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
4296                                struct poll_table_struct *p)
4297 {
4298         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
4299
4300         __io_queue_proc(&pt->req->apoll->poll, pt, head);
4301 }
4302
4303 static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx)
4304 {
4305         struct mm_struct *mm = current->mm;
4306
4307         if (mm) {
4308                 unuse_mm(mm);
4309                 mmput(mm);
4310         }
4311 }
4312
4313 static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
4314                                    struct io_kiocb *req)
4315 {
4316         if (io_op_defs[req->opcode].needs_mm && !current->mm) {
4317                 if (unlikely(!mmget_not_zero(ctx->sqo_mm)))
4318                         return -EFAULT;
4319                 use_mm(ctx->sqo_mm);
4320         }
4321
4322         return 0;
4323 }
4324
4325 static void io_async_task_func(struct callback_head *cb)
4326 {
4327         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
4328         struct async_poll *apoll = req->apoll;
4329         struct io_ring_ctx *ctx = req->ctx;
4330         bool canceled;
4331
4332         trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
4333
4334         if (io_poll_rewait(req, &apoll->poll)) {
4335                 spin_unlock_irq(&ctx->completion_lock);
4336                 return;
4337         }
4338
4339         if (hash_hashed(&req->hash_node))
4340                 hash_del(&req->hash_node);
4341
4342         canceled = READ_ONCE(apoll->poll.canceled);
4343         if (canceled) {
4344                 io_cqring_fill_event(req, -ECANCELED);
4345                 io_commit_cqring(ctx);
4346         }
4347
4348         spin_unlock_irq(&ctx->completion_lock);
4349
4350         /* restore ->work in case we need to retry again */
4351         memcpy(&req->work, &apoll->work, sizeof(req->work));
4352
4353         if (canceled) {
4354                 kfree(apoll);
4355                 io_cqring_ev_posted(ctx);
4356 end_req:
4357                 req_set_fail_links(req);
4358                 io_double_put_req(req);
4359                 return;
4360         }
4361
4362         __set_current_state(TASK_RUNNING);
4363         if (io_sq_thread_acquire_mm(ctx, req)) {
4364                 io_cqring_add_event(req, -EFAULT);
4365                 goto end_req;
4366         }
4367         mutex_lock(&ctx->uring_lock);
4368         __io_queue_sqe(req, NULL);
4369         mutex_unlock(&ctx->uring_lock);
4370
4371         kfree(apoll);
4372 }
4373
4374 static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
4375                         void *key)
4376 {
4377         struct io_kiocb *req = wait->private;
4378         struct io_poll_iocb *poll = &req->apoll->poll;
4379
4380         trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
4381                                         key_to_poll(key));
4382
4383         return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
4384 }
4385
4386 static void io_poll_req_insert(struct io_kiocb *req)
4387 {
4388         struct io_ring_ctx *ctx = req->ctx;
4389         struct hlist_head *list;
4390
4391         list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
4392         hlist_add_head(&req->hash_node, list);
4393 }
4394
4395 static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
4396                                       struct io_poll_iocb *poll,
4397                                       struct io_poll_table *ipt, __poll_t mask,
4398                                       wait_queue_func_t wake_func)
4399         __acquires(&ctx->completion_lock)
4400 {
4401         struct io_ring_ctx *ctx = req->ctx;
4402         bool cancel = false;
4403
4404         poll->file = req->file;
4405         io_init_poll_iocb(poll, mask, wake_func);
4406         poll->wait.private = req;
4407
4408         ipt->pt._key = mask;
4409         ipt->req = req;
4410         ipt->error = -EINVAL;
4411
4412         mask = vfs_poll(req->file, &ipt->pt) & poll->events;
4413
4414         spin_lock_irq(&ctx->completion_lock);
4415         if (likely(poll->head)) {
4416                 spin_lock(&poll->head->lock);
4417                 if (unlikely(list_empty(&poll->wait.entry))) {
4418                         if (ipt->error)
4419                                 cancel = true;
4420                         ipt->error = 0;
4421                         mask = 0;
4422                 }
4423                 if (mask || ipt->error)
4424                         list_del_init(&poll->wait.entry);
4425                 else if (cancel)
4426                         WRITE_ONCE(poll->canceled, true);
4427                 else if (!poll->done) /* actually waiting for an event */
4428                         io_poll_req_insert(req);
4429                 spin_unlock(&poll->head->lock);
4430         }
4431
4432         return mask;
4433 }
4434
4435 static bool io_arm_poll_handler(struct io_kiocb *req)
4436 {
4437         const struct io_op_def *def = &io_op_defs[req->opcode];
4438         struct io_ring_ctx *ctx = req->ctx;
4439         struct async_poll *apoll;
4440         struct io_poll_table ipt;
4441         __poll_t mask, ret;
4442         bool had_io;
4443
4444         if (!req->file || !file_can_poll(req->file))
4445                 return false;
4446         if (req->flags & (REQ_F_MUST_PUNT | REQ_F_POLLED))
4447                 return false;
4448         if (!def->pollin && !def->pollout)
4449                 return false;
4450
4451         apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
4452         if (unlikely(!apoll))
4453                 return false;
4454
4455         req->flags |= REQ_F_POLLED;
4456         memcpy(&apoll->work, &req->work, sizeof(req->work));
4457         had_io = req->io != NULL;
4458
4459         get_task_struct(current);
4460         req->task = current;
4461         req->apoll = apoll;
4462         INIT_HLIST_NODE(&req->hash_node);
4463
4464         mask = 0;
4465         if (def->pollin)
4466                 mask |= POLLIN | POLLRDNORM;
4467         if (def->pollout)
4468                 mask |= POLLOUT | POLLWRNORM;
4469         mask |= POLLERR | POLLPRI;
4470
4471         ipt.pt._qproc = io_async_queue_proc;
4472
4473         ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
4474                                         io_async_wake);
4475         if (ret) {
4476                 ipt.error = 0;
4477                 /* only remove double add if we did it here */
4478                 if (!had_io)
4479                         io_poll_remove_double(req);
4480                 spin_unlock_irq(&ctx->completion_lock);
4481                 memcpy(&req->work, &apoll->work, sizeof(req->work));
4482                 kfree(apoll);
4483                 return false;
4484         }
4485         spin_unlock_irq(&ctx->completion_lock);
4486         trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask,
4487                                         apoll->poll.events);
4488         return true;
4489 }
4490
4491 static bool __io_poll_remove_one(struct io_kiocb *req,
4492                                  struct io_poll_iocb *poll)
4493 {
4494         bool do_complete = false;
4495
4496         spin_lock(&poll->head->lock);
4497         WRITE_ONCE(poll->canceled, true);
4498         if (!list_empty(&poll->wait.entry)) {
4499                 list_del_init(&poll->wait.entry);
4500                 do_complete = true;
4501         }
4502         spin_unlock(&poll->head->lock);
4503         hash_del(&req->hash_node);
4504         return do_complete;
4505 }
4506
4507 static bool io_poll_remove_one(struct io_kiocb *req)
4508 {
4509         bool do_complete;
4510
4511         if (req->opcode == IORING_OP_POLL_ADD) {
4512                 io_poll_remove_double(req);
4513                 do_complete = __io_poll_remove_one(req, &req->poll);
4514         } else {
4515                 struct async_poll *apoll = req->apoll;
4516
4517                 /* non-poll requests have submit ref still */
4518                 do_complete = __io_poll_remove_one(req, &apoll->poll);
4519                 if (do_complete) {
4520                         io_put_req(req);
4521                         /*
4522                          * restore ->work because we will call
4523                          * io_req_work_drop_env below when dropping the
4524                          * final reference.
4525                          */
4526                         memcpy(&req->work, &apoll->work, sizeof(req->work));
4527                         kfree(apoll);
4528                 }
4529         }
4530
4531         if (do_complete) {
4532                 io_cqring_fill_event(req, -ECANCELED);
4533                 io_commit_cqring(req->ctx);
4534                 req->flags |= REQ_F_COMP_LOCKED;
4535                 io_put_req(req);
4536         }
4537
4538         return do_complete;
4539 }
4540
4541 static void io_poll_remove_all(struct io_ring_ctx *ctx)
4542 {
4543         struct hlist_node *tmp;
4544         struct io_kiocb *req;
4545         int posted = 0, i;
4546
4547         spin_lock_irq(&ctx->completion_lock);
4548         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
4549                 struct hlist_head *list;
4550
4551                 list = &ctx->cancel_hash[i];
4552                 hlist_for_each_entry_safe(req, tmp, list, hash_node)
4553                         posted += io_poll_remove_one(req);
4554         }
4555         spin_unlock_irq(&ctx->completion_lock);
4556
4557         if (posted)
4558                 io_cqring_ev_posted(ctx);
4559 }
4560
4561 static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
4562 {
4563         struct hlist_head *list;
4564         struct io_kiocb *req;
4565
4566         list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
4567         hlist_for_each_entry(req, list, hash_node) {
4568                 if (sqe_addr != req->user_data)
4569                         continue;
4570                 if (io_poll_remove_one(req))
4571                         return 0;
4572                 return -EALREADY;
4573         }
4574
4575         return -ENOENT;
4576 }
4577
4578 static int io_poll_remove_prep(struct io_kiocb *req,
4579                                const struct io_uring_sqe *sqe)
4580 {
4581         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4582                 return -EINVAL;
4583         if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
4584             sqe->poll_events)
4585                 return -EINVAL;
4586
4587         req->poll.addr = READ_ONCE(sqe->addr);
4588         return 0;
4589 }
4590
4591 /*
4592  * Find a running poll command that matches one specified in sqe->addr,
4593  * and remove it if found.
4594  */
4595 static int io_poll_remove(struct io_kiocb *req)
4596 {
4597         struct io_ring_ctx *ctx = req->ctx;
4598         u64 addr;
4599         int ret;
4600
4601         addr = req->poll.addr;
4602         spin_lock_irq(&ctx->completion_lock);
4603         ret = io_poll_cancel(ctx, addr);
4604         spin_unlock_irq(&ctx->completion_lock);
4605
4606         io_cqring_add_event(req, ret);
4607         if (ret < 0)
4608                 req_set_fail_links(req);
4609         io_put_req(req);
4610         return 0;
4611 }
4612
4613 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
4614                         void *key)
4615 {
4616         struct io_kiocb *req = wait->private;
4617         struct io_poll_iocb *poll = &req->poll;
4618
4619         return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
4620 }
4621
4622 static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
4623                                struct poll_table_struct *p)
4624 {
4625         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
4626
4627         __io_queue_proc(&pt->req->poll, pt, head);
4628 }
4629
4630 static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4631 {
4632         struct io_poll_iocb *poll = &req->poll;
4633         u16 events;
4634
4635         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4636                 return -EINVAL;
4637         if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
4638                 return -EINVAL;
4639         if (!poll->file)
4640                 return -EBADF;
4641
4642         events = READ_ONCE(sqe->poll_events);
4643         poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
4644
4645         get_task_struct(current);
4646         req->task = current;
4647         return 0;
4648 }
4649
4650 static int io_poll_add(struct io_kiocb *req)
4651 {
4652         struct io_poll_iocb *poll = &req->poll;
4653         struct io_ring_ctx *ctx = req->ctx;
4654         struct io_poll_table ipt;
4655         __poll_t mask;
4656
4657         INIT_HLIST_NODE(&req->hash_node);
4658         INIT_LIST_HEAD(&req->list);
4659         ipt.pt._qproc = io_poll_queue_proc;
4660
4661         mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
4662                                         io_poll_wake);
4663
4664         if (mask) { /* no async, we'd stolen it */
4665                 ipt.error = 0;
4666                 io_poll_complete(req, mask, 0);
4667         }
4668         spin_unlock_irq(&ctx->completion_lock);
4669
4670         if (mask) {
4671                 io_cqring_ev_posted(ctx);
4672                 io_put_req(req);
4673         }
4674         return ipt.error;
4675 }
4676
4677 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
4678 {
4679         struct io_timeout_data *data = container_of(timer,
4680                                                 struct io_timeout_data, timer);
4681         struct io_kiocb *req = data->req;
4682         struct io_ring_ctx *ctx = req->ctx;
4683         unsigned long flags;
4684
4685         atomic_inc(&ctx->cq_timeouts);
4686
4687         spin_lock_irqsave(&ctx->completion_lock, flags);
4688         /*
4689          * We could be racing with timeout deletion. If the list is empty,
4690          * then timeout lookup already found it and will be handling it.
4691          */
4692         if (!list_empty(&req->list)) {
4693                 struct io_kiocb *prev;
4694
4695                 /*
4696                  * Adjust the reqs sequence before the current one because it
4697                  * will consume a slot in the cq_ring and the cq_tail
4698                  * pointer will be increased, otherwise other timeout reqs may
4699                  * return in advance without waiting for enough wait_nr.
4700                  */
4701                 prev = req;
4702                 list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list)
4703                         prev->sequence++;
4704                 list_del_init(&req->list);
4705         }
4706
4707         io_cqring_fill_event(req, -ETIME);
4708         io_commit_cqring(ctx);
4709         spin_unlock_irqrestore(&ctx->completion_lock, flags);
4710
4711         io_cqring_ev_posted(ctx);
4712         req_set_fail_links(req);
4713         io_put_req(req);
4714         return HRTIMER_NORESTART;
4715 }
4716
4717 static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
4718 {
4719         struct io_kiocb *req;
4720         int ret = -ENOENT;
4721
4722         list_for_each_entry(req, &ctx->timeout_list, list) {
4723                 if (user_data == req->user_data) {
4724                         list_del_init(&req->list);
4725                         ret = 0;
4726                         break;
4727                 }
4728         }
4729
4730         if (ret == -ENOENT)
4731                 return ret;
4732
4733         ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
4734         if (ret == -1)
4735                 return -EALREADY;
4736
4737         req_set_fail_links(req);
4738         io_cqring_fill_event(req, -ECANCELED);
4739         io_put_req(req);
4740         return 0;
4741 }
4742
4743 static int io_timeout_remove_prep(struct io_kiocb *req,
4744                                   const struct io_uring_sqe *sqe)
4745 {
4746         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4747                 return -EINVAL;
4748         if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len)
4749                 return -EINVAL;
4750
4751         req->timeout.addr = READ_ONCE(sqe->addr);
4752         req->timeout.flags = READ_ONCE(sqe->timeout_flags);
4753         if (req->timeout.flags)
4754                 return -EINVAL;
4755
4756         return 0;
4757 }
4758
4759 /*
4760  * Remove or update an existing timeout command
4761  */
4762 static int io_timeout_remove(struct io_kiocb *req)
4763 {
4764         struct io_ring_ctx *ctx = req->ctx;
4765         int ret;
4766
4767         spin_lock_irq(&ctx->completion_lock);
4768         ret = io_timeout_cancel(ctx, req->timeout.addr);
4769
4770         io_cqring_fill_event(req, ret);
4771         io_commit_cqring(ctx);
4772         spin_unlock_irq(&ctx->completion_lock);
4773         io_cqring_ev_posted(ctx);
4774         if (ret < 0)
4775                 req_set_fail_links(req);
4776         io_put_req(req);
4777         return 0;
4778 }
4779
4780 static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
4781                            bool is_timeout_link)
4782 {
4783         struct io_timeout_data *data;
4784         unsigned flags;
4785
4786         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4787                 return -EINVAL;
4788         if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
4789                 return -EINVAL;
4790         if (sqe->off && is_timeout_link)
4791                 return -EINVAL;
4792         flags = READ_ONCE(sqe->timeout_flags);
4793         if (flags & ~IORING_TIMEOUT_ABS)
4794                 return -EINVAL;
4795
4796         req->timeout.count = READ_ONCE(sqe->off);
4797
4798         if (!req->io && io_alloc_async_ctx(req))
4799                 return -ENOMEM;
4800
4801         data = &req->io->timeout;
4802         data->req = req;
4803         req->flags |= REQ_F_TIMEOUT;
4804
4805         if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
4806                 return -EFAULT;
4807
4808         if (flags & IORING_TIMEOUT_ABS)
4809                 data->mode = HRTIMER_MODE_ABS;
4810         else
4811                 data->mode = HRTIMER_MODE_REL;
4812
4813         hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
4814         return 0;
4815 }
4816
4817 static int io_timeout(struct io_kiocb *req)
4818 {
4819         struct io_ring_ctx *ctx = req->ctx;
4820         struct io_timeout_data *data;
4821         struct list_head *entry;
4822         unsigned span = 0;
4823         u32 count = req->timeout.count;
4824         u32 seq = req->sequence;
4825
4826         data = &req->io->timeout;
4827
4828         /*
4829          * sqe->off holds how many events that need to occur for this
4830          * timeout event to be satisfied. If it isn't set, then this is
4831          * a pure timeout request, sequence isn't used.
4832          */
4833         if (!count) {
4834                 req->flags |= REQ_F_TIMEOUT_NOSEQ;
4835                 spin_lock_irq(&ctx->completion_lock);
4836                 entry = ctx->timeout_list.prev;
4837                 goto add;
4838         }
4839
4840         req->sequence = seq + count;
4841
4842         /*
4843          * Insertion sort, ensuring the first entry in the list is always
4844          * the one we need first.
4845          */
4846         spin_lock_irq(&ctx->completion_lock);
4847         list_for_each_prev(entry, &ctx->timeout_list) {
4848                 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
4849                 unsigned nxt_seq;
4850                 long long tmp, tmp_nxt;
4851                 u32 nxt_offset = nxt->timeout.count;
4852
4853                 if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
4854                         continue;
4855
4856                 /*
4857                  * Since seq + count can overflow, use type long
4858                  * long to store it.
4859                  */
4860                 tmp = (long long)seq + count;
4861                 nxt_seq = nxt->sequence - nxt_offset;
4862                 tmp_nxt = (long long)nxt_seq + nxt_offset;
4863
4864                 /*
4865                  * cached_sq_head may overflow, and it will never overflow twice
4866                  * once there is some timeout req still be valid.
4867                  */
4868                 if (seq < nxt_seq)
4869                         tmp += UINT_MAX;
4870
4871                 if (tmp > tmp_nxt)
4872                         break;
4873
4874                 /*
4875                  * Sequence of reqs after the insert one and itself should
4876                  * be adjusted because each timeout req consumes a slot.
4877                  */
4878                 span++;
4879                 nxt->sequence++;
4880         }
4881         req->sequence -= span;
4882 add:
4883         list_add(&req->list, entry);
4884         data->timer.function = io_timeout_fn;
4885         hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
4886         spin_unlock_irq(&ctx->completion_lock);
4887         return 0;
4888 }
4889
4890 static bool io_cancel_cb(struct io_wq_work *work, void *data)
4891 {
4892         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
4893
4894         return req->user_data == (unsigned long) data;
4895 }
4896
4897 static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
4898 {
4899         enum io_wq_cancel cancel_ret;
4900         int ret = 0;
4901
4902         cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr);
4903         switch (cancel_ret) {
4904         case IO_WQ_CANCEL_OK:
4905                 ret = 0;
4906                 break;
4907         case IO_WQ_CANCEL_RUNNING:
4908                 ret = -EALREADY;
4909                 break;
4910         case IO_WQ_CANCEL_NOTFOUND:
4911                 ret = -ENOENT;
4912                 break;
4913         }
4914
4915         return ret;
4916 }
4917
4918 static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
4919                                      struct io_kiocb *req, __u64 sqe_addr,
4920                                      int success_ret)
4921 {
4922         unsigned long flags;
4923         int ret;
4924
4925         ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr);
4926         if (ret != -ENOENT) {
4927                 spin_lock_irqsave(&ctx->completion_lock, flags);
4928                 goto done;
4929         }
4930
4931         spin_lock_irqsave(&ctx->completion_lock, flags);
4932         ret = io_timeout_cancel(ctx, sqe_addr);
4933         if (ret != -ENOENT)
4934                 goto done;
4935         ret = io_poll_cancel(ctx, sqe_addr);
4936 done:
4937         if (!ret)
4938                 ret = success_ret;
4939         io_cqring_fill_event(req, ret);
4940         io_commit_cqring(ctx);
4941         spin_unlock_irqrestore(&ctx->completion_lock, flags);
4942         io_cqring_ev_posted(ctx);
4943
4944         if (ret < 0)
4945                 req_set_fail_links(req);
4946         io_put_req(req);
4947 }
4948
4949 static int io_async_cancel_prep(struct io_kiocb *req,
4950                                 const struct io_uring_sqe *sqe)
4951 {
4952         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4953                 return -EINVAL;
4954         if (sqe->flags || sqe->ioprio || sqe->off || sqe->len ||
4955             sqe->cancel_flags)
4956                 return -EINVAL;
4957
4958         req->cancel.addr = READ_ONCE(sqe->addr);
4959         return 0;
4960 }
4961
4962 static int io_async_cancel(struct io_kiocb *req)
4963 {
4964         struct io_ring_ctx *ctx = req->ctx;
4965
4966         io_async_find_and_cancel(ctx, req, req->cancel.addr, 0);
4967         return 0;
4968 }
4969
4970 static int io_files_update_prep(struct io_kiocb *req,
4971                                 const struct io_uring_sqe *sqe)
4972 {
4973         if (sqe->flags || sqe->ioprio || sqe->rw_flags)
4974                 return -EINVAL;
4975
4976         req->files_update.offset = READ_ONCE(sqe->off);
4977         req->files_update.nr_args = READ_ONCE(sqe->len);
4978         if (!req->files_update.nr_args)
4979                 return -EINVAL;
4980         req->files_update.arg = READ_ONCE(sqe->addr);
4981         return 0;
4982 }
4983
4984 static int io_files_update(struct io_kiocb *req, bool force_nonblock)
4985 {
4986         struct io_ring_ctx *ctx = req->ctx;
4987         struct io_uring_files_update up;
4988         int ret;
4989
4990         if (force_nonblock)
4991                 return -EAGAIN;
4992
4993         up.offset = req->files_update.offset;
4994         up.fds = req->files_update.arg;
4995
4996         mutex_lock(&ctx->uring_lock);
4997         ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args);
4998         mutex_unlock(&ctx->uring_lock);
4999
5000         if (ret < 0)
5001                 req_set_fail_links(req);
5002         io_cqring_add_event(req, ret);
5003         io_put_req(req);
5004         return 0;
5005 }
5006
5007 static int io_req_defer_prep(struct io_kiocb *req,
5008                              const struct io_uring_sqe *sqe)
5009 {
5010         ssize_t ret = 0;
5011
5012         if (!sqe)
5013                 return 0;
5014
5015         if (io_op_defs[req->opcode].file_table) {
5016                 ret = io_grab_files(req);
5017                 if (unlikely(ret))
5018                         return ret;
5019         }
5020
5021         io_req_work_grab_env(req, &io_op_defs[req->opcode]);
5022
5023         switch (req->opcode) {
5024         case IORING_OP_NOP:
5025                 break;
5026         case IORING_OP_READV:
5027         case IORING_OP_READ_FIXED:
5028         case IORING_OP_READ:
5029                 ret = io_read_prep(req, sqe, true);
5030                 break;
5031         case IORING_OP_WRITEV:
5032         case IORING_OP_WRITE_FIXED:
5033         case IORING_OP_WRITE:
5034                 ret = io_write_prep(req, sqe, true);
5035                 break;
5036         case IORING_OP_POLL_ADD:
5037                 ret = io_poll_add_prep(req, sqe);
5038                 break;
5039         case IORING_OP_POLL_REMOVE:
5040                 ret = io_poll_remove_prep(req, sqe);
5041                 break;
5042         case IORING_OP_FSYNC:
5043                 ret = io_prep_fsync(req, sqe);
5044                 break;
5045         case IORING_OP_SYNC_FILE_RANGE:
5046                 ret = io_prep_sfr(req, sqe);
5047                 break;
5048         case IORING_OP_SENDMSG:
5049         case IORING_OP_SEND:
5050                 ret = io_sendmsg_prep(req, sqe);
5051                 break;
5052         case IORING_OP_RECVMSG:
5053         case IORING_OP_RECV:
5054                 ret = io_recvmsg_prep(req, sqe);
5055                 break;
5056         case IORING_OP_CONNECT:
5057                 ret = io_connect_prep(req, sqe);
5058                 break;
5059         case IORING_OP_TIMEOUT:
5060                 ret = io_timeout_prep(req, sqe, false);
5061                 break;
5062         case IORING_OP_TIMEOUT_REMOVE:
5063                 ret = io_timeout_remove_prep(req, sqe);
5064                 break;
5065         case IORING_OP_ASYNC_CANCEL:
5066                 ret = io_async_cancel_prep(req, sqe);
5067                 break;
5068         case IORING_OP_LINK_TIMEOUT:
5069                 ret = io_timeout_prep(req, sqe, true);
5070                 break;
5071         case IORING_OP_ACCEPT:
5072                 ret = io_accept_prep(req, sqe);
5073                 break;
5074         case IORING_OP_FALLOCATE:
5075                 ret = io_fallocate_prep(req, sqe);
5076                 break;
5077         case IORING_OP_OPENAT:
5078                 ret = io_openat_prep(req, sqe);
5079                 break;
5080         case IORING_OP_CLOSE:
5081                 ret = io_close_prep(req, sqe);
5082                 break;
5083         case IORING_OP_FILES_UPDATE:
5084                 ret = io_files_update_prep(req, sqe);
5085                 break;
5086         case IORING_OP_STATX:
5087                 ret = io_statx_prep(req, sqe);
5088                 break;
5089         case IORING_OP_FADVISE:
5090                 ret = io_fadvise_prep(req, sqe);
5091                 break;
5092         case IORING_OP_MADVISE:
5093                 ret = io_madvise_prep(req, sqe);
5094                 break;
5095         case IORING_OP_OPENAT2:
5096                 ret = io_openat2_prep(req, sqe);
5097                 break;
5098         case IORING_OP_EPOLL_CTL:
5099                 ret = io_epoll_ctl_prep(req, sqe);
5100                 break;
5101         case IORING_OP_SPLICE:
5102                 ret = io_splice_prep(req, sqe);
5103                 break;
5104         case IORING_OP_PROVIDE_BUFFERS:
5105                 ret = io_provide_buffers_prep(req, sqe);
5106                 break;
5107         case IORING_OP_REMOVE_BUFFERS:
5108                 ret = io_remove_buffers_prep(req, sqe);
5109                 break;
5110         default:
5111                 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
5112                                 req->opcode);
5113                 ret = -EINVAL;
5114                 break;
5115         }
5116
5117         return ret;
5118 }
5119
5120 static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5121 {
5122         struct io_ring_ctx *ctx = req->ctx;
5123         int ret;
5124
5125         /* Still need defer if there is pending req in defer list. */
5126         if (!req_need_defer(req) && list_empty_careful(&ctx->defer_list))
5127                 return 0;
5128
5129         if (!req->io) {
5130                 if (io_alloc_async_ctx(req))
5131                         return -EAGAIN;
5132                 ret = io_req_defer_prep(req, sqe);
5133                 if (ret < 0)
5134                         return ret;
5135         }
5136
5137         spin_lock_irq(&ctx->completion_lock);
5138         if (!req_need_defer(req) && list_empty(&ctx->defer_list)) {
5139                 spin_unlock_irq(&ctx->completion_lock);
5140                 return 0;
5141         }
5142
5143         trace_io_uring_defer(ctx, req, req->user_data);
5144         list_add_tail(&req->list, &ctx->defer_list);
5145         spin_unlock_irq(&ctx->completion_lock);
5146         return -EIOCBQUEUED;
5147 }
5148
5149 static void io_cleanup_req(struct io_kiocb *req)
5150 {
5151         struct io_async_ctx *io = req->io;
5152
5153         switch (req->opcode) {
5154         case IORING_OP_READV:
5155         case IORING_OP_READ_FIXED:
5156         case IORING_OP_READ:
5157                 if (req->flags & REQ_F_BUFFER_SELECTED)
5158                         kfree((void *)(unsigned long)req->rw.addr);
5159                 /* fallthrough */
5160         case IORING_OP_WRITEV:
5161         case IORING_OP_WRITE_FIXED:
5162         case IORING_OP_WRITE:
5163                 if (io->rw.iov != io->rw.fast_iov)
5164                         kfree(io->rw.iov);
5165                 break;
5166         case IORING_OP_RECVMSG:
5167                 if (req->flags & REQ_F_BUFFER_SELECTED)
5168                         kfree(req->sr_msg.kbuf);
5169                 /* fallthrough */
5170         case IORING_OP_SENDMSG:
5171                 if (io->msg.iov != io->msg.fast_iov)
5172                         kfree(io->msg.iov);
5173                 break;
5174         case IORING_OP_RECV:
5175                 if (req->flags & REQ_F_BUFFER_SELECTED)
5176                         kfree(req->sr_msg.kbuf);
5177                 break;
5178         case IORING_OP_OPENAT:
5179         case IORING_OP_OPENAT2:
5180         case IORING_OP_STATX:
5181                 putname(req->open.filename);
5182                 break;
5183         case IORING_OP_SPLICE:
5184                 io_put_file(req, req->splice.file_in,
5185                             (req->splice.flags & SPLICE_F_FD_IN_FIXED));
5186                 break;
5187         }
5188
5189         req->flags &= ~REQ_F_NEED_CLEANUP;
5190 }
5191
5192 static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
5193                         bool force_nonblock)
5194 {
5195         struct io_ring_ctx *ctx = req->ctx;
5196         int ret;
5197
5198         switch (req->opcode) {
5199         case IORING_OP_NOP:
5200                 ret = io_nop(req);
5201                 break;
5202         case IORING_OP_READV:
5203         case IORING_OP_READ_FIXED:
5204         case IORING_OP_READ:
5205                 if (sqe) {
5206                         ret = io_read_prep(req, sqe, force_nonblock);
5207                         if (ret < 0)
5208                                 break;
5209                 }
5210                 ret = io_read(req, force_nonblock);
5211                 break;
5212         case IORING_OP_WRITEV:
5213         case IORING_OP_WRITE_FIXED:
5214         case IORING_OP_WRITE:
5215                 if (sqe) {
5216                         ret = io_write_prep(req, sqe, force_nonblock);
5217                         if (ret < 0)
5218                                 break;
5219                 }
5220                 ret = io_write(req, force_nonblock);
5221                 break;
5222         case IORING_OP_FSYNC:
5223                 if (sqe) {
5224                         ret = io_prep_fsync(req, sqe);
5225                         if (ret < 0)
5226                                 break;
5227                 }
5228                 ret = io_fsync(req, force_nonblock);
5229                 break;
5230         case IORING_OP_POLL_ADD:
5231                 if (sqe) {
5232                         ret = io_poll_add_prep(req, sqe);
5233                         if (ret)
5234                                 break;
5235                 }
5236                 ret = io_poll_add(req);
5237                 break;
5238         case IORING_OP_POLL_REMOVE:
5239                 if (sqe) {
5240                         ret = io_poll_remove_prep(req, sqe);
5241                         if (ret < 0)
5242                                 break;
5243                 }
5244                 ret = io_poll_remove(req);
5245                 break;
5246         case IORING_OP_SYNC_FILE_RANGE:
5247                 if (sqe) {
5248                         ret = io_prep_sfr(req, sqe);
5249                         if (ret < 0)
5250                                 break;
5251                 }
5252                 ret = io_sync_file_range(req, force_nonblock);
5253                 break;
5254         case IORING_OP_SENDMSG:
5255         case IORING_OP_SEND:
5256                 if (sqe) {
5257                         ret = io_sendmsg_prep(req, sqe);
5258                         if (ret < 0)
5259                                 break;
5260                 }
5261                 if (req->opcode == IORING_OP_SENDMSG)
5262                         ret = io_sendmsg(req, force_nonblock);
5263                 else
5264                         ret = io_send(req, force_nonblock);
5265                 break;
5266         case IORING_OP_RECVMSG:
5267         case IORING_OP_RECV:
5268                 if (sqe) {
5269                         ret = io_recvmsg_prep(req, sqe);
5270                         if (ret)
5271                                 break;
5272                 }
5273                 if (req->opcode == IORING_OP_RECVMSG)
5274                         ret = io_recvmsg(req, force_nonblock);
5275                 else
5276                         ret = io_recv(req, force_nonblock);
5277                 break;
5278         case IORING_OP_TIMEOUT:
5279                 if (sqe) {
5280                         ret = io_timeout_prep(req, sqe, false);
5281                         if (ret)
5282                                 break;
5283                 }
5284                 ret = io_timeout(req);
5285                 break;
5286         case IORING_OP_TIMEOUT_REMOVE:
5287                 if (sqe) {
5288                         ret = io_timeout_remove_prep(req, sqe);
5289                         if (ret)
5290                                 break;
5291                 }
5292                 ret = io_timeout_remove(req);
5293                 break;
5294         case IORING_OP_ACCEPT:
5295                 if (sqe) {
5296                         ret = io_accept_prep(req, sqe);
5297                         if (ret)
5298                                 break;
5299                 }
5300                 ret = io_accept(req, force_nonblock);
5301                 break;
5302         case IORING_OP_CONNECT:
5303                 if (sqe) {
5304                         ret = io_connect_prep(req, sqe);
5305                         if (ret)
5306                                 break;
5307                 }
5308                 ret = io_connect(req, force_nonblock);
5309                 break;
5310         case IORING_OP_ASYNC_CANCEL:
5311                 if (sqe) {
5312                         ret = io_async_cancel_prep(req, sqe);
5313                         if (ret)
5314                                 break;
5315                 }
5316                 ret = io_async_cancel(req);
5317                 break;
5318         case IORING_OP_FALLOCATE:
5319                 if (sqe) {
5320                         ret = io_fallocate_prep(req, sqe);
5321                         if (ret)
5322                                 break;
5323                 }
5324                 ret = io_fallocate(req, force_nonblock);
5325                 break;
5326         case IORING_OP_OPENAT:
5327                 if (sqe) {
5328                         ret = io_openat_prep(req, sqe);
5329                         if (ret)
5330                                 break;
5331                 }
5332                 ret = io_openat(req, force_nonblock);
5333                 break;
5334         case IORING_OP_CLOSE:
5335                 if (sqe) {
5336                         ret = io_close_prep(req, sqe);
5337                         if (ret)
5338                                 break;
5339                 }
5340                 ret = io_close(req, force_nonblock);
5341                 break;
5342         case IORING_OP_FILES_UPDATE:
5343                 if (sqe) {
5344                         ret = io_files_update_prep(req, sqe);
5345                         if (ret)
5346                                 break;
5347                 }
5348                 ret = io_files_update(req, force_nonblock);
5349                 break;
5350         case IORING_OP_STATX:
5351                 if (sqe) {
5352                         ret = io_statx_prep(req, sqe);
5353                         if (ret)
5354                                 break;
5355                 }
5356                 ret = io_statx(req, force_nonblock);
5357                 break;
5358         case IORING_OP_FADVISE:
5359                 if (sqe) {
5360                         ret = io_fadvise_prep(req, sqe);
5361                         if (ret)
5362                                 break;
5363                 }
5364                 ret = io_fadvise(req, force_nonblock);
5365                 break;
5366         case IORING_OP_MADVISE:
5367                 if (sqe) {
5368                         ret = io_madvise_prep(req, sqe);
5369                         if (ret)
5370                                 break;
5371                 }
5372                 ret = io_madvise(req, force_nonblock);
5373                 break;
5374         case IORING_OP_OPENAT2:
5375                 if (sqe) {
5376                         ret = io_openat2_prep(req, sqe);
5377                         if (ret)
5378                                 break;
5379                 }
5380                 ret = io_openat2(req, force_nonblock);
5381                 break;
5382         case IORING_OP_EPOLL_CTL:
5383                 if (sqe) {
5384                         ret = io_epoll_ctl_prep(req, sqe);
5385                         if (ret)
5386                                 break;
5387                 }
5388                 ret = io_epoll_ctl(req, force_nonblock);
5389                 break;
5390         case IORING_OP_SPLICE:
5391                 if (sqe) {
5392                         ret = io_splice_prep(req, sqe);
5393                         if (ret < 0)
5394                                 break;
5395                 }
5396                 ret = io_splice(req, force_nonblock);
5397                 break;
5398         case IORING_OP_PROVIDE_BUFFERS:
5399                 if (sqe) {
5400                         ret = io_provide_buffers_prep(req, sqe);
5401                         if (ret)
5402                                 break;
5403                 }
5404                 ret = io_provide_buffers(req, force_nonblock);
5405                 break;
5406         case IORING_OP_REMOVE_BUFFERS:
5407                 if (sqe) {
5408                         ret = io_remove_buffers_prep(req, sqe);
5409                         if (ret)
5410                                 break;
5411                 }
5412                 ret = io_remove_buffers(req, force_nonblock);
5413                 break;
5414         default:
5415                 ret = -EINVAL;
5416                 break;
5417         }
5418
5419         if (ret)
5420                 return ret;
5421
5422         /* If the op doesn't have a file, we're not polling for it */
5423         if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) {
5424                 const bool in_async = io_wq_current_is_worker();
5425
5426                 /* workqueue context doesn't hold uring_lock, grab it now */
5427                 if (in_async)
5428                         mutex_lock(&ctx->uring_lock);
5429
5430                 io_iopoll_req_issued(req);
5431
5432                 if (in_async)
5433                         mutex_unlock(&ctx->uring_lock);
5434         }
5435
5436         return 0;
5437 }
5438
5439 static void io_wq_submit_work(struct io_wq_work **workptr)
5440 {
5441         struct io_wq_work *work = *workptr;
5442         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
5443         int ret = 0;
5444
5445         /* if NO_CANCEL is set, we must still run the work */
5446         if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) ==
5447                                 IO_WQ_WORK_CANCEL) {
5448                 ret = -ECANCELED;
5449         }
5450
5451         if (!ret) {
5452                 do {
5453                         ret = io_issue_sqe(req, NULL, false);
5454                         /*
5455                          * We can get EAGAIN for polled IO even though we're
5456                          * forcing a sync submission from here, since we can't
5457                          * wait for request slots on the block side.
5458                          */
5459                         if (ret != -EAGAIN)
5460                                 break;
5461                         cond_resched();
5462                 } while (1);
5463         }
5464
5465         if (ret) {
5466                 req_set_fail_links(req);
5467                 io_cqring_add_event(req, ret);
5468                 io_put_req(req);
5469         }
5470
5471         io_steal_work(req, workptr);
5472 }
5473
5474 static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
5475                                               int index)
5476 {
5477         struct fixed_file_table *table;
5478
5479         table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
5480         return table->files[index & IORING_FILE_TABLE_MASK];;
5481 }
5482
5483 static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
5484                         int fd, struct file **out_file, bool fixed)
5485 {
5486         struct io_ring_ctx *ctx = req->ctx;
5487         struct file *file;
5488
5489         if (fixed) {
5490                 if (unlikely(!ctx->file_data ||
5491                     (unsigned) fd >= ctx->nr_user_files))
5492                         return -EBADF;
5493                 fd = array_index_nospec(fd, ctx->nr_user_files);
5494                 file = io_file_from_index(ctx, fd);
5495                 if (!file)
5496                         return -EBADF;
5497                 req->fixed_file_refs = ctx->file_data->cur_refs;
5498                 percpu_ref_get(req->fixed_file_refs);
5499         } else {
5500                 trace_io_uring_file_get(ctx, fd);
5501                 file = __io_file_get(state, fd);
5502                 if (unlikely(!file))
5503                         return -EBADF;
5504         }
5505
5506         *out_file = file;
5507         return 0;
5508 }
5509
5510 static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
5511                            int fd)
5512 {
5513         bool fixed;
5514
5515         fixed = (req->flags & REQ_F_FIXED_FILE) != 0;
5516         if (unlikely(!fixed && req->needs_fixed_file))
5517                 return -EBADF;
5518
5519         return io_file_get(state, req, fd, &req->file, fixed);
5520 }
5521
5522 static int io_grab_files(struct io_kiocb *req)
5523 {
5524         int ret = -EBADF;
5525         struct io_ring_ctx *ctx = req->ctx;
5526
5527         if (req->work.files || (req->flags & REQ_F_NO_FILE_TABLE))
5528                 return 0;
5529         if (!ctx->ring_file)
5530                 return -EBADF;
5531
5532         rcu_read_lock();
5533         spin_lock_irq(&ctx->inflight_lock);
5534         /*
5535          * We use the f_ops->flush() handler to ensure that we can flush
5536          * out work accessing these files if the fd is closed. Check if
5537          * the fd has changed since we started down this path, and disallow
5538          * this operation if it has.
5539          */
5540         if (fcheck(ctx->ring_fd) == ctx->ring_file) {
5541                 list_add(&req->inflight_entry, &ctx->inflight_list);
5542                 req->flags |= REQ_F_INFLIGHT;
5543                 req->work.files = current->files;
5544                 ret = 0;
5545         }
5546         spin_unlock_irq(&ctx->inflight_lock);
5547         rcu_read_unlock();
5548
5549         return ret;
5550 }
5551
5552 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
5553 {
5554         struct io_timeout_data *data = container_of(timer,
5555                                                 struct io_timeout_data, timer);
5556         struct io_kiocb *req = data->req;
5557         struct io_ring_ctx *ctx = req->ctx;
5558         struct io_kiocb *prev = NULL;
5559         unsigned long flags;
5560
5561         spin_lock_irqsave(&ctx->completion_lock, flags);
5562
5563         /*
5564          * We don't expect the list to be empty, that will only happen if we
5565          * race with the completion of the linked work.
5566          */
5567         if (!list_empty(&req->link_list)) {
5568                 prev = list_entry(req->link_list.prev, struct io_kiocb,
5569                                   link_list);
5570                 if (refcount_inc_not_zero(&prev->refs)) {
5571                         list_del_init(&req->link_list);
5572                         prev->flags &= ~REQ_F_LINK_TIMEOUT;
5573                 } else
5574                         prev = NULL;
5575         }
5576
5577         spin_unlock_irqrestore(&ctx->completion_lock, flags);
5578
5579         if (prev) {
5580                 req_set_fail_links(prev);
5581                 io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
5582                 io_put_req(prev);
5583         } else {
5584                 io_cqring_add_event(req, -ETIME);
5585                 io_put_req(req);
5586         }
5587         return HRTIMER_NORESTART;
5588 }
5589
5590 static void io_queue_linked_timeout(struct io_kiocb *req)
5591 {
5592         struct io_ring_ctx *ctx = req->ctx;
5593
5594         /*
5595          * If the list is now empty, then our linked request finished before
5596          * we got a chance to setup the timer
5597          */
5598         spin_lock_irq(&ctx->completion_lock);
5599         if (!list_empty(&req->link_list)) {
5600                 struct io_timeout_data *data = &req->io->timeout;
5601
5602                 data->timer.function = io_link_timeout_fn;
5603                 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
5604                                 data->mode);
5605         }
5606         spin_unlock_irq(&ctx->completion_lock);
5607
5608         /* drop submission reference */
5609         io_put_req(req);
5610 }
5611
5612 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
5613 {
5614         struct io_kiocb *nxt;
5615
5616         if (!(req->flags & REQ_F_LINK_HEAD))
5617                 return NULL;
5618         /* for polled retry, if flag is set, we already went through here */
5619         if (req->flags & REQ_F_POLLED)
5620                 return NULL;
5621
5622         nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,
5623                                         link_list);
5624         if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT)
5625                 return NULL;
5626
5627         req->flags |= REQ_F_LINK_TIMEOUT;
5628         return nxt;
5629 }
5630
5631 static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5632 {
5633         struct io_kiocb *linked_timeout;
5634         struct io_kiocb *nxt;
5635         const struct cred *old_creds = NULL;
5636         int ret;
5637
5638 again:
5639         linked_timeout = io_prep_linked_timeout(req);
5640
5641         if (req->work.creds && req->work.creds != current_cred()) {
5642                 if (old_creds)
5643                         revert_creds(old_creds);
5644                 if (old_creds == req->work.creds)
5645                         old_creds = NULL; /* restored original creds */
5646                 else
5647                         old_creds = override_creds(req->work.creds);
5648         }
5649
5650         ret = io_issue_sqe(req, sqe, true);
5651
5652         /*
5653          * We async punt it if the file wasn't marked NOWAIT, or if the file
5654          * doesn't support non-blocking read/write attempts
5655          */
5656         if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
5657             (req->flags & REQ_F_MUST_PUNT))) {
5658                 if (io_arm_poll_handler(req)) {
5659                         if (linked_timeout)
5660                                 io_queue_linked_timeout(linked_timeout);
5661                         goto exit;
5662                 }
5663 punt:
5664                 if (io_op_defs[req->opcode].file_table) {
5665                         ret = io_grab_files(req);
5666                         if (ret)
5667                                 goto err;
5668                 }
5669
5670                 /*
5671                  * Queued up for async execution, worker will release
5672                  * submit reference when the iocb is actually submitted.
5673                  */
5674                 io_queue_async_work(req);
5675                 goto exit;
5676         }
5677
5678 err:
5679         nxt = NULL;
5680         /* drop submission reference */
5681         io_put_req_find_next(req, &nxt);
5682
5683         if (linked_timeout) {
5684                 if (!ret)
5685                         io_queue_linked_timeout(linked_timeout);
5686                 else
5687                         io_put_req(linked_timeout);
5688         }
5689
5690         /* and drop final reference, if we failed */
5691         if (ret) {
5692                 io_cqring_add_event(req, ret);
5693                 req_set_fail_links(req);
5694                 io_put_req(req);
5695         }
5696         if (nxt) {
5697                 req = nxt;
5698
5699                 if (req->flags & REQ_F_FORCE_ASYNC)
5700                         goto punt;
5701                 goto again;
5702         }
5703 exit:
5704         if (old_creds)
5705                 revert_creds(old_creds);
5706 }
5707
5708 static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5709 {
5710         int ret;
5711
5712         ret = io_req_defer(req, sqe);
5713         if (ret) {
5714                 if (ret != -EIOCBQUEUED) {
5715 fail_req:
5716                         io_cqring_add_event(req, ret);
5717                         req_set_fail_links(req);
5718                         io_double_put_req(req);
5719                 }
5720         } else if (req->flags & REQ_F_FORCE_ASYNC) {
5721                 if (!req->io) {
5722                         ret = -EAGAIN;
5723                         if (io_alloc_async_ctx(req))
5724                                 goto fail_req;
5725                         ret = io_req_defer_prep(req, sqe);
5726                         if (unlikely(ret < 0))
5727                                 goto fail_req;
5728                 }
5729
5730                 /*
5731                  * Never try inline submit of IOSQE_ASYNC is set, go straight
5732                  * to async execution.
5733                  */
5734                 req->work.flags |= IO_WQ_WORK_CONCURRENT;
5735                 io_queue_async_work(req);
5736         } else {
5737                 __io_queue_sqe(req, sqe);
5738         }
5739 }
5740
5741 static inline void io_queue_link_head(struct io_kiocb *req)
5742 {
5743         if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
5744                 io_cqring_add_event(req, -ECANCELED);
5745                 io_double_put_req(req);
5746         } else
5747                 io_queue_sqe(req, NULL);
5748 }
5749
5750 static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
5751                           struct io_submit_state *state, struct io_kiocb **link)
5752 {
5753         struct io_ring_ctx *ctx = req->ctx;
5754         int ret;
5755
5756         /*
5757          * If we already have a head request, queue this one for async
5758          * submittal once the head completes. If we don't have a head but
5759          * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
5760          * submitted sync once the chain is complete. If none of those
5761          * conditions are true (normal request), then just queue it.
5762          */
5763         if (*link) {
5764                 struct io_kiocb *head = *link;
5765
5766                 /*
5767                  * Taking sequential execution of a link, draining both sides
5768                  * of the link also fullfils IOSQE_IO_DRAIN semantics for all
5769                  * requests in the link. So, it drains the head and the
5770                  * next after the link request. The last one is done via
5771                  * drain_next flag to persist the effect across calls.
5772                  */
5773                 if (req->flags & REQ_F_IO_DRAIN) {
5774                         head->flags |= REQ_F_IO_DRAIN;
5775                         ctx->drain_next = 1;
5776                 }
5777                 if (io_alloc_async_ctx(req))
5778                         return -EAGAIN;
5779
5780                 ret = io_req_defer_prep(req, sqe);
5781                 if (ret) {
5782                         /* fail even hard links since we don't submit */
5783                         head->flags |= REQ_F_FAIL_LINK;
5784                         return ret;
5785                 }
5786                 trace_io_uring_link(ctx, req, head);
5787                 list_add_tail(&req->link_list, &head->link_list);
5788
5789                 /* last request of a link, enqueue the link */
5790                 if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
5791                         io_queue_link_head(head);
5792                         *link = NULL;
5793                 }
5794         } else {
5795                 if (unlikely(ctx->drain_next)) {
5796                         req->flags |= REQ_F_IO_DRAIN;
5797                         ctx->drain_next = 0;
5798                 }
5799                 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
5800                         req->flags |= REQ_F_LINK_HEAD;
5801                         INIT_LIST_HEAD(&req->link_list);
5802
5803                         if (io_alloc_async_ctx(req))
5804                                 return -EAGAIN;
5805
5806                         ret = io_req_defer_prep(req, sqe);
5807                         if (ret)
5808                                 req->flags |= REQ_F_FAIL_LINK;
5809                         *link = req;
5810                 } else {
5811                         io_queue_sqe(req, sqe);
5812                 }
5813         }
5814
5815         return 0;
5816 }
5817
5818 /*
5819  * Batched submission is done, ensure local IO is flushed out.
5820  */
5821 static void io_submit_state_end(struct io_submit_state *state)
5822 {
5823         blk_finish_plug(&state->plug);
5824         io_file_put(state);
5825         if (state->free_reqs)
5826                 kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
5827 }
5828
5829 /*
5830  * Start submission side cache.
5831  */
5832 static void io_submit_state_start(struct io_submit_state *state,
5833                                   unsigned int max_ios)
5834 {
5835         blk_start_plug(&state->plug);
5836         state->free_reqs = 0;
5837         state->file = NULL;
5838         state->ios_left = max_ios;
5839 }
5840
5841 static void io_commit_sqring(struct io_ring_ctx *ctx)
5842 {
5843         struct io_rings *rings = ctx->rings;
5844
5845         /*
5846          * Ensure any loads from the SQEs are done at this point,
5847          * since once we write the new head, the application could
5848          * write new data to them.
5849          */
5850         smp_store_release(&rings->sq.head, ctx->cached_sq_head);
5851 }
5852
5853 /*
5854  * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory
5855  * that is mapped by userspace. This means that care needs to be taken to
5856  * ensure that reads are stable, as we cannot rely on userspace always
5857  * being a good citizen. If members of the sqe are validated and then later
5858  * used, it's important that those reads are done through READ_ONCE() to
5859  * prevent a re-load down the line.
5860  */
5861 static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
5862 {
5863         u32 *sq_array = ctx->sq_array;
5864         unsigned head;
5865
5866         /*
5867          * The cached sq head (or cq tail) serves two purposes:
5868          *
5869          * 1) allows us to batch the cost of updating the user visible
5870          *    head updates.
5871          * 2) allows the kernel side to track the head on its own, even
5872          *    though the application is the one updating it.
5873          */
5874         head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]);
5875         if (likely(head < ctx->sq_entries))
5876                 return &ctx->sq_sqes[head];
5877
5878         /* drop invalid entries */
5879         ctx->cached_sq_dropped++;
5880         WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
5881         return NULL;
5882 }
5883
5884 static inline void io_consume_sqe(struct io_ring_ctx *ctx)
5885 {
5886         ctx->cached_sq_head++;
5887 }
5888
5889 #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
5890                                 IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
5891                                 IOSQE_BUFFER_SELECT)
5892
5893 static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
5894                        const struct io_uring_sqe *sqe,
5895                        struct io_submit_state *state, bool async)
5896 {
5897         unsigned int sqe_flags;
5898         int id;
5899
5900         /*
5901          * All io need record the previous position, if LINK vs DARIN,
5902          * it can be used to mark the position of the first IO in the
5903          * link list.
5904          */
5905         req->sequence = ctx->cached_sq_head - ctx->cached_sq_dropped;
5906         req->opcode = READ_ONCE(sqe->opcode);
5907         req->user_data = READ_ONCE(sqe->user_data);
5908         req->io = NULL;
5909         req->file = NULL;
5910         req->ctx = ctx;
5911         req->flags = 0;
5912         /* one is dropped after submission, the other at completion */
5913         refcount_set(&req->refs, 2);
5914         req->task = NULL;
5915         req->result = 0;
5916         req->needs_fixed_file = async;
5917         INIT_IO_WORK(&req->work, io_wq_submit_work);
5918
5919         if (unlikely(req->opcode >= IORING_OP_LAST))
5920                 return -EINVAL;
5921
5922         if (unlikely(io_sq_thread_acquire_mm(ctx, req)))
5923                 return -EFAULT;
5924
5925         sqe_flags = READ_ONCE(sqe->flags);
5926         /* enforce forwards compatibility on users */
5927         if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
5928                 return -EINVAL;
5929
5930         if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
5931             !io_op_defs[req->opcode].buffer_select)
5932                 return -EOPNOTSUPP;
5933
5934         id = READ_ONCE(sqe->personality);
5935         if (id) {
5936                 req->work.creds = idr_find(&ctx->personality_idr, id);
5937                 if (unlikely(!req->work.creds))
5938                         return -EINVAL;
5939                 get_cred(req->work.creds);
5940         }
5941
5942         /* same numerical values with corresponding REQ_F_*, safe to copy */
5943         req->flags |= sqe_flags & (IOSQE_IO_DRAIN | IOSQE_IO_HARDLINK |
5944                                         IOSQE_ASYNC | IOSQE_FIXED_FILE |
5945                                         IOSQE_BUFFER_SELECT | IOSQE_IO_LINK);
5946
5947         if (!io_op_defs[req->opcode].needs_file)
5948                 return 0;
5949
5950         return io_req_set_file(state, req, READ_ONCE(sqe->fd));
5951 }
5952
5953 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
5954                           struct file *ring_file, int ring_fd, bool async)
5955 {
5956         struct io_submit_state state, *statep = NULL;
5957         struct io_kiocb *link = NULL;
5958         int i, submitted = 0;
5959
5960         /* if we have a backlog and couldn't flush it all, return BUSY */
5961         if (test_bit(0, &ctx->sq_check_overflow)) {
5962                 if (!list_empty(&ctx->cq_overflow_list) &&
5963                     !io_cqring_overflow_flush(ctx, false))
5964                         return -EBUSY;
5965         }
5966
5967         /* make sure SQ entry isn't read before tail */
5968         nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
5969
5970         if (!percpu_ref_tryget_many(&ctx->refs, nr))
5971                 return -EAGAIN;
5972
5973         if (nr > IO_PLUG_THRESHOLD) {
5974                 io_submit_state_start(&state, nr);
5975                 statep = &state;
5976         }
5977
5978         ctx->ring_fd = ring_fd;
5979         ctx->ring_file = ring_file;
5980
5981         for (i = 0; i < nr; i++) {
5982                 const struct io_uring_sqe *sqe;
5983                 struct io_kiocb *req;
5984                 int err;
5985
5986                 sqe = io_get_sqe(ctx);
5987                 if (unlikely(!sqe)) {
5988                         io_consume_sqe(ctx);
5989                         break;
5990                 }
5991                 req = io_alloc_req(ctx, statep);
5992                 if (unlikely(!req)) {
5993                         if (!submitted)
5994                                 submitted = -EAGAIN;
5995                         break;
5996                 }
5997
5998                 err = io_init_req(ctx, req, sqe, statep, async);
5999                 io_consume_sqe(ctx);
6000                 /* will complete beyond this point, count as submitted */
6001                 submitted++;
6002
6003                 if (unlikely(err)) {
6004 fail_req:
6005                         io_cqring_add_event(req, err);
6006                         io_double_put_req(req);
6007                         break;
6008                 }
6009
6010                 trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
6011                                                 true, async);
6012                 err = io_submit_sqe(req, sqe, statep, &link);
6013                 if (err)
6014                         goto fail_req;
6015         }
6016
6017         if (unlikely(submitted != nr)) {
6018                 int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
6019
6020                 percpu_ref_put_many(&ctx->refs, nr - ref_used);
6021         }
6022         if (link)
6023                 io_queue_link_head(link);
6024         if (statep)
6025                 io_submit_state_end(&state);
6026
6027          /* Commit SQ ring head once we've consumed and submitted all SQEs */
6028         io_commit_sqring(ctx);
6029
6030         return submitted;
6031 }
6032
6033 static int io_sq_thread(void *data)
6034 {
6035         struct io_ring_ctx *ctx = data;
6036         const struct cred *old_cred;
6037         mm_segment_t old_fs;
6038         DEFINE_WAIT(wait);
6039         unsigned long timeout;
6040         int ret = 0;
6041
6042         complete(&ctx->completions[1]);
6043
6044         old_fs = get_fs();
6045         set_fs(USER_DS);
6046         old_cred = override_creds(ctx->creds);
6047
6048         timeout = jiffies + ctx->sq_thread_idle;
6049         while (!kthread_should_park()) {
6050                 unsigned int to_submit;
6051
6052                 if (!list_empty(&ctx->poll_list)) {
6053                         unsigned nr_events = 0;
6054
6055                         mutex_lock(&ctx->uring_lock);
6056                         if (!list_empty(&ctx->poll_list))
6057                                 io_iopoll_getevents(ctx, &nr_events, 0);
6058                         else
6059                                 timeout = jiffies + ctx->sq_thread_idle;
6060                         mutex_unlock(&ctx->uring_lock);
6061                 }
6062
6063                 to_submit = io_sqring_entries(ctx);
6064
6065                 /*
6066                  * If submit got -EBUSY, flag us as needing the application
6067                  * to enter the kernel to reap and flush events.
6068                  */
6069                 if (!to_submit || ret == -EBUSY) {
6070                         /*
6071                          * Drop cur_mm before scheduling, we can't hold it for
6072                          * long periods (or over schedule()). Do this before
6073                          * adding ourselves to the waitqueue, as the unuse/drop
6074                          * may sleep.
6075                          */
6076                         io_sq_thread_drop_mm(ctx);
6077
6078                         /*
6079                          * We're polling. If we're within the defined idle
6080                          * period, then let us spin without work before going
6081                          * to sleep. The exception is if we got EBUSY doing
6082                          * more IO, we should wait for the application to
6083                          * reap events and wake us up.
6084                          */
6085                         if (!list_empty(&ctx->poll_list) ||
6086                             (!time_after(jiffies, timeout) && ret != -EBUSY &&
6087                             !percpu_ref_is_dying(&ctx->refs))) {
6088                                 if (current->task_works)
6089                                         task_work_run();
6090                                 cond_resched();
6091                                 continue;
6092                         }
6093
6094                         prepare_to_wait(&ctx->sqo_wait, &wait,
6095                                                 TASK_INTERRUPTIBLE);
6096
6097                         /*
6098                          * While doing polled IO, before going to sleep, we need
6099                          * to check if there are new reqs added to poll_list, it
6100                          * is because reqs may have been punted to io worker and
6101                          * will be added to poll_list later, hence check the
6102                          * poll_list again.
6103                          */
6104                         if ((ctx->flags & IORING_SETUP_IOPOLL) &&
6105                             !list_empty_careful(&ctx->poll_list)) {
6106                                 finish_wait(&ctx->sqo_wait, &wait);
6107                                 continue;
6108                         }
6109
6110                         /* Tell userspace we may need a wakeup call */
6111                         ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
6112                         /* make sure to read SQ tail after writing flags */
6113                         smp_mb();
6114
6115                         to_submit = io_sqring_entries(ctx);
6116                         if (!to_submit || ret == -EBUSY) {
6117                                 if (kthread_should_park()) {
6118                                         finish_wait(&ctx->sqo_wait, &wait);
6119                                         break;
6120                                 }
6121                                 if (current->task_works) {
6122                                         task_work_run();
6123                                         finish_wait(&ctx->sqo_wait, &wait);
6124                                         continue;
6125                                 }
6126                                 if (signal_pending(current))
6127                                         flush_signals(current);
6128                                 schedule();
6129                                 finish_wait(&ctx->sqo_wait, &wait);
6130
6131                                 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6132                                 ret = 0;
6133                                 continue;
6134                         }
6135                         finish_wait(&ctx->sqo_wait, &wait);
6136
6137                         ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6138                 }
6139
6140                 mutex_lock(&ctx->uring_lock);
6141                 ret = io_submit_sqes(ctx, to_submit, NULL, -1, true);
6142                 mutex_unlock(&ctx->uring_lock);
6143                 timeout = jiffies + ctx->sq_thread_idle;
6144         }
6145
6146         if (current->task_works)
6147                 task_work_run();
6148
6149         set_fs(old_fs);
6150         io_sq_thread_drop_mm(ctx);
6151         revert_creds(old_cred);
6152
6153         kthread_parkme();
6154
6155         return 0;
6156 }
6157
6158 struct io_wait_queue {
6159         struct wait_queue_entry wq;
6160         struct io_ring_ctx *ctx;
6161         unsigned to_wait;
6162         unsigned nr_timeouts;
6163 };
6164
6165 static inline bool io_should_wake(struct io_wait_queue *iowq, bool noflush)
6166 {
6167         struct io_ring_ctx *ctx = iowq->ctx;
6168
6169         /*
6170          * Wake up if we have enough events, or if a timeout occurred since we
6171          * started waiting. For timeouts, we always want to return to userspace,
6172          * regardless of event count.
6173          */
6174         return io_cqring_events(ctx, noflush) >= iowq->to_wait ||
6175                         atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
6176 }
6177
6178 static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
6179                             int wake_flags, void *key)
6180 {
6181         struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
6182                                                         wq);
6183
6184         /* use noflush == true, as we can't safely rely on locking context */
6185         if (!io_should_wake(iowq, true))
6186                 return -1;
6187
6188         return autoremove_wake_function(curr, mode, wake_flags, key);
6189 }
6190
6191 /*
6192  * Wait until events become available, if we don't already have some. The
6193  * application must reap them itself, as they reside on the shared cq ring.
6194  */
6195 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
6196                           const sigset_t __user *sig, size_t sigsz)
6197 {
6198         struct io_wait_queue iowq = {
6199                 .wq = {
6200                         .private        = current,
6201                         .func           = io_wake_function,
6202                         .entry          = LIST_HEAD_INIT(iowq.wq.entry),
6203                 },
6204                 .ctx            = ctx,
6205                 .to_wait        = min_events,
6206         };
6207         struct io_rings *rings = ctx->rings;
6208         int ret = 0;
6209
6210         do {
6211                 if (io_cqring_events(ctx, false) >= min_events)
6212                         return 0;
6213                 if (!current->task_works)
6214                         break;
6215                 task_work_run();
6216         } while (1);
6217
6218         if (sig) {
6219 #ifdef CONFIG_COMPAT
6220                 if (in_compat_syscall())
6221                         ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
6222                                                       sigsz);
6223                 else
6224 #endif
6225                         ret = set_user_sigmask(sig, sigsz);
6226
6227                 if (ret)
6228                         return ret;
6229         }
6230
6231         iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
6232         trace_io_uring_cqring_wait(ctx, min_events);
6233         do {
6234                 prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
6235                                                 TASK_INTERRUPTIBLE);
6236                 if (current->task_works)
6237                         task_work_run();
6238                 if (io_should_wake(&iowq, false))
6239                         break;
6240                 schedule();
6241                 if (signal_pending(current)) {
6242                         ret = -EINTR;
6243                         break;
6244                 }
6245         } while (1);
6246         finish_wait(&ctx->wait, &iowq.wq);
6247
6248         restore_saved_sigmask_unless(ret == -EINTR);
6249
6250         return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
6251 }
6252
6253 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
6254 {
6255 #if defined(CONFIG_UNIX)
6256         if (ctx->ring_sock) {
6257                 struct sock *sock = ctx->ring_sock->sk;
6258                 struct sk_buff *skb;
6259
6260                 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
6261                         kfree_skb(skb);
6262         }
6263 #else
6264         int i;
6265
6266         for (i = 0; i < ctx->nr_user_files; i++) {
6267                 struct file *file;
6268
6269                 file = io_file_from_index(ctx, i);
6270                 if (file)
6271                         fput(file);
6272         }
6273 #endif
6274 }
6275
6276 static void io_file_ref_kill(struct percpu_ref *ref)
6277 {
6278         struct fixed_file_data *data;
6279
6280         data = container_of(ref, struct fixed_file_data, refs);
6281         complete(&data->done);
6282 }
6283
6284 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
6285 {
6286         struct fixed_file_data *data = ctx->file_data;
6287         struct fixed_file_ref_node *ref_node = NULL;
6288         unsigned nr_tables, i;
6289         unsigned long flags;
6290
6291         if (!data)
6292                 return -ENXIO;
6293
6294         spin_lock_irqsave(&data->lock, flags);
6295         if (!list_empty(&data->ref_list))
6296                 ref_node = list_first_entry(&data->ref_list,
6297                                 struct fixed_file_ref_node, node);
6298         spin_unlock_irqrestore(&data->lock, flags);
6299         if (ref_node)
6300                 percpu_ref_kill(&ref_node->refs);
6301
6302         percpu_ref_kill(&data->refs);
6303
6304         /* wait for all refs nodes to complete */
6305         wait_for_completion(&data->done);
6306
6307         __io_sqe_files_unregister(ctx);
6308         nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
6309         for (i = 0; i < nr_tables; i++)
6310                 kfree(data->table[i].files);
6311         kfree(data->table);
6312         percpu_ref_exit(&data->refs);
6313         kfree(data);
6314         ctx->file_data = NULL;
6315         ctx->nr_user_files = 0;
6316         return 0;
6317 }
6318
6319 static void io_sq_thread_stop(struct io_ring_ctx *ctx)
6320 {
6321         if (ctx->sqo_thread) {
6322                 wait_for_completion(&ctx->completions[1]);
6323                 /*
6324                  * The park is a bit of a work-around, without it we get
6325                  * warning spews on shutdown with SQPOLL set and affinity
6326                  * set to a single CPU.
6327                  */
6328                 kthread_park(ctx->sqo_thread);
6329                 kthread_stop(ctx->sqo_thread);
6330                 ctx->sqo_thread = NULL;
6331         }
6332 }
6333
6334 static void io_finish_async(struct io_ring_ctx *ctx)
6335 {
6336         io_sq_thread_stop(ctx);
6337
6338         if (ctx->io_wq) {
6339                 io_wq_destroy(ctx->io_wq);
6340                 ctx->io_wq = NULL;
6341         }
6342 }
6343
6344 #if defined(CONFIG_UNIX)
6345 /*
6346  * Ensure the UNIX gc is aware of our file set, so we are certain that
6347  * the io_uring can be safely unregistered on process exit, even if we have
6348  * loops in the file referencing.
6349  */
6350 static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
6351 {
6352         struct sock *sk = ctx->ring_sock->sk;
6353         struct scm_fp_list *fpl;
6354         struct sk_buff *skb;
6355         int i, nr_files;
6356
6357         fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
6358         if (!fpl)
6359                 return -ENOMEM;
6360
6361         skb = alloc_skb(0, GFP_KERNEL);
6362         if (!skb) {
6363                 kfree(fpl);
6364                 return -ENOMEM;
6365         }
6366
6367         skb->sk = sk;
6368
6369         nr_files = 0;
6370         fpl->user = get_uid(ctx->user);
6371         for (i = 0; i < nr; i++) {
6372                 struct file *file = io_file_from_index(ctx, i + offset);
6373
6374                 if (!file)
6375                         continue;
6376                 fpl->fp[nr_files] = get_file(file);
6377                 unix_inflight(fpl->user, fpl->fp[nr_files]);
6378                 nr_files++;
6379         }
6380
6381         if (nr_files) {
6382                 fpl->max = SCM_MAX_FD;
6383                 fpl->count = nr_files;
6384                 UNIXCB(skb).fp = fpl;
6385                 skb->destructor = unix_destruct_scm;
6386                 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
6387                 skb_queue_head(&sk->sk_receive_queue, skb);
6388
6389                 for (i = 0; i < nr_files; i++)
6390                         fput(fpl->fp[i]);
6391         } else {
6392                 kfree_skb(skb);
6393                 kfree(fpl);
6394         }
6395
6396         return 0;
6397 }
6398
6399 /*
6400  * If UNIX sockets are enabled, fd passing can cause a reference cycle which
6401  * causes regular reference counting to break down. We rely on the UNIX
6402  * garbage collection to take care of this problem for us.
6403  */
6404 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
6405 {
6406         unsigned left, total;
6407         int ret = 0;
6408
6409         total = 0;
6410         left = ctx->nr_user_files;
6411         while (left) {
6412                 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
6413
6414                 ret = __io_sqe_files_scm(ctx, this_files, total);
6415                 if (ret)
6416                         break;
6417                 left -= this_files;
6418                 total += this_files;
6419         }
6420
6421         if (!ret)
6422                 return 0;
6423
6424         while (total < ctx->nr_user_files) {
6425                 struct file *file = io_file_from_index(ctx, total);
6426
6427                 if (file)
6428                         fput(file);
6429                 total++;
6430         }
6431
6432         return ret;
6433 }
6434 #else
6435 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
6436 {
6437         return 0;
6438 }
6439 #endif
6440
6441 static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
6442                                     unsigned nr_files)
6443 {
6444         int i;
6445
6446         for (i = 0; i < nr_tables; i++) {
6447                 struct fixed_file_table *table = &ctx->file_data->table[i];
6448                 unsigned this_files;
6449
6450                 this_files = min(nr_files, IORING_MAX_FILES_TABLE);
6451                 table->files = kcalloc(this_files, sizeof(struct file *),
6452                                         GFP_KERNEL);
6453                 if (!table->files)
6454                         break;
6455                 nr_files -= this_files;
6456         }
6457
6458         if (i == nr_tables)
6459                 return 0;
6460
6461         for (i = 0; i < nr_tables; i++) {
6462                 struct fixed_file_table *table = &ctx->file_data->table[i];
6463                 kfree(table->files);
6464         }
6465         return 1;
6466 }
6467
6468 static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
6469 {
6470 #if defined(CONFIG_UNIX)
6471         struct sock *sock = ctx->ring_sock->sk;
6472         struct sk_buff_head list, *head = &sock->sk_receive_queue;
6473         struct sk_buff *skb;
6474         int i;
6475
6476         __skb_queue_head_init(&list);
6477
6478         /*
6479          * Find the skb that holds this file in its SCM_RIGHTS. When found,
6480          * remove this entry and rearrange the file array.
6481          */
6482         skb = skb_dequeue(head);
6483         while (skb) {
6484                 struct scm_fp_list *fp;
6485
6486                 fp = UNIXCB(skb).fp;
6487                 for (i = 0; i < fp->count; i++) {
6488                         int left;
6489
6490                         if (fp->fp[i] != file)
6491                                 continue;
6492
6493                         unix_notinflight(fp->user, fp->fp[i]);
6494                         left = fp->count - 1 - i;
6495                         if (left) {
6496                                 memmove(&fp->fp[i], &fp->fp[i + 1],
6497                                                 left * sizeof(struct file *));
6498                         }
6499                         fp->count--;
6500                         if (!fp->count) {
6501                                 kfree_skb(skb);
6502                                 skb = NULL;
6503                         } else {
6504                                 __skb_queue_tail(&list, skb);
6505                         }
6506                         fput(file);
6507                         file = NULL;
6508                         break;
6509                 }
6510
6511                 if (!file)
6512                         break;
6513
6514                 __skb_queue_tail(&list, skb);
6515
6516                 skb = skb_dequeue(head);
6517         }
6518
6519         if (skb_peek(&list)) {
6520                 spin_lock_irq(&head->lock);
6521                 while ((skb = __skb_dequeue(&list)) != NULL)
6522                         __skb_queue_tail(head, skb);
6523                 spin_unlock_irq(&head->lock);
6524         }
6525 #else
6526         fput(file);
6527 #endif
6528 }
6529
6530 struct io_file_put {
6531         struct list_head list;
6532         struct file *file;
6533 };
6534
6535 static void io_file_put_work(struct work_struct *work)
6536 {
6537         struct fixed_file_ref_node *ref_node;
6538         struct fixed_file_data *file_data;
6539         struct io_ring_ctx *ctx;
6540         struct io_file_put *pfile, *tmp;
6541         unsigned long flags;
6542
6543         ref_node = container_of(work, struct fixed_file_ref_node, work);
6544         file_data = ref_node->file_data;
6545         ctx = file_data->ctx;
6546
6547         list_for_each_entry_safe(pfile, tmp, &ref_node->file_list, list) {
6548                 list_del_init(&pfile->list);
6549                 io_ring_file_put(ctx, pfile->file);
6550                 kfree(pfile);
6551         }
6552
6553         spin_lock_irqsave(&file_data->lock, flags);
6554         list_del_init(&ref_node->node);
6555         spin_unlock_irqrestore(&file_data->lock, flags);
6556
6557         percpu_ref_exit(&ref_node->refs);
6558         kfree(ref_node);
6559         percpu_ref_put(&file_data->refs);
6560 }
6561
6562 static void io_file_data_ref_zero(struct percpu_ref *ref)
6563 {
6564         struct fixed_file_ref_node *ref_node;
6565
6566         ref_node = container_of(ref, struct fixed_file_ref_node, refs);
6567
6568         queue_work(system_wq, &ref_node->work);
6569 }
6570
6571 static struct fixed_file_ref_node *alloc_fixed_file_ref_node(
6572                         struct io_ring_ctx *ctx)
6573 {
6574         struct fixed_file_ref_node *ref_node;
6575
6576         ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
6577         if (!ref_node)
6578                 return ERR_PTR(-ENOMEM);
6579
6580         if (percpu_ref_init(&ref_node->refs, io_file_data_ref_zero,
6581                             0, GFP_KERNEL)) {
6582                 kfree(ref_node);
6583                 return ERR_PTR(-ENOMEM);
6584         }
6585         INIT_LIST_HEAD(&ref_node->node);
6586         INIT_LIST_HEAD(&ref_node->file_list);
6587         INIT_WORK(&ref_node->work, io_file_put_work);
6588         ref_node->file_data = ctx->file_data;
6589         return ref_node;
6590
6591 }
6592
6593 static void destroy_fixed_file_ref_node(struct fixed_file_ref_node *ref_node)
6594 {
6595         percpu_ref_exit(&ref_node->refs);
6596         kfree(ref_node);
6597 }
6598
6599 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
6600                                  unsigned nr_args)
6601 {
6602         __s32 __user *fds = (__s32 __user *) arg;
6603         unsigned nr_tables;
6604         struct file *file;
6605         int fd, ret = 0;
6606         unsigned i;
6607         struct fixed_file_ref_node *ref_node;
6608         unsigned long flags;
6609
6610         if (ctx->file_data)
6611                 return -EBUSY;
6612         if (!nr_args)
6613                 return -EINVAL;
6614         if (nr_args > IORING_MAX_FIXED_FILES)
6615                 return -EMFILE;
6616
6617         ctx->file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL);
6618         if (!ctx->file_data)
6619                 return -ENOMEM;
6620         ctx->file_data->ctx = ctx;
6621         init_completion(&ctx->file_data->done);
6622         INIT_LIST_HEAD(&ctx->file_data->ref_list);
6623         spin_lock_init(&ctx->file_data->lock);
6624
6625         nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
6626         ctx->file_data->table = kcalloc(nr_tables,
6627                                         sizeof(struct fixed_file_table),
6628                                         GFP_KERNEL);
6629         if (!ctx->file_data->table) {
6630                 kfree(ctx->file_data);
6631                 ctx->file_data = NULL;
6632                 return -ENOMEM;
6633         }
6634
6635         if (percpu_ref_init(&ctx->file_data->refs, io_file_ref_kill,
6636                                 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
6637                 kfree(ctx->file_data->table);
6638                 kfree(ctx->file_data);
6639                 ctx->file_data = NULL;
6640                 return -ENOMEM;
6641         }
6642
6643         if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
6644                 percpu_ref_exit(&ctx->file_data->refs);
6645                 kfree(ctx->file_data->table);
6646                 kfree(ctx->file_data);
6647                 ctx->file_data = NULL;
6648                 return -ENOMEM;
6649         }
6650
6651         for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
6652                 struct fixed_file_table *table;
6653                 unsigned index;
6654
6655                 ret = -EFAULT;
6656                 if (copy_from_user(&fd, &fds[i], sizeof(fd)))
6657                         break;
6658                 /* allow sparse sets */
6659                 if (fd == -1) {
6660                         ret = 0;
6661                         continue;
6662                 }
6663
6664                 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
6665                 index = i & IORING_FILE_TABLE_MASK;
6666                 file = fget(fd);
6667
6668                 ret = -EBADF;
6669                 if (!file)
6670                         break;
6671
6672                 /*
6673                  * Don't allow io_uring instances to be registered. If UNIX
6674                  * isn't enabled, then this causes a reference cycle and this
6675                  * instance can never get freed. If UNIX is enabled we'll
6676                  * handle it just fine, but there's still no point in allowing
6677                  * a ring fd as it doesn't support regular read/write anyway.
6678                  */
6679                 if (file->f_op == &io_uring_fops) {
6680                         fput(file);
6681                         break;
6682                 }
6683                 ret = 0;
6684                 table->files[index] = file;
6685         }
6686
6687         if (ret) {
6688                 for (i = 0; i < ctx->nr_user_files; i++) {
6689                         file = io_file_from_index(ctx, i);
6690                         if (file)
6691                                 fput(file);
6692                 }
6693                 for (i = 0; i < nr_tables; i++)
6694                         kfree(ctx->file_data->table[i].files);
6695
6696                 kfree(ctx->file_data->table);
6697                 kfree(ctx->file_data);
6698                 ctx->file_data = NULL;
6699                 ctx->nr_user_files = 0;
6700                 return ret;
6701         }
6702
6703         ret = io_sqe_files_scm(ctx);
6704         if (ret) {
6705                 io_sqe_files_unregister(ctx);
6706                 return ret;
6707         }
6708
6709         ref_node = alloc_fixed_file_ref_node(ctx);
6710         if (IS_ERR(ref_node)) {
6711                 io_sqe_files_unregister(ctx);
6712                 return PTR_ERR(ref_node);
6713         }
6714
6715         ctx->file_data->cur_refs = &ref_node->refs;
6716         spin_lock_irqsave(&ctx->file_data->lock, flags);
6717         list_add(&ref_node->node, &ctx->file_data->ref_list);
6718         spin_unlock_irqrestore(&ctx->file_data->lock, flags);
6719         percpu_ref_get(&ctx->file_data->refs);
6720         return ret;
6721 }
6722
6723 static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
6724                                 int index)
6725 {
6726 #if defined(CONFIG_UNIX)
6727         struct sock *sock = ctx->ring_sock->sk;
6728         struct sk_buff_head *head = &sock->sk_receive_queue;
6729         struct sk_buff *skb;
6730
6731         /*
6732          * See if we can merge this file into an existing skb SCM_RIGHTS
6733          * file set. If there's no room, fall back to allocating a new skb
6734          * and filling it in.
6735          */
6736         spin_lock_irq(&head->lock);
6737         skb = skb_peek(head);
6738         if (skb) {
6739                 struct scm_fp_list *fpl = UNIXCB(skb).fp;
6740
6741                 if (fpl->count < SCM_MAX_FD) {
6742                         __skb_unlink(skb, head);
6743                         spin_unlock_irq(&head->lock);
6744                         fpl->fp[fpl->count] = get_file(file);
6745                         unix_inflight(fpl->user, fpl->fp[fpl->count]);
6746                         fpl->count++;
6747                         spin_lock_irq(&head->lock);
6748                         __skb_queue_head(head, skb);
6749                 } else {
6750                         skb = NULL;
6751                 }
6752         }
6753         spin_unlock_irq(&head->lock);
6754
6755         if (skb) {
6756                 fput(file);
6757                 return 0;
6758         }
6759
6760         return __io_sqe_files_scm(ctx, 1, index);
6761 #else
6762         return 0;
6763 #endif
6764 }
6765
6766 static int io_queue_file_removal(struct fixed_file_data *data,
6767                                  struct file *file)
6768 {
6769         struct io_file_put *pfile;
6770         struct percpu_ref *refs = data->cur_refs;
6771         struct fixed_file_ref_node *ref_node;
6772
6773         pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
6774         if (!pfile)
6775                 return -ENOMEM;
6776
6777         ref_node = container_of(refs, struct fixed_file_ref_node, refs);
6778         pfile->file = file;
6779         list_add(&pfile->list, &ref_node->file_list);
6780
6781         return 0;
6782 }
6783
6784 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
6785                                  struct io_uring_files_update *up,
6786                                  unsigned nr_args)
6787 {
6788         struct fixed_file_data *data = ctx->file_data;
6789         struct fixed_file_ref_node *ref_node;
6790         struct file *file;
6791         __s32 __user *fds;
6792         int fd, i, err;
6793         __u32 done;
6794         unsigned long flags;
6795         bool needs_switch = false;
6796
6797         if (check_add_overflow(up->offset, nr_args, &done))
6798                 return -EOVERFLOW;
6799         if (done > ctx->nr_user_files)
6800                 return -EINVAL;
6801
6802         ref_node = alloc_fixed_file_ref_node(ctx);
6803         if (IS_ERR(ref_node))
6804                 return PTR_ERR(ref_node);
6805
6806         done = 0;
6807         fds = u64_to_user_ptr(up->fds);
6808         while (nr_args) {
6809                 struct fixed_file_table *table;
6810                 unsigned index;
6811
6812                 err = 0;
6813                 if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
6814                         err = -EFAULT;
6815                         break;
6816                 }
6817                 i = array_index_nospec(up->offset, ctx->nr_user_files);
6818                 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
6819                 index = i & IORING_FILE_TABLE_MASK;
6820                 if (table->files[index]) {
6821                         file = io_file_from_index(ctx, index);
6822                         err = io_queue_file_removal(data, file);
6823                         if (err)
6824                                 break;
6825                         table->files[index] = NULL;
6826                         needs_switch = true;
6827                 }
6828                 if (fd != -1) {
6829                         file = fget(fd);
6830                         if (!file) {
6831                                 err = -EBADF;
6832                                 break;
6833                         }
6834                         /*
6835                          * Don't allow io_uring instances to be registered. If
6836                          * UNIX isn't enabled, then this causes a reference
6837                          * cycle and this instance can never get freed. If UNIX
6838                          * is enabled we'll handle it just fine, but there's
6839                          * still no point in allowing a ring fd as it doesn't
6840                          * support regular read/write anyway.
6841                          */
6842                         if (file->f_op == &io_uring_fops) {
6843                                 fput(file);
6844                                 err = -EBADF;
6845                                 break;
6846                         }
6847                         table->files[index] = file;
6848                         err = io_sqe_file_register(ctx, file, i);
6849                         if (err)
6850                                 break;
6851                 }
6852                 nr_args--;
6853                 done++;
6854                 up->offset++;
6855         }
6856
6857         if (needs_switch) {
6858                 percpu_ref_kill(data->cur_refs);
6859                 spin_lock_irqsave(&data->lock, flags);
6860                 list_add(&ref_node->node, &data->ref_list);
6861                 data->cur_refs = &ref_node->refs;
6862                 spin_unlock_irqrestore(&data->lock, flags);
6863                 percpu_ref_get(&ctx->file_data->refs);
6864         } else
6865                 destroy_fixed_file_ref_node(ref_node);
6866
6867         return done ? done : err;
6868 }
6869
6870 static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
6871                                unsigned nr_args)
6872 {
6873         struct io_uring_files_update up;
6874
6875         if (!ctx->file_data)
6876                 return -ENXIO;
6877         if (!nr_args)
6878                 return -EINVAL;
6879         if (copy_from_user(&up, arg, sizeof(up)))
6880                 return -EFAULT;
6881         if (up.resv)
6882                 return -EINVAL;
6883
6884         return __io_sqe_files_update(ctx, &up, nr_args);
6885 }
6886
6887 static void io_free_work(struct io_wq_work *work)
6888 {
6889         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6890
6891         /* Consider that io_steal_work() relies on this ref */
6892         io_put_req(req);
6893 }
6894
6895 static int io_init_wq_offload(struct io_ring_ctx *ctx,
6896                               struct io_uring_params *p)
6897 {
6898         struct io_wq_data data;
6899         struct fd f;
6900         struct io_ring_ctx *ctx_attach;
6901         unsigned int concurrency;
6902         int ret = 0;
6903
6904         data.user = ctx->user;
6905         data.free_work = io_free_work;
6906
6907         if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
6908                 /* Do QD, or 4 * CPUS, whatever is smallest */
6909                 concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
6910
6911                 ctx->io_wq = io_wq_create(concurrency, &data);
6912                 if (IS_ERR(ctx->io_wq)) {
6913                         ret = PTR_ERR(ctx->io_wq);
6914                         ctx->io_wq = NULL;
6915                 }
6916                 return ret;
6917         }
6918
6919         f = fdget(p->wq_fd);
6920         if (!f.file)
6921                 return -EBADF;
6922
6923         if (f.file->f_op != &io_uring_fops) {
6924                 ret = -EINVAL;
6925                 goto out_fput;
6926         }
6927
6928         ctx_attach = f.file->private_data;
6929         /* @io_wq is protected by holding the fd */
6930         if (!io_wq_get(ctx_attach->io_wq, &data)) {
6931                 ret = -EINVAL;
6932                 goto out_fput;
6933         }
6934
6935         ctx->io_wq = ctx_attach->io_wq;
6936 out_fput:
6937         fdput(f);
6938         return ret;
6939 }
6940
6941 static int io_sq_offload_start(struct io_ring_ctx *ctx,
6942                                struct io_uring_params *p)
6943 {
6944         int ret;
6945
6946         mmgrab(current->mm);
6947         ctx->sqo_mm = current->mm;
6948
6949         if (ctx->flags & IORING_SETUP_SQPOLL) {
6950                 ret = -EPERM;
6951                 if (!capable(CAP_SYS_ADMIN))
6952                         goto err;
6953
6954                 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
6955                 if (!ctx->sq_thread_idle)
6956                         ctx->sq_thread_idle = HZ;
6957
6958                 if (p->flags & IORING_SETUP_SQ_AFF) {
6959                         int cpu = p->sq_thread_cpu;
6960
6961                         ret = -EINVAL;
6962                         if (cpu >= nr_cpu_ids)
6963                                 goto err;
6964                         if (!cpu_online(cpu))
6965                                 goto err;
6966
6967                         ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
6968                                                         ctx, cpu,
6969                                                         "io_uring-sq");
6970                 } else {
6971                         ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
6972                                                         "io_uring-sq");
6973                 }
6974                 if (IS_ERR(ctx->sqo_thread)) {
6975                         ret = PTR_ERR(ctx->sqo_thread);
6976                         ctx->sqo_thread = NULL;
6977                         goto err;
6978                 }
6979                 wake_up_process(ctx->sqo_thread);
6980         } else if (p->flags & IORING_SETUP_SQ_AFF) {
6981                 /* Can't have SQ_AFF without SQPOLL */
6982                 ret = -EINVAL;
6983                 goto err;
6984         }
6985
6986         ret = io_init_wq_offload(ctx, p);
6987         if (ret)
6988                 goto err;
6989
6990         return 0;
6991 err:
6992         io_finish_async(ctx);
6993         mmdrop(ctx->sqo_mm);
6994         ctx->sqo_mm = NULL;
6995         return ret;
6996 }
6997
6998 static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
6999 {
7000         atomic_long_sub(nr_pages, &user->locked_vm);
7001 }
7002
7003 static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
7004 {
7005         unsigned long page_limit, cur_pages, new_pages;
7006
7007         /* Don't allow more pages than we can safely lock */
7008         page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
7009
7010         do {
7011                 cur_pages = atomic_long_read(&user->locked_vm);
7012                 new_pages = cur_pages + nr_pages;
7013                 if (new_pages > page_limit)
7014                         return -ENOMEM;
7015         } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
7016                                         new_pages) != cur_pages);
7017
7018         return 0;
7019 }
7020
7021 static void io_mem_free(void *ptr)
7022 {
7023         struct page *page;
7024
7025         if (!ptr)
7026                 return;
7027
7028         page = virt_to_head_page(ptr);
7029         if (put_page_testzero(page))
7030                 free_compound_page(page);
7031 }
7032
7033 static void *io_mem_alloc(size_t size)
7034 {
7035         gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
7036                                 __GFP_NORETRY;
7037
7038         return (void *) __get_free_pages(gfp_flags, get_order(size));
7039 }
7040
7041 static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
7042                                 size_t *sq_offset)
7043 {
7044         struct io_rings *rings;
7045         size_t off, sq_array_size;
7046
7047         off = struct_size(rings, cqes, cq_entries);
7048         if (off == SIZE_MAX)
7049                 return SIZE_MAX;
7050
7051 #ifdef CONFIG_SMP
7052         off = ALIGN(off, SMP_CACHE_BYTES);
7053         if (off == 0)
7054                 return SIZE_MAX;
7055 #endif
7056
7057         sq_array_size = array_size(sizeof(u32), sq_entries);
7058         if (sq_array_size == SIZE_MAX)
7059                 return SIZE_MAX;
7060
7061         if (check_add_overflow(off, sq_array_size, &off))
7062                 return SIZE_MAX;
7063
7064         if (sq_offset)
7065                 *sq_offset = off;
7066
7067         return off;
7068 }
7069
7070 static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
7071 {
7072         size_t pages;
7073
7074         pages = (size_t)1 << get_order(
7075                 rings_size(sq_entries, cq_entries, NULL));
7076         pages += (size_t)1 << get_order(
7077                 array_size(sizeof(struct io_uring_sqe), sq_entries));
7078
7079         return pages;
7080 }
7081
7082 static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
7083 {
7084         int i, j;
7085
7086         if (!ctx->user_bufs)
7087                 return -ENXIO;
7088
7089         for (i = 0; i < ctx->nr_user_bufs; i++) {
7090                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
7091
7092                 for (j = 0; j < imu->nr_bvecs; j++)
7093                         unpin_user_page(imu->bvec[j].bv_page);
7094
7095                 if (ctx->account_mem)
7096                         io_unaccount_mem(ctx->user, imu->nr_bvecs);
7097                 kvfree(imu->bvec);
7098                 imu->nr_bvecs = 0;
7099         }
7100
7101         kfree(ctx->user_bufs);
7102         ctx->user_bufs = NULL;
7103         ctx->nr_user_bufs = 0;
7104         return 0;
7105 }
7106
7107 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
7108                        void __user *arg, unsigned index)
7109 {
7110         struct iovec __user *src;
7111
7112 #ifdef CONFIG_COMPAT
7113         if (ctx->compat) {
7114                 struct compat_iovec __user *ciovs;
7115                 struct compat_iovec ciov;
7116
7117                 ciovs = (struct compat_iovec __user *) arg;
7118                 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
7119                         return -EFAULT;
7120
7121                 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
7122                 dst->iov_len = ciov.iov_len;
7123                 return 0;
7124         }
7125 #endif
7126         src = (struct iovec __user *) arg;
7127         if (copy_from_user(dst, &src[index], sizeof(*dst)))
7128                 return -EFAULT;
7129         return 0;
7130 }
7131
7132 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
7133                                   unsigned nr_args)
7134 {
7135         struct vm_area_struct **vmas = NULL;
7136         struct page **pages = NULL;
7137         int i, j, got_pages = 0;
7138         int ret = -EINVAL;
7139
7140         if (ctx->user_bufs)
7141                 return -EBUSY;
7142         if (!nr_args || nr_args > UIO_MAXIOV)
7143                 return -EINVAL;
7144
7145         ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
7146                                         GFP_KERNEL);
7147         if (!ctx->user_bufs)
7148                 return -ENOMEM;
7149
7150         for (i = 0; i < nr_args; i++) {
7151                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
7152                 unsigned long off, start, end, ubuf;
7153                 int pret, nr_pages;
7154                 struct iovec iov;
7155                 size_t size;
7156
7157                 ret = io_copy_iov(ctx, &iov, arg, i);
7158                 if (ret)
7159                         goto err;
7160
7161                 /*
7162                  * Don't impose further limits on the size and buffer
7163                  * constraints here, we'll -EINVAL later when IO is
7164                  * submitted if they are wrong.
7165                  */
7166                 ret = -EFAULT;
7167                 if (!iov.iov_base || !iov.iov_len)
7168                         goto err;
7169
7170                 /* arbitrary limit, but we need something */
7171                 if (iov.iov_len > SZ_1G)
7172                         goto err;
7173
7174                 ubuf = (unsigned long) iov.iov_base;
7175                 end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
7176                 start = ubuf >> PAGE_SHIFT;
7177                 nr_pages = end - start;
7178
7179                 if (ctx->account_mem) {
7180                         ret = io_account_mem(ctx->user, nr_pages);
7181                         if (ret)
7182                                 goto err;
7183                 }
7184
7185                 ret = 0;
7186                 if (!pages || nr_pages > got_pages) {
7187                         kvfree(vmas);
7188                         kvfree(pages);
7189                         pages = kvmalloc_array(nr_pages, sizeof(struct page *),
7190                                                 GFP_KERNEL);
7191                         vmas = kvmalloc_array(nr_pages,
7192                                         sizeof(struct vm_area_struct *),
7193                                         GFP_KERNEL);
7194                         if (!pages || !vmas) {
7195                                 ret = -ENOMEM;
7196                                 if (ctx->account_mem)
7197                                         io_unaccount_mem(ctx->user, nr_pages);
7198                                 goto err;
7199                         }
7200                         got_pages = nr_pages;
7201                 }
7202
7203                 imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
7204                                                 GFP_KERNEL);
7205                 ret = -ENOMEM;
7206                 if (!imu->bvec) {
7207                         if (ctx->account_mem)
7208                                 io_unaccount_mem(ctx->user, nr_pages);
7209                         goto err;
7210                 }
7211
7212                 ret = 0;
7213                 down_read(&current->mm->mmap_sem);
7214                 pret = pin_user_pages(ubuf, nr_pages,
7215                                       FOLL_WRITE | FOLL_LONGTERM,
7216                                       pages, vmas);
7217                 if (pret == nr_pages) {
7218                         /* don't support file backed memory */
7219                         for (j = 0; j < nr_pages; j++) {
7220                                 struct vm_area_struct *vma = vmas[j];
7221
7222                                 if (vma->vm_file &&
7223                                     !is_file_hugepages(vma->vm_file)) {
7224                                         ret = -EOPNOTSUPP;
7225                                         break;
7226                                 }
7227                         }
7228                 } else {
7229                         ret = pret < 0 ? pret : -EFAULT;
7230                 }
7231                 up_read(&current->mm->mmap_sem);
7232                 if (ret) {
7233                         /*
7234                          * if we did partial map, or found file backed vmas,
7235                          * release any pages we did get
7236                          */
7237                         if (pret > 0)
7238                                 unpin_user_pages(pages, pret);
7239                         if (ctx->account_mem)
7240                                 io_unaccount_mem(ctx->user, nr_pages);
7241                         kvfree(imu->bvec);
7242                         goto err;
7243                 }
7244
7245                 off = ubuf & ~PAGE_MASK;
7246                 size = iov.iov_len;
7247                 for (j = 0; j < nr_pages; j++) {
7248                         size_t vec_len;
7249
7250                         vec_len = min_t(size_t, size, PAGE_SIZE - off);
7251                         imu->bvec[j].bv_page = pages[j];
7252                         imu->bvec[j].bv_len = vec_len;
7253                         imu->bvec[j].bv_offset = off;
7254                         off = 0;
7255                         size -= vec_len;
7256                 }
7257                 /* store original address for later verification */
7258                 imu->ubuf = ubuf;
7259                 imu->len = iov.iov_len;
7260                 imu->nr_bvecs = nr_pages;
7261
7262                 ctx->nr_user_bufs++;
7263         }
7264         kvfree(pages);
7265         kvfree(vmas);
7266         return 0;
7267 err:
7268         kvfree(pages);
7269         kvfree(vmas);
7270         io_sqe_buffer_unregister(ctx);
7271         return ret;
7272 }
7273
7274 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
7275 {
7276         __s32 __user *fds = arg;
7277         int fd;
7278
7279         if (ctx->cq_ev_fd)
7280                 return -EBUSY;
7281
7282         if (copy_from_user(&fd, fds, sizeof(*fds)))
7283                 return -EFAULT;
7284
7285         ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
7286         if (IS_ERR(ctx->cq_ev_fd)) {
7287                 int ret = PTR_ERR(ctx->cq_ev_fd);
7288                 ctx->cq_ev_fd = NULL;
7289                 return ret;
7290         }
7291
7292         return 0;
7293 }
7294
7295 static int io_eventfd_unregister(struct io_ring_ctx *ctx)
7296 {
7297         if (ctx->cq_ev_fd) {
7298                 eventfd_ctx_put(ctx->cq_ev_fd);
7299                 ctx->cq_ev_fd = NULL;
7300                 return 0;
7301         }
7302
7303         return -ENXIO;
7304 }
7305
7306 static int __io_destroy_buffers(int id, void *p, void *data)
7307 {
7308         struct io_ring_ctx *ctx = data;
7309         struct io_buffer *buf = p;
7310
7311         __io_remove_buffers(ctx, buf, id, -1U);
7312         return 0;
7313 }
7314
7315 static void io_destroy_buffers(struct io_ring_ctx *ctx)
7316 {
7317         idr_for_each(&ctx->io_buffer_idr, __io_destroy_buffers, ctx);
7318         idr_destroy(&ctx->io_buffer_idr);
7319 }
7320
7321 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
7322 {
7323         io_finish_async(ctx);
7324         if (ctx->sqo_mm)
7325                 mmdrop(ctx->sqo_mm);
7326
7327         io_iopoll_reap_events(ctx);
7328         io_sqe_buffer_unregister(ctx);
7329         io_sqe_files_unregister(ctx);
7330         io_eventfd_unregister(ctx);
7331         io_destroy_buffers(ctx);
7332         idr_destroy(&ctx->personality_idr);
7333
7334 #if defined(CONFIG_UNIX)
7335         if (ctx->ring_sock) {
7336                 ctx->ring_sock->file = NULL; /* so that iput() is called */
7337                 sock_release(ctx->ring_sock);
7338         }
7339 #endif
7340
7341         io_mem_free(ctx->rings);
7342         io_mem_free(ctx->sq_sqes);
7343
7344         percpu_ref_exit(&ctx->refs);
7345         if (ctx->account_mem)
7346                 io_unaccount_mem(ctx->user,
7347                                 ring_pages(ctx->sq_entries, ctx->cq_entries));
7348         free_uid(ctx->user);
7349         put_cred(ctx->creds);
7350         kfree(ctx->completions);
7351         kfree(ctx->cancel_hash);
7352         kmem_cache_free(req_cachep, ctx->fallback_req);
7353         kfree(ctx);
7354 }
7355
7356 static __poll_t io_uring_poll(struct file *file, poll_table *wait)
7357 {
7358         struct io_ring_ctx *ctx = file->private_data;
7359         __poll_t mask = 0;
7360
7361         poll_wait(file, &ctx->cq_wait, wait);
7362         /*
7363          * synchronizes with barrier from wq_has_sleeper call in
7364          * io_commit_cqring
7365          */
7366         smp_rmb();
7367         if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head !=
7368             ctx->rings->sq_ring_entries)
7369                 mask |= EPOLLOUT | EPOLLWRNORM;
7370         if (io_cqring_events(ctx, false))
7371                 mask |= EPOLLIN | EPOLLRDNORM;
7372
7373         return mask;
7374 }
7375
7376 static int io_uring_fasync(int fd, struct file *file, int on)
7377 {
7378         struct io_ring_ctx *ctx = file->private_data;
7379
7380         return fasync_helper(fd, file, on, &ctx->cq_fasync);
7381 }
7382
7383 static int io_remove_personalities(int id, void *p, void *data)
7384 {
7385         struct io_ring_ctx *ctx = data;
7386         const struct cred *cred;
7387
7388         cred = idr_remove(&ctx->personality_idr, id);
7389         if (cred)
7390                 put_cred(cred);
7391         return 0;
7392 }
7393
7394 static void io_ring_exit_work(struct work_struct *work)
7395 {
7396         struct io_ring_ctx *ctx;
7397
7398         ctx = container_of(work, struct io_ring_ctx, exit_work);
7399         if (ctx->rings)
7400                 io_cqring_overflow_flush(ctx, true);
7401
7402         /*
7403          * If we're doing polled IO and end up having requests being
7404          * submitted async (out-of-line), then completions can come in while
7405          * we're waiting for refs to drop. We need to reap these manually,
7406          * as nobody else will be looking for them.
7407          */
7408         while (!wait_for_completion_timeout(&ctx->completions[0], HZ/20)) {
7409                 io_iopoll_reap_events(ctx);
7410                 if (ctx->rings)
7411                         io_cqring_overflow_flush(ctx, true);
7412         }
7413         io_ring_ctx_free(ctx);
7414 }
7415
7416 static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
7417 {
7418         mutex_lock(&ctx->uring_lock);
7419         percpu_ref_kill(&ctx->refs);
7420         mutex_unlock(&ctx->uring_lock);
7421
7422         /*
7423          * Wait for sq thread to idle, if we have one. It won't spin on new
7424          * work after we've killed the ctx ref above. This is important to do
7425          * before we cancel existing commands, as the thread could otherwise
7426          * be queueing new work post that. If that's work we need to cancel,
7427          * it could cause shutdown to hang.
7428          */
7429         while (ctx->sqo_thread && !wq_has_sleeper(&ctx->sqo_wait))
7430                 cond_resched();
7431
7432         io_kill_timeouts(ctx);
7433         io_poll_remove_all(ctx);
7434
7435         if (ctx->io_wq)
7436                 io_wq_cancel_all(ctx->io_wq);
7437
7438         io_iopoll_reap_events(ctx);
7439         /* if we failed setting up the ctx, we might not have any rings */
7440         if (ctx->rings)
7441                 io_cqring_overflow_flush(ctx, true);
7442         idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
7443         INIT_WORK(&ctx->exit_work, io_ring_exit_work);
7444         queue_work(system_wq, &ctx->exit_work);
7445 }
7446
7447 static int io_uring_release(struct inode *inode, struct file *file)
7448 {
7449         struct io_ring_ctx *ctx = file->private_data;
7450
7451         file->private_data = NULL;
7452         io_ring_ctx_wait_and_kill(ctx);
7453         return 0;
7454 }
7455
7456 static void io_uring_cancel_files(struct io_ring_ctx *ctx,
7457                                   struct files_struct *files)
7458 {
7459         while (!list_empty_careful(&ctx->inflight_list)) {
7460                 struct io_kiocb *cancel_req = NULL, *req;
7461                 DEFINE_WAIT(wait);
7462
7463                 spin_lock_irq(&ctx->inflight_lock);
7464                 list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
7465                         if (req->work.files != files)
7466                                 continue;
7467                         /* req is being completed, ignore */
7468                         if (!refcount_inc_not_zero(&req->refs))
7469                                 continue;
7470                         cancel_req = req;
7471                         break;
7472                 }
7473                 if (cancel_req)
7474                         prepare_to_wait(&ctx->inflight_wait, &wait,
7475                                                 TASK_UNINTERRUPTIBLE);
7476                 spin_unlock_irq(&ctx->inflight_lock);
7477
7478                 /* We need to keep going until we don't find a matching req */
7479                 if (!cancel_req)
7480                         break;
7481
7482                 if (cancel_req->flags & REQ_F_OVERFLOW) {
7483                         spin_lock_irq(&ctx->completion_lock);
7484                         list_del(&cancel_req->list);
7485                         cancel_req->flags &= ~REQ_F_OVERFLOW;
7486                         if (list_empty(&ctx->cq_overflow_list)) {
7487                                 clear_bit(0, &ctx->sq_check_overflow);
7488                                 clear_bit(0, &ctx->cq_check_overflow);
7489                         }
7490                         spin_unlock_irq(&ctx->completion_lock);
7491
7492                         WRITE_ONCE(ctx->rings->cq_overflow,
7493                                 atomic_inc_return(&ctx->cached_cq_overflow));
7494
7495                         /*
7496                          * Put inflight ref and overflow ref. If that's
7497                          * all we had, then we're done with this request.
7498                          */
7499                         if (refcount_sub_and_test(2, &cancel_req->refs)) {
7500                                 io_free_req(cancel_req);
7501                                 finish_wait(&ctx->inflight_wait, &wait);
7502                                 continue;
7503                         }
7504                 } else {
7505                         io_wq_cancel_work(ctx->io_wq, &cancel_req->work);
7506                         io_put_req(cancel_req);
7507                 }
7508
7509                 schedule();
7510                 finish_wait(&ctx->inflight_wait, &wait);
7511         }
7512 }
7513
7514 static int io_uring_flush(struct file *file, void *data)
7515 {
7516         struct io_ring_ctx *ctx = file->private_data;
7517
7518         io_uring_cancel_files(ctx, data);
7519
7520         /*
7521          * If the task is going away, cancel work it may have pending
7522          */
7523         if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
7524                 io_wq_cancel_pid(ctx->io_wq, task_pid_vnr(current));
7525
7526         return 0;
7527 }
7528
7529 static void *io_uring_validate_mmap_request(struct file *file,
7530                                             loff_t pgoff, size_t sz)
7531 {
7532         struct io_ring_ctx *ctx = file->private_data;
7533         loff_t offset = pgoff << PAGE_SHIFT;
7534         struct page *page;
7535         void *ptr;
7536
7537         switch (offset) {
7538         case IORING_OFF_SQ_RING:
7539         case IORING_OFF_CQ_RING:
7540                 ptr = ctx->rings;
7541                 break;
7542         case IORING_OFF_SQES:
7543                 ptr = ctx->sq_sqes;
7544                 break;
7545         default:
7546                 return ERR_PTR(-EINVAL);
7547         }
7548
7549         page = virt_to_head_page(ptr);
7550         if (sz > page_size(page))
7551                 return ERR_PTR(-EINVAL);
7552
7553         return ptr;
7554 }
7555
7556 #ifdef CONFIG_MMU
7557
7558 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
7559 {
7560         size_t sz = vma->vm_end - vma->vm_start;
7561         unsigned long pfn;
7562         void *ptr;
7563
7564         ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
7565         if (IS_ERR(ptr))
7566                 return PTR_ERR(ptr);
7567
7568         pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
7569         return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
7570 }
7571
7572 #else /* !CONFIG_MMU */
7573
7574 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
7575 {
7576         return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
7577 }
7578
7579 static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
7580 {
7581         return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
7582 }
7583
7584 static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
7585         unsigned long addr, unsigned long len,
7586         unsigned long pgoff, unsigned long flags)
7587 {
7588         void *ptr;
7589
7590         ptr = io_uring_validate_mmap_request(file, pgoff, len);
7591         if (IS_ERR(ptr))
7592                 return PTR_ERR(ptr);
7593
7594         return (unsigned long) ptr;
7595 }
7596
7597 #endif /* !CONFIG_MMU */
7598
7599 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
7600                 u32, min_complete, u32, flags, const sigset_t __user *, sig,
7601                 size_t, sigsz)
7602 {
7603         struct io_ring_ctx *ctx;
7604         long ret = -EBADF;
7605         int submitted = 0;
7606         struct fd f;
7607
7608         if (current->task_works)
7609                 task_work_run();
7610
7611         if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
7612                 return -EINVAL;
7613
7614         f = fdget(fd);
7615         if (!f.file)
7616                 return -EBADF;
7617
7618         ret = -EOPNOTSUPP;
7619         if (f.file->f_op != &io_uring_fops)
7620                 goto out_fput;
7621
7622         ret = -ENXIO;
7623         ctx = f.file->private_data;
7624         if (!percpu_ref_tryget(&ctx->refs))
7625                 goto out_fput;
7626
7627         /*
7628          * For SQ polling, the thread will do all submissions and completions.
7629          * Just return the requested submit count, and wake the thread if
7630          * we were asked to.
7631          */
7632         ret = 0;
7633         if (ctx->flags & IORING_SETUP_SQPOLL) {
7634                 if (!list_empty_careful(&ctx->cq_overflow_list))
7635                         io_cqring_overflow_flush(ctx, false);
7636                 if (flags & IORING_ENTER_SQ_WAKEUP)
7637                         wake_up(&ctx->sqo_wait);
7638                 submitted = to_submit;
7639         } else if (to_submit) {
7640                 mutex_lock(&ctx->uring_lock);
7641                 submitted = io_submit_sqes(ctx, to_submit, f.file, fd, false);
7642                 mutex_unlock(&ctx->uring_lock);
7643
7644                 if (submitted != to_submit)
7645                         goto out;
7646         }
7647         if (flags & IORING_ENTER_GETEVENTS) {
7648                 unsigned nr_events = 0;
7649
7650                 min_complete = min(min_complete, ctx->cq_entries);
7651
7652                 /*
7653                  * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
7654                  * space applications don't need to do io completion events
7655                  * polling again, they can rely on io_sq_thread to do polling
7656                  * work, which can reduce cpu usage and uring_lock contention.
7657                  */
7658                 if (ctx->flags & IORING_SETUP_IOPOLL &&
7659                     !(ctx->flags & IORING_SETUP_SQPOLL)) {
7660                         ret = io_iopoll_check(ctx, &nr_events, min_complete);
7661                 } else {
7662                         ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
7663                 }
7664         }
7665
7666 out:
7667         percpu_ref_put(&ctx->refs);
7668 out_fput:
7669         fdput(f);
7670         return submitted ? submitted : ret;
7671 }
7672
7673 #ifdef CONFIG_PROC_FS
7674 static int io_uring_show_cred(int id, void *p, void *data)
7675 {
7676         const struct cred *cred = p;
7677         struct seq_file *m = data;
7678         struct user_namespace *uns = seq_user_ns(m);
7679         struct group_info *gi;
7680         kernel_cap_t cap;
7681         unsigned __capi;
7682         int g;
7683
7684         seq_printf(m, "%5d\n", id);
7685         seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
7686         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
7687         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
7688         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
7689         seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
7690         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
7691         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
7692         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
7693         seq_puts(m, "\n\tGroups:\t");
7694         gi = cred->group_info;
7695         for (g = 0; g < gi->ngroups; g++) {
7696                 seq_put_decimal_ull(m, g ? " " : "",
7697                                         from_kgid_munged(uns, gi->gid[g]));
7698         }
7699         seq_puts(m, "\n\tCapEff:\t");
7700         cap = cred->cap_effective;
7701         CAP_FOR_EACH_U32(__capi)
7702                 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
7703         seq_putc(m, '\n');
7704         return 0;
7705 }
7706
7707 static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
7708 {
7709         int i;
7710
7711         mutex_lock(&ctx->uring_lock);
7712         seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
7713         for (i = 0; i < ctx->nr_user_files; i++) {
7714                 struct fixed_file_table *table;
7715                 struct file *f;
7716
7717                 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
7718                 f = table->files[i & IORING_FILE_TABLE_MASK];
7719                 if (f)
7720                         seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
7721                 else
7722                         seq_printf(m, "%5u: <none>\n", i);
7723         }
7724         seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
7725         for (i = 0; i < ctx->nr_user_bufs; i++) {
7726                 struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
7727
7728                 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
7729                                                 (unsigned int) buf->len);
7730         }
7731         if (!idr_is_empty(&ctx->personality_idr)) {
7732                 seq_printf(m, "Personalities:\n");
7733                 idr_for_each(&ctx->personality_idr, io_uring_show_cred, m);
7734         }
7735         seq_printf(m, "PollList:\n");
7736         spin_lock_irq(&ctx->completion_lock);
7737         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
7738                 struct hlist_head *list = &ctx->cancel_hash[i];
7739                 struct io_kiocb *req;
7740
7741                 hlist_for_each_entry(req, list, hash_node)
7742                         seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
7743                                         req->task->task_works != NULL);
7744         }
7745         spin_unlock_irq(&ctx->completion_lock);
7746         mutex_unlock(&ctx->uring_lock);
7747 }
7748
7749 static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
7750 {
7751         struct io_ring_ctx *ctx = f->private_data;
7752
7753         if (percpu_ref_tryget(&ctx->refs)) {
7754                 __io_uring_show_fdinfo(ctx, m);
7755                 percpu_ref_put(&ctx->refs);
7756         }
7757 }
7758 #endif
7759
7760 static const struct file_operations io_uring_fops = {
7761         .release        = io_uring_release,
7762         .flush          = io_uring_flush,
7763         .mmap           = io_uring_mmap,
7764 #ifndef CONFIG_MMU
7765         .get_unmapped_area = io_uring_nommu_get_unmapped_area,
7766         .mmap_capabilities = io_uring_nommu_mmap_capabilities,
7767 #endif
7768         .poll           = io_uring_poll,
7769         .fasync         = io_uring_fasync,
7770 #ifdef CONFIG_PROC_FS
7771         .show_fdinfo    = io_uring_show_fdinfo,
7772 #endif
7773 };
7774
7775 static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
7776                                   struct io_uring_params *p)
7777 {
7778         struct io_rings *rings;
7779         size_t size, sq_array_offset;
7780
7781         size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
7782         if (size == SIZE_MAX)
7783                 return -EOVERFLOW;
7784
7785         rings = io_mem_alloc(size);
7786         if (!rings)
7787                 return -ENOMEM;
7788
7789         ctx->rings = rings;
7790         ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
7791         rings->sq_ring_mask = p->sq_entries - 1;
7792         rings->cq_ring_mask = p->cq_entries - 1;
7793         rings->sq_ring_entries = p->sq_entries;
7794         rings->cq_ring_entries = p->cq_entries;
7795         ctx->sq_mask = rings->sq_ring_mask;
7796         ctx->cq_mask = rings->cq_ring_mask;
7797         ctx->sq_entries = rings->sq_ring_entries;
7798         ctx->cq_entries = rings->cq_ring_entries;
7799
7800         size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
7801         if (size == SIZE_MAX) {
7802                 io_mem_free(ctx->rings);
7803                 ctx->rings = NULL;
7804                 return -EOVERFLOW;
7805         }
7806
7807         ctx->sq_sqes = io_mem_alloc(size);
7808         if (!ctx->sq_sqes) {
7809                 io_mem_free(ctx->rings);
7810                 ctx->rings = NULL;
7811                 return -ENOMEM;
7812         }
7813
7814         return 0;
7815 }
7816
7817 /*
7818  * Allocate an anonymous fd, this is what constitutes the application
7819  * visible backing of an io_uring instance. The application mmaps this
7820  * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
7821  * we have to tie this fd to a socket for file garbage collection purposes.
7822  */
7823 static int io_uring_get_fd(struct io_ring_ctx *ctx)
7824 {
7825         struct file *file;
7826         int ret;
7827
7828 #if defined(CONFIG_UNIX)
7829         ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
7830                                 &ctx->ring_sock);
7831         if (ret)
7832                 return ret;
7833 #endif
7834
7835         ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
7836         if (ret < 0)
7837                 goto err;
7838
7839         file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
7840                                         O_RDWR | O_CLOEXEC);
7841         if (IS_ERR(file)) {
7842                 put_unused_fd(ret);
7843                 ret = PTR_ERR(file);
7844                 goto err;
7845         }
7846
7847 #if defined(CONFIG_UNIX)
7848         ctx->ring_sock->file = file;
7849 #endif
7850         fd_install(ret, file);
7851         return ret;
7852 err:
7853 #if defined(CONFIG_UNIX)
7854         sock_release(ctx->ring_sock);
7855         ctx->ring_sock = NULL;
7856 #endif
7857         return ret;
7858 }
7859
7860 static int io_uring_create(unsigned entries, struct io_uring_params *p,
7861                            struct io_uring_params __user *params)
7862 {
7863         struct user_struct *user = NULL;
7864         struct io_ring_ctx *ctx;
7865         bool account_mem;
7866         int ret;
7867
7868         if (!entries)
7869                 return -EINVAL;
7870         if (entries > IORING_MAX_ENTRIES) {
7871                 if (!(p->flags & IORING_SETUP_CLAMP))
7872                         return -EINVAL;
7873                 entries = IORING_MAX_ENTRIES;
7874         }
7875
7876         /*
7877          * Use twice as many entries for the CQ ring. It's possible for the
7878          * application to drive a higher depth than the size of the SQ ring,
7879          * since the sqes are only used at submission time. This allows for
7880          * some flexibility in overcommitting a bit. If the application has
7881          * set IORING_SETUP_CQSIZE, it will have passed in the desired number
7882          * of CQ ring entries manually.
7883          */
7884         p->sq_entries = roundup_pow_of_two(entries);
7885         if (p->flags & IORING_SETUP_CQSIZE) {
7886                 /*
7887                  * If IORING_SETUP_CQSIZE is set, we do the same roundup
7888                  * to a power-of-two, if it isn't already. We do NOT impose
7889                  * any cq vs sq ring sizing.
7890                  */
7891                 if (p->cq_entries < p->sq_entries)
7892                         return -EINVAL;
7893                 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
7894                         if (!(p->flags & IORING_SETUP_CLAMP))
7895                                 return -EINVAL;
7896                         p->cq_entries = IORING_MAX_CQ_ENTRIES;
7897                 }
7898                 p->cq_entries = roundup_pow_of_two(p->cq_entries);
7899         } else {
7900                 p->cq_entries = 2 * p->sq_entries;
7901         }
7902
7903         user = get_uid(current_user());
7904         account_mem = !capable(CAP_IPC_LOCK);
7905
7906         if (account_mem) {
7907                 ret = io_account_mem(user,
7908                                 ring_pages(p->sq_entries, p->cq_entries));
7909                 if (ret) {
7910                         free_uid(user);
7911                         return ret;
7912                 }
7913         }
7914
7915         ctx = io_ring_ctx_alloc(p);
7916         if (!ctx) {
7917                 if (account_mem)
7918                         io_unaccount_mem(user, ring_pages(p->sq_entries,
7919                                                                 p->cq_entries));
7920                 free_uid(user);
7921                 return -ENOMEM;
7922         }
7923         ctx->compat = in_compat_syscall();
7924         ctx->account_mem = account_mem;
7925         ctx->user = user;
7926         ctx->creds = get_current_cred();
7927
7928         ret = io_allocate_scq_urings(ctx, p);
7929         if (ret)
7930                 goto err;
7931
7932         ret = io_sq_offload_start(ctx, p);
7933         if (ret)
7934                 goto err;
7935
7936         memset(&p->sq_off, 0, sizeof(p->sq_off));
7937         p->sq_off.head = offsetof(struct io_rings, sq.head);
7938         p->sq_off.tail = offsetof(struct io_rings, sq.tail);
7939         p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
7940         p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
7941         p->sq_off.flags = offsetof(struct io_rings, sq_flags);
7942         p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
7943         p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
7944
7945         memset(&p->cq_off, 0, sizeof(p->cq_off));
7946         p->cq_off.head = offsetof(struct io_rings, cq.head);
7947         p->cq_off.tail = offsetof(struct io_rings, cq.tail);
7948         p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
7949         p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
7950         p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
7951         p->cq_off.cqes = offsetof(struct io_rings, cqes);
7952
7953         p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
7954                         IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
7955                         IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL;
7956
7957         if (copy_to_user(params, p, sizeof(*p))) {
7958                 ret = -EFAULT;
7959                 goto err;
7960         }
7961         /*
7962          * Install ring fd as the very last thing, so we don't risk someone
7963          * having closed it before we finish setup
7964          */
7965         ret = io_uring_get_fd(ctx);
7966         if (ret < 0)
7967                 goto err;
7968
7969         trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
7970         return ret;
7971 err:
7972         io_ring_ctx_wait_and_kill(ctx);
7973         return ret;
7974 }
7975
7976 /*
7977  * Sets up an aio uring context, and returns the fd. Applications asks for a
7978  * ring size, we return the actual sq/cq ring sizes (among other things) in the
7979  * params structure passed in.
7980  */
7981 static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
7982 {
7983         struct io_uring_params p;
7984         int i;
7985
7986         if (copy_from_user(&p, params, sizeof(p)))
7987                 return -EFAULT;
7988         for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
7989                 if (p.resv[i])
7990                         return -EINVAL;
7991         }
7992
7993         if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
7994                         IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
7995                         IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ))
7996                 return -EINVAL;
7997
7998         return  io_uring_create(entries, &p, params);
7999 }
8000
8001 SYSCALL_DEFINE2(io_uring_setup, u32, entries,
8002                 struct io_uring_params __user *, params)
8003 {
8004         return io_uring_setup(entries, params);
8005 }
8006
8007 static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
8008 {
8009         struct io_uring_probe *p;
8010         size_t size;
8011         int i, ret;
8012
8013         size = struct_size(p, ops, nr_args);
8014         if (size == SIZE_MAX)
8015                 return -EOVERFLOW;
8016         p = kzalloc(size, GFP_KERNEL);
8017         if (!p)
8018                 return -ENOMEM;
8019
8020         ret = -EFAULT;
8021         if (copy_from_user(p, arg, size))
8022                 goto out;
8023         ret = -EINVAL;
8024         if (memchr_inv(p, 0, size))
8025                 goto out;
8026
8027         p->last_op = IORING_OP_LAST - 1;
8028         if (nr_args > IORING_OP_LAST)
8029                 nr_args = IORING_OP_LAST;
8030
8031         for (i = 0; i < nr_args; i++) {
8032                 p->ops[i].op = i;
8033                 if (!io_op_defs[i].not_supported)
8034                         p->ops[i].flags = IO_URING_OP_SUPPORTED;
8035         }
8036         p->ops_len = i;
8037
8038         ret = 0;
8039         if (copy_to_user(arg, p, size))
8040                 ret = -EFAULT;
8041 out:
8042         kfree(p);
8043         return ret;
8044 }
8045
8046 static int io_register_personality(struct io_ring_ctx *ctx)
8047 {
8048         const struct cred *creds = get_current_cred();
8049         int id;
8050
8051         id = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1,
8052                                 USHRT_MAX, GFP_KERNEL);
8053         if (id < 0)
8054                 put_cred(creds);
8055         return id;
8056 }
8057
8058 static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
8059 {
8060         const struct cred *old_creds;
8061
8062         old_creds = idr_remove(&ctx->personality_idr, id);
8063         if (old_creds) {
8064                 put_cred(old_creds);
8065                 return 0;
8066         }
8067
8068         return -EINVAL;
8069 }
8070
8071 static bool io_register_op_must_quiesce(int op)
8072 {
8073         switch (op) {
8074         case IORING_UNREGISTER_FILES:
8075         case IORING_REGISTER_FILES_UPDATE:
8076         case IORING_REGISTER_PROBE:
8077         case IORING_REGISTER_PERSONALITY:
8078         case IORING_UNREGISTER_PERSONALITY:
8079                 return false;
8080         default:
8081                 return true;
8082         }
8083 }
8084
8085 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
8086                                void __user *arg, unsigned nr_args)
8087         __releases(ctx->uring_lock)
8088         __acquires(ctx->uring_lock)
8089 {
8090         int ret;
8091
8092         /*
8093          * We're inside the ring mutex, if the ref is already dying, then
8094          * someone else killed the ctx or is already going through
8095          * io_uring_register().
8096          */
8097         if (percpu_ref_is_dying(&ctx->refs))
8098                 return -ENXIO;
8099
8100         if (io_register_op_must_quiesce(opcode)) {
8101                 percpu_ref_kill(&ctx->refs);
8102
8103                 /*
8104                  * Drop uring mutex before waiting for references to exit. If
8105                  * another thread is currently inside io_uring_enter() it might
8106                  * need to grab the uring_lock to make progress. If we hold it
8107                  * here across the drain wait, then we can deadlock. It's safe
8108                  * to drop the mutex here, since no new references will come in
8109                  * after we've killed the percpu ref.
8110                  */
8111                 mutex_unlock(&ctx->uring_lock);
8112                 ret = wait_for_completion_interruptible(&ctx->completions[0]);
8113                 mutex_lock(&ctx->uring_lock);
8114                 if (ret) {
8115                         percpu_ref_resurrect(&ctx->refs);
8116                         ret = -EINTR;
8117                         goto out;
8118                 }
8119         }
8120
8121         switch (opcode) {
8122         case IORING_REGISTER_BUFFERS:
8123                 ret = io_sqe_buffer_register(ctx, arg, nr_args);
8124                 break;
8125         case IORING_UNREGISTER_BUFFERS:
8126                 ret = -EINVAL;
8127                 if (arg || nr_args)
8128                         break;
8129                 ret = io_sqe_buffer_unregister(ctx);
8130                 break;
8131         case IORING_REGISTER_FILES:
8132                 ret = io_sqe_files_register(ctx, arg, nr_args);
8133                 break;
8134         case IORING_UNREGISTER_FILES:
8135                 ret = -EINVAL;
8136                 if (arg || nr_args)
8137                         break;
8138                 ret = io_sqe_files_unregister(ctx);
8139                 break;
8140         case IORING_REGISTER_FILES_UPDATE:
8141                 ret = io_sqe_files_update(ctx, arg, nr_args);
8142                 break;
8143         case IORING_REGISTER_EVENTFD:
8144         case IORING_REGISTER_EVENTFD_ASYNC:
8145                 ret = -EINVAL;
8146                 if (nr_args != 1)
8147                         break;
8148                 ret = io_eventfd_register(ctx, arg);
8149                 if (ret)
8150                         break;
8151                 if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
8152                         ctx->eventfd_async = 1;
8153                 else
8154                         ctx->eventfd_async = 0;
8155                 break;
8156         case IORING_UNREGISTER_EVENTFD:
8157                 ret = -EINVAL;
8158                 if (arg || nr_args)
8159                         break;
8160                 ret = io_eventfd_unregister(ctx);
8161                 break;
8162         case IORING_REGISTER_PROBE:
8163                 ret = -EINVAL;
8164                 if (!arg || nr_args > 256)
8165                         break;
8166                 ret = io_probe(ctx, arg, nr_args);
8167                 break;
8168         case IORING_REGISTER_PERSONALITY:
8169                 ret = -EINVAL;
8170                 if (arg || nr_args)
8171                         break;
8172                 ret = io_register_personality(ctx);
8173                 break;
8174         case IORING_UNREGISTER_PERSONALITY:
8175                 ret = -EINVAL;
8176                 if (arg)
8177                         break;
8178                 ret = io_unregister_personality(ctx, nr_args);
8179                 break;
8180         default:
8181                 ret = -EINVAL;
8182                 break;
8183         }
8184
8185         if (io_register_op_must_quiesce(opcode)) {
8186                 /* bring the ctx back to life */
8187                 percpu_ref_reinit(&ctx->refs);
8188 out:
8189                 reinit_completion(&ctx->completions[0]);
8190         }
8191         return ret;
8192 }
8193
8194 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
8195                 void __user *, arg, unsigned int, nr_args)
8196 {
8197         struct io_ring_ctx *ctx;
8198         long ret = -EBADF;
8199         struct fd f;
8200
8201         f = fdget(fd);
8202         if (!f.file)
8203                 return -EBADF;
8204
8205         ret = -EOPNOTSUPP;
8206         if (f.file->f_op != &io_uring_fops)
8207                 goto out_fput;
8208
8209         ctx = f.file->private_data;
8210
8211         mutex_lock(&ctx->uring_lock);
8212         ret = __io_uring_register(ctx, opcode, arg, nr_args);
8213         mutex_unlock(&ctx->uring_lock);
8214         trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
8215                                                         ctx->cq_ev_fd != NULL, ret);
8216 out_fput:
8217         fdput(f);
8218         return ret;
8219 }
8220
8221 static int __init io_uring_init(void)
8222 {
8223 #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
8224         BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
8225         BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
8226 } while (0)
8227
8228 #define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
8229         __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
8230         BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
8231         BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
8232         BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
8233         BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
8234         BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
8235         BUILD_BUG_SQE_ELEM(8,  __u64,  off);
8236         BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
8237         BUILD_BUG_SQE_ELEM(16, __u64,  addr);
8238         BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
8239         BUILD_BUG_SQE_ELEM(24, __u32,  len);
8240         BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
8241         BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
8242         BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
8243         BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
8244         BUILD_BUG_SQE_ELEM(28, __u16,  poll_events);
8245         BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
8246         BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
8247         BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
8248         BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
8249         BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
8250         BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
8251         BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
8252         BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
8253         BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
8254         BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
8255         BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
8256         BUILD_BUG_SQE_ELEM(42, __u16,  personality);
8257         BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
8258
8259         BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
8260         BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
8261         req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
8262         return 0;
8263 };
8264 __initcall(io_uring_init);