fs/eventfd.c

   1 /*
   2  *  fs/eventfd.c
   3  *
   4  *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
   5  *
   6  */
   7
   8 #include <linux/file.h>
   9 #include <linux/poll.h>
  10 #include <linux/init.h>
  11 #include <linux/fs.h>
  12 #include <linux/sched/signal.h>
  13 #include <linux/kernel.h>
  14 #include <linux/slab.h>
  15 #include <linux/list.h>
  16 #include <linux/spinlock.h>
  17 #include <linux/anon_inodes.h>
  18 #include <linux/syscalls.h>
  19 #include <linux/export.h>
  20 #include <linux/kref.h>
  21 #include <linux/eventfd.h>
  22 #include <linux/proc_fs.h>
  23 #include <linux/seq_file.h>
  24
  25 DEFINE_PER_CPU(int, eventfd_wake_count);
  26
  27 struct eventfd_ctx {
  28         struct kref kref;
  29         wait_queue_head_t wqh;
  30         /*
  31          * Every time that a write(2) is performed on an eventfd, the
  32          * value of the __u64 being written is added to "count" and a
  33          * wakeup is performed on "wqh". A read(2) will return the "count"
  34          * value to userspace, and will reset "count" to zero. The kernel
  35          * side eventfd_signal() also, adds to the "count" counter and
  36          * issue a wakeup.
  37          */
  38         __u64 count;
  39         unsigned int flags;
  40 };
  41
  42 /**
  43  * eventfd_signal - Adds @n to the eventfd counter.
  44  * @ctx: [in] Pointer to the eventfd context.
  45  * @n: [in] Value of the counter to be added to the eventfd internal counter.
  46  *          The value cannot be negative.
  47  *
  48  * This function is supposed to be called by the kernel in paths that do not
  49  * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
  50  * value, and we signal this as overflow condition by returning a EPOLLERR
  51  * to poll(2).
  52  *
  53  * Returns the amount by which the counter was incremented.  This will be less
  54  * than @n if the counter has overflowed.
  55  */
  56 __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
  57 {
  58         unsigned long flags;
  59
  60         /*
  61          * Deadlock or stack overflow issues can happen if we recurse here
  62          * through waitqueue wakeup handlers. If the caller users potentially
  63          * nested waitqueues with custom wakeup handlers, then it should
  64          * check eventfd_signal_count() before calling this function. If
  65          * it returns true, the eventfd_signal() call should be deferred to a
  66          * safe context.
  67          */
  68         if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
  69                 return 0;
  70
  71         spin_lock_irqsave(&ctx->wqh.lock, flags);
  72         this_cpu_inc(eventfd_wake_count);
  73         if (ULLONG_MAX - ctx->count < n)
  74                 n = ULLONG_MAX - ctx->count;
  75         ctx->count += n;
  76         if (waitqueue_active(&ctx->wqh))
  77                 wake_up_locked_poll(&ctx->wqh, EPOLLIN);
  78         this_cpu_dec(eventfd_wake_count);
  79         spin_unlock_irqrestore(&ctx->wqh.lock, flags);
  80
  81         return n;
  82 }
  83 EXPORT_SYMBOL_GPL(eventfd_signal);
  84
  85 static void eventfd_free_ctx(struct eventfd_ctx *ctx)
  86 {
  87         kfree(ctx);
  88 }
  89
  90 static void eventfd_free(struct kref *kref)
  91 {
  92         struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
  93
  94         eventfd_free_ctx(ctx);
  95 }
  96
  97 /**
  98  * eventfd_ctx_put - Releases a reference to the internal eventfd context.
  99  * @ctx: [in] Pointer to eventfd context.
 100  *
 101  * The eventfd context reference must have been previously acquired either
 102  * with eventfd_ctx_fdget() or eventfd_ctx_fileget().
 103  */
 104 void eventfd_ctx_put(struct eventfd_ctx *ctx)
 105 {
 106         kref_put(&ctx->kref, eventfd_free);
 107 }
 108 EXPORT_SYMBOL_GPL(eventfd_ctx_put);
 109
 110 static int eventfd_release(struct inode *inode, struct file *file)
 111 {
 112         struct eventfd_ctx *ctx = file->private_data;
 113
 114         wake_up_poll(&ctx->wqh, EPOLLHUP);
 115         eventfd_ctx_put(ctx);
 116         return 0;
 117 }
 118
 119 static __poll_t eventfd_poll(struct file *file, poll_table *wait)
 120 {
 121         struct eventfd_ctx *ctx = file->private_data;
 122         __poll_t events = 0;
 123         u64 count;
 124
 125         poll_wait(file, &ctx->wqh, wait);
 126
 127         /*
 128          * All writes to ctx->count occur within ctx->wqh.lock.  This read
 129          * can be done outside ctx->wqh.lock because we know that poll_wait
 130          * takes that lock (through add_wait_queue) if our caller will sleep.
 131          *
 132          * The read _can_ therefore seep into add_wait_queue's critical
 133          * section, but cannot move above it!  add_wait_queue's spin_lock acts
 134          * as an acquire barrier and ensures that the read be ordered properly
 135          * against the writes.  The following CAN happen and is safe:
 136          *
 137          *     poll                               write
 138          *     -----------------                  ------------
 139          *     lock ctx->wqh.lock (in poll_wait)
 140          *     count = ctx->count
 141          *     __add_wait_queue
 142          *     unlock ctx->wqh.lock
 143          *                                        lock ctx->qwh.lock
 144          *                                        ctx->count += n
 145          *                                        if (waitqueue_active)
 146          *                                          wake_up_locked_poll
 147          *                                        unlock ctx->qwh.lock
 148          *     eventfd_poll returns 0
 149          *
 150          * but the following, which would miss a wakeup, cannot happen:
 151          *
 152          *     poll                               write
 153          *     -----------------                  ------------
 154          *     count = ctx->count (INVALID!)
 155          *                                        lock ctx->qwh.lock
 156          *                                        ctx->count += n
 157          *                                        **waitqueue_active is false**
 158          *                                        **no wake_up_locked_poll!**
 159          *                                        unlock ctx->qwh.lock
 160          *     lock ctx->wqh.lock (in poll_wait)
 161          *     __add_wait_queue
 162          *     unlock ctx->wqh.lock
 163          *     eventfd_poll returns 0
 164          */
 165         count = READ_ONCE(ctx->count);
 166
 167         if (count > 0)
 168                 events |= EPOLLIN;
 169         if (count == ULLONG_MAX)
 170                 events |= EPOLLERR;
 171         if (ULLONG_MAX - 1 > count)
 172                 events |= EPOLLOUT;
 173
 174         return events;
 175 }
 176
 177 static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
 178 {
 179         *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
 180         ctx->count -= *cnt;
 181 }
 182
 183 /**
 184  * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
 185  * @ctx: [in] Pointer to eventfd context.
 186  * @wait: [in] Wait queue to be removed.
 187  * @cnt: [out] Pointer to the 64-bit counter value.
 188  *
 189  * Returns %0 if successful, or the following error codes:
 190  *
 191  * -EAGAIN      : The operation would have blocked.
 192  *
 193  * This is used to atomically remove a wait queue entry from the eventfd wait
 194  * queue head, and read/reset the counter value.
 195  */
 196 int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
 197                                   __u64 *cnt)
 198 {
 199         unsigned long flags;
 200
 201         spin_lock_irqsave(&ctx->wqh.lock, flags);
 202         eventfd_ctx_do_read(ctx, cnt);
 203         __remove_wait_queue(&ctx->wqh, wait);
 204         if (*cnt != 0 && waitqueue_active(&ctx->wqh))
 205                 wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
 206         spin_unlock_irqrestore(&ctx->wqh.lock, flags);
 207
 208         return *cnt != 0 ? 0 : -EAGAIN;
 209 }
 210 EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
 211
 212 static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
 213                             loff_t *ppos)
 214 {
 215         struct eventfd_ctx *ctx = file->private_data;
 216         ssize_t res;
 217         __u64 ucnt = 0;
 218         DECLARE_WAITQUEUE(wait, current);
 219
 220         if (count < sizeof(ucnt))
 221                 return -EINVAL;
 222
 223         spin_lock_irq(&ctx->wqh.lock);
 224         res = -EAGAIN;
 225         if (ctx->count > 0)
 226                 res = sizeof(ucnt);
 227         else if (!(file->f_flags & O_NONBLOCK)) {
 228                 __add_wait_queue(&ctx->wqh, &wait);
 229                 for (;;) {
 230                         set_current_state(TASK_INTERRUPTIBLE);
 231                         if (ctx->count > 0) {
 232                                 res = sizeof(ucnt);
 233                                 break;
 234                         }
 235                         if (signal_pending(current)) {
 236                                 res = -ERESTARTSYS;
 237                                 break;
 238                         }
 239                         spin_unlock_irq(&ctx->wqh.lock);
 240                         schedule();
 241                         spin_lock_irq(&ctx->wqh.lock);
 242                 }
 243                 __remove_wait_queue(&ctx->wqh, &wait);
 244                 __set_current_state(TASK_RUNNING);
 245         }
 246         if (likely(res > 0)) {
 247                 eventfd_ctx_do_read(ctx, &ucnt);
 248                 if (waitqueue_active(&ctx->wqh))
 249                         wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
 250         }
 251         spin_unlock_irq(&ctx->wqh.lock);
 252
 253         if (res > 0 && put_user(ucnt, (__u64 __user *)buf))
 254                 return -EFAULT;
 255
 256         return res;
 257 }
 258
 259 static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
 260                              loff_t *ppos)
 261 {
 262         struct eventfd_ctx *ctx = file->private_data;
 263         ssize_t res;
 264         __u64 ucnt;
 265         DECLARE_WAITQUEUE(wait, current);
 266
 267         if (count < sizeof(ucnt))
 268                 return -EINVAL;
 269         if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
 270                 return -EFAULT;
 271         if (ucnt == ULLONG_MAX)
 272                 return -EINVAL;
 273         spin_lock_irq(&ctx->wqh.lock);
 274         res = -EAGAIN;
 275         if (ULLONG_MAX - ctx->count > ucnt)
 276                 res = sizeof(ucnt);
 277         else if (!(file->f_flags & O_NONBLOCK)) {
 278                 __add_wait_queue(&ctx->wqh, &wait);
 279                 for (res = 0;;) {
 280                         set_current_state(TASK_INTERRUPTIBLE);
 281                         if (ULLONG_MAX - ctx->count > ucnt) {
 282                                 res = sizeof(ucnt);
 283                                 break;
 284                         }
 285                         if (signal_pending(current)) {
 286                                 res = -ERESTARTSYS;
 287                                 break;
 288                         }
 289                         spin_unlock_irq(&ctx->wqh.lock);
 290                         schedule();
 291                         spin_lock_irq(&ctx->wqh.lock);
 292                 }
 293                 __remove_wait_queue(&ctx->wqh, &wait);
 294                 __set_current_state(TASK_RUNNING);
 295         }
 296         if (likely(res > 0)) {
 297                 ctx->count += ucnt;
 298                 if (waitqueue_active(&ctx->wqh))
 299                         wake_up_locked_poll(&ctx->wqh, EPOLLIN);
 300         }
 301         spin_unlock_irq(&ctx->wqh.lock);
 302
 303         return res;
 304 }
 305
 306 #ifdef CONFIG_PROC_FS
 307 static void eventfd_show_fdinfo(struct seq_file *m, struct file *f)
 308 {
 309         struct eventfd_ctx *ctx = f->private_data;
 310
 311         spin_lock_irq(&ctx->wqh.lock);
 312         seq_printf(m, "eventfd-count: %16llx\n",
 313                    (unsigned long long)ctx->count);
 314         spin_unlock_irq(&ctx->wqh.lock);
 315 }
 316 #endif
 317
 318 static const struct file_operations eventfd_fops = {
 319 #ifdef CONFIG_PROC_FS
 320         .show_fdinfo    = eventfd_show_fdinfo,
 321 #endif
 322         .release        = eventfd_release,
 323         .poll           = eventfd_poll,
 324         .read           = eventfd_read,
 325         .write          = eventfd_write,
 326         .llseek         = noop_llseek,
 327 };
 328
 329 /**
 330  * eventfd_fget - Acquire a reference of an eventfd file descriptor.
 331  * @fd: [in] Eventfd file descriptor.
 332  *
 333  * Returns a pointer to the eventfd file structure in case of success, or the
 334  * following error pointer:
 335  *
 336  * -EBADF    : Invalid @fd file descriptor.
 337  * -EINVAL   : The @fd file descriptor is not an eventfd file.
 338  */
 339 struct file *eventfd_fget(int fd)
 340 {
 341         struct file *file;
 342
 343         file = fget(fd);
 344         if (!file)
 345                 return ERR_PTR(-EBADF);
 346         if (file->f_op != &eventfd_fops) {
 347                 fput(file);
 348                 return ERR_PTR(-EINVAL);
 349         }
 350
 351         return file;
 352 }
 353 EXPORT_SYMBOL_GPL(eventfd_fget);
 354
 355 /**
 356  * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context.
 357  * @fd: [in] Eventfd file descriptor.
 358  *
 359  * Returns a pointer to the internal eventfd context, otherwise the error
 360  * pointers returned by the following functions:
 361  *
 362  * eventfd_fget
 363  */
 364 struct eventfd_ctx *eventfd_ctx_fdget(int fd)
 365 {
 366         struct eventfd_ctx *ctx;
 367         struct fd f = fdget(fd);
 368         if (!f.file)
 369                 return ERR_PTR(-EBADF);
 370         ctx = eventfd_ctx_fileget(f.file);
 371         fdput(f);
 372         return ctx;
 373 }
 374 EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
 375
 376 /**
 377  * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context.
 378  * @file: [in] Eventfd file pointer.
 379  *
 380  * Returns a pointer to the internal eventfd context, otherwise the error
 381  * pointer:
 382  *
 383  * -EINVAL   : The @fd file descriptor is not an eventfd file.
 384  */
 385 struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
 386 {
 387         struct eventfd_ctx *ctx;
 388
 389         if (file->f_op != &eventfd_fops)
 390                 return ERR_PTR(-EINVAL);
 391
 392         ctx = file->private_data;
 393         kref_get(&ctx->kref);
 394         return ctx;
 395 }
 396 EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
 397
 398 static int do_eventfd(unsigned int count, int flags)
 399 {
 400         struct eventfd_ctx *ctx;
 401         int fd;
 402
 403         /* Check the EFD_* constants for consistency.  */
 404         BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
 405         BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
 406
 407         if (flags & ~EFD_FLAGS_SET)
 408                 return -EINVAL;
 409
 410         ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
 411         if (!ctx)
 412                 return -ENOMEM;
 413
 414         kref_init(&ctx->kref);
 415         init_waitqueue_head(&ctx->wqh);
 416         ctx->count = count;
 417         ctx->flags = flags;
 418
 419         fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx,
 420                               O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS));
 421         if (fd < 0)
 422                 eventfd_free_ctx(ctx);
 423
 424         return fd;
 425 }
 426
 427 SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 428 {
 429         return do_eventfd(count, flags);
 430 }
 431
 432 SYSCALL_DEFINE1(eventfd, unsigned int, count)
 433 {
 434         return do_eventfd(count, 0);
 435 }
 436