fs/file.c

   1 /*
   2  *  linux/fs/file.c
   3  *
   4  *  Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes
   5  *
   6  *  Manage the dynamic fd arrays in the process files_struct.
   7  */
   8
   9 #include <linux/syscalls.h>
  10 #include <linux/export.h>
  11 #include <linux/fs.h>
  12 #include <linux/mm.h>
  13 #include <linux/mmzone.h>
  14 #include <linux/time.h>
  15 #include <linux/sched.h>
  16 #include <linux/slab.h>
  17 #include <linux/vmalloc.h>
  18 #include <linux/file.h>
  19 #include <linux/fdtable.h>
  20 #include <linux/bitops.h>
  21 #include <linux/interrupt.h>
  22 #include <linux/spinlock.h>
  23 #include <linux/rcupdate.h>
  24 #include <linux/workqueue.h>
  25
  26 int sysctl_nr_open __read_mostly = 1024*1024;
  27 int sysctl_nr_open_min = BITS_PER_LONG;
  28 /* our max() is unusable in constant expressions ;-/ */
  29 #define __const_max(x, y) ((x) < (y) ? (x) : (y))
  30 int sysctl_nr_open_max = __const_max(INT_MAX, ~(size_t)0/sizeof(void *)) &
  31                          -BITS_PER_LONG;
  32
  33 static void *alloc_fdmem(size_t size)
  34 {
  35         /*
  36          * Very large allocations can stress page reclaim, so fall back to
  37          * vmalloc() if the allocation size will be considered "large" by the VM.
  38          */
  39         if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
  40                 void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY);
  41                 if (data != NULL)
  42                         return data;
  43         }
  44         return vmalloc(size);
  45 }
  46
  47 static void free_fdmem(void *ptr)
  48 {
  49         is_vmalloc_addr(ptr) ? vfree(ptr) : kfree(ptr);
  50 }
  51
  52 static void __free_fdtable(struct fdtable *fdt)
  53 {
  54         free_fdmem(fdt->fd);
  55         free_fdmem(fdt->open_fds);
  56         kfree(fdt);
  57 }
  58
  59 static void free_fdtable_rcu(struct rcu_head *rcu)
  60 {
  61         __free_fdtable(container_of(rcu, struct fdtable, rcu));
  62 }
  63
  64 /*
  65  * Expand the fdset in the files_struct.  Called with the files spinlock
  66  * held for write.
  67  */
  68 static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
  69 {
  70         unsigned int cpy, set;
  71
  72         BUG_ON(nfdt->max_fds < ofdt->max_fds);
  73
  74         cpy = ofdt->max_fds * sizeof(struct file *);
  75         set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
  76         memcpy(nfdt->fd, ofdt->fd, cpy);
  77         memset((char *)(nfdt->fd) + cpy, 0, set);
  78
  79         cpy = ofdt->max_fds / BITS_PER_BYTE;
  80         set = (nfdt->max_fds - ofdt->max_fds) / BITS_PER_BYTE;
  81         memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
  82         memset((char *)(nfdt->open_fds) + cpy, 0, set);
  83         memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
  84         memset((char *)(nfdt->close_on_exec) + cpy, 0, set);
  85 }
  86
  87 static struct fdtable * alloc_fdtable(unsigned int nr)
  88 {
  89         struct fdtable *fdt;
  90         void *data;
  91
  92         /*
  93          * Figure out how many fds we actually want to support in this fdtable.
  94          * Allocation steps are keyed to the size of the fdarray, since it
  95          * grows far faster than any of the other dynamic data. We try to fit
  96          * the fdarray into comfortable page-tuned chunks: starting at 1024B
  97          * and growing in powers of two from there on.
  98          */
  99         nr /= (1024 / sizeof(struct file *));
 100         nr = roundup_pow_of_two(nr + 1);
 101         nr *= (1024 / sizeof(struct file *));
 102         /*
 103          * Note that this can drive nr *below* what we had passed if sysctl_nr_open
 104          * had been set lower between the check in expand_files() and here.  Deal
 105          * with that in caller, it's cheaper that way.
 106          *
 107          * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
 108          * bitmaps handling below becomes unpleasant, to put it mildly...
 109          */
 110         if (unlikely(nr > sysctl_nr_open))
 111                 nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
 112
 113         fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
 114         if (!fdt)
 115                 goto out;
 116         fdt->max_fds = nr;
 117         data = alloc_fdmem(nr * sizeof(struct file *));
 118         if (!data)
 119                 goto out_fdt;
 120         fdt->fd = data;
 121
 122         data = alloc_fdmem(max_t(size_t,
 123                                  2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES));
 124         if (!data)
 125                 goto out_arr;
 126         fdt->open_fds = data;
 127         data += nr / BITS_PER_BYTE;
 128         fdt->close_on_exec = data;
 129
 130         return fdt;
 131
 132 out_arr:
 133         free_fdmem(fdt->fd);
 134 out_fdt:
 135         kfree(fdt);
 136 out:
 137         return NULL;
 138 }
 139
 140 /*
 141  * Expand the file descriptor table.
 142  * This function will allocate a new fdtable and both fd array and fdset, of
 143  * the given size.
 144  * Return <0 error code on error; 1 on successful completion.
 145  * The files->file_lock should be held on entry, and will be held on exit.
 146  */
 147 static int expand_fdtable(struct files_struct *files, int nr)
 148         __releases(files->file_lock)
 149         __acquires(files->file_lock)
 150 {
 151         struct fdtable *new_fdt, *cur_fdt;
 152
 153         spin_unlock(&files->file_lock);
 154         new_fdt = alloc_fdtable(nr);
 155         spin_lock(&files->file_lock);
 156         if (!new_fdt)
 157                 return -ENOMEM;
 158         /*
 159          * extremely unlikely race - sysctl_nr_open decreased between the check in
 160          * caller and alloc_fdtable().  Cheaper to catch it here...
 161          */
 162         if (unlikely(new_fdt->max_fds <= nr)) {
 163                 __free_fdtable(new_fdt);
 164                 return -EMFILE;
 165         }
 166         /*
 167          * Check again since another task may have expanded the fd table while
 168          * we dropped the lock
 169          */
 170         cur_fdt = files_fdtable(files);
 171         if (nr >= cur_fdt->max_fds) {
 172                 /* Continue as planned */
 173                 copy_fdtable(new_fdt, cur_fdt);
 174                 rcu_assign_pointer(files->fdt, new_fdt);
 175                 if (cur_fdt != &files->fdtab)
 176                         call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
 177         } else {
 178                 /* Somebody else expanded, so undo our attempt */
 179                 __free_fdtable(new_fdt);
 180         }
 181         return 1;
 182 }
 183
 184 /*
 185  * Expand files.
 186  * This function will expand the file structures, if the requested size exceeds
 187  * the current capacity and there is room for expansion.
 188  * Return <0 error code on error; 0 when nothing done; 1 when files were
 189  * expanded and execution may have blocked.
 190  * The files->file_lock should be held on entry, and will be held on exit.
 191  */
 192 static int expand_files(struct files_struct *files, int nr)
 193 {
 194         struct fdtable *fdt;
 195
 196         fdt = files_fdtable(files);
 197
 198         /* Do we need to expand? */
 199         if (nr < fdt->max_fds)
 200                 return 0;
 201
 202         /* Can we expand? */
 203         if (nr >= sysctl_nr_open)
 204                 return -EMFILE;
 205
 206         /* All good, so we try */
 207         return expand_fdtable(files, nr);
 208 }
 209
 210 static inline void __set_close_on_exec(int fd, struct fdtable *fdt)
 211 {
 212         __set_bit(fd, fdt->close_on_exec);
 213 }
 214
 215 static inline void __clear_close_on_exec(int fd, struct fdtable *fdt)
 216 {
 217         __clear_bit(fd, fdt->close_on_exec);
 218 }
 219
 220 static inline void __set_open_fd(int fd, struct fdtable *fdt)
 221 {
 222         __set_bit(fd, fdt->open_fds);
 223 }
 224
 225 static inline void __clear_open_fd(int fd, struct fdtable *fdt)
 226 {
 227         __clear_bit(fd, fdt->open_fds);
 228 }
 229
 230 static int count_open_files(struct fdtable *fdt)
 231 {
 232         int size = fdt->max_fds;
 233         int i;
 234
 235         /* Find the last open fd */
 236         for (i = size / BITS_PER_LONG; i > 0; ) {
 237                 if (fdt->open_fds[--i])
 238                         break;
 239         }
 240         i = (i + 1) * BITS_PER_LONG;
 241         return i;
 242 }
 243
 244 /*
 245  * Allocate a new files structure and copy contents from the
 246  * passed in files structure.
 247  * errorp will be valid only when the returned files_struct is NULL.
 248  */
 249 struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 250 {
 251         struct files_struct *newf;
 252         struct file **old_fds, **new_fds;
 253         int open_files, size, i;
 254         struct fdtable *old_fdt, *new_fdt;
 255
 256         *errorp = -ENOMEM;
 257         newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
 258         if (!newf)
 259                 goto out;
 260
 261         atomic_set(&newf->count, 1);
 262
 263         spin_lock_init(&newf->file_lock);
 264         newf->next_fd = 0;
 265         new_fdt = &newf->fdtab;
 266         new_fdt->max_fds = NR_OPEN_DEFAULT;
 267         new_fdt->close_on_exec = newf->close_on_exec_init;
 268         new_fdt->open_fds = newf->open_fds_init;
 269         new_fdt->fd = &newf->fd_array[0];
 270
 271         spin_lock(&oldf->file_lock);
 272         old_fdt = files_fdtable(oldf);
 273         open_files = count_open_files(old_fdt);
 274
 275         /*
 276          * Check whether we need to allocate a larger fd array and fd set.
 277          */
 278         while (unlikely(open_files > new_fdt->max_fds)) {
 279                 spin_unlock(&oldf->file_lock);
 280
 281                 if (new_fdt != &newf->fdtab)
 282                         __free_fdtable(new_fdt);
 283
 284                 new_fdt = alloc_fdtable(open_files - 1);
 285                 if (!new_fdt) {
 286                         *errorp = -ENOMEM;
 287                         goto out_release;
 288                 }
 289
 290                 /* beyond sysctl_nr_open; nothing to do */
 291                 if (unlikely(new_fdt->max_fds < open_files)) {
 292                         __free_fdtable(new_fdt);
 293                         *errorp = -EMFILE;
 294                         goto out_release;
 295                 }
 296
 297                 /*
 298                  * Reacquire the oldf lock and a pointer to its fd table
 299                  * who knows it may have a new bigger fd table. We need
 300                  * the latest pointer.
 301                  */
 302                 spin_lock(&oldf->file_lock);
 303                 old_fdt = files_fdtable(oldf);
 304                 open_files = count_open_files(old_fdt);
 305         }
 306
 307         old_fds = old_fdt->fd;
 308         new_fds = new_fdt->fd;
 309
 310         memcpy(new_fdt->open_fds, old_fdt->open_fds, open_files / 8);
 311         memcpy(new_fdt->close_on_exec, old_fdt->close_on_exec, open_files / 8);
 312
 313         for (i = open_files; i != 0; i--) {
 314                 struct file *f = *old_fds++;
 315                 if (f) {
 316                         get_file(f);
 317                 } else {
 318                         /*
 319                          * The fd may be claimed in the fd bitmap but not yet
 320                          * instantiated in the files array if a sibling thread
 321                          * is partway through open().  So make sure that this
 322                          * fd is available to the new process.
 323                          */
 324                         __clear_open_fd(open_files - i, new_fdt);
 325                 }
 326                 rcu_assign_pointer(*new_fds++, f);
 327         }
 328         spin_unlock(&oldf->file_lock);
 329
 330         /* compute the remainder to be cleared */
 331         size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
 332
 333         /* This is long word aligned thus could use a optimized version */
 334         memset(new_fds, 0, size);
 335
 336         if (new_fdt->max_fds > open_files) {
 337                 int left = (new_fdt->max_fds - open_files) / 8;
 338                 int start = open_files / BITS_PER_LONG;
 339
 340                 memset(&new_fdt->open_fds[start], 0, left);
 341                 memset(&new_fdt->close_on_exec[start], 0, left);
 342         }
 343
 344         rcu_assign_pointer(newf->fdt, new_fdt);
 345
 346         return newf;
 347
 348 out_release:
 349         kmem_cache_free(files_cachep, newf);
 350 out:
 351         return NULL;
 352 }
 353
 354 static struct fdtable *close_files(struct files_struct * files)
 355 {
 356         /*
 357          * It is safe to dereference the fd table without RCU or
 358          * ->file_lock because this is the last reference to the
 359          * files structure.
 360          */
 361         struct fdtable *fdt = rcu_dereference_raw(files->fdt);
 362         int i, j = 0;
 363
 364         for (;;) {
 365                 unsigned long set;
 366                 i = j * BITS_PER_LONG;
 367                 if (i >= fdt->max_fds)
 368                         break;
 369                 set = fdt->open_fds[j++];
 370                 while (set) {
 371                         if (set & 1) {
 372                                 struct file * file = xchg(&fdt->fd[i], NULL);
 373                                 if (file) {
 374                                         filp_close(file, files);
 375                                         cond_resched();
 376                                 }
 377                         }
 378                         i++;
 379                         set >>= 1;
 380                 }
 381         }
 382
 383         return fdt;
 384 }
 385
 386 struct files_struct *get_files_struct(struct task_struct *task)
 387 {
 388         struct files_struct *files;
 389
 390         task_lock(task);
 391         files = task->files;
 392         if (files)
 393                 atomic_inc(&files->count);
 394         task_unlock(task);
 395
 396         return files;
 397 }
 398
 399 void put_files_struct(struct files_struct *files)
 400 {
 401         if (atomic_dec_and_test(&files->count)) {
 402                 struct fdtable *fdt = close_files(files);
 403
 404                 /* free the arrays if they are not embedded */
 405                 if (fdt != &files->fdtab)
 406                         __free_fdtable(fdt);
 407                 kmem_cache_free(files_cachep, files);
 408         }
 409 }
 410
 411 void reset_files_struct(struct files_struct *files)
 412 {
 413         struct task_struct *tsk = current;
 414         struct files_struct *old;
 415
 416         old = tsk->files;
 417         task_lock(tsk);
 418         tsk->files = files;
 419         task_unlock(tsk);
 420         put_files_struct(old);
 421 }
 422
 423 void exit_files(struct task_struct *tsk)
 424 {
 425         struct files_struct * files = tsk->files;
 426
 427         if (files) {
 428                 task_lock(tsk);
 429                 tsk->files = NULL;
 430                 task_unlock(tsk);
 431                 put_files_struct(files);
 432         }
 433 }
 434
 435 struct files_struct init_files = {
 436         .count          = ATOMIC_INIT(1),
 437         .fdt            = &init_files.fdtab,
 438         .fdtab          = {
 439                 .max_fds        = NR_OPEN_DEFAULT,
 440                 .fd             = &init_files.fd_array[0],
 441                 .close_on_exec  = init_files.close_on_exec_init,
 442                 .open_fds       = init_files.open_fds_init,
 443         },
 444         .file_lock      = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
 445 };
 446
 447 /*
 448  * allocate a file descriptor, mark it busy.
 449  */
 450 int __alloc_fd(struct files_struct *files,
 451                unsigned start, unsigned end, unsigned flags)
 452 {
 453         unsigned int fd;
 454         int error;
 455         struct fdtable *fdt;
 456
 457         spin_lock(&files->file_lock);
 458 repeat:
 459         fdt = files_fdtable(files);
 460         fd = start;
 461         if (fd < files->next_fd)
 462                 fd = files->next_fd;
 463
 464         if (fd < fdt->max_fds)
 465                 fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd);
 466
 467         /*
 468          * N.B. For clone tasks sharing a files structure, this test
 469          * will limit the total number of files that can be opened.
 470          */
 471         error = -EMFILE;
 472         if (fd >= end)
 473                 goto out;
 474
 475         error = expand_files(files, fd);
 476         if (error < 0)
 477                 goto out;
 478
 479         /*
 480          * If we needed to expand the fs array we
 481          * might have blocked - try again.
 482          */
 483         if (error)
 484                 goto repeat;
 485
 486         if (start <= files->next_fd)
 487                 files->next_fd = fd + 1;
 488
 489         __set_open_fd(fd, fdt);
 490         if (flags & O_CLOEXEC)
 491                 __set_close_on_exec(fd, fdt);
 492         else
 493                 __clear_close_on_exec(fd, fdt);
 494         error = fd;
 495 #if 1
 496         /* Sanity check */
 497         if (rcu_access_pointer(fdt->fd[fd]) != NULL) {
 498                 printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
 499                 rcu_assign_pointer(fdt->fd[fd], NULL);
 500         }
 501 #endif
 502
 503 out:
 504         spin_unlock(&files->file_lock);
 505         return error;
 506 }
 507
 508 static int alloc_fd(unsigned start, unsigned flags)
 509 {
 510         return __alloc_fd(current->files, start, rlimit(RLIMIT_NOFILE), flags);
 511 }
 512
 513 int get_unused_fd_flags(unsigned flags)
 514 {
 515         return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags);
 516 }
 517 EXPORT_SYMBOL(get_unused_fd_flags);
 518
 519 static void __put_unused_fd(struct files_struct *files, unsigned int fd)
 520 {
 521         struct fdtable *fdt = files_fdtable(files);
 522         __clear_open_fd(fd, fdt);
 523         if (fd < files->next_fd)
 524                 files->next_fd = fd;
 525 }
 526
 527 void put_unused_fd(unsigned int fd)
 528 {
 529         struct files_struct *files = current->files;
 530         spin_lock(&files->file_lock);
 531         __put_unused_fd(files, fd);
 532         spin_unlock(&files->file_lock);
 533 }
 534
 535 EXPORT_SYMBOL(put_unused_fd);
 536
 537 /*
 538  * Install a file pointer in the fd array.
 539  *
 540  * The VFS is full of places where we drop the files lock between
 541  * setting the open_fds bitmap and installing the file in the file
 542  * array.  At any such point, we are vulnerable to a dup2() race
 543  * installing a file in the array before us.  We need to detect this and
 544  * fput() the struct file we are about to overwrite in this case.
 545  *
 546  * It should never happen - if we allow dup2() do it, _really_ bad things
 547  * will follow.
 548  *
 549  * NOTE: __fd_install() variant is really, really low-level; don't
 550  * use it unless you are forced to by truly lousy API shoved down
 551  * your throat.  'files' *MUST* be either current->files or obtained
 552  * by get_files_struct(current) done by whoever had given it to you,
 553  * or really bad things will happen.  Normally you want to use
 554  * fd_install() instead.
 555  */
 556
 557 void __fd_install(struct files_struct *files, unsigned int fd,
 558                 struct file *file)
 559 {
 560         struct fdtable *fdt;
 561         spin_lock(&files->file_lock);
 562         fdt = files_fdtable(files);
 563         BUG_ON(fdt->fd[fd] != NULL);
 564         rcu_assign_pointer(fdt->fd[fd], file);
 565         spin_unlock(&files->file_lock);
 566 }
 567
 568 void fd_install(unsigned int fd, struct file *file)
 569 {
 570         __fd_install(current->files, fd, file);
 571 }
 572
 573 EXPORT_SYMBOL(fd_install);
 574
 575 /*
 576  * The same warnings as for __alloc_fd()/__fd_install() apply here...
 577  */
 578 int __close_fd(struct files_struct *files, unsigned fd)
 579 {
 580         struct file *file;
 581         struct fdtable *fdt;
 582
 583         spin_lock(&files->file_lock);
 584         fdt = files_fdtable(files);
 585         if (fd >= fdt->max_fds)
 586                 goto out_unlock;
 587         file = fdt->fd[fd];
 588         if (!file)
 589                 goto out_unlock;
 590         rcu_assign_pointer(fdt->fd[fd], NULL);
 591         __clear_close_on_exec(fd, fdt);
 592         __put_unused_fd(files, fd);
 593         spin_unlock(&files->file_lock);
 594         return filp_close(file, files);
 595
 596 out_unlock:
 597         spin_unlock(&files->file_lock);
 598         return -EBADF;
 599 }
 600
 601 void do_close_on_exec(struct files_struct *files)
 602 {
 603         unsigned i;
 604         struct fdtable *fdt;
 605
 606         /* exec unshares first */
 607         spin_lock(&files->file_lock);
 608         for (i = 0; ; i++) {
 609                 unsigned long set;
 610                 unsigned fd = i * BITS_PER_LONG;
 611                 fdt = files_fdtable(files);
 612                 if (fd >= fdt->max_fds)
 613                         break;
 614                 set = fdt->close_on_exec[i];
 615                 if (!set)
 616                         continue;
 617                 fdt->close_on_exec[i] = 0;
 618                 for ( ; set ; fd++, set >>= 1) {
 619                         struct file *file;
 620                         if (!(set & 1))
 621                                 continue;
 622                         file = fdt->fd[fd];
 623                         if (!file)
 624                                 continue;
 625                         rcu_assign_pointer(fdt->fd[fd], NULL);
 626                         __put_unused_fd(files, fd);
 627                         spin_unlock(&files->file_lock);
 628                         filp_close(file, files);
 629                         cond_resched();
 630                         spin_lock(&files->file_lock);
 631                 }
 632
 633         }
 634         spin_unlock(&files->file_lock);
 635 }
 636
 637 static struct file *__fget(unsigned int fd, fmode_t mask)
 638 {
 639         struct files_struct *files = current->files;
 640         struct file *file;
 641
 642         rcu_read_lock();
 643         file = fcheck_files(files, fd);
 644         if (file) {
 645                 /* File object ref couldn't be taken */
 646                 if ((file->f_mode & mask) ||
 647                     !atomic_long_inc_not_zero(&file->f_count))
 648                         file = NULL;
 649         }
 650         rcu_read_unlock();
 651
 652         return file;
 653 }
 654
 655 struct file *fget(unsigned int fd)
 656 {
 657         return __fget(fd, FMODE_PATH);
 658 }
 659 EXPORT_SYMBOL(fget);
 660
 661 struct file *fget_raw(unsigned int fd)
 662 {
 663         return __fget(fd, 0);
 664 }
 665 EXPORT_SYMBOL(fget_raw);
 666
 667 /*
 668  * Lightweight file lookup - no refcnt increment if fd table isn't shared.
 669  *
 670  * You can use this instead of fget if you satisfy all of the following
 671  * conditions:
 672  * 1) You must call fput_light before exiting the syscall and returning control
 673  *    to userspace (i.e. you cannot remember the returned struct file * after
 674  *    returning to userspace).
 675  * 2) You must not call filp_close on the returned struct file * in between
 676  *    calls to fget_light and fput_light.
 677  * 3) You must not clone the current task in between the calls to fget_light
 678  *    and fput_light.
 679  *
 680  * The fput_needed flag returned by fget_light should be passed to the
 681  * corresponding fput_light.
 682  */
 683 static unsigned long __fget_light(unsigned int fd, fmode_t mask)
 684 {
 685         struct files_struct *files = current->files;
 686         struct file *file;
 687
 688         if (atomic_read(&files->count) == 1) {
 689                 file = __fcheck_files(files, fd);
 690                 if (!file || unlikely(file->f_mode & mask))
 691                         return 0;
 692                 return (unsigned long)file;
 693         } else {
 694                 file = __fget(fd, mask);
 695                 if (!file)
 696                         return 0;
 697                 return FDPUT_FPUT | (unsigned long)file;
 698         }
 699 }
 700 unsigned long __fdget(unsigned int fd)
 701 {
 702         return __fget_light(fd, FMODE_PATH);
 703 }
 704 EXPORT_SYMBOL(__fdget);
 705
 706 unsigned long __fdget_raw(unsigned int fd)
 707 {
 708         return __fget_light(fd, 0);
 709 }
 710
 711 unsigned long __fdget_pos(unsigned int fd)
 712 {
 713         unsigned long v = __fdget(fd);
 714         struct file *file = (struct file *)(v & ~3);
 715
 716         if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
 717                 if (file_count(file) > 1) {
 718                         v |= FDPUT_POS_UNLOCK;
 719                         mutex_lock(&file->f_pos_lock);
 720                 }
 721         }
 722         return v;
 723 }
 724
 725 /*
 726  * We only lock f_pos if we have threads or if the file might be
 727  * shared with another process. In both cases we'll have an elevated
 728  * file count (done either by fdget() or by fork()).
 729  */
 730
 731 void set_close_on_exec(unsigned int fd, int flag)
 732 {
 733         struct files_struct *files = current->files;
 734         struct fdtable *fdt;
 735         spin_lock(&files->file_lock);
 736         fdt = files_fdtable(files);
 737         if (flag)
 738                 __set_close_on_exec(fd, fdt);
 739         else
 740                 __clear_close_on_exec(fd, fdt);
 741         spin_unlock(&files->file_lock);
 742 }
 743
 744 bool get_close_on_exec(unsigned int fd)
 745 {
 746         struct files_struct *files = current->files;
 747         struct fdtable *fdt;
 748         bool res;
 749         rcu_read_lock();
 750         fdt = files_fdtable(files);
 751         res = close_on_exec(fd, fdt);
 752         rcu_read_unlock();
 753         return res;
 754 }
 755
 756 static int do_dup2(struct files_struct *files,
 757         struct file *file, unsigned fd, unsigned flags)
 758 {
 759         struct file *tofree;
 760         struct fdtable *fdt;
 761
 762         /*
 763          * We need to detect attempts to do dup2() over allocated but still
 764          * not finished descriptor.  NB: OpenBSD avoids that at the price of
 765          * extra work in their equivalent of fget() - they insert struct
 766          * file immediately after grabbing descriptor, mark it larval if
 767          * more work (e.g. actual opening) is needed and make sure that
 768          * fget() treats larval files as absent.  Potentially interesting,
 769          * but while extra work in fget() is trivial, locking implications
 770          * and amount of surgery on open()-related paths in VFS are not.
 771          * FreeBSD fails with -EBADF in the same situation, NetBSD "solution"
 772          * deadlocks in rather amusing ways, AFAICS.  All of that is out of
 773          * scope of POSIX or SUS, since neither considers shared descriptor
 774          * tables and this condition does not arise without those.
 775          */
 776         fdt = files_fdtable(files);
 777         tofree = fdt->fd[fd];
 778         if (!tofree && fd_is_open(fd, fdt))
 779                 goto Ebusy;
 780         get_file(file);
 781         rcu_assign_pointer(fdt->fd[fd], file);
 782         __set_open_fd(fd, fdt);
 783         if (flags & O_CLOEXEC)
 784                 __set_close_on_exec(fd, fdt);
 785         else
 786                 __clear_close_on_exec(fd, fdt);
 787         spin_unlock(&files->file_lock);
 788
 789         if (tofree)
 790                 filp_close(tofree, files);
 791
 792         return fd;
 793
 794 Ebusy:
 795         spin_unlock(&files->file_lock);
 796         return -EBUSY;
 797 }
 798
 799 int replace_fd(unsigned fd, struct file *file, unsigned flags)
 800 {
 801         int err;
 802         struct files_struct *files = current->files;
 803
 804         if (!file)
 805                 return __close_fd(files, fd);
 806
 807         if (fd >= rlimit(RLIMIT_NOFILE))
 808                 return -EBADF;
 809
 810         spin_lock(&files->file_lock);
 811         err = expand_files(files, fd);
 812         if (unlikely(err < 0))
 813                 goto out_unlock;
 814         return do_dup2(files, file, fd, flags);
 815
 816 out_unlock:
 817         spin_unlock(&files->file_lock);
 818         return err;
 819 }
 820
 821 SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
 822 {
 823         int err = -EBADF;
 824         struct file *file;
 825         struct files_struct *files = current->files;
 826
 827         if ((flags & ~O_CLOEXEC) != 0)
 828                 return -EINVAL;
 829
 830         if (unlikely(oldfd == newfd))
 831                 return -EINVAL;
 832
 833         if (newfd >= rlimit(RLIMIT_NOFILE))
 834                 return -EBADF;
 835
 836         spin_lock(&files->file_lock);
 837         err = expand_files(files, newfd);
 838         file = fcheck(oldfd);
 839         if (unlikely(!file))
 840                 goto Ebadf;
 841         if (unlikely(err < 0)) {
 842                 if (err == -EMFILE)
 843                         goto Ebadf;
 844                 goto out_unlock;
 845         }
 846         return do_dup2(files, file, newfd, flags);
 847
 848 Ebadf:
 849         err = -EBADF;
 850 out_unlock:
 851         spin_unlock(&files->file_lock);
 852         return err;
 853 }
 854
 855 SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
 856 {
 857         if (unlikely(newfd == oldfd)) { /* corner case */
 858                 struct files_struct *files = current->files;
 859                 int retval = oldfd;
 860
 861                 rcu_read_lock();
 862                 if (!fcheck_files(files, oldfd))
 863                         retval = -EBADF;
 864                 rcu_read_unlock();
 865                 return retval;
 866         }
 867         return sys_dup3(oldfd, newfd, 0);
 868 }
 869
 870 SYSCALL_DEFINE1(dup, unsigned int, fildes)
 871 {
 872         int ret = -EBADF;
 873         struct file *file = fget_raw(fildes);
 874
 875         if (file) {
 876                 ret = get_unused_fd();
 877                 if (ret >= 0)
 878                         fd_install(ret, file);
 879                 else
 880                         fput(file);
 881         }
 882         return ret;
 883 }
 884
 885 int f_dupfd(unsigned int from, struct file *file, unsigned flags)
 886 {
 887         int err;
 888         if (from >= rlimit(RLIMIT_NOFILE))
 889                 return -EINVAL;
 890         err = alloc_fd(from, flags);
 891         if (err >= 0) {
 892                 get_file(file);
 893                 fd_install(err, file);
 894         }
 895         return err;
 896 }
 897
 898 int iterate_fd(struct files_struct *files, unsigned n,
 899                 int (*f)(const void *, struct file *, unsigned),
 900                 const void *p)
 901 {
 902         struct fdtable *fdt;
 903         int res = 0;
 904         if (!files)
 905                 return 0;
 906         spin_lock(&files->file_lock);
 907         for (fdt = files_fdtable(files); n < fdt->max_fds; n++) {
 908                 struct file *file;
 909                 file = rcu_dereference_check_fdtable(files, fdt->fd[n]);
 910                 if (!file)
 911                         continue;
 912                 res = f(p, file, n);
 913                 if (res)
 914                         break;
 915         }
 916         spin_unlock(&files->file_lock);
 917         return res;
 918 }
 919 EXPORT_SYMBOL(iterate_fd);