src/system/kernel/fs/vfs.cpp

   1 /*
   2  * Copyright 2005-2013, Ingo Weinhold, ingo_weinhold@gmx.de.
   3  * Copyright 2002-2017, Axel Dörfler, axeld@pinc-software.de.
   4  * Distributed under the terms of the MIT License.
   5  *
   6  * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
   7  * Distributed under the terms of the NewOS License.
   8  */
   9
  10
  11 /*! Virtual File System and File System Interface Layer */
  12
  13
  14 #include <ctype.h>
  15 #include <fcntl.h>
  16 #include <limits.h>
  17 #include <stddef.h>
  18 #include <stdio.h>
  19 #include <string.h>
  20 #include <sys/file.h>
  21 #include <sys/resource.h>
  22 #include <sys/stat.h>
  23 #include <unistd.h>
  24
  25 #include <fs_attr.h>
  26 #include <fs_info.h>
  27 #include <fs_interface.h>
  28 #include <fs_volume.h>
  29 #include <OS.h>
  30 #include <StorageDefs.h>
  31
  32 #include <AutoDeleter.h>
  33 #include <block_cache.h>
  34 #include <boot/kernel_args.h>
  35 #include <debug_heap.h>
  36 #include <disk_device_manager/KDiskDevice.h>
  37 #include <disk_device_manager/KDiskDeviceManager.h>
  38 #include <disk_device_manager/KDiskDeviceUtils.h>
  39 #include <disk_device_manager/KDiskSystem.h>
  40 #include <fd.h>
  41 #include <file_cache.h>
  42 #include <fs/node_monitor.h>
  43 #include <KPath.h>
  44 #include <lock.h>
  45 #include <low_resource_manager.h>
  46 #include <syscalls.h>
  47 #include <syscall_restart.h>
  48 #include <tracing.h>
  49 #include <util/atomic.h>
  50 #include <util/AutoLock.h>
  51 #include <util/DoublyLinkedList.h>
  52 #include <vfs.h>
  53 #include <vm/vm.h>
  54 #include <vm/VMCache.h>
  55
  56 #include "EntryCache.h"
  57 #include "fifo.h"
  58 #include "IORequest.h"
  59 #include "unused_vnodes.h"
  60 #include "vfs_tracing.h"
  61 #include "Vnode.h"
  62 #include "../cache/vnode_store.h"
  63
  64
  65 //#define TRACE_VFS
  66 #ifdef TRACE_VFS
  67 #       define TRACE(x) dprintf x
  68 #       define FUNCTION(x) dprintf x
  69 #else
  70 #       define TRACE(x) ;
  71 #       define FUNCTION(x) ;
  72 #endif
  73
  74 #define ADD_DEBUGGER_COMMANDS
  75
  76
  77 #define HAS_FS_CALL(vnode, op)                  (vnode->ops->op != NULL)
  78 #define HAS_FS_MOUNT_CALL(mount, op)    (mount->volume->ops->op != NULL)
  79
  80 #if KDEBUG
  81 #       define FS_CALL(vnode, op, params...) \
  82                 ( HAS_FS_CALL(vnode, op) ? \
  83                         vnode->ops->op(vnode->mount->volume, vnode, params) \
  84                         : (panic("FS_CALL op " #op " is NULL"), 0))
  85 #       define FS_CALL_NO_PARAMS(vnode, op) \
  86                 ( HAS_FS_CALL(vnode, op) ? \
  87                         vnode->ops->op(vnode->mount->volume, vnode) \
  88                         : (panic("FS_CALL_NO_PARAMS op " #op " is NULL"), 0))
  89 #       define FS_MOUNT_CALL(mount, op, params...) \
  90                 ( HAS_FS_MOUNT_CALL(mount, op) ? \
  91                         mount->volume->ops->op(mount->volume, params) \
  92                         : (panic("FS_MOUNT_CALL op " #op " is NULL"), 0))
  93 #       define FS_MOUNT_CALL_NO_PARAMS(mount, op) \
  94                 ( HAS_FS_MOUNT_CALL(mount, op) ? \
  95                         mount->volume->ops->op(mount->volume) \
  96                         : (panic("FS_MOUNT_CALL_NO_PARAMS op " #op " is NULL"), 0))
  97 #else
  98 #       define FS_CALL(vnode, op, params...) \
  99                         vnode->ops->op(vnode->mount->volume, vnode, params)
 100 #       define FS_CALL_NO_PARAMS(vnode, op) \
 101                         vnode->ops->op(vnode->mount->volume, vnode)
 102 #       define FS_MOUNT_CALL(mount, op, params...) \
 103                         mount->volume->ops->op(mount->volume, params)
 104 #       define FS_MOUNT_CALL_NO_PARAMS(mount, op) \
 105                         mount->volume->ops->op(mount->volume)
 106 #endif
 107
 108
 109 const static size_t kMaxPathLength = 65536;
 110         // The absolute maximum path length (for getcwd() - this is not depending
 111         // on PATH_MAX
 112
 113
 114 typedef DoublyLinkedList<vnode> VnodeList;
 115
 116 /*!     \brief Structure to manage a mounted file system
 117
 118         Note: The root_vnode and root_vnode->covers fields (what others?) are
 119         initialized in fs_mount() and not changed afterwards. That is as soon
 120         as the mount is mounted and it is made sure it won't be unmounted
 121         (e.g. by holding a reference to a vnode of that mount) (read) access
 122         to those fields is always safe, even without additional locking. Morever
 123         while mounted the mount holds a reference to the root_vnode->covers vnode,
 124         and thus making the access path vnode->mount->root_vnode->covers->mount->...
 125         safe if a reference to vnode is held (note that for the root mount
 126         root_vnode->covers is NULL, though).
 127 */
 128 struct fs_mount {
 129         fs_mount()
 130                 :
 131                 volume(NULL),
 132                 device_name(NULL)
 133         {
 134                 recursive_lock_init(&rlock, "mount rlock");
 135         }
 136
 137         ~fs_mount()
 138         {
 139                 recursive_lock_destroy(&rlock);
 140                 free(device_name);
 141
 142                 while (volume) {
 143                         fs_volume* superVolume = volume->super_volume;
 144
 145                         if (volume->file_system != NULL)
 146                                 put_module(volume->file_system->info.name);
 147
 148                         free(volume->file_system_name);
 149                         free(volume);
 150                         volume = superVolume;
 151                 }
 152         }
 153
 154         struct fs_mount* next;
 155         dev_t                   id;
 156         fs_volume*              volume;
 157         char*                   device_name;
 158         recursive_lock  rlock;  // guards the vnodes list
 159                 // TODO: Make this a mutex! It is never used recursively.
 160         struct vnode*   root_vnode;
 161         struct vnode*   covers_vnode;   // immutable
 162         KPartition*             partition;
 163         VnodeList               vnodes;
 164         EntryCache              entry_cache;
 165         bool                    unmounting;
 166         bool                    owns_file_device;
 167 };
 168
 169
 170 namespace {
 171
 172 struct advisory_lock : public DoublyLinkedListLinkImpl<advisory_lock> {
 173         list_link               link;
 174         team_id                 team;
 175         pid_t                   session;
 176         off_t                   start;
 177         off_t                   end;
 178         bool                    shared;
 179 };
 180
 181 typedef DoublyLinkedList<advisory_lock> LockList;
 182
 183 } // namespace
 184
 185
 186 struct advisory_locking {
 187         sem_id                  lock;
 188         sem_id                  wait_sem;
 189         LockList                locks;
 190
 191         advisory_locking()
 192                 :
 193                 lock(-1),
 194                 wait_sem(-1)
 195         {
 196         }
 197
 198         ~advisory_locking()
 199         {
 200                 if (lock >= 0)
 201                         delete_sem(lock);
 202                 if (wait_sem >= 0)
 203                         delete_sem(wait_sem);
 204         }
 205 };
 206
 207 /*!     \brief Guards sMountsTable.
 208
 209         The holder is allowed to read/write access the sMountsTable.
 210         Manipulation of the fs_mount structures themselves
 211         (and their destruction) requires different locks though.
 212 */
 213 static mutex sMountMutex = MUTEX_INITIALIZER("vfs_mount_lock");
 214
 215 /*!     \brief Guards mount/unmount operations.
 216
 217         The fs_mount() and fs_unmount() hold the lock during their whole operation.
 218         That is locking the lock ensures that no FS is mounted/unmounted. In
 219         particular this means that
 220         - sMountsTable will not be modified,
 221         - the fields immutable after initialization of the fs_mount structures in
 222           sMountsTable will not be modified,
 223
 224         The thread trying to lock the lock must not hold sVnodeLock or
 225         sMountMutex.
 226 */
 227 static recursive_lock sMountOpLock;
 228
 229 /*!     \brief Guards sVnodeTable.
 230
 231         The holder is allowed read/write access to sVnodeTable and to
 232         any unbusy vnode in that table, save to the immutable fields (device, id,
 233         private_node, mount) to which only read-only access is allowed.
 234         The mutable fields advisory_locking, mandatory_locked_by, and ref_count, as
 235         well as the busy, removed, unused flags, and the vnode's type can also be
 236         write accessed when holding a read lock to sVnodeLock *and* having the vnode
 237         locked. Write access to covered_by and covers requires to write lock
 238         sVnodeLock.
 239
 240         The thread trying to acquire the lock must not hold sMountMutex.
 241         You must not hold this lock when calling create_sem(), as this might call
 242         vfs_free_unused_vnodes() and thus cause a deadlock.
 243 */
 244 static rw_lock sVnodeLock = RW_LOCK_INITIALIZER("vfs_vnode_lock");
 245
 246 /*!     \brief Guards io_context::root.
 247
 248         Must be held when setting or getting the io_context::root field.
 249         The only operation allowed while holding this lock besides getting or
 250         setting the field is inc_vnode_ref_count() on io_context::root.
 251 */
 252 static mutex sIOContextRootLock = MUTEX_INITIALIZER("io_context::root lock");
 253
 254
 255 namespace {
 256
 257 struct vnode_hash_key {
 258         dev_t   device;
 259         ino_t   vnode;
 260 };
 261
 262 struct VnodeHash {
 263         typedef vnode_hash_key  KeyType;
 264         typedef struct vnode    ValueType;
 265
 266 #define VHASH(mountid, vnodeid) \
 267         (((uint32)((vnodeid) >> 32) + (uint32)(vnodeid)) ^ (uint32)(mountid))
 268
 269         size_t HashKey(KeyType key) const
 270         {
 271                 return VHASH(key.device, key.vnode);
 272         }
 273
 274         size_t Hash(ValueType* vnode) const
 275         {
 276                 return VHASH(vnode->device, vnode->id);
 277         }
 278
 279 #undef VHASH
 280
 281         bool Compare(KeyType key, ValueType* vnode) const
 282         {
 283                 return vnode->device == key.device && vnode->id == key.vnode;
 284         }
 285
 286         ValueType*& GetLink(ValueType* value) const
 287         {
 288                 return value->next;
 289         }
 290 };
 291
 292 typedef BOpenHashTable<VnodeHash> VnodeTable;
 293
 294
 295 struct MountHash {
 296         typedef dev_t                   KeyType;
 297         typedef struct fs_mount ValueType;
 298
 299         size_t HashKey(KeyType key) const
 300         {
 301                 return key;
 302         }
 303
 304         size_t Hash(ValueType* mount) const
 305         {
 306                 return mount->id;
 307         }
 308
 309         bool Compare(KeyType key, ValueType* mount) const
 310         {
 311                 return mount->id == key;
 312         }
 313
 314         ValueType*& GetLink(ValueType* value) const
 315         {
 316                 return value->next;
 317         }
 318 };
 319
 320 typedef BOpenHashTable<MountHash> MountTable;
 321
 322 } // namespace
 323
 324
 325 #define VNODE_HASH_TABLE_SIZE 1024
 326 static VnodeTable* sVnodeTable;
 327 static struct vnode* sRoot;
 328
 329 #define MOUNTS_HASH_TABLE_SIZE 16
 330 static MountTable* sMountsTable;
 331 static dev_t sNextMountID = 1;
 332
 333 #define MAX_TEMP_IO_VECS 8
 334
 335 // How long to wait for busy vnodes (10s)
 336 #define BUSY_VNODE_RETRIES 2000
 337 #define BUSY_VNODE_DELAY 5000
 338
 339 mode_t __gUmask = 022;
 340
 341 /* function declarations */
 342
 343 static void free_unused_vnodes();
 344
 345 // file descriptor operation prototypes
 346 static status_t file_read(struct file_descriptor* descriptor, off_t pos,
 347         void* buffer, size_t* _bytes);
 348 static status_t file_write(struct file_descriptor* descriptor, off_t pos,
 349         const void* buffer, size_t* _bytes);
 350 static off_t file_seek(struct file_descriptor* descriptor, off_t pos,
 351         int seekType);
 352 static void file_free_fd(struct file_descriptor* descriptor);
 353 static status_t file_close(struct file_descriptor* descriptor);
 354 static status_t file_select(struct file_descriptor* descriptor, uint8 event,
 355         struct selectsync* sync);
 356 static status_t file_deselect(struct file_descriptor* descriptor, uint8 event,
 357         struct selectsync* sync);
 358 static status_t dir_read(struct io_context* context,
 359         struct file_descriptor* descriptor, struct dirent* buffer,
 360         size_t bufferSize, uint32* _count);
 361 static status_t dir_read(struct io_context* ioContext, struct vnode* vnode,
 362         void* cookie, struct dirent* buffer, size_t bufferSize, uint32* _count);
 363 static status_t dir_rewind(struct file_descriptor* descriptor);
 364 static void dir_free_fd(struct file_descriptor* descriptor);
 365 static status_t dir_close(struct file_descriptor* descriptor);
 366 static status_t attr_dir_read(struct io_context* context,
 367         struct file_descriptor* descriptor, struct dirent* buffer,
 368         size_t bufferSize, uint32* _count);
 369 static status_t attr_dir_rewind(struct file_descriptor* descriptor);
 370 static void attr_dir_free_fd(struct file_descriptor* descriptor);
 371 static status_t attr_dir_close(struct file_descriptor* descriptor);
 372 static status_t attr_read(struct file_descriptor* descriptor, off_t pos,
 373         void* buffer, size_t* _bytes);
 374 static status_t attr_write(struct file_descriptor* descriptor, off_t pos,
 375         const void* buffer, size_t* _bytes);
 376 static off_t attr_seek(struct file_descriptor* descriptor, off_t pos,
 377         int seekType);
 378 static void attr_free_fd(struct file_descriptor* descriptor);
 379 static status_t attr_close(struct file_descriptor* descriptor);
 380 static status_t attr_read_stat(struct file_descriptor* descriptor,
 381         struct stat* statData);
 382 static status_t attr_write_stat(struct file_descriptor* descriptor,
 383         const struct stat* stat, int statMask);
 384 static status_t index_dir_read(struct io_context* context,
 385         struct file_descriptor* descriptor, struct dirent* buffer,
 386         size_t bufferSize, uint32* _count);
 387 static status_t index_dir_rewind(struct file_descriptor* descriptor);
 388 static void index_dir_free_fd(struct file_descriptor* descriptor);
 389 static status_t index_dir_close(struct file_descriptor* descriptor);
 390 static status_t query_read(struct io_context* context,
 391         struct file_descriptor* descriptor, struct dirent* buffer,
 392         size_t bufferSize, uint32* _count);
 393 static status_t query_rewind(struct file_descriptor* descriptor);
 394 static void query_free_fd(struct file_descriptor* descriptor);
 395 static status_t query_close(struct file_descriptor* descriptor);
 396
 397 static status_t common_ioctl(struct file_descriptor* descriptor, ulong op,
 398         void* buffer, size_t length);
 399 static status_t common_read_stat(struct file_descriptor* descriptor,
 400         struct stat* statData);
 401 static status_t common_write_stat(struct file_descriptor* descriptor,
 402         const struct stat* statData, int statMask);
 403 static status_t common_path_read_stat(int fd, char* path, bool traverseLeafLink,
 404         struct stat* stat, bool kernel);
 405
 406 static status_t vnode_path_to_vnode(struct vnode* vnode, char* path,
 407         bool traverseLeafLink, int count, bool kernel,
 408         struct vnode** _vnode, ino_t* _parentID);
 409 static status_t dir_vnode_to_path(struct vnode* vnode, char* buffer,
 410         size_t bufferSize, bool kernel);
 411 static status_t fd_and_path_to_vnode(int fd, char* path, bool traverseLeafLink,
 412         struct vnode** _vnode, ino_t* _parentID, bool kernel);
 413 static void inc_vnode_ref_count(struct vnode* vnode);
 414 static status_t dec_vnode_ref_count(struct vnode* vnode, bool alwaysFree,
 415         bool reenter);
 416 static inline void put_vnode(struct vnode* vnode);
 417 static status_t fs_unmount(char* path, dev_t mountID, uint32 flags,
 418         bool kernel);
 419 static int open_vnode(struct vnode* vnode, int openMode, bool kernel);
 420
 421
 422 static struct fd_ops sFileOps = {
 423         file_read,
 424         file_write,
 425         file_seek,
 426         common_ioctl,
 427         NULL,           // set_flags
 428         file_select,
 429         file_deselect,
 430         NULL,           // read_dir()
 431         NULL,           // rewind_dir()
 432         common_read_stat,
 433         common_write_stat,
 434         file_close,
 435         file_free_fd
 436 };
 437
 438 static struct fd_ops sDirectoryOps = {
 439         NULL,           // read()
 440         NULL,           // write()
 441         NULL,           // seek()
 442         common_ioctl,
 443         NULL,           // set_flags
 444         NULL,           // select()
 445         NULL,           // deselect()
 446         dir_read,
 447         dir_rewind,
 448         common_read_stat,
 449         common_write_stat,
 450         dir_close,
 451         dir_free_fd
 452 };
 453
 454 static struct fd_ops sAttributeDirectoryOps = {
 455         NULL,           // read()
 456         NULL,           // write()
 457         NULL,           // seek()
 458         common_ioctl,
 459         NULL,           // set_flags
 460         NULL,           // select()
 461         NULL,           // deselect()
 462         attr_dir_read,
 463         attr_dir_rewind,
 464         common_read_stat,
 465         common_write_stat,
 466         attr_dir_close,
 467         attr_dir_free_fd
 468 };
 469
 470 static struct fd_ops sAttributeOps = {
 471         attr_read,
 472         attr_write,
 473         attr_seek,
 474         common_ioctl,
 475         NULL,           // set_flags
 476         NULL,           // select()
 477         NULL,           // deselect()
 478         NULL,           // read_dir()
 479         NULL,           // rewind_dir()
 480         attr_read_stat,
 481         attr_write_stat,
 482         attr_close,
 483         attr_free_fd
 484 };
 485
 486 static struct fd_ops sIndexDirectoryOps = {
 487         NULL,           // read()
 488         NULL,           // write()
 489         NULL,           // seek()
 490         NULL,           // ioctl()
 491         NULL,           // set_flags
 492         NULL,           // select()
 493         NULL,           // deselect()
 494         index_dir_read,
 495         index_dir_rewind,
 496         NULL,           // read_stat()
 497         NULL,           // write_stat()
 498         index_dir_close,
 499         index_dir_free_fd
 500 };
 501
 502 #if 0
 503 static struct fd_ops sIndexOps = {
 504         NULL,           // read()
 505         NULL,           // write()
 506         NULL,           // seek()
 507         NULL,           // ioctl()
 508         NULL,           // set_flags
 509         NULL,           // select()
 510         NULL,           // deselect()
 511         NULL,           // dir_read()
 512         NULL,           // dir_rewind()
 513         index_read_stat,        // read_stat()
 514         NULL,           // write_stat()
 515         NULL,           // dir_close()
 516         NULL            // free_fd()
 517 };
 518 #endif
 519
 520 static struct fd_ops sQueryOps = {
 521         NULL,           // read()
 522         NULL,           // write()
 523         NULL,           // seek()
 524         NULL,           // ioctl()
 525         NULL,           // set_flags
 526         NULL,           // select()
 527         NULL,           // deselect()
 528         query_read,
 529         query_rewind,
 530         NULL,           // read_stat()
 531         NULL,           // write_stat()
 532         query_close,
 533         query_free_fd
 534 };
 535
 536
 537 namespace {
 538
 539 class VNodePutter {
 540 public:
 541         VNodePutter(struct vnode* vnode = NULL) : fVNode(vnode) {}
 542
 543         ~VNodePutter()
 544         {
 545                 Put();
 546         }
 547
 548         void SetTo(struct vnode* vnode)
 549         {
 550                 Put();
 551                 fVNode = vnode;
 552         }
 553
 554         void Put()
 555         {
 556                 if (fVNode) {
 557                         put_vnode(fVNode);
 558                         fVNode = NULL;
 559                 }
 560         }
 561
 562         struct vnode* Detach()
 563         {
 564                 struct vnode* vnode = fVNode;
 565                 fVNode = NULL;
 566                 return vnode;
 567         }
 568
 569 private:
 570         struct vnode* fVNode;
 571 };
 572
 573
 574 class FDCloser {
 575 public:
 576         FDCloser() : fFD(-1), fKernel(true) {}
 577
 578         FDCloser(int fd, bool kernel) : fFD(fd), fKernel(kernel) {}
 579
 580         ~FDCloser()
 581         {
 582                 Close();
 583         }
 584
 585         void SetTo(int fd, bool kernel)
 586         {
 587                 Close();
 588                 fFD = fd;
 589                 fKernel = kernel;
 590         }
 591
 592         void Close()
 593         {
 594                 if (fFD >= 0) {
 595                         if (fKernel)
 596                                 _kern_close(fFD);
 597                         else
 598                                 _user_close(fFD);
 599                         fFD = -1;
 600                 }
 601         }
 602
 603         int Detach()
 604         {
 605                 int fd = fFD;
 606                 fFD = -1;
 607                 return fd;
 608         }
 609
 610 private:
 611         int             fFD;
 612         bool    fKernel;
 613 };
 614
 615 } // namespace
 616
 617
 618 #if VFS_PAGES_IO_TRACING
 619
 620 namespace VFSPagesIOTracing {
 621
 622 class PagesIOTraceEntry : public AbstractTraceEntry {
 623 protected:
 624         PagesIOTraceEntry(struct vnode* vnode, void* cookie, off_t pos,
 625                 const generic_io_vec* vecs, uint32 count, uint32 flags,
 626                 generic_size_t bytesRequested, status_t status,
 627                 generic_size_t bytesTransferred)
 628                 :
 629                 fVnode(vnode),
 630                 fMountID(vnode->mount->id),
 631                 fNodeID(vnode->id),
 632                 fCookie(cookie),
 633                 fPos(pos),
 634                 fCount(count),
 635                 fFlags(flags),
 636                 fBytesRequested(bytesRequested),
 637                 fStatus(status),
 638                 fBytesTransferred(bytesTransferred)
 639         {
 640                 fVecs = (generic_io_vec*)alloc_tracing_buffer_memcpy(vecs,
 641                         sizeof(generic_io_vec) * count, false);
 642         }
 643
 644         void AddDump(TraceOutput& out, const char* mode)
 645         {
 646                 out.Print("vfs pages io %5s: vnode: %p (%" B_PRId32 ", %" B_PRId64 "), "
 647                         "cookie: %p, pos: %" B_PRIdOFF ", size: %" B_PRIu64 ", vecs: {",
 648                         mode, fVnode, fMountID, fNodeID, fCookie, fPos,
 649                         (uint64)fBytesRequested);
 650
 651                 if (fVecs != NULL) {
 652                         for (uint32 i = 0; i < fCount; i++) {
 653                                 if (i > 0)
 654                                         out.Print(", ");
 655                                 out.Print("(%" B_PRIx64 ", %" B_PRIu64 ")", (uint64)fVecs[i].base,
 656                                         (uint64)fVecs[i].length);
 657                         }
 658                 }
 659
 660                 out.Print("}, flags: %#" B_PRIx32 " -> status: %#" B_PRIx32 ", "
 661                         "transferred: %" B_PRIu64, fFlags, fStatus,
 662                         (uint64)fBytesTransferred);
 663         }
 664
 665 protected:
 666         struct vnode*   fVnode;
 667         dev_t                   fMountID;
 668         ino_t                   fNodeID;
 669         void*                   fCookie;
 670         off_t                   fPos;
 671         generic_io_vec* fVecs;
 672         uint32                  fCount;
 673         uint32                  fFlags;
 674         generic_size_t  fBytesRequested;
 675         status_t                fStatus;
 676         generic_size_t  fBytesTransferred;
 677 };
 678
 679
 680 class ReadPages : public PagesIOTraceEntry {
 681 public:
 682         ReadPages(struct vnode* vnode, void* cookie, off_t pos,
 683                 const generic_io_vec* vecs, uint32 count, uint32 flags,
 684                 generic_size_t bytesRequested, status_t status,
 685                 generic_size_t bytesTransferred)
 686                 :
 687                 PagesIOTraceEntry(vnode, cookie, pos, vecs, count, flags,
 688                         bytesRequested, status, bytesTransferred)
 689         {
 690                 Initialized();
 691         }
 692
 693         virtual void AddDump(TraceOutput& out)
 694         {
 695                 PagesIOTraceEntry::AddDump(out, "read");
 696         }
 697 };
 698
 699
 700 class WritePages : public PagesIOTraceEntry {
 701 public:
 702         WritePages(struct vnode* vnode, void* cookie, off_t pos,
 703                 const generic_io_vec* vecs, uint32 count, uint32 flags,
 704                 generic_size_t bytesRequested, status_t status,
 705                 generic_size_t bytesTransferred)
 706                 :
 707                 PagesIOTraceEntry(vnode, cookie, pos, vecs, count, flags,
 708                         bytesRequested, status, bytesTransferred)
 709         {
 710                 Initialized();
 711         }
 712
 713         virtual void AddDump(TraceOutput& out)
 714         {
 715                 PagesIOTraceEntry::AddDump(out, "write");
 716         }
 717 };
 718
 719 }       // namespace VFSPagesIOTracing
 720
 721 #       define TPIO(x) new(std::nothrow) VFSPagesIOTracing::x;
 722 #else
 723 #       define TPIO(x) ;
 724 #endif  // VFS_PAGES_IO_TRACING
 725
 726
 727 /*! Finds the mounted device (the fs_mount structure) with the given ID.
 728         Note, you must hold the gMountMutex lock when you call this function.
 729 */
 730 static struct fs_mount*
 731 find_mount(dev_t id)
 732 {
 733         ASSERT_LOCKED_MUTEX(&sMountMutex);
 734
 735         return sMountsTable->Lookup(id);
 736 }
 737
 738
 739 static status_t
 740 get_mount(dev_t id, struct fs_mount** _mount)
 741 {
 742         struct fs_mount* mount;
 743
 744         ReadLocker nodeLocker(sVnodeLock);
 745         MutexLocker mountLocker(sMountMutex);
 746
 747         mount = find_mount(id);
 748         if (mount == NULL)
 749                 return B_BAD_VALUE;
 750
 751         struct vnode* rootNode = mount->root_vnode;
 752         if (mount->unmounting || rootNode == NULL || rootNode->IsBusy()
 753                 || rootNode->ref_count == 0) {
 754                 // might have been called during a mount/unmount operation
 755                 return B_BUSY;
 756         }
 757
 758         inc_vnode_ref_count(rootNode);
 759         *_mount = mount;
 760         return B_OK;
 761 }
 762
 763
 764 static void
 765 put_mount(struct fs_mount* mount)
 766 {
 767         if (mount)
 768                 put_vnode(mount->root_vnode);
 769 }
 770
 771
 772 /*!     Tries to open the specified file system module.
 773         Accepts a file system name of the form "bfs" or "file_systems/bfs/v1".
 774         Returns a pointer to file system module interface, or NULL if it
 775         could not open the module.
 776 */
 777 static file_system_module_info*
 778 get_file_system(const char* fsName)
 779 {
 780         char name[B_FILE_NAME_LENGTH];
 781         if (strncmp(fsName, "file_systems/", strlen("file_systems/"))) {
 782                 // construct module name if we didn't get one
 783                 // (we currently support only one API)
 784                 snprintf(name, sizeof(name), "file_systems/%s/v1", fsName);
 785                 fsName = NULL;
 786         }
 787
 788         file_system_module_info* info;
 789         if (get_module(fsName ? fsName : name, (module_info**)&info) != B_OK)
 790                 return NULL;
 791
 792         return info;
 793 }
 794
 795
 796 /*!     Accepts a file system name of the form "bfs" or "file_systems/bfs/v1"
 797         and returns a compatible fs_info.fsh_name name ("bfs" in both cases).
 798         The name is allocated for you, and you have to free() it when you're
 799         done with it.
 800         Returns NULL if the required memory is not available.
 801 */
 802 static char*
 803 get_file_system_name(const char* fsName)
 804 {
 805         const size_t length = strlen("file_systems/");
 806
 807         if (strncmp(fsName, "file_systems/", length)) {
 808                 // the name already seems to be the module's file name
 809                 return strdup(fsName);
 810         }
 811
 812         fsName += length;
 813         const char* end = strchr(fsName, '/');
 814         if (end == NULL) {
 815                 // this doesn't seem to be a valid name, but well...
 816                 return strdup(fsName);
 817         }
 818
 819         // cut off the trailing /v1
 820
 821         char* name = (char*)malloc(end + 1 - fsName);
 822         if (name == NULL)
 823                 return NULL;
 824
 825         strlcpy(name, fsName, end + 1 - fsName);
 826         return name;
 827 }
 828
 829
 830 /*!     Accepts a list of file system names separated by a colon, one for each
 831         layer and returns the file system name for the specified layer.
 832         The name is allocated for you, and you have to free() it when you're
 833         done with it.
 834         Returns NULL if the required memory is not available or if there is no
 835         name for the specified layer.
 836 */
 837 static char*
 838 get_file_system_name_for_layer(const char* fsNames, int32 layer)
 839 {
 840         while (layer >= 0) {
 841                 const char* end = strchr(fsNames, ':');
 842                 if (end == NULL) {
 843                         if (layer == 0)
 844                                 return strdup(fsNames);
 845                         return NULL;
 846                 }
 847
 848                 if (layer == 0) {
 849                         size_t length = end - fsNames + 1;
 850                         char* result = (char*)malloc(length);
 851                         strlcpy(result, fsNames, length);
 852                         return result;
 853                 }
 854
 855                 fsNames = end + 1;
 856                 layer--;
 857         }
 858
 859         return NULL;
 860 }
 861
 862
 863 static void
 864 add_vnode_to_mount_list(struct vnode* vnode, struct fs_mount* mount)
 865 {
 866         RecursiveLocker _(mount->rlock);
 867         mount->vnodes.Add(vnode);
 868 }
 869
 870
 871 static void
 872 remove_vnode_from_mount_list(struct vnode* vnode, struct fs_mount* mount)
 873 {
 874         RecursiveLocker _(mount->rlock);
 875         mount->vnodes.Remove(vnode);
 876 }
 877
 878
 879 /*!     \brief Looks up a vnode by mount and node ID in the sVnodeTable.
 880
 881         The caller must hold the sVnodeLock (read lock at least).
 882
 883         \param mountID the mount ID.
 884         \param vnodeID the node ID.
 885
 886         \return The vnode structure, if it was found in the hash table, \c NULL
 887                         otherwise.
 888 */
 889 static struct vnode*
 890 lookup_vnode(dev_t mountID, ino_t vnodeID)
 891 {
 892         struct vnode_hash_key key;
 893
 894         key.device = mountID;
 895         key.vnode = vnodeID;
 896
 897         return sVnodeTable->Lookup(key);
 898 }
 899
 900
 901 /*!     \brief Checks whether or not a busy vnode should be waited for (again).
 902
 903         This will also wait for BUSY_VNODE_DELAY before returning if one should
 904         still wait for the vnode becoming unbusy.
 905
 906         \return \c true if one should retry, \c false if not.
 907 */
 908 static bool
 909 retry_busy_vnode(int32& tries, dev_t mountID, ino_t vnodeID)
 910 {
 911         if (--tries < 0) {
 912                 // vnode doesn't seem to become unbusy
 913                 dprintf("vnode %" B_PRIdDEV ":%" B_PRIdINO
 914                         " is not becoming unbusy!\n", mountID, vnodeID);
 915                 return false;
 916         }
 917         snooze(BUSY_VNODE_DELAY);
 918         return true;
 919 }
 920
 921
 922 /*!     Creates a new vnode with the given mount and node ID.
 923         If the node already exists, it is returned instead and no new node is
 924         created. In either case -- but not, if an error occurs -- the function write
 925         locks \c sVnodeLock and keeps it locked for the caller when returning. On
 926         error the lock is not held on return.
 927
 928         \param mountID The mount ID.
 929         \param vnodeID The vnode ID.
 930         \param _vnode Will be set to the new vnode on success.
 931         \param _nodeCreated Will be set to \c true when the returned vnode has
 932                 been newly created, \c false when it already existed. Will not be
 933                 changed on error.
 934         \return \c B_OK, when the vnode was successfully created and inserted or
 935                 a node with the given ID was found, \c B_NO_MEMORY or
 936                 \c B_ENTRY_NOT_FOUND on error.
 937 */
 938 static status_t
 939 create_new_vnode_and_lock(dev_t mountID, ino_t vnodeID, struct vnode*& _vnode,
 940         bool& _nodeCreated)
 941 {
 942         FUNCTION(("create_new_vnode_and_lock()\n"));
 943
 944         struct vnode* vnode = (struct vnode*)malloc(sizeof(struct vnode));
 945         if (vnode == NULL)
 946                 return B_NO_MEMORY;
 947
 948         // initialize basic values
 949         memset(vnode, 0, sizeof(struct vnode));
 950         vnode->device = mountID;
 951         vnode->id = vnodeID;
 952         vnode->ref_count = 1;
 953         vnode->SetBusy(true);
 954
 955         // look up the node -- it might have been added by someone else in the
 956         // meantime
 957         rw_lock_write_lock(&sVnodeLock);
 958         struct vnode* existingVnode = lookup_vnode(mountID, vnodeID);
 959         if (existingVnode != NULL) {
 960                 free(vnode);
 961                 _vnode = existingVnode;
 962                 _nodeCreated = false;
 963                 return B_OK;
 964         }
 965
 966         // get the mount structure
 967         mutex_lock(&sMountMutex);
 968         vnode->mount = find_mount(mountID);
 969         if (!vnode->mount || vnode->mount->unmounting) {
 970                 mutex_unlock(&sMountMutex);
 971                 rw_lock_write_unlock(&sVnodeLock);
 972                 free(vnode);
 973                 return B_ENTRY_NOT_FOUND;
 974         }
 975
 976         // add the vnode to the mount's node list and the hash table
 977         sVnodeTable->Insert(vnode);
 978         add_vnode_to_mount_list(vnode, vnode->mount);
 979
 980         mutex_unlock(&sMountMutex);
 981
 982         _vnode = vnode;
 983         _nodeCreated = true;
 984
 985         // keep the vnode lock locked
 986         return B_OK;
 987 }
 988
 989
 990 /*!     Frees the vnode and all resources it has acquired, and removes
 991         it from the vnode hash as well as from its mount structure.
 992         Will also make sure that any cache modifications are written back.
 993 */
 994 static void
 995 free_vnode(struct vnode* vnode, bool reenter)
 996 {
 997         ASSERT_PRINT(vnode->ref_count == 0 && vnode->IsBusy(), "vnode: %p\n",
 998                 vnode);
 999
1000         // write back any changes in this vnode's cache -- but only
1001         // if the vnode won't be deleted, in which case the changes
1002         // will be discarded
1003
1004         if (!vnode->IsRemoved() && HAS_FS_CALL(vnode, fsync))
1005                 FS_CALL_NO_PARAMS(vnode, fsync);
1006
1007         // Note: If this vnode has a cache attached, there will still be two
1008         // references to that cache at this point. The last one belongs to the vnode
1009         // itself (cf. vfs_get_vnode_cache()) and one belongs to the node's file
1010         // cache. Each but the last reference to a cache also includes a reference
1011         // to the vnode. The file cache, however, released its reference (cf.
1012         // file_cache_create()), so that this vnode's ref count has the chance to
1013         // ever drop to 0. Deleting the file cache now, will cause the next to last
1014         // cache reference to be released, which will also release a (no longer
1015         // existing) vnode reference. To avoid problems, we set the vnode's ref
1016         // count, so that it will neither become negative nor 0.
1017         vnode->ref_count = 2;
1018
1019         if (!vnode->IsUnpublished()) {
1020                 if (vnode->IsRemoved())
1021                         FS_CALL(vnode, remove_vnode, reenter);
1022                 else
1023                         FS_CALL(vnode, put_vnode, reenter);
1024         }
1025
1026         // If the vnode has a VMCache attached, make sure that it won't try to get
1027         // another reference via VMVnodeCache::AcquireUnreferencedStoreRef(). As
1028         // long as the vnode is busy and in the hash, that won't happen, but as
1029         // soon as we've removed it from the hash, it could reload the vnode -- with
1030         // a new cache attached!
1031         if (vnode->cache != NULL)
1032                 ((VMVnodeCache*)vnode->cache)->VnodeDeleted();
1033
1034         // The file system has removed the resources of the vnode now, so we can
1035         // make it available again (by removing the busy vnode from the hash).
1036         rw_lock_write_lock(&sVnodeLock);
1037         sVnodeTable->Remove(vnode);
1038         rw_lock_write_unlock(&sVnodeLock);
1039
1040         // if we have a VMCache attached, remove it
1041         if (vnode->cache)
1042                 vnode->cache->ReleaseRef();
1043
1044         vnode->cache = NULL;
1045
1046         remove_vnode_from_mount_list(vnode, vnode->mount);
1047
1048         free(vnode);
1049 }
1050
1051
1052 /*!     \brief Decrements the reference counter of the given vnode and deletes it,
1053         if the counter dropped to 0.
1054
1055         The caller must, of course, own a reference to the vnode to call this
1056         function.
1057         The caller must not hold the sVnodeLock or the sMountMutex.
1058
1059         \param vnode the vnode.
1060         \param alwaysFree don't move this vnode into the unused list, but really
1061                    delete it if possible.
1062         \param reenter \c true, if this function is called (indirectly) from within
1063                    a file system. This will be passed to file system hooks only.
1064         \return \c B_OK, if everything went fine, an error code otherwise.
1065 */
1066 static status_t
1067 dec_vnode_ref_count(struct vnode* vnode, bool alwaysFree, bool reenter)
1068 {
1069         ReadLocker locker(sVnodeLock);
1070         AutoLocker<Vnode> nodeLocker(vnode);
1071
1072         int32 oldRefCount = atomic_add(&vnode->ref_count, -1);
1073
1074         ASSERT_PRINT(oldRefCount > 0, "vnode %p\n", vnode);
1075
1076         TRACE(("dec_vnode_ref_count: vnode %p, ref now %" B_PRId32 "\n", vnode,
1077                 vnode->ref_count));
1078
1079         if (oldRefCount != 1)
1080                 return B_OK;
1081
1082         if (vnode->IsBusy())
1083                 panic("dec_vnode_ref_count: called on busy vnode %p\n", vnode);
1084
1085         bool freeNode = false;
1086         bool freeUnusedNodes = false;
1087
1088         // Just insert the vnode into an unused list if we don't need
1089         // to delete it
1090         if (vnode->IsRemoved() || alwaysFree) {
1091                 vnode_to_be_freed(vnode);
1092                 vnode->SetBusy(true);
1093                 freeNode = true;
1094         } else
1095                 freeUnusedNodes = vnode_unused(vnode);
1096
1097         nodeLocker.Unlock();
1098         locker.Unlock();
1099
1100         if (freeNode)
1101                 free_vnode(vnode, reenter);
1102         else if (freeUnusedNodes)
1103                 free_unused_vnodes();
1104
1105         return B_OK;
1106 }
1107
1108
1109 /*!     \brief Increments the reference counter of the given vnode.
1110
1111         The caller must make sure that the node isn't deleted while this function
1112         is called. This can be done either:
1113         - by ensuring that a reference to the node exists and remains in existence,
1114           or
1115         - by holding the vnode's lock (which also requires read locking sVnodeLock)
1116           or by holding sVnodeLock write locked.
1117
1118         In the second case the caller is responsible for dealing with the ref count
1119         0 -> 1 transition. That is 1. this function must not be invoked when the
1120         node is busy in the first place and 2. vnode_used() must be called for the
1121         node.
1122
1123         \param vnode the vnode.
1124 */
1125 static void
1126 inc_vnode_ref_count(struct vnode* vnode)
1127 {
1128         atomic_add(&vnode->ref_count, 1);
1129         TRACE(("inc_vnode_ref_count: vnode %p, ref now %" B_PRId32 "\n", vnode,
1130                 vnode->ref_count));
1131 }
1132
1133
1134 static bool
1135 is_special_node_type(int type)
1136 {
1137         // at the moment only FIFOs are supported
1138         return S_ISFIFO(type);
1139 }
1140
1141
1142 static status_t
1143 create_special_sub_node(struct vnode* vnode, uint32 flags)
1144 {
1145         if (S_ISFIFO(vnode->Type()))
1146                 return create_fifo_vnode(vnode->mount->volume, vnode);
1147
1148         return B_BAD_VALUE;
1149 }
1150
1151
1152 /*!     \brief Retrieves a vnode for a given mount ID, node ID pair.
1153
1154         If the node is not yet in memory, it will be loaded.
1155
1156         The caller must not hold the sVnodeLock or the sMountMutex.
1157
1158         \param mountID the mount ID.
1159         \param vnodeID the node ID.
1160         \param _vnode Pointer to a vnode* variable into which the pointer to the
1161                    retrieved vnode structure shall be written.
1162         \param reenter \c true, if this function is called (indirectly) from within
1163                    a file system.
1164         \return \c B_OK, if everything when fine, an error code otherwise.
1165 */
1166 static status_t
1167 get_vnode(dev_t mountID, ino_t vnodeID, struct vnode** _vnode, bool canWait,
1168         int reenter)
1169 {
1170         FUNCTION(("get_vnode: mountid %" B_PRId32 " vnid 0x%" B_PRIx64 " %p\n",
1171                 mountID, vnodeID, _vnode));
1172
1173         rw_lock_read_lock(&sVnodeLock);
1174
1175         int32 tries = BUSY_VNODE_RETRIES;
1176 restart:
1177         struct vnode* vnode = lookup_vnode(mountID, vnodeID);
1178         AutoLocker<Vnode> nodeLocker(vnode);
1179
1180         if (vnode && vnode->IsBusy()) {
1181                 nodeLocker.Unlock();
1182                 rw_lock_read_unlock(&sVnodeLock);
1183                 if (!canWait) {
1184                         dprintf("vnode %" B_PRIdDEV ":%" B_PRIdINO " is busy!\n",
1185                                 mountID, vnodeID);
1186                         return B_BUSY;
1187                 }
1188                 if (!retry_busy_vnode(tries, mountID, vnodeID))
1189                         return B_BUSY;
1190
1191                 rw_lock_read_lock(&sVnodeLock);
1192                 goto restart;
1193         }
1194
1195         TRACE(("get_vnode: tried to lookup vnode, got %p\n", vnode));
1196
1197         status_t status;
1198
1199         if (vnode) {
1200                 if (vnode->ref_count == 0) {
1201                         // this vnode has been unused before
1202                         vnode_used(vnode);
1203                 }
1204                 inc_vnode_ref_count(vnode);
1205
1206                 nodeLocker.Unlock();
1207                 rw_lock_read_unlock(&sVnodeLock);
1208         } else {
1209                 // we need to create a new vnode and read it in
1210                 rw_lock_read_unlock(&sVnodeLock);
1211                         // unlock -- create_new_vnode_and_lock() write-locks on success
1212                 bool nodeCreated;
1213                 status = create_new_vnode_and_lock(mountID, vnodeID, vnode,
1214                         nodeCreated);
1215                 if (status != B_OK)
1216                         return status;
1217
1218                 if (!nodeCreated) {
1219                         rw_lock_read_lock(&sVnodeLock);
1220                         rw_lock_write_unlock(&sVnodeLock);
1221                         goto restart;
1222                 }
1223
1224                 rw_lock_write_unlock(&sVnodeLock);
1225
1226                 int type;
1227                 uint32 flags;
1228                 status = FS_MOUNT_CALL(vnode->mount, get_vnode, vnodeID, vnode, &type,
1229                         &flags, reenter);
1230                 if (status == B_OK && vnode->private_node == NULL)
1231                         status = B_BAD_VALUE;
1232
1233                 bool gotNode = status == B_OK;
1234                 bool publishSpecialSubNode = false;
1235                 if (gotNode) {
1236                         vnode->SetType(type);
1237                         publishSpecialSubNode = is_special_node_type(type)
1238                                 && (flags & B_VNODE_DONT_CREATE_SPECIAL_SUB_NODE) == 0;
1239                 }
1240
1241                 if (gotNode && publishSpecialSubNode)
1242                         status = create_special_sub_node(vnode, flags);
1243
1244                 if (status != B_OK) {
1245                         if (gotNode)
1246                                 FS_CALL(vnode, put_vnode, reenter);
1247
1248                         rw_lock_write_lock(&sVnodeLock);
1249                         sVnodeTable->Remove(vnode);
1250                         remove_vnode_from_mount_list(vnode, vnode->mount);
1251                         rw_lock_write_unlock(&sVnodeLock);
1252
1253                         free(vnode);
1254                         return status;
1255                 }
1256
1257                 rw_lock_read_lock(&sVnodeLock);
1258                 vnode->Lock();
1259
1260                 vnode->SetRemoved((flags & B_VNODE_PUBLISH_REMOVED) != 0);
1261                 vnode->SetBusy(false);
1262
1263                 vnode->Unlock();
1264                 rw_lock_read_unlock(&sVnodeLock);
1265         }
1266
1267         TRACE(("get_vnode: returning %p\n", vnode));
1268
1269         *_vnode = vnode;
1270         return B_OK;
1271 }
1272
1273
1274 /*!     \brief Decrements the reference counter of the given vnode and deletes it,
1275         if the counter dropped to 0.
1276
1277         The caller must, of course, own a reference to the vnode to call this
1278         function.
1279         The caller must not hold the sVnodeLock or the sMountMutex.
1280
1281         \param vnode the vnode.
1282 */
1283 static inline void
1284 put_vnode(struct vnode* vnode)
1285 {
1286         dec_vnode_ref_count(vnode, false, false);
1287 }
1288
1289
1290 static void
1291 free_unused_vnodes(int32 level)
1292 {
1293         unused_vnodes_check_started();
1294
1295         if (level == B_NO_LOW_RESOURCE) {
1296                 unused_vnodes_check_done();
1297                 return;
1298         }
1299
1300         flush_hot_vnodes();
1301
1302         // determine how many nodes to free
1303         uint32 count = 1;
1304         {
1305                 MutexLocker unusedVnodesLocker(sUnusedVnodesLock);
1306
1307                 switch (level) {
1308                         case B_LOW_RESOURCE_NOTE:
1309                                 count = sUnusedVnodes / 100;
1310                                 break;
1311                         case B_LOW_RESOURCE_WARNING:
1312                                 count = sUnusedVnodes / 10;
1313                                 break;
1314                         case B_LOW_RESOURCE_CRITICAL:
1315                                 count = sUnusedVnodes;
1316                                 break;
1317                 }
1318
1319                 if (count > sUnusedVnodes)
1320                         count = sUnusedVnodes;
1321         }
1322
1323         // Write back the modified pages of some unused vnodes and free them.
1324
1325         for (uint32 i = 0; i < count; i++) {
1326                 ReadLocker vnodesReadLocker(sVnodeLock);
1327
1328                 // get the first node
1329                 MutexLocker unusedVnodesLocker(sUnusedVnodesLock);
1330                 struct vnode* vnode = (struct vnode*)list_get_first_item(
1331                         &sUnusedVnodeList);
1332                 unusedVnodesLocker.Unlock();
1333
1334                 if (vnode == NULL)
1335                         break;
1336
1337                 // lock the node
1338                 AutoLocker<Vnode> nodeLocker(vnode);
1339
1340                 // Check whether the node is still unused -- since we only append to the
1341                 // tail of the unused queue, the vnode should still be at its head.
1342                 // Alternatively we could check its ref count for 0 and its busy flag,
1343                 // but if the node is no longer at the head of the queue, it means it
1344                 // has been touched in the meantime, i.e. it is no longer the least
1345                 // recently used unused vnode and we rather don't free it.
1346                 unusedVnodesLocker.Lock();
1347                 if (vnode != list_get_first_item(&sUnusedVnodeList))
1348                         continue;
1349                 unusedVnodesLocker.Unlock();
1350
1351                 ASSERT(!vnode->IsBusy());
1352
1353                 // grab a reference
1354                 inc_vnode_ref_count(vnode);
1355                 vnode_used(vnode);
1356
1357                 // write back changes and free the node
1358                 nodeLocker.Unlock();
1359                 vnodesReadLocker.Unlock();
1360
1361                 if (vnode->cache != NULL)
1362                         vnode->cache->WriteModified();
1363
1364                 dec_vnode_ref_count(vnode, true, false);
1365                         // this should free the vnode when it's still unused
1366         }
1367
1368         unused_vnodes_check_done();
1369 }
1370
1371
1372 /*!     Gets the vnode the given vnode is covering.
1373
1374         The caller must have \c sVnodeLock read-locked at least.
1375
1376         The function returns a reference to the retrieved vnode (if any), the caller
1377         is responsible to free.
1378
1379         \param vnode The vnode whose covered node shall be returned.
1380         \return The covered vnode, or \c NULL if the given vnode doesn't cover any
1381                 vnode.
1382 */
1383 static inline Vnode*
1384 get_covered_vnode_locked(Vnode* vnode)
1385 {
1386         if (Vnode* coveredNode = vnode->covers) {
1387                 while (coveredNode->covers != NULL)
1388                         coveredNode = coveredNode->covers;
1389
1390                 inc_vnode_ref_count(coveredNode);
1391                 return coveredNode;
1392         }
1393
1394         return NULL;
1395 }
1396
1397
1398 /*!     Gets the vnode the given vnode is covering.
1399
1400         The caller must not hold \c sVnodeLock. Note that this implies a race
1401         condition, since the situation can change at any time.
1402
1403         The function returns a reference to the retrieved vnode (if any), the caller
1404         is responsible to free.
1405
1406         \param vnode The vnode whose covered node shall be returned.
1407         \return The covered vnode, or \c NULL if the given vnode doesn't cover any
1408                 vnode.
1409 */
1410 static inline Vnode*
1411 get_covered_vnode(Vnode* vnode)
1412 {
1413         if (!vnode->IsCovering())
1414                 return NULL;
1415
1416         ReadLocker vnodeReadLocker(sVnodeLock);
1417         return get_covered_vnode_locked(vnode);
1418 }
1419
1420
1421 /*!     Gets the vnode the given vnode is covered by.
1422
1423         The caller must have \c sVnodeLock read-locked at least.
1424
1425         The function returns a reference to the retrieved vnode (if any), the caller
1426         is responsible to free.
1427
1428         \param vnode The vnode whose covering node shall be returned.
1429         \return The covering vnode, or \c NULL if the given vnode isn't covered by
1430                 any vnode.
1431 */
1432 static Vnode*
1433 get_covering_vnode_locked(Vnode* vnode)
1434 {
1435         if (Vnode* coveringNode = vnode->covered_by) {
1436                 while (coveringNode->covered_by != NULL)
1437                         coveringNode = coveringNode->covered_by;
1438
1439                 inc_vnode_ref_count(coveringNode);
1440                 return coveringNode;
1441         }
1442
1443         return NULL;
1444 }
1445
1446
1447 /*!     Gets the vnode the given vnode is covered by.
1448
1449         The caller must not hold \c sVnodeLock. Note that this implies a race
1450         condition, since the situation can change at any time.
1451
1452         The function returns a reference to the retrieved vnode (if any), the caller
1453         is responsible to free.
1454
1455         \param vnode The vnode whose covering node shall be returned.
1456         \return The covering vnode, or \c NULL if the given vnode isn't covered by
1457                 any vnode.
1458 */
1459 static inline Vnode*
1460 get_covering_vnode(Vnode* vnode)
1461 {
1462         if (!vnode->IsCovered())
1463                 return NULL;
1464
1465         ReadLocker vnodeReadLocker(sVnodeLock);
1466         return get_covering_vnode_locked(vnode);
1467 }
1468
1469
1470 static void
1471 free_unused_vnodes()
1472 {
1473         free_unused_vnodes(
1474                 low_resource_state(B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
1475                         | B_KERNEL_RESOURCE_ADDRESS_SPACE));
1476 }
1477
1478
1479 static void
1480 vnode_low_resource_handler(void* /*data*/, uint32 resources, int32 level)
1481 {
1482         TRACE(("vnode_low_resource_handler(level = %" B_PRId32 ")\n", level));
1483
1484         free_unused_vnodes(level);
1485 }
1486
1487
1488 static inline void
1489 put_advisory_locking(struct advisory_locking* locking)
1490 {
1491         release_sem(locking->lock);
1492 }
1493
1494
1495 /*!     Returns the advisory_locking object of the \a vnode in case it
1496         has one, and locks it.
1497         You have to call put_advisory_locking() when you're done with
1498         it.
1499         Note, you must not have the vnode mutex locked when calling
1500         this function.
1501 */
1502 static struct advisory_locking*
1503 get_advisory_locking(struct vnode* vnode)
1504 {
1505         rw_lock_read_lock(&sVnodeLock);
1506         vnode->Lock();
1507
1508         struct advisory_locking* locking = vnode->advisory_locking;
1509         sem_id lock = locking != NULL ? locking->lock : B_ERROR;
1510
1511         vnode->Unlock();
1512         rw_lock_read_unlock(&sVnodeLock);
1513
1514         if (lock >= 0)
1515                 lock = acquire_sem(lock);
1516         if (lock < 0) {
1517                 // This means the locking has been deleted in the mean time
1518                 // or had never existed in the first place - otherwise, we
1519                 // would get the lock at some point.
1520                 return NULL;
1521         }
1522
1523         return locking;
1524 }
1525
1526
1527 /*!     Creates a locked advisory_locking object, and attaches it to the
1528         given \a vnode.
1529         Returns B_OK in case of success - also if the vnode got such an
1530         object from someone else in the mean time, you'll still get this
1531         one locked then.
1532 */
1533 static status_t
1534 create_advisory_locking(struct vnode* vnode)
1535 {
1536         if (vnode == NULL)
1537                 return B_FILE_ERROR;
1538
1539         ObjectDeleter<advisory_locking> lockingDeleter;
1540         struct advisory_locking* locking = NULL;
1541
1542         while (get_advisory_locking(vnode) == NULL) {
1543                 // no locking object set on the vnode yet, create one
1544                 if (locking == NULL) {
1545                         locking = new(std::nothrow) advisory_locking;
1546                         if (locking == NULL)
1547                                 return B_NO_MEMORY;
1548                         lockingDeleter.SetTo(locking);
1549
1550                         locking->wait_sem = create_sem(0, "advisory lock");
1551                         if (locking->wait_sem < 0)
1552                                 return locking->wait_sem;
1553
1554                         locking->lock = create_sem(0, "advisory locking");
1555                         if (locking->lock < 0)
1556                                 return locking->lock;
1557                 }
1558
1559                 // set our newly created locking object
1560                 ReadLocker _(sVnodeLock);
1561                 AutoLocker<Vnode> nodeLocker(vnode);
1562                 if (vnode->advisory_locking == NULL) {
1563                         vnode->advisory_locking = locking;
1564                         lockingDeleter.Detach();
1565                         return B_OK;
1566                 }
1567         }
1568
1569         // The vnode already had a locking object. That's just as well.
1570
1571         return B_OK;
1572 }
1573
1574
1575 /*! Returns \c true when either \a flock is \c NULL or the \a flock intersects
1576         with the advisory_lock \a lock.
1577 */
1578 static bool
1579 advisory_lock_intersects(struct advisory_lock* lock, struct flock* flock)
1580 {
1581         if (flock == NULL)
1582                 return true;
1583
1584         return lock->start <= flock->l_start - 1 + flock->l_len
1585                 && lock->end >= flock->l_start;
1586 }
1587
1588
1589 /*!     Tests whether acquiring a lock would block.
1590 */
1591 static status_t
1592 test_advisory_lock(struct vnode* vnode, struct flock* flock)
1593 {
1594         flock->l_type = F_UNLCK;
1595
1596         struct advisory_locking* locking = get_advisory_locking(vnode);
1597         if (locking == NULL)
1598                 return B_OK;
1599
1600         team_id team = team_get_current_team_id();
1601
1602         LockList::Iterator iterator = locking->locks.GetIterator();
1603         while (iterator.HasNext()) {
1604                 struct advisory_lock* lock = iterator.Next();
1605
1606                  if (lock->team != team && advisory_lock_intersects(lock, flock)) {
1607                         // locks do overlap
1608                         if (flock->l_type != F_RDLCK || !lock->shared) {
1609                                 // collision
1610                                 flock->l_type = lock->shared ? F_RDLCK : F_WRLCK;
1611                                 flock->l_whence = SEEK_SET;
1612                                 flock->l_start = lock->start;
1613                                 flock->l_len = lock->end - lock->start + 1;
1614                                 flock->l_pid = lock->team;
1615                                 break;
1616                         }
1617                 }
1618         }
1619
1620         put_advisory_locking(locking);
1621         return B_OK;
1622 }
1623
1624
1625 /*!     Removes the specified lock, or all locks of the calling team
1626         if \a flock is NULL.
1627 */
1628 static status_t
1629 release_advisory_lock(struct vnode* vnode, struct flock* flock)
1630 {
1631         FUNCTION(("release_advisory_lock(vnode = %p, flock = %p)\n", vnode, flock));
1632
1633         struct advisory_locking* locking = get_advisory_locking(vnode);
1634         if (locking == NULL)
1635                 return B_OK;
1636
1637         // TODO: use the thread ID instead??
1638         team_id team = team_get_current_team_id();
1639         pid_t session = thread_get_current_thread()->team->session_id;
1640
1641         // find matching lock entries
1642
1643         LockList::Iterator iterator = locking->locks.GetIterator();
1644         while (iterator.HasNext()) {
1645                 struct advisory_lock* lock = iterator.Next();
1646                 bool removeLock = false;
1647
1648                 if (lock->session == session)
1649                         removeLock = true;
1650                 else if (lock->team == team && advisory_lock_intersects(lock, flock)) {
1651                         bool endsBeyond = false;
1652                         bool startsBefore = false;
1653                         if (flock != NULL) {
1654                                 startsBefore = lock->start < flock->l_start;
1655                                 endsBeyond = lock->end > flock->l_start - 1 + flock->l_len;
1656                         }
1657
1658                         if (!startsBefore && !endsBeyond) {
1659                                 // lock is completely contained in flock
1660                                 removeLock = true;
1661                         } else if (startsBefore && !endsBeyond) {
1662                                 // cut the end of the lock
1663                                 lock->end = flock->l_start - 1;
1664                         } else if (!startsBefore && endsBeyond) {
1665                                 // cut the start of the lock
1666                                 lock->start = flock->l_start + flock->l_len;
1667                         } else {
1668                                 // divide the lock into two locks
1669                                 struct advisory_lock* secondLock = new advisory_lock;
1670                                 if (secondLock == NULL) {
1671                                         // TODO: we should probably revert the locks we already
1672                                         // changed... (ie. allocate upfront)
1673                                         put_advisory_locking(locking);
1674                                         return B_NO_MEMORY;
1675                                 }
1676
1677                                 lock->end = flock->l_start - 1;
1678
1679                                 secondLock->team = lock->team;
1680                                 secondLock->session = lock->session;
1681                                 // values must already be normalized when getting here
1682                                 secondLock->start = flock->l_start + flock->l_len;
1683                                 secondLock->end = lock->end;
1684                                 secondLock->shared = lock->shared;
1685
1686                                 locking->locks.Add(secondLock);
1687                         }
1688                 }
1689
1690                 if (removeLock) {
1691                         // this lock is no longer used
1692                         iterator.Remove();
1693                         free(lock);
1694                 }
1695         }
1696
1697         bool removeLocking = locking->locks.IsEmpty();
1698         release_sem_etc(locking->wait_sem, 1, B_RELEASE_ALL);
1699
1700         put_advisory_locking(locking);
1701
1702         if (removeLocking) {
1703                 // We can remove the whole advisory locking structure; it's no
1704                 // longer used
1705                 locking = get_advisory_locking(vnode);
1706                 if (locking != NULL) {
1707                         ReadLocker locker(sVnodeLock);
1708                         AutoLocker<Vnode> nodeLocker(vnode);
1709
1710                         // the locking could have been changed in the mean time
1711                         if (locking->locks.IsEmpty()) {
1712                                 vnode->advisory_locking = NULL;
1713                                 nodeLocker.Unlock();
1714                                 locker.Unlock();
1715
1716                                 // we've detached the locking from the vnode, so we can
1717                                 // safely delete it
1718                                 delete locking;
1719                         } else {
1720                                 // the locking is in use again
1721                                 nodeLocker.Unlock();
1722                                 locker.Unlock();
1723                                 release_sem_etc(locking->lock, 1, B_DO_NOT_RESCHEDULE);
1724                         }
1725                 }
1726         }
1727
1728         return B_OK;
1729 }
1730
1731
1732 /*!     Acquires an advisory lock for the \a vnode. If \a wait is \c true, it
1733         will wait for the lock to become available, if there are any collisions
1734         (it will return B_PERMISSION_DENIED in this case if \a wait is \c false).
1735
1736         If \a session is -1, POSIX semantics are used for this lock. Otherwise,
1737         BSD flock() semantics are used, that is, all children can unlock the file
1738         in question (we even allow parents to remove the lock, though, but that
1739         seems to be in line to what the BSD's are doing).
1740 */
1741 static status_t
1742 acquire_advisory_lock(struct vnode* vnode, pid_t session, struct flock* flock,
1743         bool wait)
1744 {
1745         FUNCTION(("acquire_advisory_lock(vnode = %p, flock = %p, wait = %s)\n",
1746                 vnode, flock, wait ? "yes" : "no"));
1747
1748         bool shared = flock->l_type == F_RDLCK;
1749         status_t status = B_OK;
1750
1751         // TODO: do deadlock detection!
1752
1753         struct advisory_locking* locking;
1754
1755         while (true) {
1756                 // if this vnode has an advisory_locking structure attached,
1757                 // lock that one and search for any colliding file lock
1758                 status = create_advisory_locking(vnode);
1759                 if (status != B_OK)
1760                         return status;
1761
1762                 locking = vnode->advisory_locking;
1763                 team_id team = team_get_current_team_id();
1764                 sem_id waitForLock = -1;
1765
1766                 // test for collisions
1767                 LockList::Iterator iterator = locking->locks.GetIterator();
1768                 while (iterator.HasNext()) {
1769                         struct advisory_lock* lock = iterator.Next();
1770
1771                         // TODO: locks from the same team might be joinable!
1772                         if (lock->team != team && advisory_lock_intersects(lock, flock)) {
1773                                 // locks do overlap
1774                                 if (!shared || !lock->shared) {
1775                                         // we need to wait
1776                                         waitForLock = locking->wait_sem;
1777                                         break;
1778                                 }
1779                         }
1780                 }
1781
1782                 if (waitForLock < 0)
1783                         break;
1784
1785                 // We need to wait. Do that or fail now, if we've been asked not to.
1786
1787                 if (!wait) {
1788                         put_advisory_locking(locking);
1789                         return session != -1 ? B_WOULD_BLOCK : B_PERMISSION_DENIED;
1790                 }
1791
1792                 status = switch_sem_etc(locking->lock, waitForLock, 1,
1793                         B_CAN_INTERRUPT, 0);
1794                 if (status != B_OK && status != B_BAD_SEM_ID)
1795                         return status;
1796
1797                 // We have been notified, but we need to re-lock the locking object. So
1798                 // go another round...
1799         }
1800
1801         // install new lock
1802
1803         struct advisory_lock* lock = (struct advisory_lock*)malloc(
1804                 sizeof(struct advisory_lock));
1805         if (lock == NULL) {
1806                 put_advisory_locking(locking);
1807                 return B_NO_MEMORY;
1808         }
1809
1810         lock->team = team_get_current_team_id();
1811         lock->session = session;
1812         // values must already be normalized when getting here
1813         lock->start = flock->l_start;
1814         lock->end = flock->l_start - 1 + flock->l_len;
1815         lock->shared = shared;
1816
1817         locking->locks.Add(lock);
1818         put_advisory_locking(locking);
1819
1820         return status;
1821 }
1822
1823
1824 /*!     Normalizes the \a flock structure to make it easier to compare the
1825         structure with others. The l_start and l_len fields are set to absolute
1826         values according to the l_whence field.
1827 */
1828 static status_t
1829 normalize_flock(struct file_descriptor* descriptor, struct flock* flock)
1830 {
1831         switch (flock->l_whence) {
1832                 case SEEK_SET:
1833                         break;
1834                 case SEEK_CUR:
1835                         flock->l_start += descriptor->pos;
1836                         break;
1837                 case SEEK_END:
1838                 {
1839                         struct vnode* vnode = descriptor->u.vnode;
1840                         struct stat stat;
1841                         status_t status;
1842
1843                         if (!HAS_FS_CALL(vnode, read_stat))
1844                                 return B_UNSUPPORTED;
1845
1846                         status = FS_CALL(vnode, read_stat, &stat);
1847                         if (status != B_OK)
1848                                 return status;
1849
1850                         flock->l_start += stat.st_size;
1851                         break;
1852                 }
1853                 default:
1854                         return B_BAD_VALUE;
1855         }
1856
1857         if (flock->l_start < 0)
1858                 flock->l_start = 0;
1859         if (flock->l_len == 0)
1860                 flock->l_len = OFF_MAX;
1861
1862         // don't let the offset and length overflow
1863         if (flock->l_start > 0 && OFF_MAX - flock->l_start < flock->l_len)
1864                 flock->l_len = OFF_MAX - flock->l_start;
1865
1866         if (flock->l_len < 0) {
1867                 // a negative length reverses the region
1868                 flock->l_start += flock->l_len;
1869                 flock->l_len = -flock->l_len;
1870         }
1871
1872         return B_OK;
1873 }
1874
1875
1876 static void
1877 replace_vnode_if_disconnected(struct fs_mount* mount,
1878         struct vnode* vnodeToDisconnect, struct vnode*& vnode,
1879         struct vnode* fallBack, bool lockRootLock)
1880 {
1881         struct vnode* givenVnode = vnode;
1882         bool vnodeReplaced = false;
1883
1884         ReadLocker vnodeReadLocker(sVnodeLock);
1885
1886         if (lockRootLock)
1887                 mutex_lock(&sIOContextRootLock);
1888
1889         while (vnode != NULL && vnode->mount == mount
1890                 && (vnodeToDisconnect == NULL || vnodeToDisconnect == vnode)) {
1891                 if (vnode->covers != NULL) {
1892                         // redirect the vnode to the covered vnode
1893                         vnode = vnode->covers;
1894                 } else
1895                         vnode = fallBack;
1896
1897                 vnodeReplaced = true;
1898         }
1899
1900         // If we've replaced the node, grab a reference for the new one.
1901         if (vnodeReplaced && vnode != NULL)
1902                 inc_vnode_ref_count(vnode);
1903
1904         if (lockRootLock)
1905                 mutex_unlock(&sIOContextRootLock);
1906
1907         vnodeReadLocker.Unlock();
1908
1909         if (vnodeReplaced)
1910                 put_vnode(givenVnode);
1911 }
1912
1913
1914 /*!     Disconnects all file descriptors that are associated with the
1915         \a vnodeToDisconnect, or if this is NULL, all vnodes of the specified
1916         \a mount object.
1917
1918         Note, after you've called this function, there might still be ongoing
1919         accesses - they won't be interrupted if they already happened before.
1920         However, any subsequent access will fail.
1921
1922         This is not a cheap function and should be used with care and rarely.
1923         TODO: there is currently no means to stop a blocking read/write!
1924 */
1925 static void
1926 disconnect_mount_or_vnode_fds(struct fs_mount* mount,
1927         struct vnode* vnodeToDisconnect)
1928 {
1929         // iterate over all teams and peek into their file descriptors
1930         TeamListIterator teamIterator;
1931         while (Team* team = teamIterator.Next()) {
1932                 BReference<Team> teamReference(team, true);
1933
1934                 // lock the I/O context
1935                 io_context* context = team->io_context;
1936                 MutexLocker contextLocker(context->io_mutex);
1937
1938                 replace_vnode_if_disconnected(mount, vnodeToDisconnect, context->root,
1939                         sRoot, true);
1940                 replace_vnode_if_disconnected(mount, vnodeToDisconnect, context->cwd,
1941                         sRoot, false);
1942
1943                 for (uint32 i = 0; i < context->table_size; i++) {
1944                         if (struct file_descriptor* descriptor = context->fds[i]) {
1945                                 inc_fd_ref_count(descriptor);
1946
1947                                 // if this descriptor points at this mount, we
1948                                 // need to disconnect it to be able to unmount
1949                                 struct vnode* vnode = fd_vnode(descriptor);
1950                                 if (vnodeToDisconnect != NULL) {
1951                                         if (vnode == vnodeToDisconnect)
1952                                                 disconnect_fd(descriptor);
1953                                 } else if ((vnode != NULL && vnode->mount == mount)
1954                                         || (vnode == NULL && descriptor->u.mount == mount))
1955                                         disconnect_fd(descriptor);
1956
1957                                 put_fd(descriptor);
1958                         }
1959                 }
1960         }
1961 }
1962
1963
1964 /*!     \brief Gets the root node of the current IO context.
1965         If \a kernel is \c true, the kernel IO context will be used.
1966         The caller obtains a reference to the returned node.
1967 */
1968 struct vnode*
1969 get_root_vnode(bool kernel)
1970 {
1971         if (!kernel) {
1972                 // Get current working directory from io context
1973                 struct io_context* context = get_current_io_context(kernel);
1974
1975                 mutex_lock(&sIOContextRootLock);
1976
1977                 struct vnode* root = context->root;
1978                 if (root != NULL)
1979                         inc_vnode_ref_count(root);
1980
1981                 mutex_unlock(&sIOContextRootLock);
1982
1983                 if (root != NULL)
1984                         return root;
1985
1986                 // That should never happen.
1987                 dprintf("get_root_vnode(): IO context for team %" B_PRId32 " doesn't "
1988                         "have a root\n", team_get_current_team_id());
1989         }
1990
1991         inc_vnode_ref_count(sRoot);
1992         return sRoot;
1993 }
1994
1995
1996 /*!     \brief Gets the directory path and leaf name for a given path.
1997
1998         The supplied \a path is transformed to refer to the directory part of
1999         the entry identified by the original path, and into the buffer \a filename
2000         the leaf name of the original entry is written.
2001         Neither the returned path nor the leaf name can be expected to be
2002         canonical.
2003
2004         \param path The path to be analyzed. Must be able to store at least one
2005                    additional character.
2006         \param filename The buffer into which the leaf name will be written.
2007                    Must be of size B_FILE_NAME_LENGTH at least.
2008         \return \c B_OK, if everything went fine, \c B_NAME_TOO_LONG, if the leaf
2009                    name is longer than \c B_FILE_NAME_LENGTH, or \c B_ENTRY_NOT_FOUND,
2010                    if the given path name is empty.
2011 */
2012 static status_t
2013 get_dir_path_and_leaf(char* path, char* filename)
2014 {
2015         if (*path == '\0')
2016                 return B_ENTRY_NOT_FOUND;
2017
2018         char* last = strrchr(path, '/');
2019                 // '/' are not allowed in file names!
2020
2021         FUNCTION(("get_dir_path_and_leaf(path = %s)\n", path));
2022
2023         if (last == NULL) {
2024                 // this path is single segment with no '/' in it
2025                 // ex. "foo"
2026                 if (strlcpy(filename, path, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2027                         return B_NAME_TOO_LONG;
2028
2029                 strcpy(path, ".");
2030         } else {
2031                 last++;
2032                 if (last[0] == '\0') {
2033                         // special case: the path ends in one or more '/' - remove them
2034                         while (*--last == '/' && last != path);
2035                         last[1] = '\0';
2036
2037                         if (last == path && last[0] == '/') {
2038                                 // This path points to the root of the file system
2039                                 strcpy(filename, ".");
2040                                 return B_OK;
2041                         }
2042                         for (; last != path && *(last - 1) != '/'; last--);
2043                                 // rewind to the start of the leaf before the '/'
2044                 }
2045
2046                 // normal leaf: replace the leaf portion of the path with a '.'
2047                 if (strlcpy(filename, last, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2048                         return B_NAME_TOO_LONG;
2049
2050                 last[0] = '.';
2051                 last[1] = '\0';
2052         }
2053         return B_OK;
2054 }
2055
2056
2057 static status_t
2058 entry_ref_to_vnode(dev_t mountID, ino_t directoryID, const char* name,
2059         bool traverse, bool kernel, struct vnode** _vnode)
2060 {
2061         char clonedName[B_FILE_NAME_LENGTH + 1];
2062         if (strlcpy(clonedName, name, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2063                 return B_NAME_TOO_LONG;
2064
2065         // get the directory vnode and let vnode_path_to_vnode() do the rest
2066         struct vnode* directory;
2067
2068         status_t status = get_vnode(mountID, directoryID, &directory, true, false);
2069         if (status < 0)
2070                 return status;
2071
2072         return vnode_path_to_vnode(directory, clonedName, traverse, 0, kernel,
2073                 _vnode, NULL);
2074 }
2075
2076
2077 /*!     Looks up the entry with name \a name in the directory represented by \a dir
2078         and returns the respective vnode.
2079         On success a reference to the vnode is acquired for the caller.
2080 */
2081 static status_t
2082 lookup_dir_entry(struct vnode* dir, const char* name, struct vnode** _vnode)
2083 {
2084         ino_t id;
2085         bool missing;
2086
2087         if (dir->mount->entry_cache.Lookup(dir->id, name, id, missing)) {
2088                 return missing ? B_ENTRY_NOT_FOUND
2089                         : get_vnode(dir->device, id, _vnode, true, false);
2090         }
2091
2092         status_t status = FS_CALL(dir, lookup, name, &id);
2093         if (status != B_OK)
2094                 return status;
2095
2096         // The lookup() hook calls get_vnode() or publish_vnode(), so we do already
2097         // have a reference and just need to look the node up.
2098         rw_lock_read_lock(&sVnodeLock);
2099         *_vnode = lookup_vnode(dir->device, id);
2100         rw_lock_read_unlock(&sVnodeLock);
2101
2102         if (*_vnode == NULL) {
2103                 panic("lookup_dir_entry(): could not lookup vnode (mountid 0x%" B_PRIx32
2104                         " vnid 0x%" B_PRIx64 ")\n", dir->device, id);
2105                 return B_ENTRY_NOT_FOUND;
2106         }
2107
2108 //      ktrace_printf("lookup_dir_entry(): dir: %p (%ld, %lld), name: \"%s\" -> "
2109 //              "%p (%ld, %lld)", dir, dir->mount->id, dir->id, name, *_vnode,
2110 //              (*_vnode)->mount->id, (*_vnode)->id);
2111
2112         return B_OK;
2113 }
2114
2115
2116 /*!     Returns the vnode for the relative path starting at the specified \a vnode.
2117         \a path must not be NULL.
2118         If it returns successfully, \a path contains the name of the last path
2119         component. This function clobbers the buffer pointed to by \a path only
2120         if it does contain more than one component.
2121         Note, this reduces the ref_count of the starting \a vnode, no matter if
2122         it is successful or not!
2123 */
2124 static status_t
2125 vnode_path_to_vnode(struct vnode* vnode, char* path, bool traverseLeafLink,
2126         int count, struct io_context* ioContext, struct vnode** _vnode,
2127         ino_t* _parentID)
2128 {
2129         status_t status = B_OK;
2130         ino_t lastParentID = vnode->id;
2131
2132         FUNCTION(("vnode_path_to_vnode(vnode = %p, path = %s)\n", vnode, path));
2133
2134         if (path == NULL) {
2135                 put_vnode(vnode);
2136                 return B_BAD_VALUE;
2137         }
2138
2139         if (*path == '\0') {
2140                 put_vnode(vnode);
2141                 return B_ENTRY_NOT_FOUND;
2142         }
2143
2144         while (true) {
2145                 struct vnode* nextVnode;
2146                 char* nextPath;
2147
2148                 TRACE(("vnode_path_to_vnode: top of loop. p = %p, p = '%s'\n", path,
2149                         path));
2150
2151                 // done?
2152                 if (path[0] == '\0')
2153                         break;
2154
2155                 // walk to find the next path component ("path" will point to a single
2156                 // path component), and filter out multiple slashes
2157                 for (nextPath = path + 1; *nextPath != '\0' && *nextPath != '/';
2158                                 nextPath++);
2159
2160                 if (*nextPath == '/') {
2161                         *nextPath = '\0';
2162                         do
2163                                 nextPath++;
2164                         while (*nextPath == '/');
2165                 }
2166
2167                 // See if the '..' is at a covering vnode move to the covered
2168                 // vnode so we pass the '..' path to the underlying filesystem.
2169                 // Also prevent breaking the root of the IO context.
2170                 if (strcmp("..", path) == 0) {
2171                         if (vnode == ioContext->root) {
2172                                 // Attempted prison break! Keep it contained.
2173                                 path = nextPath;
2174                                 continue;
2175                         }
2176
2177                         if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2178                                 nextVnode = coveredVnode;
2179                                 put_vnode(vnode);
2180                                 vnode = nextVnode;
2181                         }
2182                 }
2183
2184                 // check if vnode is really a directory
2185                 if (status == B_OK && !S_ISDIR(vnode->Type()))
2186                         status = B_NOT_A_DIRECTORY;
2187
2188                 // Check if we have the right to search the current directory vnode.
2189                 // If a file system doesn't have the access() function, we assume that
2190                 // searching a directory is always allowed
2191                 if (status == B_OK && HAS_FS_CALL(vnode, access))
2192                         status = FS_CALL(vnode, access, X_OK);
2193
2194                 // Tell the filesystem to get the vnode of this path component (if we
2195                 // got the permission from the call above)
2196                 if (status == B_OK)
2197                         status = lookup_dir_entry(vnode, path, &nextVnode);
2198
2199                 if (status != B_OK) {
2200                         put_vnode(vnode);
2201                         return status;
2202                 }
2203
2204                 // If the new node is a symbolic link, resolve it (if we've been told
2205                 // to do it)
2206                 if (S_ISLNK(nextVnode->Type())
2207                         && (traverseLeafLink || nextPath[0] != '\0')) {
2208                         size_t bufferSize;
2209                         char* buffer;
2210
2211                         TRACE(("traverse link\n"));
2212
2213                         // it's not exactly nice style using goto in this way, but hey,
2214                         // it works :-/
2215                         if (count + 1 > B_MAX_SYMLINKS) {
2216                                 status = B_LINK_LIMIT;
2217                                 goto resolve_link_error;
2218                         }
2219
2220                         buffer = (char*)malloc(bufferSize = B_PATH_NAME_LENGTH);
2221                         if (buffer == NULL) {
2222                                 status = B_NO_MEMORY;
2223                                 goto resolve_link_error;
2224                         }
2225
2226                         if (HAS_FS_CALL(nextVnode, read_symlink)) {
2227                                 bufferSize--;
2228                                 status = FS_CALL(nextVnode, read_symlink, buffer, &bufferSize);
2229                                 // null-terminate
2230                                 if (status >= 0)
2231                                         buffer[bufferSize] = '\0';
2232                         } else
2233                                 status = B_BAD_VALUE;
2234
2235                         if (status != B_OK) {
2236                                 free(buffer);
2237
2238                 resolve_link_error:
2239                                 put_vnode(vnode);
2240                                 put_vnode(nextVnode);
2241
2242                                 return status;
2243                         }
2244                         put_vnode(nextVnode);
2245
2246                         // Check if we start from the root directory or the current
2247                         // directory ("vnode" still points to that one).
2248                         // Cut off all leading slashes if it's the root directory
2249                         path = buffer;
2250                         bool absoluteSymlink = false;
2251                         if (path[0] == '/') {
2252                                 // we don't need the old directory anymore
2253                                 put_vnode(vnode);
2254
2255                                 while (*++path == '/')
2256                                         ;
2257
2258                                 mutex_lock(&sIOContextRootLock);
2259                                 vnode = ioContext->root;
2260                                 inc_vnode_ref_count(vnode);
2261                                 mutex_unlock(&sIOContextRootLock);
2262
2263                                 absoluteSymlink = true;
2264                         }
2265
2266                         inc_vnode_ref_count(vnode);
2267                                 // balance the next recursion - we will decrement the
2268                                 // ref_count of the vnode, no matter if we succeeded or not
2269
2270                         if (absoluteSymlink && *path == '\0') {
2271                                 // symlink was just "/"
2272                                 nextVnode = vnode;
2273                         } else {
2274                                 status = vnode_path_to_vnode(vnode, path, true, count + 1,
2275                                         ioContext, &nextVnode, &lastParentID);
2276                         }
2277
2278                         free(buffer);
2279
2280                         if (status != B_OK) {
2281                                 put_vnode(vnode);
2282                                 return status;
2283                         }
2284                 } else
2285                         lastParentID = vnode->id;
2286
2287                 // decrease the ref count on the old dir we just looked up into
2288                 put_vnode(vnode);
2289
2290                 path = nextPath;
2291                 vnode = nextVnode;
2292
2293                 // see if we hit a covered node
2294                 if (Vnode* coveringNode = get_covering_vnode(vnode)) {
2295                         put_vnode(vnode);
2296                         vnode = coveringNode;
2297                 }
2298         }
2299
2300         *_vnode = vnode;
2301         if (_parentID)
2302                 *_parentID = lastParentID;
2303
2304         return B_OK;
2305 }
2306
2307
2308 static status_t
2309 vnode_path_to_vnode(struct vnode* vnode, char* path, bool traverseLeafLink,
2310         int count, bool kernel, struct vnode** _vnode, ino_t* _parentID)
2311 {
2312         return vnode_path_to_vnode(vnode, path, traverseLeafLink, count,
2313                 get_current_io_context(kernel), _vnode, _parentID);
2314 }
2315
2316
2317 static status_t
2318 path_to_vnode(char* path, bool traverseLink, struct vnode** _vnode,
2319         ino_t* _parentID, bool kernel)
2320 {
2321         struct vnode* start = NULL;
2322
2323         FUNCTION(("path_to_vnode(path = \"%s\")\n", path));
2324
2325         if (!path)
2326                 return B_BAD_VALUE;
2327
2328         if (*path == '\0')
2329                 return B_ENTRY_NOT_FOUND;
2330
2331         // figure out if we need to start at root or at cwd
2332         if (*path == '/') {
2333                 if (sRoot == NULL) {
2334                         // we're a bit early, aren't we?
2335                         return B_ERROR;
2336                 }
2337
2338                 while (*++path == '/')
2339                         ;
2340                 start = get_root_vnode(kernel);
2341
2342                 if (*path == '\0') {
2343                         *_vnode = start;
2344                         return B_OK;
2345                 }
2346
2347         } else {
2348                 struct io_context* context = get_current_io_context(kernel);
2349
2350                 mutex_lock(&context->io_mutex);
2351                 start = context->cwd;
2352                 if (start != NULL)
2353                         inc_vnode_ref_count(start);
2354                 mutex_unlock(&context->io_mutex);
2355
2356                 if (start == NULL)
2357                         return B_ERROR;
2358         }
2359
2360         return vnode_path_to_vnode(start, path, traverseLink, 0, kernel, _vnode,
2361                 _parentID);
2362 }
2363
2364
2365 /*! Returns the vnode in the next to last segment of the path, and returns
2366         the last portion in filename.
2367         The path buffer must be able to store at least one additional character.
2368 */
2369 static status_t
2370 path_to_dir_vnode(char* path, struct vnode** _vnode, char* filename,
2371         bool kernel)
2372 {
2373         status_t status = get_dir_path_and_leaf(path, filename);
2374         if (status != B_OK)
2375                 return status;
2376
2377         return path_to_vnode(path, true, _vnode, NULL, kernel);
2378 }
2379
2380
2381 /*!     \brief Retrieves the directory vnode and the leaf name of an entry referred
2382                    to by a FD + path pair.
2383
2384         \a path must be given in either case. \a fd might be omitted, in which
2385         case \a path is either an absolute path or one relative to the current
2386         directory. If both a supplied and \a path is relative it is reckoned off
2387         of the directory referred to by \a fd. If \a path is absolute \a fd is
2388         ignored.
2389
2390         The caller has the responsibility to call put_vnode() on the returned
2391         directory vnode.
2392
2393         \param fd The FD. May be < 0.
2394         \param path The absolute or relative path. Must not be \c NULL. The buffer
2395                is modified by this function. It must have at least room for a
2396                string one character longer than the path it contains.
2397         \param _vnode A pointer to a variable the directory vnode shall be written
2398                    into.
2399         \param filename A buffer of size B_FILE_NAME_LENGTH or larger into which
2400                    the leaf name of the specified entry will be written.
2401         \param kernel \c true, if invoked from inside the kernel, \c false if
2402                    invoked from userland.
2403         \return \c B_OK, if everything went fine, another error code otherwise.
2404 */
2405 static status_t
2406 fd_and_path_to_dir_vnode(int fd, char* path, struct vnode** _vnode,
2407         char* filename, bool kernel)
2408 {
2409         if (!path)
2410                 return B_BAD_VALUE;
2411         if (*path == '\0')
2412                 return B_ENTRY_NOT_FOUND;
2413         if (fd < 0)
2414                 return path_to_dir_vnode(path, _vnode, filename, kernel);
2415
2416         status_t status = get_dir_path_and_leaf(path, filename);
2417         if (status != B_OK)
2418                 return status;
2419
2420         return fd_and_path_to_vnode(fd, path, true, _vnode, NULL, kernel);
2421 }
2422
2423
2424 /*!     \brief Retrieves the directory vnode and the leaf name of an entry referred
2425                    to by a vnode + path pair.
2426
2427         \a path must be given in either case. \a vnode might be omitted, in which
2428         case \a path is either an absolute path or one relative to the current
2429         directory. If both a supplied and \a path is relative it is reckoned off
2430         of the directory referred to by \a vnode. If \a path is absolute \a vnode is
2431         ignored.
2432
2433         The caller has the responsibility to call put_vnode() on the returned
2434         directory vnode.
2435
2436         \param vnode The vnode. May be \c NULL.
2437         \param path The absolute or relative path. Must not be \c NULL. The buffer
2438                is modified by this function. It must have at least room for a
2439                string one character longer than the path it contains.
2440         \param _vnode A pointer to a variable the directory vnode shall be written
2441                    into.
2442         \param filename A buffer of size B_FILE_NAME_LENGTH or larger into which
2443                    the leaf name of the specified entry will be written.
2444         \param kernel \c true, if invoked from inside the kernel, \c false if
2445                    invoked from userland.
2446         \return \c B_OK, if everything went fine, another error code otherwise.
2447 */
2448 static status_t
2449 vnode_and_path_to_dir_vnode(struct vnode* vnode, char* path,
2450         struct vnode** _vnode, char* filename, bool kernel)
2451 {
2452         if (!path)
2453                 return B_BAD_VALUE;
2454         if (*path == '\0')
2455                 return B_ENTRY_NOT_FOUND;
2456         if (vnode == NULL || path[0] == '/')
2457                 return path_to_dir_vnode(path, _vnode, filename, kernel);
2458
2459         status_t status = get_dir_path_and_leaf(path, filename);
2460         if (status != B_OK)
2461                 return status;
2462
2463         inc_vnode_ref_count(vnode);
2464                 // vnode_path_to_vnode() always decrements the ref count
2465
2466         return vnode_path_to_vnode(vnode, path, true, 0, kernel, _vnode, NULL);
2467 }
2468
2469
2470 /*! Returns a vnode's name in the d_name field of a supplied dirent buffer.
2471 */
2472 static status_t
2473 get_vnode_name(struct vnode* vnode, struct vnode* parent, struct dirent* buffer,
2474         size_t bufferSize, struct io_context* ioContext)
2475 {
2476         if (bufferSize < sizeof(struct dirent))
2477                 return B_BAD_VALUE;
2478
2479         // See if the vnode is covering another vnode and move to the covered
2480         // vnode so we get the underlying file system
2481         VNodePutter vnodePutter;
2482         if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2483                 vnode = coveredVnode;
2484                 vnodePutter.SetTo(vnode);
2485         }
2486
2487         if (HAS_FS_CALL(vnode, get_vnode_name)) {
2488                 // The FS supports getting the name of a vnode.
2489                 if (FS_CALL(vnode, get_vnode_name, buffer->d_name,
2490                         (char*)buffer + bufferSize - buffer->d_name) == B_OK)
2491                         return B_OK;
2492         }
2493
2494         // The FS doesn't support getting the name of a vnode. So we search the
2495         // parent directory for the vnode, if the caller let us.
2496
2497         if (parent == NULL || !HAS_FS_CALL(parent, read_dir))
2498                 return B_UNSUPPORTED;
2499
2500         void* cookie;
2501
2502         status_t status = FS_CALL(parent, open_dir, &cookie);
2503         if (status >= B_OK) {
2504                 while (true) {
2505                         uint32 num = 1;
2506                         // We use the FS hook directly instead of dir_read(), since we don't
2507                         // want the entries to be fixed. We have already resolved vnode to
2508                         // the covered node.
2509                         status = FS_CALL(parent, read_dir, cookie, buffer, bufferSize,
2510                                 &num);
2511                         if (status != B_OK)
2512                                 break;
2513                         if (num == 0) {
2514                                 status = B_ENTRY_NOT_FOUND;
2515                                 break;
2516                         }
2517
2518                         if (vnode->id == buffer->d_ino) {
2519                                 // found correct entry!
2520                                 break;
2521                         }
2522                 }
2523
2524                 FS_CALL(parent, close_dir, cookie);
2525                 FS_CALL(parent, free_dir_cookie, cookie);
2526         }
2527         return status;
2528 }
2529
2530
2531 static status_t
2532 get_vnode_name(struct vnode* vnode, struct vnode* parent, char* name,
2533         size_t nameSize, bool kernel)
2534 {
2535         char buffer[sizeof(struct dirent) + B_FILE_NAME_LENGTH];
2536         struct dirent* dirent = (struct dirent*)buffer;
2537
2538         status_t status = get_vnode_name(vnode, parent, dirent, sizeof(buffer),
2539                 get_current_io_context(kernel));
2540         if (status != B_OK)
2541                 return status;
2542
2543         if (strlcpy(name, dirent->d_name, nameSize) >= nameSize)
2544                 return B_BUFFER_OVERFLOW;
2545
2546         return B_OK;
2547 }
2548
2549
2550 /*!     Gets the full path to a given directory vnode.
2551         It uses the fs_get_vnode_name() call to get the name of a vnode; if a
2552         file system doesn't support this call, it will fall back to iterating
2553         through the parent directory to get the name of the child.
2554
2555         To protect against circular loops, it supports a maximum tree depth
2556         of 256 levels.
2557
2558         Note that the path may not be correct the time this function returns!
2559         It doesn't use any locking to prevent returning the correct path, as
2560         paths aren't safe anyway: the path to a file can change at any time.
2561
2562         It might be a good idea, though, to check if the returned path exists
2563         in the calling function (it's not done here because of efficiency)
2564 */
2565 static status_t
2566 dir_vnode_to_path(struct vnode* vnode, char* buffer, size_t bufferSize,
2567         bool kernel)
2568 {
2569         FUNCTION(("dir_vnode_to_path(%p, %p, %lu)\n", vnode, buffer, bufferSize));
2570
2571         if (vnode == NULL || buffer == NULL || bufferSize == 0)
2572                 return B_BAD_VALUE;
2573
2574         if (!S_ISDIR(vnode->Type()))
2575                 return B_NOT_A_DIRECTORY;
2576
2577         char* path = buffer;
2578         int32 insert = bufferSize;
2579         int32 maxLevel = 256;
2580         int32 length;
2581         status_t status = B_OK;
2582         struct io_context* ioContext = get_current_io_context(kernel);
2583
2584         // we don't use get_vnode() here because this call is more
2585         // efficient and does all we need from get_vnode()
2586         inc_vnode_ref_count(vnode);
2587
2588         path[--insert] = '\0';
2589                 // the path is filled right to left
2590
2591         while (true) {
2592                 // If the node is the context's root, bail out. Otherwise resolve mount
2593                 // points.
2594                 if (vnode == ioContext->root)
2595                         break;
2596
2597                 if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2598                         put_vnode(vnode);
2599                         vnode = coveredVnode;
2600                 }
2601
2602                 // lookup the parent vnode
2603                 struct vnode* parentVnode;
2604                 status = lookup_dir_entry(vnode, "..", &parentVnode);
2605                 if (status != B_OK)
2606                         goto out;
2607
2608                 if (parentVnode == vnode) {
2609                         // The caller apparently got their hands on a node outside of their
2610                         // context's root. Now we've hit the global root.
2611                         put_vnode(parentVnode);
2612                         break;
2613                 }
2614
2615                 // get the node's name
2616                 char nameBuffer[sizeof(struct dirent) + B_FILE_NAME_LENGTH];
2617                         // also used for fs_read_dir()
2618                 char* name = &((struct dirent*)nameBuffer)->d_name[0];
2619                 status = get_vnode_name(vnode, parentVnode, (struct dirent*)nameBuffer,
2620                         sizeof(nameBuffer), ioContext);
2621
2622                 // release the current vnode, we only need its parent from now on
2623                 put_vnode(vnode);
2624                 vnode = parentVnode;
2625
2626                 if (status != B_OK)
2627                         goto out;
2628
2629                 // TODO: add an explicit check for loops in about 10 levels to do
2630                 // real loop detection
2631
2632                 // don't go deeper as 'maxLevel' to prevent circular loops
2633                 if (maxLevel-- < 0) {
2634                         status = B_LINK_LIMIT;
2635                         goto out;
2636                 }
2637
2638                 // add the name in front of the current path
2639                 name[B_FILE_NAME_LENGTH - 1] = '\0';
2640                 length = strlen(name);
2641                 insert -= length;
2642                 if (insert <= 0) {
2643                         status = B_RESULT_NOT_REPRESENTABLE;
2644                         goto out;
2645                 }
2646                 memcpy(path + insert, name, length);
2647                 path[--insert] = '/';
2648         }
2649
2650         // the root dir will result in an empty path: fix it
2651         if (path[insert] == '\0')
2652                 path[--insert] = '/';
2653
2654         TRACE(("  path is: %s\n", path + insert));
2655
2656         // move the path to the start of the buffer
2657         length = bufferSize - insert;
2658         memmove(buffer, path + insert, length);
2659
2660 out:
2661         put_vnode(vnode);
2662         return status;
2663 }
2664
2665
2666 /*!     Checks the length of every path component, and adds a '.'
2667         if the path ends in a slash.
2668         The given path buffer must be able to store at least one
2669         additional character.
2670 */
2671 static status_t
2672 check_path(char* to)
2673 {
2674         int32 length = 0;
2675
2676         // check length of every path component
2677
2678         while (*to) {
2679                 char* begin;
2680                 if (*to == '/')
2681                         to++, length++;
2682
2683                 begin = to;
2684                 while (*to != '/' && *to)
2685                         to++, length++;
2686
2687                 if (to - begin > B_FILE_NAME_LENGTH)
2688                         return B_NAME_TOO_LONG;
2689         }
2690
2691         if (length == 0)
2692                 return B_ENTRY_NOT_FOUND;
2693
2694         // complete path if there is a slash at the end
2695
2696         if (*(to - 1) == '/') {
2697                 if (length > B_PATH_NAME_LENGTH - 2)
2698                         return B_NAME_TOO_LONG;
2699
2700                 to[0] = '.';
2701                 to[1] = '\0';
2702         }
2703
2704         return B_OK;
2705 }
2706
2707
2708 static struct file_descriptor*
2709 get_fd_and_vnode(int fd, struct vnode** _vnode, bool kernel)
2710 {
2711         struct file_descriptor* descriptor
2712                 = get_fd(get_current_io_context(kernel), fd);
2713         if (descriptor == NULL)
2714                 return NULL;
2715
2716         struct vnode* vnode = fd_vnode(descriptor);
2717         if (vnode == NULL) {
2718                 put_fd(descriptor);
2719                 return NULL;
2720         }
2721
2722         // ToDo: when we can close a file descriptor at any point, investigate
2723         //      if this is still valid to do (accessing the vnode without ref_count
2724         //      or locking)
2725         *_vnode = vnode;
2726         return descriptor;
2727 }
2728
2729
2730 static struct vnode*
2731 get_vnode_from_fd(int fd, bool kernel)
2732 {
2733         struct file_descriptor* descriptor;
2734         struct vnode* vnode;
2735
2736         descriptor = get_fd(get_current_io_context(kernel), fd);
2737         if (descriptor == NULL)
2738                 return NULL;
2739
2740         vnode = fd_vnode(descriptor);
2741         if (vnode != NULL)
2742                 inc_vnode_ref_count(vnode);
2743
2744         put_fd(descriptor);
2745         return vnode;
2746 }
2747
2748
2749 /*!     Gets the vnode from an FD + path combination. If \a fd is lower than zero,
2750         only the path will be considered. In this case, the \a path must not be
2751         NULL.
2752         If \a fd is a valid file descriptor, \a path may be NULL for directories,
2753         and should be NULL for files.
2754 */
2755 static status_t
2756 fd_and_path_to_vnode(int fd, char* path, bool traverseLeafLink,
2757         struct vnode** _vnode, ino_t* _parentID, bool kernel)
2758 {
2759         if (fd < 0 && !path)
2760                 return B_BAD_VALUE;
2761
2762         if (path != NULL && *path == '\0')
2763                 return B_ENTRY_NOT_FOUND;
2764
2765         if (fd < 0 || (path != NULL && path[0] == '/')) {
2766                 // no FD or absolute path
2767                 return path_to_vnode(path, traverseLeafLink, _vnode, _parentID, kernel);
2768         }
2769
2770         // FD only, or FD + relative path
2771         struct vnode* vnode = get_vnode_from_fd(fd, kernel);
2772         if (vnode == NULL)
2773                 return B_FILE_ERROR;
2774
2775         if (path != NULL) {
2776                 return vnode_path_to_vnode(vnode, path, traverseLeafLink, 0, kernel,
2777                         _vnode, _parentID);
2778         }
2779
2780         // there is no relative path to take into account
2781
2782         *_vnode = vnode;
2783         if (_parentID)
2784                 *_parentID = -1;
2785
2786         return B_OK;
2787 }
2788
2789
2790 static int
2791 get_new_fd(int type, struct fs_mount* mount, struct vnode* vnode,
2792         void* cookie, int openMode, bool kernel)
2793 {
2794         struct file_descriptor* descriptor;
2795         int fd;
2796
2797         // If the vnode is locked, we don't allow creating a new file/directory
2798         // file_descriptor for it
2799         if (vnode && vnode->mandatory_locked_by != NULL
2800                 && (type == FDTYPE_FILE || type == FDTYPE_DIR))
2801                 return B_BUSY;
2802
2803         descriptor = alloc_fd();
2804         if (!descriptor)
2805                 return B_NO_MEMORY;
2806
2807         if (vnode)
2808                 descriptor->u.vnode = vnode;
2809         else
2810                 descriptor->u.mount = mount;
2811         descriptor->cookie = cookie;
2812
2813         switch (type) {
2814                 // vnode types
2815                 case FDTYPE_FILE:
2816                         descriptor->ops = &sFileOps;
2817                         break;
2818                 case FDTYPE_DIR:
2819                         descriptor->ops = &sDirectoryOps;
2820                         break;
2821                 case FDTYPE_ATTR:
2822                         descriptor->ops = &sAttributeOps;
2823                         break;
2824                 case FDTYPE_ATTR_DIR:
2825                         descriptor->ops = &sAttributeDirectoryOps;
2826                         break;
2827
2828                 // mount types
2829                 case FDTYPE_INDEX_DIR:
2830                         descriptor->ops = &sIndexDirectoryOps;
2831                         break;
2832                 case FDTYPE_QUERY:
2833                         descriptor->ops = &sQueryOps;
2834                         break;
2835
2836                 default:
2837                         panic("get_new_fd() called with unknown type %d\n", type);
2838                         break;
2839         }
2840         descriptor->type = type;
2841         descriptor->open_mode = openMode;
2842
2843         io_context* context = get_current_io_context(kernel);
2844         fd = new_fd(context, descriptor);
2845         if (fd < 0) {
2846                 free(descriptor);
2847                 return B_NO_MORE_FDS;
2848         }
2849
2850         mutex_lock(&context->io_mutex);
2851         fd_set_close_on_exec(context, fd, (openMode & O_CLOEXEC) != 0);
2852         mutex_unlock(&context->io_mutex);
2853
2854         return fd;
2855 }
2856
2857
2858 /*!     In-place normalizes \a path. It's otherwise semantically equivalent to
2859         vfs_normalize_path(). See there for more documentation.
2860 */
2861 static status_t
2862 normalize_path(char* path, size_t pathSize, bool traverseLink, bool kernel)
2863 {
2864         VNodePutter dirPutter;
2865         struct vnode* dir = NULL;
2866         status_t error;
2867
2868         for (int i = 0; i < B_MAX_SYMLINKS; i++) {
2869                 // get dir vnode + leaf name
2870                 struct vnode* nextDir;
2871                 char leaf[B_FILE_NAME_LENGTH];
2872                 error = vnode_and_path_to_dir_vnode(dir, path, &nextDir, leaf, kernel);
2873                 if (error != B_OK)
2874                         return error;
2875
2876                 dir = nextDir;
2877                 strcpy(path, leaf);
2878                 dirPutter.SetTo(dir);
2879
2880                 // get file vnode, if we shall resolve links
2881                 bool fileExists = false;
2882                 struct vnode* fileVnode;
2883                 VNodePutter fileVnodePutter;
2884                 if (traverseLink) {
2885                         inc_vnode_ref_count(dir);
2886                         if (vnode_path_to_vnode(dir, path, false, 0, kernel, &fileVnode,
2887                                         NULL) == B_OK) {
2888                                 fileVnodePutter.SetTo(fileVnode);
2889                                 fileExists = true;
2890                         }
2891                 }
2892
2893                 if (!fileExists || !traverseLink || !S_ISLNK(fileVnode->Type())) {
2894                         // we're done -- construct the path
2895                         bool hasLeaf = true;
2896                         if (strcmp(leaf, ".") == 0 || strcmp(leaf, "..") == 0) {
2897                                 // special cases "." and ".." -- get the dir, forget the leaf
2898                                 inc_vnode_ref_count(dir);
2899                                 error = vnode_path_to_vnode(dir, leaf, false, 0, kernel,
2900                                         &nextDir, NULL);
2901                                 if (error != B_OK)
2902                                         return error;
2903                                 dir = nextDir;
2904                                 dirPutter.SetTo(dir);
2905                                 hasLeaf = false;
2906                         }
2907
2908                         // get the directory path
2909                         error = dir_vnode_to_path(dir, path, B_PATH_NAME_LENGTH, kernel);
2910                         if (error != B_OK)
2911                                 return error;
2912
2913                         // append the leaf name
2914                         if (hasLeaf) {
2915                                 // insert a directory separator if this is not the file system
2916                                 // root
2917                                 if ((strcmp(path, "/") != 0
2918                                         && strlcat(path, "/", pathSize) >= pathSize)
2919                                         || strlcat(path, leaf, pathSize) >= pathSize) {
2920                                         return B_NAME_TOO_LONG;
2921                                 }
2922                         }
2923
2924                         return B_OK;
2925                 }
2926
2927                 // read link
2928                 if (HAS_FS_CALL(fileVnode, read_symlink)) {
2929                         size_t bufferSize = B_PATH_NAME_LENGTH - 1;
2930                         error = FS_CALL(fileVnode, read_symlink, path, &bufferSize);
2931                         if (error != B_OK)
2932                                 return error;
2933                         path[bufferSize] = '\0';
2934                 } else
2935                         return B_BAD_VALUE;
2936         }
2937
2938         return B_LINK_LIMIT;
2939 }
2940
2941
2942 static status_t
2943 resolve_covered_parent(struct vnode* parent, dev_t* _device, ino_t* _node,
2944         struct io_context* ioContext)
2945 {
2946         // Make sure the IO context root is not bypassed.
2947         if (parent == ioContext->root) {
2948                 *_device = parent->device;
2949                 *_node = parent->id;
2950                 return B_OK;
2951         }
2952
2953         inc_vnode_ref_count(parent);
2954                 // vnode_path_to_vnode() puts the node
2955
2956         // ".." is guaranteed not to be clobbered by this call
2957         struct vnode* vnode;
2958         status_t status = vnode_path_to_vnode(parent, (char*)"..", false, 0,
2959                 ioContext, &vnode, NULL);
2960         if (status == B_OK) {
2961                 *_device = vnode->device;
2962                 *_node = vnode->id;
2963                 put_vnode(vnode);
2964         }
2965
2966         return status;
2967 }
2968
2969
2970 #ifdef ADD_DEBUGGER_COMMANDS
2971
2972
2973 static void
2974 _dump_advisory_locking(advisory_locking* locking)
2975 {
2976         if (locking == NULL)
2977                 return;
2978
2979         kprintf("   lock:        %" B_PRId32, locking->lock);
2980         kprintf("   wait_sem:    %" B_PRId32, locking->wait_sem);
2981
2982         int32 index = 0;
2983         LockList::Iterator iterator = locking->locks.GetIterator();
2984         while (iterator.HasNext()) {
2985                 struct advisory_lock* lock = iterator.Next();
2986
2987                 kprintf("   [%2" B_PRId32 "] team:   %" B_PRId32 "\n", index++, lock->team);
2988                 kprintf("        start:  %" B_PRIdOFF "\n", lock->start);
2989                 kprintf("        end:    %" B_PRIdOFF "\n", lock->end);
2990                 kprintf("        shared? %s\n", lock->shared ? "yes" : "no");
2991         }
2992 }
2993
2994
2995 static void
2996 _dump_mount(struct fs_mount* mount)
2997 {
2998         kprintf("MOUNT: %p\n", mount);
2999         kprintf(" id:            %" B_PRIdDEV "\n", mount->id);
3000         kprintf(" device_name:   %s\n", mount->device_name);
3001         kprintf(" root_vnode:    %p\n", mount->root_vnode);
3002         kprintf(" covers:        %p\n", mount->root_vnode->covers);
3003         kprintf(" partition:     %p\n", mount->partition);
3004         kprintf(" lock:          %p\n", &mount->rlock);
3005         kprintf(" flags:        %s%s\n", mount->unmounting ? " unmounting" : "",
3006                 mount->owns_file_device ? " owns_file_device" : "");
3007
3008         fs_volume* volume = mount->volume;
3009         while (volume != NULL) {
3010                 kprintf(" volume %p:\n", volume);
3011                 kprintf("  layer:            %" B_PRId32 "\n", volume->layer);
3012                 kprintf("  private_volume:   %p\n", volume->private_volume);
3013                 kprintf("  ops:              %p\n", volume->ops);
3014                 kprintf("  file_system:      %p\n", volume->file_system);
3015                 kprintf("  file_system_name: %s\n", volume->file_system_name);
3016                 volume = volume->super_volume;
3017         }
3018
3019         set_debug_variable("_volume", (addr_t)mount->volume->private_volume);
3020         set_debug_variable("_root", (addr_t)mount->root_vnode);
3021         set_debug_variable("_covers", (addr_t)mount->root_vnode->covers);
3022         set_debug_variable("_partition", (addr_t)mount->partition);
3023 }
3024
3025
3026 static bool
3027 debug_prepend_vnode_name_to_path(char* buffer, size_t& bufferSize,
3028         const char* name)
3029 {
3030         bool insertSlash = buffer[bufferSize] != '\0';
3031         size_t nameLength = strlen(name);
3032
3033         if (bufferSize < nameLength + (insertSlash ? 1 : 0))
3034                 return false;
3035
3036         if (insertSlash)
3037                 buffer[--bufferSize] = '/';
3038
3039         bufferSize -= nameLength;
3040         memcpy(buffer + bufferSize, name, nameLength);
3041
3042         return true;
3043 }
3044
3045
3046 static bool
3047 debug_prepend_vnode_id_to_path(char* buffer, size_t& bufferSize, dev_t devID,
3048         ino_t nodeID)
3049 {
3050         if (bufferSize == 0)
3051                 return false;
3052
3053         bool insertSlash = buffer[bufferSize] != '\0';
3054         if (insertSlash)
3055                 buffer[--bufferSize] = '/';
3056
3057         size_t size = snprintf(buffer, bufferSize,
3058                 "<%" B_PRIdDEV ",%" B_PRIdINO ">", devID, nodeID);
3059         if (size > bufferSize) {
3060                 if (insertSlash)
3061                         bufferSize++;
3062                 return false;
3063         }
3064
3065         if (size < bufferSize)
3066                 memmove(buffer + bufferSize - size, buffer, size);
3067
3068         bufferSize -= size;
3069         return true;
3070 }
3071
3072
3073 static char*
3074 debug_resolve_vnode_path(struct vnode* vnode, char* buffer, size_t bufferSize,
3075         bool& _truncated)
3076 {
3077         // null-terminate the path
3078         buffer[--bufferSize] = '\0';
3079
3080         while (true) {
3081                 while (vnode->covers != NULL)
3082                         vnode = vnode->covers;
3083
3084                 if (vnode == sRoot) {
3085                         _truncated = bufferSize == 0;
3086                         if (!_truncated)
3087                                 buffer[--bufferSize] = '/';
3088                         return buffer + bufferSize;
3089                 }
3090
3091                 // resolve the name
3092                 ino_t dirID;
3093                 const char* name = vnode->mount->entry_cache.DebugReverseLookup(
3094                         vnode->id, dirID);
3095                 if (name == NULL) {
3096                         // Failed to resolve the name -- prepend "<dev,node>/".
3097                         _truncated = !debug_prepend_vnode_id_to_path(buffer, bufferSize,
3098                                 vnode->mount->id, vnode->id);
3099                         return buffer + bufferSize;
3100                 }
3101
3102                 // prepend the name
3103                 if (!debug_prepend_vnode_name_to_path(buffer, bufferSize, name)) {
3104                         _truncated = true;
3105                         return buffer + bufferSize;
3106                 }
3107
3108                 // resolve the directory node
3109                 struct vnode* nextVnode = lookup_vnode(vnode->mount->id, dirID);
3110                 if (nextVnode == NULL) {
3111                         _truncated = !debug_prepend_vnode_id_to_path(buffer, bufferSize,
3112                                 vnode->mount->id, dirID);
3113                         return buffer + bufferSize;
3114                 }
3115
3116                 vnode = nextVnode;
3117         }
3118 }
3119
3120
3121 static void
3122 _dump_vnode(struct vnode* vnode, bool printPath)
3123 {
3124         kprintf("VNODE: %p\n", vnode);
3125         kprintf(" device:        %" B_PRIdDEV "\n", vnode->device);
3126         kprintf(" id:            %" B_PRIdINO "\n", vnode->id);
3127         kprintf(" ref_count:     %" B_PRId32 "\n", vnode->ref_count);
3128         kprintf(" private_node:  %p\n", vnode->private_node);
3129         kprintf(" mount:         %p\n", vnode->mount);
3130         kprintf(" covered_by:    %p\n", vnode->covered_by);
3131         kprintf(" covers:        %p\n", vnode->covers);
3132         kprintf(" cache:         %p\n", vnode->cache);
3133         kprintf(" type:          %#" B_PRIx32 "\n", vnode->Type());
3134         kprintf(" flags:         %s%s%s\n", vnode->IsRemoved() ? "r" : "-",
3135                 vnode->IsBusy() ? "b" : "-", vnode->IsUnpublished() ? "u" : "-");
3136         kprintf(" advisory_lock: %p\n", vnode->advisory_locking);
3137
3138         _dump_advisory_locking(vnode->advisory_locking);
3139
3140         if (printPath) {
3141                 void* buffer = debug_malloc(B_PATH_NAME_LENGTH);
3142                 if (buffer != NULL) {
3143                         bool truncated;
3144                         char* path = debug_resolve_vnode_path(vnode, (char*)buffer,
3145                                 B_PATH_NAME_LENGTH, truncated);
3146                         if (path != NULL) {
3147                                 kprintf(" path:          ");
3148                                 if (truncated)
3149                                         kputs("<truncated>/");
3150                                 kputs(path);
3151                                 kputs("\n");
3152                         } else
3153                                 kprintf("Failed to resolve vnode path.\n");
3154
3155                         debug_free(buffer);
3156                 } else
3157                         kprintf("Failed to allocate memory for constructing the path.\n");
3158         }
3159
3160         set_debug_variable("_node", (addr_t)vnode->private_node);
3161         set_debug_variable("_mount", (addr_t)vnode->mount);
3162         set_debug_variable("_covered_by", (addr_t)vnode->covered_by);
3163         set_debug_variable("_covers", (addr_t)vnode->covers);
3164         set_debug_variable("_adv_lock", (addr_t)vnode->advisory_locking);
3165 }
3166
3167
3168 static int
3169 dump_mount(int argc, char** argv)
3170 {
3171         if (argc != 2 || !strcmp(argv[1], "--help")) {
3172                 kprintf("usage: %s [id|address]\n", argv[0]);
3173                 return 0;
3174         }
3175
3176         ulong val = parse_expression(argv[1]);
3177         uint32 id = val;
3178
3179         struct fs_mount* mount = sMountsTable->Lookup(id);
3180         if (mount == NULL) {
3181                 if (IS_USER_ADDRESS(id)) {
3182                         kprintf("fs_mount not found\n");
3183                         return 0;
3184                 }
3185                 mount = (fs_mount*)val;
3186         }
3187
3188         _dump_mount(mount);
3189         return 0;
3190 }
3191
3192
3193 static int
3194 dump_mounts(int argc, char** argv)
3195 {
3196         if (argc != 1) {
3197                 kprintf("usage: %s\n", argv[0]);
3198                 return 0;
3199         }
3200
3201         kprintf("%-*s    id %-*s   %-*s   %-*s   fs_name\n",
3202                 B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "root",
3203                 B_PRINTF_POINTER_WIDTH, "covers", B_PRINTF_POINTER_WIDTH, "cookie");
3204
3205         struct fs_mount* mount;
3206
3207         MountTable::Iterator iterator(sMountsTable);
3208         while (iterator.HasNext()) {
3209                 mount = iterator.Next();
3210                 kprintf("%p%4" B_PRIdDEV " %p %p %p %s\n", mount, mount->id, mount->root_vnode,
3211                         mount->root_vnode->covers, mount->volume->private_volume,
3212                         mount->volume->file_system_name);
3213
3214                 fs_volume* volume = mount->volume;
3215                 while (volume->super_volume != NULL) {
3216                         volume = volume->super_volume;
3217                         kprintf("                                     %p %s\n",
3218                                 volume->private_volume, volume->file_system_name);
3219                 }
3220         }
3221
3222         return 0;
3223 }
3224
3225
3226 static int
3227 dump_vnode(int argc, char** argv)
3228 {
3229         bool printPath = false;
3230         int argi = 1;
3231         if (argc >= 2 && strcmp(argv[argi], "-p") == 0) {
3232                 printPath = true;
3233                 argi++;
3234         }
3235
3236         if (argi >= argc || argi + 2 < argc) {
3237                 print_debugger_command_usage(argv[0]);
3238                 return 0;
3239         }
3240
3241         struct vnode* vnode = NULL;
3242
3243         if (argi + 1 == argc) {
3244                 vnode = (struct vnode*)parse_expression(argv[argi]);
3245                 if (IS_USER_ADDRESS(vnode)) {
3246                         kprintf("invalid vnode address\n");
3247                         return 0;
3248                 }
3249                 _dump_vnode(vnode, printPath);
3250                 return 0;
3251         }
3252
3253         dev_t device = parse_expression(argv[argi]);
3254         ino_t id = parse_expression(argv[argi + 1]);
3255
3256         VnodeTable::Iterator iterator(sVnodeTable);
3257         while (iterator.HasNext()) {
3258                 vnode = iterator.Next();
3259                 if (vnode->id != id || vnode->device != device)
3260                         continue;
3261
3262                 _dump_vnode(vnode, printPath);
3263         }
3264
3265         return 0;
3266 }
3267
3268
3269 static int
3270 dump_vnodes(int argc, char** argv)
3271 {
3272         if (argc != 2 || !strcmp(argv[1], "--help")) {
3273                 kprintf("usage: %s [device]\n", argv[0]);
3274                 return 0;
3275         }
3276
3277         // restrict dumped nodes to a certain device if requested
3278         dev_t device = parse_expression(argv[1]);
3279
3280         struct vnode* vnode;
3281
3282         kprintf("%-*s   dev     inode  ref %-*s   %-*s   %-*s   flags\n",
3283                 B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "cache",
3284                 B_PRINTF_POINTER_WIDTH, "fs-node", B_PRINTF_POINTER_WIDTH, "locking");
3285
3286         VnodeTable::Iterator iterator(sVnodeTable);
3287         while (iterator.HasNext()) {
3288                 vnode = iterator.Next();
3289                 if (vnode->device != device)
3290                         continue;
3291
3292                 kprintf("%p%4" B_PRIdDEV "%10" B_PRIdINO "%5" B_PRId32 " %p %p %p %s%s%s\n",
3293                         vnode, vnode->device, vnode->id, vnode->ref_count, vnode->cache,
3294                         vnode->private_node, vnode->advisory_locking,
3295                         vnode->IsRemoved() ? "r" : "-", vnode->IsBusy() ? "b" : "-",
3296                         vnode->IsUnpublished() ? "u" : "-");
3297         }
3298
3299         return 0;
3300 }
3301
3302
3303 static int
3304 dump_vnode_caches(int argc, char** argv)
3305 {
3306         struct vnode* vnode;
3307
3308         if (argc > 2 || !strcmp(argv[1], "--help")) {
3309                 kprintf("usage: %s [device]\n", argv[0]);
3310                 return 0;
3311         }
3312
3313         // restrict dumped nodes to a certain device if requested
3314         dev_t device = -1;
3315         if (argc > 1)
3316                 device = parse_expression(argv[1]);
3317
3318         kprintf("%-*s   dev     inode %-*s       size   pages\n",
3319                 B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "cache");
3320
3321         VnodeTable::Iterator iterator(sVnodeTable);
3322         while (iterator.HasNext()) {
3323                 vnode = iterator.Next();
3324                 if (vnode->cache == NULL)
3325                         continue;
3326                 if (device != -1 && vnode->device != device)
3327                         continue;
3328
3329                 kprintf("%p%4" B_PRIdDEV "%10" B_PRIdINO " %p %8" B_PRIdOFF "%8" B_PRId32 "\n",
3330                         vnode, vnode->device, vnode->id, vnode->cache,
3331                         (vnode->cache->virtual_end + B_PAGE_SIZE - 1) / B_PAGE_SIZE,
3332                         vnode->cache->page_count);
3333         }
3334
3335         return 0;
3336 }
3337
3338
3339 int
3340 dump_io_context(int argc, char** argv)
3341 {
3342         if (argc > 2 || !strcmp(argv[1], "--help")) {
3343                 kprintf("usage: %s [team-id|address]\n", argv[0]);
3344                 return 0;
3345         }
3346
3347         struct io_context* context = NULL;
3348
3349         if (argc > 1) {
3350                 ulong num = parse_expression(argv[1]);
3351                 if (IS_KERNEL_ADDRESS(num))
3352                         context = (struct io_context*)num;
3353                 else {
3354                         Team* team = team_get_team_struct_locked(num);
3355                         if (team == NULL) {
3356                                 kprintf("could not find team with ID %lu\n", num);
3357                                 return 0;
3358                         }
3359                         context = (struct io_context*)team->io_context;
3360                 }
3361         } else
3362                 context = get_current_io_context(true);
3363
3364         kprintf("I/O CONTEXT: %p\n", context);
3365         kprintf(" root vnode:\t%p\n", context->root);
3366         kprintf(" cwd vnode:\t%p\n", context->cwd);
3367         kprintf(" used fds:\t%" B_PRIu32 "\n", context->num_used_fds);
3368         kprintf(" max fds:\t%" B_PRIu32 "\n", context->table_size);
3369
3370         if (context->num_used_fds) {
3371                 kprintf("   no.  type    %*s  ref  open  mode         pos    %*s\n",
3372                         B_PRINTF_POINTER_WIDTH, "ops", B_PRINTF_POINTER_WIDTH, "cookie");
3373         }
3374
3375         for (uint32 i = 0; i < context->table_size; i++) {
3376                 struct file_descriptor* fd = context->fds[i];
3377                 if (fd == NULL)
3378                         continue;
3379
3380                 kprintf("  %3" B_PRIu32 ":  %4" B_PRId32 "  %p  %3" B_PRId32 "  %4"
3381                         B_PRIu32 "  %4" B_PRIx32 "  %10" B_PRIdOFF "  %p  %s %p\n", i,
3382                         fd->type, fd->ops, fd->ref_count, fd->open_count, fd->open_mode,
3383                         fd->pos, fd->cookie,
3384                         fd->type >= FDTYPE_INDEX && fd->type <= FDTYPE_QUERY
3385                                 ? "mount" : "vnode",
3386                         fd->u.vnode);
3387         }
3388
3389         kprintf(" used monitors:\t%" B_PRIu32 "\n", context->num_monitors);
3390         kprintf(" max monitors:\t%" B_PRIu32 "\n", context->max_monitors);
3391
3392         set_debug_variable("_cwd", (addr_t)context->cwd);
3393
3394         return 0;
3395 }
3396
3397
3398 int
3399 dump_vnode_usage(int argc, char** argv)
3400 {
3401         if (argc != 1) {
3402                 kprintf("usage: %s\n", argv[0]);
3403                 return 0;
3404         }
3405
3406         kprintf("Unused vnodes: %" B_PRIu32 " (max unused %" B_PRIu32 ")\n",
3407                 sUnusedVnodes, kMaxUnusedVnodes);
3408
3409         uint32 count = sVnodeTable->CountElements();
3410
3411         kprintf("%" B_PRIu32 " vnodes total (%" B_PRIu32 " in use).\n", count,
3412                 count - sUnusedVnodes);
3413         return 0;
3414 }
3415
3416 #endif  // ADD_DEBUGGER_COMMANDS
3417
3418
3419 /*!     Clears memory specified by an iovec array.
3420 */
3421 static void
3422 zero_iovecs(const iovec* vecs, size_t vecCount, size_t bytes)
3423 {
3424         for (size_t i = 0; i < vecCount && bytes > 0; i++) {
3425                 size_t length = std::min(vecs[i].iov_len, bytes);
3426                 memset(vecs[i].iov_base, 0, length);
3427                 bytes -= length;
3428         }
3429 }
3430
3431
3432 /*!     Does the dirty work of combining the file_io_vecs with the iovecs
3433         and calls the file system hooks to read/write the request to disk.
3434 */
3435 static status_t
3436 common_file_io_vec_pages(struct vnode* vnode, void* cookie,
3437         const file_io_vec* fileVecs, size_t fileVecCount, const iovec* vecs,
3438         size_t vecCount, uint32* _vecIndex, size_t* _vecOffset, size_t* _numBytes,
3439         bool doWrite)
3440 {
3441         if (fileVecCount == 0) {
3442                 // There are no file vecs at this offset, so we're obviously trying
3443                 // to access the file outside of its bounds
3444                 return B_BAD_VALUE;
3445         }
3446
3447         size_t numBytes = *_numBytes;
3448         uint32 fileVecIndex;
3449         size_t vecOffset = *_vecOffset;
3450         uint32 vecIndex = *_vecIndex;
3451         status_t status;
3452         size_t size;
3453
3454         if (!doWrite && vecOffset == 0) {
3455                 // now directly read the data from the device
3456                 // the first file_io_vec can be read directly
3457
3458                 if (fileVecs[0].length < (off_t)numBytes)
3459                         size = fileVecs[0].length;
3460                 else
3461                         size = numBytes;
3462
3463                 if (fileVecs[0].offset >= 0) {
3464                         status = FS_CALL(vnode, read_pages, cookie, fileVecs[0].offset,
3465                                 &vecs[vecIndex], vecCount - vecIndex, &size);
3466                 } else {
3467                         // sparse read
3468                         zero_iovecs(&vecs[vecIndex], vecCount - vecIndex, size);
3469                         status = B_OK;
3470                 }
3471                 if (status != B_OK)
3472                         return status;
3473
3474                 // TODO: this is a work-around for buggy device drivers!
3475                 //      When our own drivers honour the length, we can:
3476                 //      a) also use this direct I/O for writes (otherwise, it would
3477                 //         overwrite precious data)
3478                 //      b) panic if the term below is true (at least for writes)
3479                 if ((off_t)size > fileVecs[0].length) {
3480                         //dprintf("warning: device driver %p doesn't respect total length "
3481                         //      "in read_pages() call!\n", ref->device);
3482                         size = fileVecs[0].length;
3483                 }
3484
3485                 ASSERT((off_t)size <= fileVecs[0].length);
3486
3487                 // If the file portion was contiguous, we're already done now
3488                 if (size == numBytes)
3489                         return B_OK;
3490
3491                 // if we reached the end of the file, we can return as well
3492                 if ((off_t)size != fileVecs[0].length) {
3493                         *_numBytes = size;
3494                         return B_OK;
3495                 }
3496
3497                 fileVecIndex = 1;
3498
3499                 // first, find out where we have to continue in our iovecs
3500                 for (; vecIndex < vecCount; vecIndex++) {
3501                         if (size < vecs[vecIndex].iov_len)
3502                                 break;
3503
3504                         size -= vecs[vecIndex].iov_len;
3505                 }
3506
3507                 vecOffset = size;
3508         } else {
3509                 fileVecIndex = 0;
3510                 size = 0;
3511         }
3512
3513         // Too bad, let's process the rest of the file_io_vecs
3514
3515         size_t totalSize = size;
3516         size_t bytesLeft = numBytes - size;
3517
3518         for (; fileVecIndex < fileVecCount; fileVecIndex++) {
3519                 const file_io_vec &fileVec = fileVecs[fileVecIndex];
3520                 off_t fileOffset = fileVec.offset;
3521                 off_t fileLeft = min_c(fileVec.length, (off_t)bytesLeft);
3522
3523                 TRACE(("FILE VEC [%" B_PRIu32 "] length %" B_PRIdOFF "\n", fileVecIndex,
3524                         fileLeft));
3525
3526                 // process the complete fileVec
3527                 while (fileLeft > 0) {
3528                         iovec tempVecs[MAX_TEMP_IO_VECS];
3529                         uint32 tempCount = 0;
3530
3531                         // size tracks how much of what is left of the current fileVec
3532                         // (fileLeft) has been assigned to tempVecs
3533                         size = 0;
3534
3535                         // assign what is left of the current fileVec to the tempVecs
3536                         for (size = 0; (off_t)size < fileLeft && vecIndex < vecCount
3537                                         && tempCount < MAX_TEMP_IO_VECS;) {
3538                                 // try to satisfy one iovec per iteration (or as much as
3539                                 // possible)
3540
3541                                 // bytes left of the current iovec
3542                                 size_t vecLeft = vecs[vecIndex].iov_len - vecOffset;
3543                                 if (vecLeft == 0) {
3544                                         vecOffset = 0;
3545                                         vecIndex++;
3546                                         continue;
3547                                 }
3548
3549                                 TRACE(("fill vec %" B_PRIu32 ", offset = %lu, size = %lu\n",
3550                                         vecIndex, vecOffset, size));
3551
3552                                 // actually available bytes
3553                                 size_t tempVecSize = min_c(vecLeft, fileLeft - size);
3554
3555                                 tempVecs[tempCount].iov_base
3556                                         = (void*)((addr_t)vecs[vecIndex].iov_base + vecOffset);
3557                                 tempVecs[tempCount].iov_len = tempVecSize;
3558                                 tempCount++;
3559
3560                                 size += tempVecSize;
3561                                 vecOffset += tempVecSize;
3562                         }
3563
3564                         size_t bytes = size;
3565
3566                         if (fileOffset == -1) {
3567                                 if (doWrite) {
3568                                         panic("sparse write attempt: vnode %p", vnode);
3569                                         status = B_IO_ERROR;
3570                                 } else {
3571                                         // sparse read
3572                                         zero_iovecs(tempVecs, tempCount, bytes);
3573                                         status = B_OK;
3574                                 }
3575                         } else if (doWrite) {
3576                                 status = FS_CALL(vnode, write_pages, cookie, fileOffset,
3577                                         tempVecs, tempCount, &bytes);
3578                         } else {
3579                                 status = FS_CALL(vnode, read_pages, cookie, fileOffset,
3580                                         tempVecs, tempCount, &bytes);
3581                         }
3582                         if (status != B_OK)
3583                                 return status;
3584
3585                         totalSize += bytes;
3586                         bytesLeft -= size;
3587                         if (fileOffset >= 0)
3588                                 fileOffset += size;
3589                         fileLeft -= size;
3590                         //dprintf("-> file left = %Lu\n", fileLeft);
3591
3592                         if (size != bytes || vecIndex >= vecCount) {
3593                                 // there are no more bytes or iovecs, let's bail out
3594                                 *_numBytes = totalSize;
3595                                 return B_OK;
3596                         }
3597                 }
3598         }
3599
3600         *_vecIndex = vecIndex;
3601         *_vecOffset = vecOffset;
3602         *_numBytes = totalSize;
3603         return B_OK;
3604 }
3605
3606
3607 static bool
3608 is_user_in_group(gid_t gid)
3609 {
3610         if (gid == getegid())
3611                 return true;
3612
3613         gid_t groups[NGROUPS_MAX];
3614         int groupCount = getgroups(NGROUPS_MAX, groups);
3615         for (int i = 0; i < groupCount; i++) {
3616                 if (gid == groups[i])
3617                         return true;
3618         }
3619
3620         return false;
3621 }
3622
3623
3624 static status_t
3625 free_io_context(io_context* context)
3626 {
3627         uint32 i;
3628
3629         TIOC(FreeIOContext(context));
3630
3631         if (context->root)
3632                 put_vnode(context->root);
3633
3634         if (context->cwd)
3635                 put_vnode(context->cwd);
3636
3637         mutex_lock(&context->io_mutex);
3638
3639         for (i = 0; i < context->table_size; i++) {
3640                 if (struct file_descriptor* descriptor = context->fds[i]) {
3641                         close_fd(descriptor);
3642                         put_fd(descriptor);
3643                 }
3644         }
3645
3646         mutex_destroy(&context->io_mutex);
3647
3648         remove_node_monitors(context);
3649         free(context->fds);
3650         free(context);
3651
3652         return B_OK;
3653 }
3654
3655
3656 static status_t
3657 resize_monitor_table(struct io_context* context, const int newSize)
3658 {
3659         int     status = B_OK;
3660
3661         if (newSize <= 0 || newSize > MAX_NODE_MONITORS)
3662                 return B_BAD_VALUE;
3663
3664         mutex_lock(&context->io_mutex);
3665
3666         if ((size_t)newSize < context->num_monitors) {
3667                 status = B_BUSY;
3668                 goto out;
3669         }
3670         context->max_monitors = newSize;
3671
3672 out:
3673         mutex_unlock(&context->io_mutex);
3674         return status;
3675 }
3676
3677
3678 //      #pragma mark - public API for file systems
3679
3680
3681 extern "C" status_t
3682 new_vnode(fs_volume* volume, ino_t vnodeID, void* privateNode,
3683         fs_vnode_ops* ops)
3684 {
3685         FUNCTION(("new_vnode(volume = %p (%" B_PRId32 "), vnodeID = %" B_PRId64
3686                 ", node = %p)\n", volume, volume->id, vnodeID, privateNode));
3687
3688         if (privateNode == NULL)
3689                 return B_BAD_VALUE;
3690
3691         int32 tries = BUSY_VNODE_RETRIES;
3692 restart:
3693         // create the node
3694         bool nodeCreated;
3695         struct vnode* vnode;
3696         status_t status = create_new_vnode_and_lock(volume->id, vnodeID, vnode,
3697                 nodeCreated);
3698         if (status != B_OK)
3699                 return status;
3700
3701         WriteLocker nodeLocker(sVnodeLock, true);
3702                 // create_new_vnode_and_lock() has locked for us
3703
3704         if (!nodeCreated && vnode->IsBusy()) {
3705                 nodeLocker.Unlock();
3706                 if (!retry_busy_vnode(tries, volume->id, vnodeID))
3707                         return B_BUSY;
3708                 goto restart;
3709         }
3710
3711         // file system integrity check:
3712         // test if the vnode already exists and bail out if this is the case!
3713         if (!nodeCreated) {
3714                 panic("vnode %" B_PRIdDEV ":%" B_PRIdINO " already exists (node = %p, "
3715                         "vnode->node = %p)!", volume->id, vnodeID, privateNode,
3716                         vnode->private_node);
3717                 return B_ERROR;
3718         }
3719
3720         vnode->private_node = privateNode;
3721         vnode->ops = ops;
3722         vnode->SetUnpublished(true);
3723
3724         TRACE(("returns: %s\n", strerror(status)));
3725
3726         return status;
3727 }
3728
3729
3730 extern "C" status_t
3731 publish_vnode(fs_volume* volume, ino_t vnodeID, void* privateNode,
3732         fs_vnode_ops* ops, int type, uint32 flags)
3733 {
3734         FUNCTION(("publish_vnode()\n"));
3735
3736         int32 tries = BUSY_VNODE_RETRIES;
3737 restart:
3738         WriteLocker locker(sVnodeLock);
3739
3740         struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3741
3742         bool nodeCreated = false;
3743         if (vnode == NULL) {
3744                 if (privateNode == NULL)
3745                         return B_BAD_VALUE;
3746
3747                 // create the node
3748                 locker.Unlock();
3749                         // create_new_vnode_and_lock() will re-lock for us on success
3750                 status_t status = create_new_vnode_and_lock(volume->id, vnodeID, vnode,
3751                         nodeCreated);
3752                 if (status != B_OK)
3753                         return status;
3754
3755                 locker.SetTo(sVnodeLock, true);
3756         }
3757
3758         if (nodeCreated) {
3759                 vnode->private_node = privateNode;
3760                 vnode->ops = ops;
3761                 vnode->SetUnpublished(true);
3762         } else if (vnode->IsBusy() && vnode->IsUnpublished()
3763                 && vnode->private_node == privateNode && vnode->ops == ops) {
3764                 // already known, but not published
3765         } else if (vnode->IsBusy()) {
3766                 locker.Unlock();
3767                 if (!retry_busy_vnode(tries, volume->id, vnodeID))
3768                         return B_BUSY;
3769                 goto restart;
3770         } else
3771                 return B_BAD_VALUE;
3772
3773         bool publishSpecialSubNode = false;
3774
3775         vnode->SetType(type);
3776         vnode->SetRemoved((flags & B_VNODE_PUBLISH_REMOVED) != 0);
3777         publishSpecialSubNode = is_special_node_type(type)
3778                 && (flags & B_VNODE_DONT_CREATE_SPECIAL_SUB_NODE) == 0;
3779
3780         status_t status = B_OK;
3781
3782         // create sub vnodes, if necessary
3783         if (volume->sub_volume != NULL || publishSpecialSubNode) {
3784                 locker.Unlock();
3785
3786                 fs_volume* subVolume = volume;
3787                 if (volume->sub_volume != NULL) {
3788                         while (status == B_OK && subVolume->sub_volume != NULL) {
3789                                 subVolume = subVolume->sub_volume;
3790                                 status = subVolume->ops->create_sub_vnode(subVolume, vnodeID,
3791                                         vnode);
3792                         }
3793                 }
3794
3795                 if (status == B_OK && publishSpecialSubNode)
3796                         status = create_special_sub_node(vnode, flags);
3797
3798                 if (status != B_OK) {
3799                         // error -- clean up the created sub vnodes
3800                         while (subVolume->super_volume != volume) {
3801                                 subVolume = subVolume->super_volume;
3802                                 subVolume->ops->delete_sub_vnode(subVolume, vnode);
3803                         }
3804                 }
3805
3806                 if (status == B_OK) {
3807                         ReadLocker vnodesReadLocker(sVnodeLock);
3808                         AutoLocker<Vnode> nodeLocker(vnode);
3809                         vnode->SetBusy(false);
3810                         vnode->SetUnpublished(false);
3811                 } else {
3812                         locker.Lock();
3813                         sVnodeTable->Remove(vnode);
3814                         remove_vnode_from_mount_list(vnode, vnode->mount);
3815                         free(vnode);
3816                 }
3817         } else {
3818                 // we still hold the write lock -- mark the node unbusy and published
3819                 vnode->SetBusy(false);
3820                 vnode->SetUnpublished(false);
3821         }
3822
3823         TRACE(("returns: %s\n", strerror(status)));
3824
3825         return status;
3826 }
3827
3828
3829 extern "C" status_t
3830 get_vnode(fs_volume* volume, ino_t vnodeID, void** _privateNode)
3831 {
3832         struct vnode* vnode;
3833
3834         if (volume == NULL)
3835                 return B_BAD_VALUE;
3836
3837         status_t status = get_vnode(volume->id, vnodeID, &vnode, true, true);
3838         if (status != B_OK)
3839                 return status;
3840
3841         // If this is a layered FS, we need to get the node cookie for the requested
3842         // layer.
3843         if (HAS_FS_CALL(vnode, get_super_vnode)) {
3844                 fs_vnode resolvedNode;
3845                 status_t status = FS_CALL(vnode, get_super_vnode, volume,
3846                         &resolvedNode);
3847                 if (status != B_OK) {
3848                         panic("get_vnode(): Failed to get super node for vnode %p, "
3849                                 "volume: %p", vnode, volume);
3850                         put_vnode(vnode);
3851                         return status;
3852                 }
3853
3854                 if (_privateNode != NULL)
3855                         *_privateNode = resolvedNode.private_node;
3856         } else if (_privateNode != NULL)
3857                 *_privateNode = vnode->private_node;
3858
3859         return B_OK;
3860 }
3861
3862
3863 extern "C" status_t
3864 acquire_vnode(fs_volume* volume, ino_t vnodeID)
3865 {
3866         struct vnode* vnode;
3867
3868         rw_lock_read_lock(&sVnodeLock);
3869         vnode = lookup_vnode(volume->id, vnodeID);
3870         rw_lock_read_unlock(&sVnodeLock);
3871
3872         if (vnode == NULL)
3873                 return B_BAD_VALUE;
3874
3875         inc_vnode_ref_count(vnode);
3876         return B_OK;
3877 }
3878
3879
3880 extern "C" status_t
3881 put_vnode(fs_volume* volume, ino_t vnodeID)
3882 {
3883         struct vnode* vnode;
3884
3885         rw_lock_read_lock(&sVnodeLock);
3886         vnode = lookup_vnode(volume->id, vnodeID);
3887         rw_lock_read_unlock(&sVnodeLock);
3888
3889         if (vnode == NULL)
3890                 return B_BAD_VALUE;
3891
3892         dec_vnode_ref_count(vnode, false, true);
3893         return B_OK;
3894 }
3895
3896
3897 extern "C" status_t
3898 remove_vnode(fs_volume* volume, ino_t vnodeID)
3899 {
3900         ReadLocker locker(sVnodeLock);
3901
3902         struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3903         if (vnode == NULL)
3904                 return B_ENTRY_NOT_FOUND;
3905
3906         if (vnode->covered_by != NULL || vnode->covers != NULL) {
3907                 // this vnode is in use
3908                 return B_BUSY;
3909         }
3910
3911         vnode->Lock();
3912
3913         vnode->SetRemoved(true);
3914         bool removeUnpublished = false;
3915
3916         if (vnode->IsUnpublished()) {
3917                 // prepare the vnode for deletion
3918                 removeUnpublished = true;
3919                 vnode->SetBusy(true);
3920         }
3921
3922         vnode->Unlock();
3923         locker.Unlock();
3924
3925         if (removeUnpublished) {
3926                 // If the vnode hasn't been published yet, we delete it here
3927                 atomic_add(&vnode->ref_count, -1);
3928                 free_vnode(vnode, true);
3929         }
3930
3931         return B_OK;
3932 }
3933
3934
3935 extern "C" status_t
3936 unremove_vnode(fs_volume* volume, ino_t vnodeID)
3937 {
3938         struct vnode* vnode;
3939
3940         rw_lock_read_lock(&sVnodeLock);
3941
3942         vnode = lookup_vnode(volume->id, vnodeID);
3943         if (vnode) {
3944                 AutoLocker<Vnode> nodeLocker(vnode);
3945                 vnode->SetRemoved(false);
3946         }
3947
3948         rw_lock_read_unlock(&sVnodeLock);
3949         return B_OK;
3950 }
3951
3952
3953 extern "C" status_t
3954 get_vnode_removed(fs_volume* volume, ino_t vnodeID, bool* _removed)
3955 {
3956         ReadLocker _(sVnodeLock);
3957
3958         if (struct vnode* vnode = lookup_vnode(volume->id, vnodeID)) {
3959                 if (_removed != NULL)
3960                         *_removed = vnode->IsRemoved();
3961                 return B_OK;
3962         }
3963
3964         return B_BAD_VALUE;
3965 }
3966
3967
3968 extern "C" fs_volume*
3969 volume_for_vnode(fs_vnode* _vnode)
3970 {
3971         if (_vnode == NULL)
3972                 return NULL;
3973
3974         struct vnode* vnode = static_cast<struct vnode*>(_vnode);
3975         return vnode->mount->volume;
3976 }
3977
3978
3979 extern "C" status_t
3980 check_access_permissions(int accessMode, mode_t mode, gid_t nodeGroupID,
3981         uid_t nodeUserID)
3982 {
3983         // get node permissions
3984         int userPermissions = (mode & S_IRWXU) >> 6;
3985         int groupPermissions = (mode & S_IRWXG) >> 3;
3986         int otherPermissions = mode & S_IRWXO;
3987
3988         // get the node permissions for this uid/gid
3989         int permissions = 0;
3990         uid_t uid = geteuid();
3991
3992         if (uid == 0) {
3993                 // user is root
3994                 // root has always read/write permission, but at least one of the
3995                 // X bits must be set for execute permission
3996                 permissions = userPermissions | groupPermissions | otherPermissions
3997                         | S_IROTH | S_IWOTH;
3998                 if (S_ISDIR(mode))
3999                         permissions |= S_IXOTH;
4000         } else if (uid == nodeUserID) {
4001                 // user is node owner
4002                 permissions = userPermissions;
4003         } else if (is_user_in_group(nodeGroupID)) {
4004                 // user is in owning group
4005                 permissions = groupPermissions;
4006         } else {
4007                 // user is one of the others
4008                 permissions = otherPermissions;
4009         }
4010
4011         return (accessMode & ~permissions) == 0 ? B_OK : B_PERMISSION_DENIED;
4012 }
4013
4014
4015 #if 0
4016 extern "C" status_t
4017 read_pages(int fd, off_t pos, const iovec* vecs, size_t count,
4018         size_t* _numBytes)
4019 {
4020         struct file_descriptor* descriptor;
4021         struct vnode* vnode;
4022
4023         descriptor = get_fd_and_vnode(fd, &vnode, true);
4024         if (descriptor == NULL)
4025                 return B_FILE_ERROR;
4026
4027         status_t status = vfs_read_pages(vnode, descriptor->cookie, pos, vecs,
4028                 count, 0, _numBytes);
4029
4030         put_fd(descriptor);
4031         return status;
4032 }
4033
4034
4035 extern "C" status_t
4036 write_pages(int fd, off_t pos, const iovec* vecs, size_t count,
4037         size_t* _numBytes)
4038 {
4039         struct file_descriptor* descriptor;
4040         struct vnode* vnode;
4041
4042         descriptor = get_fd_and_vnode(fd, &vnode, true);
4043         if (descriptor == NULL)
4044                 return B_FILE_ERROR;
4045
4046         status_t status = vfs_write_pages(vnode, descriptor->cookie, pos, vecs,
4047                 count, 0, _numBytes);
4048
4049         put_fd(descriptor);
4050         return status;
4051 }
4052 #endif
4053
4054
4055 extern "C" status_t
4056 read_file_io_vec_pages(int fd, const file_io_vec* fileVecs, size_t fileVecCount,
4057         const iovec* vecs, size_t vecCount, uint32* _vecIndex, size_t* _vecOffset,
4058         size_t* _bytes)
4059 {
4060         struct file_descriptor* descriptor;
4061         struct vnode* vnode;
4062
4063         descriptor = get_fd_and_vnode(fd, &vnode, true);
4064         if (descriptor == NULL)
4065                 return B_FILE_ERROR;
4066
4067         status_t status = common_file_io_vec_pages(vnode, descriptor->cookie,
4068                 fileVecs, fileVecCount, vecs, vecCount, _vecIndex, _vecOffset, _bytes,
4069                 false);
4070
4071         put_fd(descriptor);
4072         return status;
4073 }
4074
4075
4076 extern "C" status_t
4077 write_file_io_vec_pages(int fd, const file_io_vec* fileVecs, size_t fileVecCount,
4078         const iovec* vecs, size_t vecCount, uint32* _vecIndex, size_t* _vecOffset,
4079         size_t* _bytes)
4080 {
4081         struct file_descriptor* descriptor;
4082         struct vnode* vnode;
4083
4084         descriptor = get_fd_and_vnode(fd, &vnode, true);
4085         if (descriptor == NULL)
4086                 return B_FILE_ERROR;
4087
4088         status_t status = common_file_io_vec_pages(vnode, descriptor->cookie,
4089                 fileVecs, fileVecCount, vecs, vecCount, _vecIndex, _vecOffset, _bytes,
4090                 true);
4091
4092         put_fd(descriptor);
4093         return status;
4094 }
4095
4096
4097 extern "C" status_t
4098 entry_cache_add(dev_t mountID, ino_t dirID, const char* name, ino_t nodeID)
4099 {
4100         // lookup mount -- the caller is required to make sure that the mount
4101         // won't go away
4102         MutexLocker locker(sMountMutex);
4103         struct fs_mount* mount = find_mount(mountID);
4104         if (mount == NULL)
4105                 return B_BAD_VALUE;
4106         locker.Unlock();
4107
4108         return mount->entry_cache.Add(dirID, name, nodeID, false);
4109 }
4110
4111
4112 extern "C" status_t
4113 entry_cache_add_missing(dev_t mountID, ino_t dirID, const char* name)
4114 {
4115         // lookup mount -- the caller is required to make sure that the mount
4116         // won't go away
4117         MutexLocker locker(sMountMutex);
4118         struct fs_mount* mount = find_mount(mountID);
4119         if (mount == NULL)
4120                 return B_BAD_VALUE;
4121         locker.Unlock();
4122
4123         return mount->entry_cache.Add(dirID, name, -1, true);
4124 }
4125
4126
4127 extern "C" status_t
4128 entry_cache_remove(dev_t mountID, ino_t dirID, const char* name)
4129 {
4130         // lookup mount -- the caller is required to make sure that the mount
4131         // won't go away
4132         MutexLocker locker(sMountMutex);
4133         struct fs_mount* mount = find_mount(mountID);
4134         if (mount == NULL)
4135                 return B_BAD_VALUE;
4136         locker.Unlock();
4137
4138         return mount->entry_cache.Remove(dirID, name);
4139 }
4140
4141
4142 //      #pragma mark - private VFS API
4143 //      Functions the VFS exports for other parts of the kernel
4144
4145
4146 /*! Acquires another reference to the vnode that has to be released
4147         by calling vfs_put_vnode().
4148 */
4149 void
4150 vfs_acquire_vnode(struct vnode* vnode)
4151 {
4152         inc_vnode_ref_count(vnode);
4153 }
4154
4155
4156 /*! This is currently called from file_cache_create() only.
4157         It's probably a temporary solution as long as devfs requires that
4158         fs_read_pages()/fs_write_pages() are called with the standard
4159         open cookie and not with a device cookie.
4160         If that's done differently, remove this call; it has no other
4161         purpose.
4162 */
4163 extern "C" status_t
4164 vfs_get_cookie_from_fd(int fd, void** _cookie)
4165 {
4166         struct file_descriptor* descriptor;
4167
4168         descriptor = get_fd(get_current_io_context(true), fd);
4169         if (descriptor == NULL)
4170                 return B_FILE_ERROR;
4171
4172         *_cookie = descriptor->cookie;
4173         return B_OK;
4174 }
4175
4176
4177 extern "C" status_t
4178 vfs_get_vnode_from_fd(int fd, bool kernel, struct vnode** vnode)
4179 {
4180         *vnode = get_vnode_from_fd(fd, kernel);
4181
4182         if (*vnode == NULL)
4183                 return B_FILE_ERROR;
4184
4185         return B_NO_ERROR;
4186 }
4187
4188
4189 extern "C" status_t
4190 vfs_get_vnode_from_path(const char* path, bool kernel, struct vnode** _vnode)
4191 {
4192         TRACE(("vfs_get_vnode_from_path: entry. path = '%s', kernel %d\n",
4193                 path, kernel));
4194
4195         KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
4196         if (pathBuffer.InitCheck() != B_OK)
4197                 return B_NO_MEMORY;
4198
4199         char* buffer = pathBuffer.LockBuffer();
4200         strlcpy(buffer, path, pathBuffer.BufferSize());
4201
4202         struct vnode* vnode;
4203         status_t status = path_to_vnode(buffer, true, &vnode, NULL, kernel);
4204         if (status != B_OK)
4205                 return status;
4206
4207         *_vnode = vnode;
4208         return B_OK;
4209 }
4210
4211
4212 extern "C" status_t
4213 vfs_get_vnode(dev_t mountID, ino_t vnodeID, bool canWait, struct vnode** _vnode)
4214 {
4215         struct vnode* vnode = NULL;
4216
4217         status_t status = get_vnode(mountID, vnodeID, &vnode, canWait, false);
4218         if (status != B_OK)
4219                 return status;
4220
4221         *_vnode = vnode;
4222         return B_OK;
4223 }
4224
4225
4226 extern "C" status_t
4227 vfs_entry_ref_to_vnode(dev_t mountID, ino_t directoryID,
4228         const char* name, struct vnode** _vnode)
4229 {
4230         return entry_ref_to_vnode(mountID, directoryID, name, false, true, _vnode);
4231 }
4232
4233
4234 extern "C" void
4235 vfs_vnode_to_node_ref(struct vnode* vnode, dev_t* _mountID, ino_t* _vnodeID)
4236 {
4237         *_mountID = vnode->device;
4238         *_vnodeID = vnode->id;
4239 }
4240
4241
4242 /*!
4243         Helper function abstracting the process of "converting" a given
4244         vnode-pointer to a fs_vnode-pointer.
4245         Currently only used in bindfs.
4246 */
4247 extern "C" fs_vnode*
4248 vfs_fsnode_for_vnode(struct vnode* vnode)
4249 {
4250         return vnode;
4251 }
4252
4253
4254 /*!
4255         Calls fs_open() on the given vnode and returns a new
4256         file descriptor for it
4257 */
4258 int
4259 vfs_open_vnode(struct vnode* vnode, int openMode, bool kernel)
4260 {
4261         return open_vnode(vnode, openMode, kernel);
4262 }
4263
4264
4265 /*!     Looks up a vnode with the given mount and vnode ID.
4266         Must only be used with "in-use" vnodes as it doesn't grab a reference
4267         to the node.
4268         It's currently only be used by file_cache_create().
4269 */
4270 extern "C" status_t
4271 vfs_lookup_vnode(dev_t mountID, ino_t vnodeID, struct vnode** _vnode)
4272 {
4273         rw_lock_read_lock(&sVnodeLock);
4274         struct vnode* vnode = lookup_vnode(mountID, vnodeID);
4275         rw_lock_read_unlock(&sVnodeLock);
4276
4277         if (vnode == NULL)
4278                 return B_ERROR;
4279
4280         *_vnode = vnode;
4281         return B_OK;
4282 }
4283
4284
4285 extern "C" status_t
4286 vfs_get_fs_node_from_path(fs_volume* volume, const char* path,
4287         bool traverseLeafLink, bool kernel, void** _node)
4288 {
4289         TRACE(("vfs_get_fs_node_from_path(volume = %p, path = \"%s\", kernel %d)\n",
4290                 volume, path, kernel));
4291
4292         KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
4293         if (pathBuffer.InitCheck() != B_OK)
4294                 return B_NO_MEMORY;
4295
4296         fs_mount* mount;
4297         status_t status = get_mount(volume->id, &mount);
4298         if (status != B_OK)
4299                 return status;
4300
4301         char* buffer = pathBuffer.LockBuffer();
4302         strlcpy(buffer, path, pathBuffer.BufferSize());
4303
4304         struct vnode* vnode = mount->root_vnode;
4305
4306         if (buffer[0] == '/')
4307                 status = path_to_vnode(buffer, traverseLeafLink, &vnode, NULL, kernel);
4308         else {
4309                 inc_vnode_ref_count(vnode);
4310                         // vnode_path_to_vnode() releases a reference to the starting vnode
4311                 status = vnode_path_to_vnode(vnode, buffer, traverseLeafLink, 0,
4312                         kernel, &vnode, NULL);
4313         }
4314
4315         put_mount(mount);
4316
4317         if (status != B_OK)
4318                 return status;
4319
4320         if (vnode->device != volume->id) {
4321                 // wrong mount ID - must not gain access on foreign file system nodes
4322                 put_vnode(vnode);
4323                 return B_BAD_VALUE;
4324         }
4325
4326         // Use get_vnode() to resolve the cookie for the right layer.
4327         status = get_vnode(volume, vnode->id, _node);
4328         put_vnode(vnode);
4329
4330         return status;
4331 }
4332
4333
4334 status_t
4335 vfs_read_stat(int fd, const char* path, bool traverseLeafLink,
4336         struct stat* stat, bool kernel)
4337 {
4338         status_t status;
4339
4340         if (path != NULL) {
4341                 // path given: get the stat of the node referred to by (fd, path)
4342                 KPath pathBuffer(path, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
4343                 if (pathBuffer.InitCheck() != B_OK)
4344                         return B_NO_MEMORY;
4345
4346                 status = common_path_read_stat(fd, pathBuffer.LockBuffer(),
4347                         traverseLeafLink, stat, kernel);
4348         } else {
4349                 // no path given: get the FD and use the FD operation
4350                 struct file_descriptor* descriptor
4351                         = get_fd(get_current_io_context(kernel), fd);
4352                 if (descriptor == NULL)
4353                         return B_FILE_ERROR;
4354
4355                 if (descriptor->ops->fd_read_stat)
4356                         status = descriptor->ops->fd_read_stat(descriptor, stat);
4357                 else
4358                         status = B_UNSUPPORTED;
4359
4360                 put_fd(descriptor);
4361         }
4362
4363         return status;
4364 }
4365
4366
4367 /*!     Finds the full path to the file that contains the module \a moduleName,
4368         puts it into \a pathBuffer, and returns B_OK for success.
4369         If \a pathBuffer was too small, it returns \c B_BUFFER_OVERFLOW,
4370         \c B_ENTRY_NOT_FOUNT if no file could be found.
4371         \a pathBuffer is clobbered in any case and must not be relied on if this
4372         functions returns unsuccessfully.
4373         \a basePath and \a pathBuffer must not point to the same space.
4374 */
4375 status_t
4376 vfs_get_module_path(const char* basePath, const char* moduleName,
4377         char* pathBuffer, size_t bufferSize)
4378 {
4379         struct vnode* dir;
4380         struct vnode* file;
4381         status_t status;
4382         size_t length;
4383         char* path;
4384
4385         if (bufferSize == 0
4386                 || strlcpy(pathBuffer, basePath, bufferSize) >= bufferSize)
4387                 return B_BUFFER_OVERFLOW;
4388
4389         status = path_to_vnode(pathBuffer, true, &dir, NULL, true);
4390         if (status != B_OK)
4391                 return status;
4392
4393         // the path buffer had been clobbered by the above call
4394         length = strlcpy(pathBuffer, basePath, bufferSize);
4395         if (pathBuffer[length - 1] != '/')
4396                 pathBuffer[length++] = '/';
4397
4398         path = pathBuffer + length;
4399         bufferSize -= length;
4400
4401         while (moduleName) {
4402                 char* nextPath = strchr(moduleName, '/');
4403                 if (nextPath == NULL)
4404                         length = strlen(moduleName);
4405                 else {
4406                         length = nextPath - moduleName;
4407                         nextPath++;
4408                 }
4409
4410                 if (length + 1 >= bufferSize) {
4411                         status = B_BUFFER_OVERFLOW;
4412                         goto err;
4413                 }
4414
4415                 memcpy(path, moduleName, length);
4416                 path[length] = '\0';
4417                 moduleName = nextPath;
4418
4419                 status = vnode_path_to_vnode(dir, path, true, 0, true, &file, NULL);
4420                 if (status != B_OK) {
4421                         // vnode_path_to_vnode() has already released the reference to dir
4422                         return status;
4423                 }
4424
4425                 if (S_ISDIR(file->Type())) {
4426                         // goto the next directory
4427                         path[length] = '/';
4428                         path[length + 1] = '\0';
4429                         path += length + 1;
4430                         bufferSize -= length + 1;
4431
4432                         dir = file;
4433                 } else if (S_ISREG(file->Type())) {
4434                         // it's a file so it should be what we've searched for
4435                         put_vnode(file);
4436
4437                         return B_OK;
4438                 } else {
4439                         TRACE(("vfs_get_module_path(): something is strange here: "
4440                                 "0x%08" B_PRIx32 "...\n", file->Type()));
4441                         status = B_ERROR;
4442                         dir = file;
4443                         goto err;
4444                 }
4445         }
4446
4447         // if we got here, the moduleName just pointed to a directory, not to
4448         // a real module - what should we do in this case?
4449         status = B_ENTRY_NOT_FOUND;
4450
4451 err:
4452         put_vnode(dir);
4453         return status;
4454 }
4455
4456
4457 /*!     \brief Normalizes a given path.
4458
4459         The path must refer to an existing or non-existing entry in an existing
4460         directory, that is chopping off the leaf component the remaining path must
4461         refer to an existing directory.
4462
4463         The returned will be canonical in that it will be absolute, will not
4464         contain any "." or ".." components or duplicate occurrences of '/'s,
4465         and none of the directory components will by symbolic links.
4466
4467         Any two paths referring to the same entry, will result in the same
4468         normalized path (well, that is pretty much the definition of `normalized',
4469         isn't it :-).
4470
4471         \param path The path to be normalized.
4472         \param buffer The buffer into which the normalized path will be written.
4473                    May be the same one as \a path.
4474         \param bufferSize The size of \a buffer.
4475         \param traverseLink If \c true, the function also resolves leaf symlinks.
4476         \param kernel \c true, if the IO context of the kernel shall be used,
4477                    otherwise that of the team this thread belongs to. Only relevant,
4478                    if the path is relative (to get the CWD).
4479         \return \c B_OK if everything went fine, another error code otherwise.
4480 */
4481 status_t
4482 vfs_normalize_path(const char* path, char* buffer, size_t bufferSize,
4483         bool traverseLink, bool kernel)
4484 {
4485         if (!path || !buffer || bufferSize < 1)
4486                 return B_BAD_VALUE;
4487
4488         if (path != buffer) {
4489                 if (strlcpy(buffer, path, bufferSize) >= bufferSize)
4490                         return B_BUFFER_OVERFLOW;
4491         }
4492
4493         return normalize_path(buffer, bufferSize, traverseLink, kernel);
4494 }
4495
4496
4497 /*!     \brief Gets the parent of the passed in node.
4498
4499         Gets the parent of the passed in node, and correctly resolves covered
4500         nodes.
4501 */
4502 extern "C" status_t
4503 vfs_resolve_parent(struct vnode* parent, dev_t* device, ino_t* node)
4504 {
4505         return resolve_covered_parent(parent, device, node,
4506                 get_current_io_context(true));
4507 }
4508
4509
4510 /*!     \brief Creates a special node in the file system.
4511
4512         The caller gets a reference to the newly created node (which is passed
4513         back through \a _createdVnode) and is responsible for releasing it.
4514
4515         \param path The path where to create the entry for the node. Can be \c NULL,
4516                 in which case the node is created without an entry in the root FS -- it
4517                 will automatically be deleted when the last reference has been released.
4518         \param subVnode The definition of the subnode. Can be \c NULL, in which case
4519                 the target file system will just create the node with its standard
4520                 operations. Depending on the type of the node a subnode might be created
4521                 automatically, though.
4522         \param mode The type and permissions for the node to be created.
4523         \param flags Flags to be passed to the creating FS.
4524         \param kernel \c true, if called in the kernel context (relevant only if
4525                 \a path is not \c NULL and not absolute).
4526         \param _superVnode Pointer to a pre-allocated structure to be filled by the
4527                 file system creating the node, with the private data pointer and
4528                 operations for the super node. Can be \c NULL.
4529         \param _createVnode Pointer to pre-allocated storage where to store the
4530                 pointer to the newly created node.
4531         \return \c B_OK, if everything went fine, another error code otherwise.
4532 */
4533 status_t
4534 vfs_create_special_node(const char* path, fs_vnode* subVnode, mode_t mode,
4535         uint32 flags, bool kernel, fs_vnode* _superVnode,
4536         struct vnode** _createdVnode)
4537 {
4538         struct vnode* dirNode;
4539         char _leaf[B_FILE_NAME_LENGTH];
4540         char* leaf = NULL;
4541
4542         if (path) {
4543                 // We've got a path. Get the dir vnode and the leaf name.
4544                 KPath tmpPathBuffer(B_PATH_NAME_LENGTH + 1);
4545                 if (tmpPathBuffer.InitCheck() != B_OK)
4546                         return B_NO_MEMORY;
4547
4548                 char* tmpPath = tmpPathBuffer.LockBuffer();
4549                 if (strlcpy(tmpPath, path, B_PATH_NAME_LENGTH) >= B_PATH_NAME_LENGTH)
4550                         return B_NAME_TOO_LONG;
4551
4552                 // get the dir vnode and the leaf name
4553                 leaf = _leaf;
4554                 status_t error = path_to_dir_vnode(tmpPath, &dirNode, leaf, kernel);
4555                 if (error != B_OK)
4556                         return error;
4557         } else {
4558                 // No path. Create the node in the root FS.
4559                 dirNode = sRoot;
4560                 inc_vnode_ref_count(dirNode);
4561         }
4562
4563         VNodePutter _(dirNode);
4564
4565         // check support for creating special nodes
4566         if (!HAS_FS_CALL(dirNode, create_special_node))
4567                 return B_UNSUPPORTED;
4568
4569         // create the node
4570         fs_vnode superVnode;
4571         ino_t nodeID;
4572         status_t status = FS_CALL(dirNode, create_special_node, leaf, subVnode,
4573                 mode, flags, _superVnode != NULL ? _superVnode : &superVnode, &nodeID);
4574         if (status != B_OK)
4575                 return status;
4576
4577         // lookup the node
4578         rw_lock_read_lock(&sVnodeLock);
4579         *_createdVnode = lookup_vnode(dirNode->mount->id, nodeID);
4580         rw_lock_read_unlock(&sVnodeLock);
4581
4582         if (*_createdVnode == NULL) {
4583                 panic("vfs_create_special_node(): lookup of node failed");
4584                 return B_ERROR;
4585         }
4586
4587         return B_OK;
4588 }
4589
4590
4591 extern "C" void
4592 vfs_put_vnode(struct vnode* vnode)
4593 {
4594         put_vnode(vnode);
4595 }
4596
4597
4598 extern "C" status_t
4599 vfs_get_cwd(dev_t* _mountID, ino_t* _vnodeID)
4600 {
4601         // Get current working directory from io context
4602         struct io_context* context = get_current_io_context(false);
4603         status_t status = B_OK;
4604
4605         mutex_lock(&context->io_mutex);
4606
4607         if (context->cwd != NULL) {
4608                 *_mountID = context->cwd->device;
4609                 *_vnodeID = context->cwd->id;
4610         } else
4611                 status = B_ERROR;
4612
4613         mutex_unlock(&context->io_mutex);
4614         return status;
4615 }
4616
4617
4618 status_t
4619 vfs_unmount(dev_t mountID, uint32 flags)
4620 {
4621         return fs_unmount(NULL, mountID, flags, true);
4622 }
4623
4624
4625 extern "C" status_t
4626 vfs_disconnect_vnode(dev_t mountID, ino_t vnodeID)
4627 {
4628         struct vnode* vnode;
4629
4630         status_t status = get_vnode(mountID, vnodeID, &vnode, true, true);
4631         if (status != B_OK)
4632                 return status;
4633
4634         disconnect_mount_or_vnode_fds(vnode->mount, vnode);
4635         put_vnode(vnode);
4636         return B_OK;
4637 }
4638
4639
4640 extern "C" void
4641 vfs_free_unused_vnodes(int32 level)
4642 {
4643         vnode_low_resource_handler(NULL,
4644                 B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
4645                         | B_KERNEL_RESOURCE_ADDRESS_SPACE,
4646                 level);
4647 }
4648
4649
4650 extern "C" bool
4651 vfs_can_page(struct vnode* vnode, void* cookie)
4652 {
4653         FUNCTION(("vfs_canpage: vnode %p\n", vnode));
4654
4655         if (HAS_FS_CALL(vnode, can_page))
4656                 return FS_CALL(vnode, can_page, cookie);
4657         return false;
4658 }
4659
4660
4661 extern "C" status_t
4662 vfs_read_pages(struct vnode* vnode, void* cookie, off_t pos,
4663         const generic_io_vec* vecs, size_t count, uint32 flags,
4664         generic_size_t* _numBytes)
4665 {
4666         FUNCTION(("vfs_read_pages: vnode %p, vecs %p, pos %" B_PRIdOFF "\n", vnode,
4667                 vecs, pos));
4668
4669 #if VFS_PAGES_IO_TRACING
4670         generic_size_t bytesRequested = *_numBytes;
4671 #endif
4672
4673         IORequest request;
4674         status_t status = request.Init(pos, vecs, count, *_numBytes, false, flags);
4675         if (status == B_OK) {
4676                 status = vfs_vnode_io(vnode, cookie, &request);
4677                 if (status == B_OK)
4678                         status = request.Wait();
4679                 *_numBytes = request.TransferredBytes();
4680         }
4681
4682         TPIO(ReadPages(vnode, cookie, pos, vecs, count, flags, bytesRequested,
4683                 status, *_numBytes));
4684
4685         return status;
4686 }
4687
4688
4689 extern "C" status_t
4690 vfs_write_pages(struct vnode* vnode, void* cookie, off_t pos,
4691         const generic_io_vec* vecs, size_t count, uint32 flags,
4692         generic_size_t* _numBytes)
4693 {
4694         FUNCTION(("vfs_write_pages: vnode %p, vecs %p, pos %" B_PRIdOFF "\n", vnode,
4695                 vecs, pos));
4696
4697 #if VFS_PAGES_IO_TRACING
4698         generic_size_t bytesRequested = *_numBytes;
4699 #endif
4700
4701         IORequest request;
4702         status_t status = request.Init(pos, vecs, count, *_numBytes, true, flags);
4703         if (status == B_OK) {
4704                 status = vfs_vnode_io(vnode, cookie, &request);
4705                 if (status == B_OK)
4706                         status = request.Wait();
4707                 *_numBytes = request.TransferredBytes();
4708         }
4709
4710         TPIO(WritePages(vnode, cookie, pos, vecs, count, flags, bytesRequested,
4711                 status, *_numBytes));
4712
4713         return status;
4714 }
4715
4716
4717 /*!     Gets the vnode's VMCache object. If it didn't have one, it will be
4718         created if \a allocate is \c true.
4719         In case it's successful, it will also grab a reference to the cache
4720         it returns.
4721 */
4722 extern "C" status_t
4723 vfs_get_vnode_cache(struct vnode* vnode, VMCache** _cache, bool allocate)
4724 {
4725         if (vnode->cache != NULL) {
4726                 vnode->cache->AcquireRef();
4727                 *_cache = vnode->cache;
4728                 return B_OK;
4729         }
4730
4731         rw_lock_read_lock(&sVnodeLock);
4732         vnode->Lock();
4733
4734         status_t status = B_OK;
4735
4736         // The cache could have been created in the meantime
4737         if (vnode->cache == NULL) {
4738                 if (allocate) {
4739                         // TODO: actually the vnode needs to be busy already here, or
4740                         //      else this won't work...
4741                         bool wasBusy = vnode->IsBusy();
4742                         vnode->SetBusy(true);
4743
4744                         vnode->Unlock();
4745                         rw_lock_read_unlock(&sVnodeLock);
4746
4747                         status = vm_create_vnode_cache(vnode, &vnode->cache);
4748
4749                         rw_lock_read_lock(&sVnodeLock);
4750                         vnode->Lock();
4751                         vnode->SetBusy(wasBusy);
4752                 } else
4753                         status = B_BAD_VALUE;
4754         }
4755
4756         vnode->Unlock();
4757         rw_lock_read_unlock(&sVnodeLock);
4758
4759         if (status == B_OK) {
4760                 vnode->cache->AcquireRef();
4761                 *_cache = vnode->cache;
4762         }
4763
4764         return status;
4765 }
4766
4767
4768 status_t
4769 vfs_get_file_map(struct vnode* vnode, off_t offset, size_t size,
4770         file_io_vec* vecs, size_t* _count)
4771 {
4772         FUNCTION(("vfs_get_file_map: vnode %p, vecs %p, offset %" B_PRIdOFF
4773                 ", size = %" B_PRIuSIZE "\n", vnode, vecs, offset, size));
4774
4775         return FS_CALL(vnode, get_file_map, offset, size, vecs, _count);
4776 }
4777
4778
4779 status_t
4780 vfs_stat_vnode(struct vnode* vnode, struct stat* stat)
4781 {
4782         status_t status = FS_CALL(vnode, read_stat, stat);
4783
4784         // fill in the st_dev and st_ino fields
4785         if (status == B_OK) {
4786                 stat->st_dev = vnode->device;
4787                 stat->st_ino = vnode->id;
4788                 // the rdev field must stay unset for non-special files
4789                 if (!S_ISBLK(stat->st_mode) && !S_ISCHR(stat->st_mode))
4790                         stat->st_rdev = -1;
4791         }
4792
4793         return status;
4794 }
4795
4796
4797 status_t
4798 vfs_stat_node_ref(dev_t device, ino_t inode, struct stat* stat)
4799 {
4800         struct vnode* vnode;
4801         status_t status = get_vnode(device, inode, &vnode, true, false);
4802         if (status != B_OK)
4803                 return status;
4804
4805         status = vfs_stat_vnode(vnode, stat);
4806
4807         put_vnode(vnode);
4808         return status;
4809 }
4810
4811
4812 status_t
4813 vfs_get_vnode_name(struct vnode* vnode, char* name, size_t nameSize)
4814 {
4815         return get_vnode_name(vnode, NULL, name, nameSize, true);
4816 }
4817
4818
4819 status_t
4820 vfs_entry_ref_to_path(dev_t device, ino_t inode, const char* leaf,
4821         bool kernel, char* path, size_t pathLength)
4822 {
4823         struct vnode* vnode;
4824         status_t status;
4825
4826         // filter invalid leaf names
4827         if (leaf != NULL && (leaf[0] == '\0' || strchr(leaf, '/')))
4828                 return B_BAD_VALUE;
4829
4830         // get the vnode matching the dir's node_ref
4831         if (leaf && (strcmp(leaf, ".") == 0 || strcmp(leaf, "..") == 0)) {
4832                 // special cases "." and "..": we can directly get the vnode of the
4833                 // referenced directory
4834                 status = entry_ref_to_vnode(device, inode, leaf, false, kernel, &vnode);
4835                 leaf = NULL;
4836         } else
4837                 status = get_vnode(device, inode, &vnode, true, false);
4838         if (status != B_OK)
4839                 return status;
4840
4841         // get the directory path
4842         status = dir_vnode_to_path(vnode, path, pathLength, kernel);
4843         put_vnode(vnode);
4844                 // we don't need the vnode anymore
4845         if (status != B_OK)
4846                 return status;
4847
4848         // append the leaf name
4849         if (leaf) {
4850                 // insert a directory separator if this is not the file system root
4851                 if ((strcmp(path, "/") && strlcat(path, "/", pathLength)
4852                                 >= pathLength)
4853                         || strlcat(path, leaf, pathLength) >= pathLength) {
4854                         return B_NAME_TOO_LONG;
4855                 }
4856         }
4857
4858         return B_OK;
4859 }
4860
4861
4862 /*!     If the given descriptor locked its vnode, that lock will be released. */
4863 void
4864 vfs_unlock_vnode_if_locked(struct file_descriptor* descriptor)
4865 {
4866         struct vnode* vnode = fd_vnode(descriptor);
4867
4868         if (vnode != NULL && vnode->mandatory_locked_by == descriptor)
4869                 vnode->mandatory_locked_by = NULL;
4870 }
4871
4872
4873 /*!     Closes all file descriptors of the specified I/O context that
4874         have the O_CLOEXEC flag set.
4875 */
4876 void
4877 vfs_exec_io_context(io_context* context)
4878 {
4879         uint32 i;
4880
4881         for (i = 0; i < context->table_size; i++) {
4882                 mutex_lock(&context->io_mutex);
4883
4884                 struct file_descriptor* descriptor = context->fds[i];
4885                 bool remove = false;
4886
4887                 if (descriptor != NULL && fd_close_on_exec(context, i)) {
4888                         context->fds[i] = NULL;
4889                         context->num_used_fds--;
4890
4891                         remove = true;
4892                 }
4893
4894                 mutex_unlock(&context->io_mutex);
4895
4896                 if (remove) {
4897                         close_fd(descriptor);
4898                         put_fd(descriptor);
4899                 }
4900         }
4901 }
4902
4903
4904 /*! Sets up a new io_control structure, and inherits the properties
4905         of the parent io_control if it is given.
4906 */
4907 io_context*
4908 vfs_new_io_context(io_context* parentContext, bool purgeCloseOnExec)
4909 {
4910         io_context* context = (io_context*)malloc(sizeof(io_context));
4911         if (context == NULL)
4912                 return NULL;
4913
4914         TIOC(NewIOContext(context, parentContext));
4915
4916         memset(context, 0, sizeof(io_context));
4917         context->ref_count = 1;
4918
4919         MutexLocker parentLocker;
4920
4921         size_t tableSize;
4922         if (parentContext != NULL) {
4923                 parentLocker.SetTo(parentContext->io_mutex, false);
4924                 tableSize = parentContext->table_size;
4925         } else
4926                 tableSize = DEFAULT_FD_TABLE_SIZE;
4927
4928         // allocate space for FDs and their close-on-exec flag
4929         context->fds = (file_descriptor**)malloc(
4930                 sizeof(struct file_descriptor*) * tableSize
4931                 + sizeof(struct select_sync*) * tableSize
4932                 + (tableSize + 7) / 8);
4933         if (context->fds == NULL) {
4934                 free(context);
4935                 return NULL;
4936         }
4937
4938         context->select_infos = (select_info**)(context->fds + tableSize);
4939         context->fds_close_on_exec = (uint8*)(context->select_infos + tableSize);
4940
4941         memset(context->fds, 0, sizeof(struct file_descriptor*) * tableSize
4942                 + sizeof(struct select_sync*) * tableSize
4943                 + (tableSize + 7) / 8);
4944
4945         mutex_init(&context->io_mutex, "I/O context");
4946
4947         // Copy all parent file descriptors
4948
4949         if (parentContext != NULL) {
4950                 size_t i;
4951
4952                 mutex_lock(&sIOContextRootLock);
4953                 context->root = parentContext->root;
4954                 if (context->root)
4955                         inc_vnode_ref_count(context->root);
4956                 mutex_unlock(&sIOContextRootLock);
4957
4958                 context->cwd = parentContext->cwd;
4959                 if (context->cwd)
4960                         inc_vnode_ref_count(context->cwd);
4961
4962                 if (parentContext->inherit_fds) {
4963                         for (i = 0; i < tableSize; i++) {
4964                                 struct file_descriptor* descriptor = parentContext->fds[i];
4965
4966                                 if (descriptor != NULL
4967                                         && (descriptor->open_mode & O_DISCONNECTED) == 0) {
4968                                         bool closeOnExec = fd_close_on_exec(parentContext, i);
4969                                         if (closeOnExec && purgeCloseOnExec)
4970                                                 continue;
4971
4972                                         TFD(InheritFD(context, i, descriptor, parentContext));
4973
4974                                         context->fds[i] = descriptor;
4975                                         context->num_used_fds++;
4976                                         atomic_add(&descriptor->ref_count, 1);
4977                                         atomic_add(&descriptor->open_count, 1);
4978
4979                                         if (closeOnExec)
4980                                                 fd_set_close_on_exec(context, i, true);
4981                                 }
4982                         }
4983                 }
4984
4985                 parentLocker.Unlock();
4986         } else {
4987                 context->root = sRoot;
4988                 context->cwd = sRoot;
4989
4990                 if (context->root)
4991                         inc_vnode_ref_count(context->root);
4992
4993                 if (context->cwd)
4994                         inc_vnode_ref_count(context->cwd);
4995         }
4996
4997         context->table_size = tableSize;
4998         context->inherit_fds = parentContext != NULL;
4999
5000         list_init(&context->node_monitors);
5001         context->max_monitors = DEFAULT_NODE_MONITORS;
5002
5003         return context;
5004 }
5005
5006
5007 void
5008 vfs_get_io_context(io_context* context)
5009 {
5010         atomic_add(&context->ref_count, 1);
5011 }
5012
5013
5014 void
5015 vfs_put_io_context(io_context* context)
5016 {
5017         if (atomic_add(&context->ref_count, -1) == 1)
5018                 free_io_context(context);
5019 }
5020
5021
5022 status_t
5023 vfs_resize_fd_table(struct io_context* context, uint32 newSize)
5024 {
5025         if (newSize == 0 || newSize > MAX_FD_TABLE_SIZE)
5026                 return B_BAD_VALUE;
5027
5028         TIOC(ResizeIOContext(context, newSize));
5029
5030         MutexLocker _(context->io_mutex);
5031
5032         uint32 oldSize = context->table_size;
5033         int oldCloseOnExitBitmapSize = (oldSize + 7) / 8;
5034         int newCloseOnExitBitmapSize = (newSize + 7) / 8;
5035
5036         // If the tables shrink, make sure none of the fds being dropped are in use.
5037         if (newSize < oldSize) {
5038                 for (uint32 i = oldSize; i-- > newSize;) {
5039                         if (context->fds[i])
5040                                 return B_BUSY;
5041                 }
5042         }
5043
5044         // store pointers to the old tables
5045         file_descriptor** oldFDs = context->fds;
5046         select_info** oldSelectInfos = context->select_infos;
5047         uint8* oldCloseOnExecTable = context->fds_close_on_exec;
5048
5049         // allocate new tables
5050         file_descriptor** newFDs = (file_descriptor**)malloc(
5051                 sizeof(struct file_descriptor*) * newSize
5052                 + sizeof(struct select_sync*) * newSize
5053                 + newCloseOnExitBitmapSize);
5054         if (newFDs == NULL)
5055                 return B_NO_MEMORY;
5056
5057         context->fds = newFDs;
5058         context->select_infos = (select_info**)(context->fds + newSize);
5059         context->fds_close_on_exec = (uint8*)(context->select_infos + newSize);
5060         context->table_size = newSize;
5061
5062         // copy entries from old tables
5063         uint32 toCopy = min_c(oldSize, newSize);
5064
5065         memcpy(context->fds, oldFDs, sizeof(void*) * toCopy);
5066         memcpy(context->select_infos, oldSelectInfos, sizeof(void*) * toCopy);
5067         memcpy(context->fds_close_on_exec, oldCloseOnExecTable,
5068                 min_c(oldCloseOnExitBitmapSize, newCloseOnExitBitmapSize));
5069
5070         // clear additional entries, if the tables grow
5071         if (newSize > oldSize) {
5072                 memset(context->fds + oldSize, 0, sizeof(void*) * (newSize - oldSize));
5073                 memset(context->select_infos + oldSize, 0,
5074                         sizeof(void*) * (newSize - oldSize));
5075                 memset(context->fds_close_on_exec + oldCloseOnExitBitmapSize, 0,
5076                         newCloseOnExitBitmapSize - oldCloseOnExitBitmapSize);
5077         }
5078
5079         free(oldFDs);
5080
5081         return B_OK;
5082 }
5083
5084
5085 /*!     \brief Resolves a vnode to the vnode it is covered by, if any.
5086
5087         Given an arbitrary vnode (identified by mount and node ID), the function
5088         checks, whether the vnode is covered by another vnode. If it is, the
5089         function returns the mount and node ID of the covering vnode. Otherwise
5090         it simply returns the supplied mount and node ID.
5091
5092         In case of error (e.g. the supplied node could not be found) the variables
5093         for storing the resolved mount and node ID remain untouched and an error
5094         code is returned.
5095
5096         \param mountID The mount ID of the vnode in question.
5097         \param nodeID The node ID of the vnode in question.
5098         \param resolvedMountID Pointer to storage for the resolved mount ID.
5099         \param resolvedNodeID Pointer to storage for the resolved node ID.
5100         \return
5101         - \c B_OK, if everything went fine,
5102         - another error code, if something went wrong.
5103 */
5104 status_t
5105 vfs_resolve_vnode_to_covering_vnode(dev_t mountID, ino_t nodeID,
5106         dev_t* resolvedMountID, ino_t* resolvedNodeID)
5107 {
5108         // get the node
5109         struct vnode* node;
5110         status_t error = get_vnode(mountID, nodeID, &node, true, false);
5111         if (error != B_OK)
5112                 return error;
5113
5114         // resolve the node
5115         if (Vnode* coveringNode = get_covering_vnode(node)) {
5116                 put_vnode(node);
5117                 node = coveringNode;
5118         }
5119
5120         // set the return values
5121         *resolvedMountID = node->device;
5122         *resolvedNodeID = node->id;
5123
5124         put_vnode(node);
5125
5126         return B_OK;
5127 }
5128
5129
5130 status_t
5131 vfs_get_mount_point(dev_t mountID, dev_t* _mountPointMountID,
5132         ino_t* _mountPointNodeID)
5133 {
5134         ReadLocker nodeLocker(sVnodeLock);
5135         MutexLocker mountLocker(sMountMutex);
5136
5137         struct fs_mount* mount = find_mount(mountID);
5138         if (mount == NULL)
5139                 return B_BAD_VALUE;
5140
5141         Vnode* mountPoint = mount->covers_vnode;
5142
5143         *_mountPointMountID = mountPoint->device;
5144         *_mountPointNodeID = mountPoint->id;
5145
5146         return B_OK;
5147 }
5148
5149
5150 status_t
5151 vfs_bind_mount_directory(dev_t mountID, ino_t nodeID, dev_t coveredMountID,
5152         ino_t coveredNodeID)
5153 {
5154         // get the vnodes
5155         Vnode* vnode;
5156         status_t error = get_vnode(mountID, nodeID, &vnode, true, false);
5157         if (error != B_OK)
5158                 return B_BAD_VALUE;
5159         VNodePutter vnodePutter(vnode);
5160
5161         Vnode* coveredVnode;
5162         error = get_vnode(coveredMountID, coveredNodeID, &coveredVnode, true,
5163                 false);
5164         if (error != B_OK)
5165                 return B_BAD_VALUE;
5166         VNodePutter coveredVnodePutter(coveredVnode);
5167
5168         // establish the covered/covering links
5169         WriteLocker locker(sVnodeLock);
5170
5171         if (vnode->covers != NULL || coveredVnode->covered_by != NULL
5172                 || vnode->mount->unmounting || coveredVnode->mount->unmounting) {
5173                 return B_BUSY;
5174         }
5175
5176         vnode->covers = coveredVnode;
5177         vnode->SetCovering(true);
5178
5179         coveredVnode->covered_by = vnode;
5180         coveredVnode->SetCovered(true);
5181
5182         // the vnodes do now reference each other
5183         inc_vnode_ref_count(vnode);
5184         inc_vnode_ref_count(coveredVnode);
5185
5186         return B_OK;
5187 }
5188
5189
5190 int
5191 vfs_getrlimit(int resource, struct rlimit* rlp)
5192 {
5193         if (!rlp)
5194                 return B_BAD_ADDRESS;
5195
5196         switch (resource) {
5197                 case RLIMIT_NOFILE:
5198                 {
5199                         struct io_context* context = get_current_io_context(false);
5200                         MutexLocker _(context->io_mutex);
5201
5202                         rlp->rlim_cur = context->table_size;
5203                         rlp->rlim_max = MAX_FD_TABLE_SIZE;
5204                         return 0;
5205                 }
5206
5207                 case RLIMIT_NOVMON:
5208                 {
5209                         struct io_context* context = get_current_io_context(false);
5210                         MutexLocker _(context->io_mutex);
5211
5212                         rlp->rlim_cur = context->max_monitors;
5213                         rlp->rlim_max = MAX_NODE_MONITORS;
5214                         return 0;
5215                 }
5216
5217                 default:
5218                         return B_BAD_VALUE;
5219         }
5220 }
5221
5222
5223 int
5224 vfs_setrlimit(int resource, const struct rlimit* rlp)
5225 {
5226         if (!rlp)
5227                 return B_BAD_ADDRESS;
5228
5229         switch (resource) {
5230                 case RLIMIT_NOFILE:
5231                         /* TODO: check getuid() */
5232                         if (rlp->rlim_max != RLIM_SAVED_MAX
5233                                 && rlp->rlim_max != MAX_FD_TABLE_SIZE)
5234                                 return B_NOT_ALLOWED;
5235
5236                         return vfs_resize_fd_table(get_current_io_context(false),
5237                                 rlp->rlim_cur);
5238
5239                 case RLIMIT_NOVMON:
5240                         /* TODO: check getuid() */
5241                         if (rlp->rlim_max != RLIM_SAVED_MAX
5242                                 && rlp->rlim_max != MAX_NODE_MONITORS)
5243                                 return B_NOT_ALLOWED;
5244
5245                         return resize_monitor_table(get_current_io_context(false),
5246                                 rlp->rlim_cur);
5247
5248                 default:
5249                         return B_BAD_VALUE;
5250         }
5251 }
5252
5253
5254 status_t
5255 vfs_init(kernel_args* args)
5256 {
5257         vnode::StaticInit();
5258
5259         sVnodeTable = new(std::nothrow) VnodeTable();
5260         if (sVnodeTable == NULL || sVnodeTable->Init(VNODE_HASH_TABLE_SIZE) != B_OK)
5261                 panic("vfs_init: error creating vnode hash table\n");
5262
5263         struct vnode dummy_vnode;
5264         list_init_etc(&sUnusedVnodeList, offset_of_member(dummy_vnode, unused_link));
5265
5266         struct fs_mount dummyMount;
5267         sMountsTable = new(std::nothrow) MountTable();
5268         if (sMountsTable == NULL
5269                         || sMountsTable->Init(MOUNTS_HASH_TABLE_SIZE) != B_OK)
5270                 panic("vfs_init: error creating mounts hash table\n");
5271
5272         node_monitor_init();
5273
5274         sRoot = NULL;
5275
5276         recursive_lock_init(&sMountOpLock, "vfs_mount_op_lock");
5277
5278         if (block_cache_init() != B_OK)
5279                 return B_ERROR;
5280
5281 #ifdef ADD_DEBUGGER_COMMANDS
5282         // add some debugger commands
5283         add_debugger_command_etc("vnode", &dump_vnode,
5284                 "Print info about the specified vnode",
5285                 "[ \"-p\" ] ( <vnode> | <devID> <nodeID> )\n"
5286                 "Prints information about the vnode specified by address <vnode> or\n"
5287                 "<devID>, <vnodeID> pair. If \"-p\" is given, a path of the vnode is\n"
5288                 "constructed and printed. It might not be possible to construct a\n"
5289                 "complete path, though.\n",
5290                 0);
5291         add_debugger_command("vnodes", &dump_vnodes,
5292                 "list all vnodes (from the specified device)");
5293         add_debugger_command("vnode_caches", &dump_vnode_caches,
5294                 "list all vnode caches");
5295         add_debugger_command("mount", &dump_mount,
5296                 "info about the specified fs_mount");
5297         add_debugger_command("mounts", &dump_mounts, "list all fs_mounts");
5298         add_debugger_command("io_context", &dump_io_context,
5299                 "info about the I/O context");
5300         add_debugger_command("vnode_usage", &dump_vnode_usage,
5301                 "info about vnode usage");
5302 #endif
5303
5304         register_low_resource_handler(&vnode_low_resource_handler, NULL,
5305                 B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
5306                         | B_KERNEL_RESOURCE_ADDRESS_SPACE,
5307                 0);
5308
5309         fifo_init();
5310         file_map_init();
5311
5312         return file_cache_init();
5313 }
5314
5315
5316 //      #pragma mark - fd_ops implementations
5317
5318
5319 /*!
5320         Calls fs_open() on the given vnode and returns a new
5321         file descriptor for it
5322 */
5323 static int
5324 open_vnode(struct vnode* vnode, int openMode, bool kernel)
5325 {
5326         void* cookie;
5327         status_t status = FS_CALL(vnode, open, openMode, &cookie);
5328         if (status != B_OK)
5329                 return status;
5330
5331         int fd = get_new_fd(FDTYPE_FILE, NULL, vnode, cookie, openMode, kernel);
5332         if (fd < 0) {
5333                 FS_CALL(vnode, close, cookie);
5334                 FS_CALL(vnode, free_cookie, cookie);
5335         }
5336         return fd;
5337 }
5338
5339
5340 /*!
5341         Calls fs_open() on the given vnode and returns a new
5342         file descriptor for it
5343 */
5344 static int
5345 create_vnode(struct vnode* directory, const char* name, int openMode,
5346         int perms, bool kernel)
5347 {
5348         bool traverse = ((openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0);
5349         status_t status = B_ERROR;
5350         struct vnode* vnode;
5351         void* cookie;
5352         ino_t newID;
5353
5354         // This is somewhat tricky: If the entry already exists, the FS responsible
5355         // for the directory might not necessarily also be the one responsible for
5356         // the node the entry refers to (e.g. in case of mount points or FIFOs). So
5357         // we can actually never call the create() hook without O_EXCL. Instead we
5358         // try to look the entry up first. If it already exists, we just open the
5359         // node (unless O_EXCL), otherwise we call create() with O_EXCL. This
5360         // introduces a race condition, since someone else might have created the
5361         // entry in the meantime. We hope the respective FS returns the correct
5362         // error code and retry (up to 3 times) again.
5363
5364         for (int i = 0; i < 3 && status != B_OK; i++) {
5365                 // look the node up
5366                 status = lookup_dir_entry(directory, name, &vnode);
5367                 if (status == B_OK) {
5368                         VNodePutter putter(vnode);
5369
5370                         if ((openMode & O_EXCL) != 0)
5371                                 return B_FILE_EXISTS;
5372
5373                         // If the node is a symlink, we have to follow it, unless
5374                         // O_NOTRAVERSE is set.
5375                         if (S_ISLNK(vnode->Type()) && traverse) {
5376                                 putter.Put();
5377                                 char clonedName[B_FILE_NAME_LENGTH + 1];
5378                                 if (strlcpy(clonedName, name, B_FILE_NAME_LENGTH)
5379                                                 >= B_FILE_NAME_LENGTH) {
5380                                         return B_NAME_TOO_LONG;
5381                                 }
5382
5383                                 inc_vnode_ref_count(directory);
5384                                 status = vnode_path_to_vnode(directory, clonedName, true, 0,
5385                                         kernel, &vnode, NULL);
5386                                 if (status != B_OK)
5387                                         return status;
5388
5389                                 putter.SetTo(vnode);
5390                         }
5391
5392                         if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type()))
5393                                 return B_LINK_LIMIT;
5394
5395                         int fd = open_vnode(vnode, openMode & ~O_CREAT, kernel);
5396                         // on success keep the vnode reference for the FD
5397                         if (fd >= 0)
5398                                 putter.Detach();
5399
5400                         return fd;
5401                 }
5402
5403                 // it doesn't exist yet -- try to create it
5404
5405                 if (!HAS_FS_CALL(directory, create))
5406                         return B_READ_ONLY_DEVICE;
5407
5408                 status = FS_CALL(directory, create, name, openMode | O_EXCL, perms,
5409                         &cookie, &newID);
5410                 if (status != B_OK
5411                         && ((openMode & O_EXCL) != 0 || status != B_FILE_EXISTS)) {
5412                         return status;
5413                 }
5414         }
5415
5416         if (status != B_OK)
5417                 return status;
5418
5419         // the node has been created successfully
5420
5421         rw_lock_read_lock(&sVnodeLock);
5422         vnode = lookup_vnode(directory->device, newID);
5423         rw_lock_read_unlock(&sVnodeLock);
5424
5425         if (vnode == NULL) {
5426                 panic("vfs: fs_create() returned success but there is no vnode, "
5427                         "mount ID %" B_PRIdDEV "!\n", directory->device);
5428                 return B_BAD_VALUE;
5429         }
5430
5431         int fd = get_new_fd(FDTYPE_FILE, NULL, vnode, cookie, openMode, kernel);
5432         if (fd >= 0)
5433                 return fd;
5434
5435         status = fd;
5436
5437         // something went wrong, clean up
5438
5439         FS_CALL(vnode, close, cookie);
5440         FS_CALL(vnode, free_cookie, cookie);
5441         put_vnode(vnode);
5442
5443         FS_CALL(directory, unlink, name);
5444
5445         return status;
5446 }
5447
5448
5449 /*! Calls fs open_dir() on the given vnode and returns a new
5450         file descriptor for it
5451 */
5452 static int
5453 open_dir_vnode(struct vnode* vnode, bool kernel)
5454 {
5455         void* cookie;
5456         status_t status = FS_CALL(vnode, open_dir, &cookie);
5457         if (status != B_OK)
5458                 return status;
5459
5460         // directory is opened, create a fd
5461         status = get_new_fd(FDTYPE_DIR, NULL, vnode, cookie, O_CLOEXEC, kernel);
5462         if (status >= 0)
5463                 return status;
5464
5465         FS_CALL(vnode, close_dir, cookie);
5466         FS_CALL(vnode, free_dir_cookie, cookie);
5467
5468         return status;
5469 }
5470
5471
5472 /*! Calls fs open_attr_dir() on the given vnode and returns a new
5473         file descriptor for it.
5474         Used by attr_dir_open(), and attr_dir_open_fd().
5475 */
5476 static int
5477 open_attr_dir_vnode(struct vnode* vnode, bool kernel)
5478 {
5479         if (!HAS_FS_CALL(vnode, open_attr_dir))
5480                 return B_UNSUPPORTED;
5481
5482         void* cookie;
5483         status_t status = FS_CALL(vnode, open_attr_dir, &cookie);
5484         if (status != B_OK)
5485                 return status;
5486
5487         // directory is opened, create a fd
5488         status = get_new_fd(FDTYPE_ATTR_DIR, NULL, vnode, cookie, O_CLOEXEC,
5489                 kernel);
5490         if (status >= 0)
5491                 return status;
5492
5493         FS_CALL(vnode, close_attr_dir, cookie);
5494         FS_CALL(vnode, free_attr_dir_cookie, cookie);
5495
5496         return status;
5497 }
5498
5499
5500 static int
5501 file_create_entry_ref(dev_t mountID, ino_t directoryID, const char* name,
5502         int openMode, int perms, bool kernel)
5503 {
5504         FUNCTION(("file_create_entry_ref: name = '%s', omode %x, perms %d, "
5505                 "kernel %d\n", name, openMode, perms, kernel));
5506
5507         // get directory to put the new file in
5508         struct vnode* directory;
5509         status_t status = get_vnode(mountID, directoryID, &directory, true, false);
5510         if (status != B_OK)
5511                 return status;
5512
5513         status = create_vnode(directory, name, openMode, perms, kernel);
5514         put_vnode(directory);
5515
5516         return status;
5517 }
5518
5519
5520 static int
5521 file_create(int fd, char* path, int openMode, int perms, bool kernel)
5522 {
5523         FUNCTION(("file_create: path '%s', omode %x, perms %d, kernel %d\n", path,
5524                 openMode, perms, kernel));
5525
5526         // get directory to put the new file in
5527         char name[B_FILE_NAME_LENGTH];
5528         struct vnode* directory;
5529         status_t status = fd_and_path_to_dir_vnode(fd, path, &directory, name,
5530                 kernel);
5531         if (status < 0)
5532                 return status;
5533
5534         status = create_vnode(directory, name, openMode, perms, kernel);
5535
5536         put_vnode(directory);
5537         return status;
5538 }
5539
5540
5541 static int
5542 file_open_entry_ref(dev_t mountID, ino_t directoryID, const char* name,
5543         int openMode, bool kernel)
5544 {
5545         if (name == NULL || *name == '\0')
5546                 return B_BAD_VALUE;
5547
5548         FUNCTION(("file_open_entry_ref(ref = (%" B_PRId32 ", %" B_PRId64 ", %s), "
5549                 "openMode = %d)\n", mountID, directoryID, name, openMode));
5550
5551         bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
5552
5553         // get the vnode matching the entry_ref
5554         struct vnode* vnode;
5555         status_t status = entry_ref_to_vnode(mountID, directoryID, name, traverse,
5556                 kernel, &vnode);
5557         if (status != B_OK)
5558                 return status;
5559
5560         if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
5561                 put_vnode(vnode);
5562                 return B_LINK_LIMIT;
5563         }
5564
5565         int newFD = open_vnode(vnode, openMode, kernel);
5566         if (newFD >= 0) {
5567                 // The vnode reference has been transferred to the FD
5568                 cache_node_opened(vnode, FDTYPE_FILE, vnode->cache, mountID,
5569                         directoryID, vnode->id, name);
5570         } else
5571                 put_vnode(vnode);
5572
5573         return newFD;
5574 }
5575
5576
5577 static int
5578 file_open(int fd, char* path, int openMode, bool kernel)
5579 {
5580         bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
5581
5582         FUNCTION(("file_open: fd: %d, entry path = '%s', omode %d, kernel %d\n",
5583                 fd, path, openMode, kernel));
5584
5585         // get the vnode matching the vnode + path combination
5586         struct vnode* vnode;
5587         ino_t parentID;
5588         status_t status = fd_and_path_to_vnode(fd, path, traverse, &vnode,
5589                 &parentID, kernel);
5590         if (status != B_OK)
5591                 return status;
5592
5593         if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
5594                 put_vnode(vnode);
5595                 return B_LINK_LIMIT;
5596         }
5597
5598         // open the vnode
5599         int newFD = open_vnode(vnode, openMode, kernel);
5600         if (newFD >= 0) {
5601                 // The vnode reference has been transferred to the FD
5602                 cache_node_opened(vnode, FDTYPE_FILE, vnode->cache,
5603                         vnode->device, parentID, vnode->id, NULL);
5604         } else
5605                 put_vnode(vnode);
5606
5607         return newFD;
5608 }
5609
5610
5611 static status_t
5612 file_close(struct file_descriptor* descriptor)
5613 {
5614         struct vnode* vnode = descriptor->u.vnode;
5615         status_t status = B_OK;
5616
5617         FUNCTION(("file_close(descriptor = %p)\n", descriptor));
5618
5619         cache_node_closed(vnode, FDTYPE_FILE, vnode->cache, vnode->device,
5620                 vnode->id);
5621         if (HAS_FS_CALL(vnode, close)) {
5622                 status = FS_CALL(vnode, close, descriptor->cookie);
5623         }
5624
5625         if (status == B_OK) {
5626                 // remove all outstanding locks for this team
5627                 if (HAS_FS_CALL(vnode, release_lock))
5628                         status = FS_CALL(vnode, release_lock, descriptor->cookie, NULL);
5629                 else
5630                         status = release_advisory_lock(vnode, NULL);
5631         }
5632         return status;
5633 }
5634
5635
5636 static void
5637 file_free_fd(struct file_descriptor* descriptor)
5638 {
5639         struct vnode* vnode = descriptor->u.vnode;
5640
5641         if (vnode != NULL) {
5642                 FS_CALL(vnode, free_cookie, descriptor->cookie);
5643                 put_vnode(vnode);
5644         }
5645 }
5646
5647
5648 static status_t
5649 file_read(struct file_descriptor* descriptor, off_t pos, void* buffer,
5650         size_t* length)
5651 {
5652         struct vnode* vnode = descriptor->u.vnode;
5653         FUNCTION(("file_read: buf %p, pos %" B_PRIdOFF ", len %p = %ld\n", buffer,
5654                 pos, length, *length));
5655
5656         if (S_ISDIR(vnode->Type()))
5657                 return B_IS_A_DIRECTORY;
5658
5659         return FS_CALL(vnode, read, descriptor->cookie, pos, buffer, length);
5660 }
5661
5662
5663 static status_t
5664 file_write(struct file_descriptor* descriptor, off_t pos, const void* buffer,
5665         size_t* length)
5666 {
5667         struct vnode* vnode = descriptor->u.vnode;
5668         FUNCTION(("file_write: buf %p, pos %" B_PRIdOFF ", len %p\n", buffer, pos,
5669                 length));
5670
5671         if (S_ISDIR(vnode->Type()))
5672                 return B_IS_A_DIRECTORY;
5673         if (!HAS_FS_CALL(vnode, write))
5674                 return B_READ_ONLY_DEVICE;
5675
5676         return FS_CALL(vnode, write, descriptor->cookie, pos, buffer, length);
5677 }
5678
5679
5680 static off_t
5681 file_seek(struct file_descriptor* descriptor, off_t pos, int seekType)
5682 {
5683         struct vnode* vnode = descriptor->u.vnode;
5684         off_t offset;
5685         bool isDevice = false;
5686
5687         FUNCTION(("file_seek(pos = %" B_PRIdOFF ", seekType = %d)\n", pos,
5688                 seekType));
5689
5690         // some kinds of files are not seekable
5691         switch (vnode->Type() & S_IFMT) {
5692                 case S_IFIFO:
5693                 case S_IFSOCK:
5694                         return ESPIPE;
5695
5696                 // drivers publish block devices as chr, so pick both
5697                 case S_IFBLK:
5698                 case S_IFCHR:
5699                         isDevice = true;
5700                         break;
5701                 // The Open Group Base Specs don't mention any file types besides pipes,
5702                 // fifos, and sockets specially, so we allow seeking them.
5703                 case S_IFREG:
5704                 case S_IFDIR:
5705                 case S_IFLNK:
5706                         break;
5707         }
5708
5709         switch (seekType) {
5710                 case SEEK_SET:
5711                         offset = 0;
5712                         break;
5713                 case SEEK_CUR:
5714                         offset = descriptor->pos;
5715                         break;
5716                 case SEEK_END:
5717                 {
5718                         // stat() the node
5719                         if (!HAS_FS_CALL(vnode, read_stat))
5720                                 return B_UNSUPPORTED;
5721
5722                         struct stat stat;
5723                         status_t status = FS_CALL(vnode, read_stat, &stat);
5724                         if (status != B_OK)
5725                                 return status;
5726
5727                         offset = stat.st_size;
5728
5729                         if (offset == 0 && isDevice) {
5730                                 // stat() on regular drivers doesn't report size
5731                                 device_geometry geometry;
5732
5733                                 if (HAS_FS_CALL(vnode, ioctl)) {
5734                                         status = FS_CALL(vnode, ioctl, descriptor->cookie,
5735                                                 B_GET_GEOMETRY, &geometry, sizeof(geometry));
5736                                         if (status == B_OK)
5737                                                 offset = (off_t)geometry.bytes_per_sector
5738                                                         * geometry.sectors_per_track
5739                                                         * geometry.cylinder_count
5740                                                         * geometry.head_count;
5741                                 }
5742                         }
5743
5744                         break;
5745                 }
5746                 default:
5747                         return B_BAD_VALUE;
5748         }
5749
5750         // assumes off_t is 64 bits wide
5751         if (offset > 0 && LONGLONG_MAX - offset < pos)
5752                 return B_BUFFER_OVERFLOW;
5753
5754         pos += offset;
5755         if (pos < 0)
5756                 return B_BAD_VALUE;
5757
5758         return descriptor->pos = pos;
5759 }
5760
5761
5762 static status_t
5763 file_select(struct file_descriptor* descriptor, uint8 event,
5764         struct selectsync* sync)
5765 {
5766         FUNCTION(("file_select(%p, %u, %p)\n", descriptor, event, sync));
5767
5768         struct vnode* vnode = descriptor->u.vnode;
5769
5770         // If the FS has no select() hook, notify select() now.
5771         if (!HAS_FS_CALL(vnode, select))
5772                 return notify_select_event(sync, event);
5773
5774         return FS_CALL(vnode, select, descriptor->cookie, event, sync);
5775 }
5776
5777
5778 static status_t
5779 file_deselect(struct file_descriptor* descriptor, uint8 event,
5780         struct selectsync* sync)
5781 {
5782         struct vnode* vnode = descriptor->u.vnode;
5783
5784         if (!HAS_FS_CALL(vnode, deselect))
5785                 return B_OK;
5786
5787         return FS_CALL(vnode, deselect, descriptor->cookie, event, sync);
5788 }
5789
5790
5791 static status_t
5792 dir_create_entry_ref(dev_t mountID, ino_t parentID, const char* name, int perms,
5793         bool kernel)
5794 {
5795         struct vnode* vnode;
5796         status_t status;
5797
5798         if (name == NULL || *name == '\0')
5799                 return B_BAD_VALUE;
5800
5801         FUNCTION(("dir_create_entry_ref(dev = %" B_PRId32 ", ino = %" B_PRId64 ", "
5802                 "name = '%s', perms = %d)\n", mountID, parentID, name, perms));
5803
5804         status = get_vnode(mountID, parentID, &vnode, true, false);
5805         if (status != B_OK)
5806                 return status;
5807
5808         if (HAS_FS_CALL(vnode, create_dir))
5809                 status = FS_CALL(vnode, create_dir, name, perms);
5810         else
5811                 status = B_READ_ONLY_DEVICE;
5812
5813         put_vnode(vnode);
5814         return status;
5815 }
5816
5817
5818 static status_t
5819 dir_create(int fd, char* path, int perms, bool kernel)
5820 {
5821         char filename[B_FILE_NAME_LENGTH];
5822         struct vnode* vnode;
5823         status_t status;
5824
5825         FUNCTION(("dir_create: path '%s', perms %d, kernel %d\n", path, perms,
5826                 kernel));
5827
5828         status = fd_and_path_to_dir_vnode(fd, path, &vnode, filename, kernel);
5829         if (status < 0)
5830                 return status;
5831
5832         if (HAS_FS_CALL(vnode, create_dir)) {
5833                 status = FS_CALL(vnode, create_dir, filename, perms);
5834         } else
5835                 status = B_READ_ONLY_DEVICE;
5836
5837         put_vnode(vnode);
5838         return status;
5839 }
5840
5841
5842 static int
5843 dir_open_entry_ref(dev_t mountID, ino_t parentID, const char* name, bool kernel)
5844 {
5845         FUNCTION(("dir_open_entry_ref()\n"));
5846
5847         if (name && name[0] == '\0')
5848                 return B_BAD_VALUE;
5849
5850         // get the vnode matching the entry_ref/node_ref
5851         struct vnode* vnode;
5852         status_t status;
5853         if (name) {
5854                 status = entry_ref_to_vnode(mountID, parentID, name, true, kernel,
5855                         &vnode);
5856         } else
5857                 status = get_vnode(mountID, parentID, &vnode, true, false);
5858         if (status != B_OK)
5859                 return status;
5860
5861         int newFD = open_dir_vnode(vnode, kernel);
5862         if (newFD >= 0) {
5863                 // The vnode reference has been transferred to the FD
5864                 cache_node_opened(vnode, FDTYPE_DIR, vnode->cache, mountID, parentID,
5865                         vnode->id, name);
5866         } else
5867                 put_vnode(vnode);
5868
5869         return newFD;
5870 }
5871
5872
5873 static int
5874 dir_open(int fd, char* path, bool kernel)
5875 {
5876         FUNCTION(("dir_open: fd: %d, entry path = '%s', kernel %d\n", fd, path,
5877                 kernel));
5878
5879         // get the vnode matching the vnode + path combination
5880         struct vnode* vnode = NULL;
5881         ino_t parentID;
5882         status_t status = fd_and_path_to_vnode(fd, path, true, &vnode, &parentID,
5883                 kernel);
5884         if (status != B_OK)
5885                 return status;
5886
5887         // open the dir
5888         int newFD = open_dir_vnode(vnode, kernel);
5889         if (newFD >= 0) {
5890                 // The vnode reference has been transferred to the FD
5891                 cache_node_opened(vnode, FDTYPE_DIR, vnode->cache, vnode->device,
5892                         parentID, vnode->id, NULL);
5893         } else
5894                 put_vnode(vnode);
5895
5896         return newFD;
5897 }
5898
5899
5900 static status_t
5901 dir_close(struct file_descriptor* descriptor)
5902 {
5903         struct vnode* vnode = descriptor->u.vnode;
5904
5905         FUNCTION(("dir_close(descriptor = %p)\n", descriptor));
5906
5907         cache_node_closed(vnode, FDTYPE_DIR, vnode->cache, vnode->device,
5908                 vnode->id);
5909         if (HAS_FS_CALL(vnode, close_dir))
5910                 return FS_CALL(vnode, close_dir, descriptor->cookie);
5911
5912         return B_OK;
5913 }
5914
5915
5916 static void
5917 dir_free_fd(struct file_descriptor* descriptor)
5918 {
5919         struct vnode* vnode = descriptor->u.vnode;
5920
5921         if (vnode != NULL) {
5922                 FS_CALL(vnode, free_dir_cookie, descriptor->cookie);
5923                 put_vnode(vnode);
5924         }
5925 }
5926
5927
5928 static status_t
5929 dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
5930         struct dirent* buffer, size_t bufferSize, uint32* _count)
5931 {
5932         return dir_read(ioContext, descriptor->u.vnode, descriptor->cookie, buffer,
5933                 bufferSize, _count);
5934 }
5935
5936
5937 static status_t
5938 fix_dirent(struct vnode* parent, struct dirent* entry,
5939         struct io_context* ioContext)
5940 {
5941         // set d_pdev and d_pino
5942         entry->d_pdev = parent->device;
5943         entry->d_pino = parent->id;
5944
5945         // If this is the ".." entry and the directory covering another vnode,
5946         // we need to replace d_dev and d_ino with the actual values.
5947         if (strcmp(entry->d_name, "..") == 0 && parent->IsCovering()) {
5948                 return resolve_covered_parent(parent, &entry->d_dev, &entry->d_ino,
5949                         ioContext);
5950         }
5951
5952         // resolve covered vnodes
5953         ReadLocker _(&sVnodeLock);
5954
5955         struct vnode* vnode = lookup_vnode(entry->d_dev, entry->d_ino);
5956         if (vnode != NULL && vnode->covered_by != NULL) {
5957                 do {
5958                         vnode = vnode->covered_by;
5959                 } while (vnode->covered_by != NULL);
5960
5961                 entry->d_dev = vnode->device;
5962                 entry->d_ino = vnode->id;
5963         }
5964
5965         return B_OK;
5966 }
5967
5968
5969 static status_t
5970 dir_read(struct io_context* ioContext, struct vnode* vnode, void* cookie,
5971         struct dirent* buffer, size_t bufferSize, uint32* _count)
5972 {
5973         if (!HAS_FS_CALL(vnode, read_dir))
5974                 return B_UNSUPPORTED;
5975
5976         status_t error = FS_CALL(vnode, read_dir, cookie, buffer, bufferSize,
5977                 _count);
5978         if (error != B_OK)
5979                 return error;
5980
5981         // we need to adjust the read dirents
5982         uint32 count = *_count;
5983         for (uint32 i = 0; i < count; i++) {
5984                 error = fix_dirent(vnode, buffer, ioContext);
5985                 if (error != B_OK)
5986                         return error;
5987
5988                 buffer = (struct dirent*)((uint8*)buffer + buffer->d_reclen);
5989         }
5990
5991         return error;
5992 }
5993
5994
5995 static status_t
5996 dir_rewind(struct file_descriptor* descriptor)
5997 {
5998         struct vnode* vnode = descriptor->u.vnode;
5999
6000         if (HAS_FS_CALL(vnode, rewind_dir)) {
6001                 return FS_CALL(vnode, rewind_dir, descriptor->cookie);
6002         }
6003
6004         return B_UNSUPPORTED;
6005 }
6006
6007
6008 static status_t
6009 dir_remove(int fd, char* path, bool kernel)
6010 {
6011         char name[B_FILE_NAME_LENGTH];
6012         struct vnode* directory;
6013         status_t status;
6014
6015         if (path != NULL) {
6016                 // we need to make sure our path name doesn't stop with "/", ".",
6017                 // or ".."
6018                 char* lastSlash;
6019                 while ((lastSlash = strrchr(path, '/')) != NULL) {
6020                         char* leaf = lastSlash + 1;
6021                         if (!strcmp(leaf, ".."))
6022                                 return B_NOT_ALLOWED;
6023
6024                         // omit multiple slashes
6025                         while (lastSlash > path && lastSlash[-1] == '/')
6026                                 lastSlash--;
6027
6028                         if (leaf[0]
6029                                 && strcmp(leaf, ".")) {
6030                                 break;
6031                         }
6032                         // "name/" -> "name", or "name/." -> "name"
6033                         lastSlash[0] = '\0';
6034                 }
6035
6036                 if (!strcmp(path, ".") || !strcmp(path, ".."))
6037                         return B_NOT_ALLOWED;
6038         }
6039
6040         status = fd_and_path_to_dir_vnode(fd, path, &directory, name, kernel);
6041         if (status != B_OK)
6042                 return status;
6043
6044         if (HAS_FS_CALL(directory, remove_dir))
6045                 status = FS_CALL(directory, remove_dir, name);
6046         else
6047                 status = B_READ_ONLY_DEVICE;
6048
6049         put_vnode(directory);
6050         return status;
6051 }
6052
6053
6054 static status_t
6055 common_ioctl(struct file_descriptor* descriptor, ulong op, void* buffer,
6056         size_t length)
6057 {
6058         struct vnode* vnode = descriptor->u.vnode;
6059
6060         if (HAS_FS_CALL(vnode, ioctl))
6061                 return FS_CALL(vnode, ioctl, descriptor->cookie, op, buffer, length);
6062
6063         return B_DEV_INVALID_IOCTL;
6064 }
6065
6066
6067 static status_t
6068 common_fcntl(int fd, int op, size_t argument, bool kernel)
6069 {
6070         struct flock flock;
6071
6072         FUNCTION(("common_fcntl(fd = %d, op = %d, argument = %lx, %s)\n",
6073                 fd, op, argument, kernel ? "kernel" : "user"));
6074
6075         struct file_descriptor* descriptor = get_fd(get_current_io_context(kernel),
6076                 fd);
6077         if (descriptor == NULL)
6078                 return B_FILE_ERROR;
6079
6080         struct vnode* vnode = fd_vnode(descriptor);
6081
6082         status_t status = B_OK;
6083
6084         if (op == F_SETLK || op == F_SETLKW || op == F_GETLK) {
6085                 if (descriptor->type != FDTYPE_FILE)
6086                         status = B_BAD_VALUE;
6087                 else if (user_memcpy(&flock, (struct flock*)argument,
6088                                 sizeof(struct flock)) != B_OK)
6089                         status = B_BAD_ADDRESS;
6090
6091                 if (status != B_OK) {
6092                         put_fd(descriptor);
6093                         return status;
6094                 }
6095         }
6096
6097         switch (op) {
6098                 case F_SETFD:
6099                 {
6100                         struct io_context* context = get_current_io_context(kernel);
6101                         // Set file descriptor flags
6102
6103                         // O_CLOEXEC is the only flag available at this time
6104                         mutex_lock(&context->io_mutex);
6105                         fd_set_close_on_exec(context, fd, (argument & FD_CLOEXEC) != 0);
6106                         mutex_unlock(&context->io_mutex);
6107
6108                         status = B_OK;
6109                         break;
6110                 }
6111
6112                 case F_GETFD:
6113                 {
6114                         struct io_context* context = get_current_io_context(kernel);
6115
6116                         // Get file descriptor flags
6117                         mutex_lock(&context->io_mutex);
6118                         status = fd_close_on_exec(context, fd) ? FD_CLOEXEC : 0;
6119                         mutex_unlock(&context->io_mutex);
6120                         break;
6121                 }
6122
6123                 case F_SETFL:
6124                         // Set file descriptor open mode
6125
6126                         // we only accept changes to O_APPEND and O_NONBLOCK
6127                         argument &= O_APPEND | O_NONBLOCK;
6128                         if (descriptor->ops->fd_set_flags != NULL) {
6129                                 status = descriptor->ops->fd_set_flags(descriptor, argument);
6130                         } else if (vnode != NULL && HAS_FS_CALL(vnode, set_flags)) {
6131                                 status = FS_CALL(vnode, set_flags, descriptor->cookie,
6132                                         (int)argument);
6133                         } else
6134                                 status = B_UNSUPPORTED;
6135
6136                         if (status == B_OK) {
6137                                 // update this descriptor's open_mode field
6138                                 descriptor->open_mode = (descriptor->open_mode
6139                                         & ~(O_APPEND | O_NONBLOCK)) | argument;
6140                         }
6141
6142                         break;
6143
6144                 case F_GETFL:
6145                         // Get file descriptor open mode
6146                         status = descriptor->open_mode;
6147                         break;
6148
6149                 case F_DUPFD:
6150                 case F_DUPFD_CLOEXEC:
6151                 {
6152                         struct io_context* context = get_current_io_context(kernel);
6153
6154                         status = new_fd_etc(context, descriptor, (int)argument);
6155                         if (status >= 0) {
6156                                 mutex_lock(&context->io_mutex);
6157                                 fd_set_close_on_exec(context, fd, op == F_DUPFD_CLOEXEC);
6158                                 mutex_unlock(&context->io_mutex);
6159
6160                                 atomic_add(&descriptor->ref_count, 1);
6161                         }
6162                         break;
6163                 }
6164
6165                 case F_GETLK:
6166                         if (vnode != NULL) {
6167                                 struct flock normalizedLock;
6168
6169                                 memcpy(&normalizedLock, &flock, sizeof(struct flock));
6170                                 status = normalize_flock(descriptor, &normalizedLock);
6171                                 if (status != B_OK)
6172                                         break;
6173
6174                                 if (HAS_FS_CALL(vnode, test_lock)) {
6175                                         status = FS_CALL(vnode, test_lock, descriptor->cookie,
6176                                                 &normalizedLock);
6177                                 } else
6178                                         status = test_advisory_lock(vnode, &normalizedLock);
6179                                 if (status == B_OK) {
6180                                         if (normalizedLock.l_type == F_UNLCK) {
6181                                                 // no conflicting lock found, copy back the same struct
6182                                                 // we were given except change type to F_UNLCK
6183                                                 flock.l_type = F_UNLCK;
6184                                                 status = user_memcpy((struct flock*)argument, &flock,
6185                                                         sizeof(struct flock));
6186                                         } else {
6187                                                 // a conflicting lock was found, copy back its range and
6188                                                 // type
6189                                                 if (normalizedLock.l_len == OFF_MAX)
6190                                                         normalizedLock.l_len = 0;
6191
6192                                                 status = user_memcpy((struct flock*)argument,
6193                                                         &normalizedLock, sizeof(struct flock));
6194                                         }
6195                                 }
6196                         } else
6197                                 status = B_BAD_VALUE;
6198                         break;
6199
6200                 case F_SETLK:
6201                 case F_SETLKW:
6202                         status = normalize_flock(descriptor, &flock);
6203                         if (status != B_OK)
6204                                 break;
6205
6206                         if (vnode == NULL) {
6207                                 status = B_BAD_VALUE;
6208                         } else if (flock.l_type == F_UNLCK) {
6209                                 if (HAS_FS_CALL(vnode, release_lock)) {
6210                                         status = FS_CALL(vnode, release_lock, descriptor->cookie,
6211                                                 &flock);
6212                                 } else
6213                                         status = release_advisory_lock(vnode, &flock);
6214                         } else {
6215                                 // the open mode must match the lock type
6216                                 if (((descriptor->open_mode & O_RWMASK) == O_RDONLY
6217                                                 && flock.l_type == F_WRLCK)
6218                                         || ((descriptor->open_mode & O_RWMASK) == O_WRONLY
6219                                                 && flock.l_type == F_RDLCK))
6220                                         status = B_FILE_ERROR;
6221                                 else {
6222                                         if (HAS_FS_CALL(vnode, acquire_lock)) {
6223                                                 status = FS_CALL(vnode, acquire_lock,
6224                                                         descriptor->cookie, &flock, op == F_SETLKW);
6225                                         } else {
6226                                                 status = acquire_advisory_lock(vnode, -1,
6227                                                         &flock, op == F_SETLKW);
6228                                         }
6229                                 }
6230                         }
6231                         break;
6232
6233                 // ToDo: add support for more ops?
6234
6235                 default:
6236                         status = B_BAD_VALUE;
6237         }
6238
6239         put_fd(descriptor);
6240         return status;
6241 }
6242
6243
6244 static status_t
6245 common_sync(int fd, bool kernel)
6246 {
6247         struct file_descriptor* descriptor;
6248         struct vnode* vnode;
6249         status_t status;
6250
6251         FUNCTION(("common_fsync: entry. fd %d kernel %d\n", fd, kernel));
6252
6253         descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6254         if (descriptor == NULL)
6255                 return B_FILE_ERROR;
6256
6257         if (HAS_FS_CALL(vnode, fsync))
6258                 status = FS_CALL_NO_PARAMS(vnode, fsync);
6259         else
6260                 status = B_UNSUPPORTED;
6261
6262         put_fd(descriptor);
6263         return status;
6264 }
6265
6266
6267 static status_t
6268 common_lock_node(int fd, bool kernel)
6269 {
6270         struct file_descriptor* descriptor;
6271         struct vnode* vnode;
6272
6273         descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6274         if (descriptor == NULL)
6275                 return B_FILE_ERROR;
6276
6277         status_t status = B_OK;
6278
6279         // We need to set the locking atomically - someone
6280         // else might set one at the same time
6281         if (atomic_pointer_test_and_set(&vnode->mandatory_locked_by, descriptor,
6282                         (file_descriptor*)NULL) != NULL)
6283                 status = B_BUSY;
6284
6285         put_fd(descriptor);
6286         return status;
6287 }
6288
6289
6290 static status_t
6291 common_unlock_node(int fd, bool kernel)
6292 {
6293         struct file_descriptor* descriptor;
6294         struct vnode* vnode;
6295
6296         descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6297         if (descriptor == NULL)
6298                 return B_FILE_ERROR;
6299
6300         status_t status = B_OK;
6301
6302         // We need to set the locking atomically - someone
6303         // else might set one at the same time
6304         if (atomic_pointer_test_and_set(&vnode->mandatory_locked_by,
6305                         (file_descriptor*)NULL, descriptor) != descriptor)
6306                 status = B_BAD_VALUE;
6307
6308         put_fd(descriptor);
6309         return status;
6310 }
6311
6312
6313 static status_t
6314 common_read_link(int fd, char* path, char* buffer, size_t* _bufferSize,
6315         bool kernel)
6316 {
6317         struct vnode* vnode;
6318         status_t status;
6319
6320         status = fd_and_path_to_vnode(fd, path, false, &vnode, NULL, kernel);
6321         if (status != B_OK)
6322                 return status;
6323
6324         if (HAS_FS_CALL(vnode, read_symlink)) {
6325                 status = FS_CALL(vnode, read_symlink, buffer, _bufferSize);
6326         } else
6327                 status = B_BAD_VALUE;
6328
6329         put_vnode(vnode);
6330         return status;
6331 }
6332
6333
6334 static status_t
6335 common_create_symlink(int fd, char* path, const char* toPath, int mode,
6336         bool kernel)
6337 {
6338         // path validity checks have to be in the calling function!
6339         char name[B_FILE_NAME_LENGTH];
6340         struct vnode* vnode;
6341         status_t status;
6342
6343         FUNCTION(("common_create_symlink(fd = %d, path = %s, toPath = %s, "
6344                 "mode = %d, kernel = %d)\n", fd, path, toPath, mode, kernel));
6345
6346         status = fd_and_path_to_dir_vnode(fd, path, &vnode, name, kernel);
6347         if (status != B_OK)
6348                 return status;
6349
6350         if (HAS_FS_CALL(vnode, create_symlink))
6351                 status = FS_CALL(vnode, create_symlink, name, toPath, mode);
6352         else {
6353                 status = HAS_FS_CALL(vnode, write)
6354                         ? B_UNSUPPORTED : B_READ_ONLY_DEVICE;
6355         }
6356
6357         put_vnode(vnode);
6358
6359         return status;
6360 }
6361
6362
6363 static status_t
6364 common_create_link(int pathFD, char* path, int toFD, char* toPath,
6365         bool traverseLeafLink, bool kernel)
6366 {
6367         // path validity checks have to be in the calling function!
6368
6369         FUNCTION(("common_create_link(path = %s, toPath = %s, kernel = %d)\n", path,
6370                 toPath, kernel));
6371
6372         char name[B_FILE_NAME_LENGTH];
6373         struct vnode* directory;
6374         status_t status = fd_and_path_to_dir_vnode(pathFD, path, &directory, name,
6375                 kernel);
6376         if (status != B_OK)
6377                 return status;
6378
6379         struct vnode* vnode;
6380         status = fd_and_path_to_vnode(toFD, toPath, traverseLeafLink, &vnode, NULL,
6381                 kernel);
6382         if (status != B_OK)
6383                 goto err;
6384
6385         if (directory->mount != vnode->mount) {
6386                 status = B_CROSS_DEVICE_LINK;
6387                 goto err1;
6388         }
6389
6390         if (HAS_FS_CALL(directory, link))
6391                 status = FS_CALL(directory, link, name, vnode);
6392         else
6393                 status = B_READ_ONLY_DEVICE;
6394
6395 err1:
6396         put_vnode(vnode);
6397 err:
6398         put_vnode(directory);
6399
6400         return status;
6401 }
6402
6403
6404 static status_t
6405 common_unlink(int fd, char* path, bool kernel)
6406 {
6407         char filename[B_FILE_NAME_LENGTH];
6408         struct vnode* vnode;
6409         status_t status;
6410
6411         FUNCTION(("common_unlink: fd: %d, path '%s', kernel %d\n", fd, path,
6412                 kernel));
6413
6414         status = fd_and_path_to_dir_vnode(fd, path, &vnode, filename, kernel);
6415         if (status < 0)
6416                 return status;
6417
6418         if (HAS_FS_CALL(vnode, unlink))
6419                 status = FS_CALL(vnode, unlink, filename);
6420         else
6421                 status = B_READ_ONLY_DEVICE;
6422
6423         put_vnode(vnode);
6424
6425         return status;
6426 }
6427
6428
6429 static status_t
6430 common_access(int fd, char* path, int mode, bool effectiveUserGroup, bool kernel)
6431 {
6432         struct vnode* vnode;
6433         status_t status;
6434
6435         // TODO: honor effectiveUserGroup argument
6436
6437         status = fd_and_path_to_vnode(fd, path, true, &vnode, NULL, kernel);
6438         if (status != B_OK)
6439                 return status;
6440
6441         if (HAS_FS_CALL(vnode, access))
6442                 status = FS_CALL(vnode, access, mode);
6443         else
6444                 status = B_OK;
6445
6446         put_vnode(vnode);
6447
6448         return status;
6449 }
6450
6451
6452 static status_t
6453 common_rename(int fd, char* path, int newFD, char* newPath, bool kernel)
6454 {
6455         struct vnode* fromVnode;
6456         struct vnode* toVnode;
6457         char fromName[B_FILE_NAME_LENGTH];
6458         char toName[B_FILE_NAME_LENGTH];
6459         status_t status;
6460
6461         FUNCTION(("common_rename(fd = %d, path = %s, newFD = %d, newPath = %s, "
6462                 "kernel = %d)\n", fd, path, newFD, newPath, kernel));
6463
6464         status = fd_and_path_to_dir_vnode(fd, path, &fromVnode, fromName, kernel);
6465         if (status != B_OK)
6466                 return status;
6467
6468         status = fd_and_path_to_dir_vnode(newFD, newPath, &toVnode, toName, kernel);
6469         if (status != B_OK)
6470                 goto err1;
6471
6472         if (fromVnode->device != toVnode->device) {
6473                 status = B_CROSS_DEVICE_LINK;
6474                 goto err2;
6475         }
6476
6477         if (fromName[0] == '\0' || toName[0] == '\0'
6478                 || !strcmp(fromName, ".") || !strcmp(fromName, "..")
6479                 || !strcmp(toName, ".") || !strcmp(toName, "..")
6480                 || (fromVnode == toVnode && !strcmp(fromName, toName))) {
6481                 status = B_BAD_VALUE;
6482                 goto err2;
6483         }
6484
6485         if (HAS_FS_CALL(fromVnode, rename))
6486                 status = FS_CALL(fromVnode, rename, fromName, toVnode, toName);
6487         else
6488                 status = B_READ_ONLY_DEVICE;
6489
6490 err2:
6491         put_vnode(toVnode);
6492 err1:
6493         put_vnode(fromVnode);
6494
6495         return status;
6496 }
6497
6498
6499 static status_t
6500 common_read_stat(struct file_descriptor* descriptor, struct stat* stat)
6501 {
6502         struct vnode* vnode = descriptor->u.vnode;
6503
6504         FUNCTION(("common_read_stat: stat %p\n", stat));
6505
6506         // TODO: remove this once all file systems properly set them!
6507         stat->st_crtim.tv_nsec = 0;
6508         stat->st_ctim.tv_nsec = 0;
6509         stat->st_mtim.tv_nsec = 0;
6510         stat->st_atim.tv_nsec = 0;
6511
6512         return vfs_stat_vnode(vnode, stat);
6513 }
6514
6515
6516 static status_t
6517 common_write_stat(struct file_descriptor* descriptor, const struct stat* stat,
6518         int statMask)
6519 {
6520         struct vnode* vnode = descriptor->u.vnode;
6521
6522         FUNCTION(("common_write_stat(vnode = %p, stat = %p, statMask = %d)\n",
6523                 vnode, stat, statMask));
6524
6525         if (!HAS_FS_CALL(vnode, write_stat))
6526                 return B_READ_ONLY_DEVICE;
6527
6528         return FS_CALL(vnode, write_stat, stat, statMask);
6529 }
6530
6531
6532 static status_t
6533 common_path_read_stat(int fd, char* path, bool traverseLeafLink,
6534         struct stat* stat, bool kernel)
6535 {
6536         FUNCTION(("common_path_read_stat: fd: %d, path '%s', stat %p,\n", fd, path,
6537                 stat));
6538
6539         struct vnode* vnode;
6540         status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, &vnode,
6541                 NULL, kernel);
6542         if (status != B_OK)
6543                 return status;
6544
6545         status = vfs_stat_vnode(vnode, stat);
6546
6547         put_vnode(vnode);
6548         return status;
6549 }
6550
6551
6552 static status_t
6553 common_path_write_stat(int fd, char* path, bool traverseLeafLink,
6554         const struct stat* stat, int statMask, bool kernel)
6555 {
6556         FUNCTION(("common_write_stat: fd: %d, path '%s', stat %p, stat_mask %d, "
6557                 "kernel %d\n", fd, path, stat, statMask, kernel));
6558
6559         struct vnode* vnode;
6560         status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, &vnode,
6561                 NULL, kernel);
6562         if (status != B_OK)
6563                 return status;
6564
6565         if (HAS_FS_CALL(vnode, write_stat))
6566                 status = FS_CALL(vnode, write_stat, stat, statMask);
6567         else
6568                 status = B_READ_ONLY_DEVICE;
6569
6570         put_vnode(vnode);
6571
6572         return status;
6573 }
6574
6575
6576 static int
6577 attr_dir_open(int fd, char* path, bool traverseLeafLink, bool kernel)
6578 {
6579         FUNCTION(("attr_dir_open(fd = %d, path = '%s', kernel = %d)\n", fd, path,
6580                 kernel));
6581
6582         struct vnode* vnode;
6583         status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, &vnode,
6584                 NULL, kernel);
6585         if (status != B_OK)
6586                 return status;
6587
6588         status = open_attr_dir_vnode(vnode, kernel);
6589         if (status < 0)
6590                 put_vnode(vnode);
6591
6592         return status;
6593 }
6594
6595
6596 static status_t
6597 attr_dir_close(struct file_descriptor* descriptor)
6598 {
6599         struct vnode* vnode = descriptor->u.vnode;
6600
6601         FUNCTION(("attr_dir_close(descriptor = %p)\n", descriptor));
6602
6603         if (HAS_FS_CALL(vnode, close_attr_dir))
6604                 return FS_CALL(vnode, close_attr_dir, descriptor->cookie);
6605
6606         return B_OK;
6607 }
6608
6609
6610 static void
6611 attr_dir_free_fd(struct file_descriptor* descriptor)
6612 {
6613         struct vnode* vnode = descriptor->u.vnode;
6614
6615         if (vnode != NULL) {
6616                 FS_CALL(vnode, free_attr_dir_cookie, descriptor->cookie);
6617                 put_vnode(vnode);
6618         }
6619 }
6620
6621
6622 static status_t
6623 attr_dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
6624         struct dirent* buffer, size_t bufferSize, uint32* _count)
6625 {
6626         struct vnode* vnode = descriptor->u.vnode;
6627
6628         FUNCTION(("attr_dir_read(descriptor = %p)\n", descriptor));
6629
6630         if (HAS_FS_CALL(vnode, read_attr_dir))
6631                 return FS_CALL(vnode, read_attr_dir, descriptor->cookie, buffer,
6632                         bufferSize, _count);
6633
6634         return B_UNSUPPORTED;
6635 }
6636
6637
6638 static status_t
6639 attr_dir_rewind(struct file_descriptor* descriptor)
6640 {
6641         struct vnode* vnode = descriptor->u.vnode;
6642
6643         FUNCTION(("attr_dir_rewind(descriptor = %p)\n", descriptor));
6644
6645         if (HAS_FS_CALL(vnode, rewind_attr_dir))
6646                 return FS_CALL(vnode, rewind_attr_dir, descriptor->cookie);
6647
6648         return B_UNSUPPORTED;
6649 }
6650
6651
6652 static int
6653 attr_create(int fd, char* path, const char* name, uint32 type,
6654         int openMode, bool kernel)
6655 {
6656         if (name == NULL || *name == '\0')
6657                 return B_BAD_VALUE;
6658
6659         bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
6660         struct vnode* vnode;
6661         status_t status = fd_and_path_to_vnode(fd, path, traverse, &vnode, NULL,
6662                 kernel);
6663         if (status != B_OK)
6664                 return status;
6665
6666         if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
6667                 status = B_LINK_LIMIT;
6668                 goto err;
6669         }
6670
6671         if (!HAS_FS_CALL(vnode, create_attr)) {
6672                 status = B_READ_ONLY_DEVICE;
6673                 goto err;
6674         }
6675
6676         void* cookie;
6677         status = FS_CALL(vnode, create_attr, name, type, openMode, &cookie);
6678         if (status != B_OK)
6679                 goto err;
6680
6681         fd = get_new_fd(FDTYPE_ATTR, NULL, vnode, cookie, openMode, kernel);
6682         if (fd >= 0)
6683                 return fd;
6684
6685         status = fd;
6686
6687         FS_CALL(vnode, close_attr, cookie);
6688         FS_CALL(vnode, free_attr_cookie, cookie);
6689
6690         FS_CALL(vnode, remove_attr, name);
6691
6692 err:
6693         put_vnode(vnode);
6694
6695         return status;
6696 }
6697
6698
6699 static int
6700 attr_open(int fd, char* path, const char* name, int openMode, bool kernel)
6701 {
6702         if (name == NULL || *name == '\0')
6703                 return B_BAD_VALUE;
6704
6705         bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
6706         struct vnode* vnode;
6707         status_t status = fd_and_path_to_vnode(fd, path, traverse, &vnode, NULL,
6708                 kernel);
6709         if (status != B_OK)
6710                 return status;
6711
6712         if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
6713                 status = B_LINK_LIMIT;
6714                 goto err;
6715         }
6716
6717         if (!HAS_FS_CALL(vnode, open_attr)) {
6718                 status = B_UNSUPPORTED;
6719                 goto err;
6720         }
6721
6722         void* cookie;
6723         status = FS_CALL(vnode, open_attr, name, openMode, &cookie);
6724         if (status != B_OK)
6725                 goto err;
6726
6727         // now we only need a file descriptor for this attribute and we're done
6728         fd = get_new_fd(FDTYPE_ATTR, NULL, vnode, cookie, openMode, kernel);
6729         if (fd >= 0)
6730                 return fd;
6731
6732         status = fd;
6733
6734         FS_CALL(vnode, close_attr, cookie);
6735         FS_CALL(vnode, free_attr_cookie, cookie);
6736
6737 err:
6738         put_vnode(vnode);
6739
6740         return status;
6741 }
6742
6743
6744 static status_t
6745 attr_close(struct file_descriptor* descriptor)
6746 {
6747         struct vnode* vnode = descriptor->u.vnode;
6748
6749         FUNCTION(("attr_close(descriptor = %p)\n", descriptor));
6750
6751         if (HAS_FS_CALL(vnode, close_attr))
6752                 return FS_CALL(vnode, close_attr, descriptor->cookie);
6753
6754         return B_OK;
6755 }
6756
6757
6758 static void
6759 attr_free_fd(struct file_descriptor* descriptor)
6760 {
6761         struct vnode* vnode = descriptor->u.vnode;
6762
6763         if (vnode != NULL) {
6764                 FS_CALL(vnode, free_attr_cookie, descriptor->cookie);
6765                 put_vnode(vnode);
6766         }
6767 }
6768
6769
6770 static status_t
6771 attr_read(struct file_descriptor* descriptor, off_t pos, void* buffer,
6772         size_t* length)
6773 {
6774         struct vnode* vnode = descriptor->u.vnode;
6775
6776         FUNCTION(("attr_read: buf %p, pos %" B_PRIdOFF ", len %p = %ld\n", buffer,
6777                 pos, length, *length));
6778
6779         if (!HAS_FS_CALL(vnode, read_attr))
6780                 return B_UNSUPPORTED;
6781
6782         return FS_CALL(vnode, read_attr, descriptor->cookie, pos, buffer, length);
6783 }
6784
6785
6786 static status_t
6787 attr_write(struct file_descriptor* descriptor, off_t pos, const void* buffer,
6788         size_t* length)
6789 {
6790         struct vnode* vnode = descriptor->u.vnode;
6791
6792         FUNCTION(("attr_write: buf %p, pos %" B_PRIdOFF ", len %p\n", buffer, pos,
6793                 length));
6794
6795         if (!HAS_FS_CALL(vnode, write_attr))
6796                 return B_UNSUPPORTED;
6797
6798         return FS_CALL(vnode, write_attr, descriptor->cookie, pos, buffer, length);
6799 }
6800
6801
6802 static off_t
6803 attr_seek(struct file_descriptor* descriptor, off_t pos, int seekType)
6804 {
6805         off_t offset;
6806
6807         switch (seekType) {
6808                 case SEEK_SET:
6809                         offset = 0;
6810                         break;
6811                 case SEEK_CUR:
6812                         offset = descriptor->pos;
6813                         break;
6814                 case SEEK_END:
6815                 {
6816                         struct vnode* vnode = descriptor->u.vnode;
6817                         if (!HAS_FS_CALL(vnode, read_stat))
6818                                 return B_UNSUPPORTED;
6819
6820                         struct stat stat;
6821                         status_t status = FS_CALL(vnode, read_attr_stat, descriptor->cookie,
6822                                 &stat);
6823                         if (status != B_OK)
6824                                 return status;
6825
6826                         offset = stat.st_size;
6827                         break;
6828                 }
6829                 default:
6830                         return B_BAD_VALUE;
6831         }
6832
6833         // assumes off_t is 64 bits wide
6834         if (offset > 0 && LONGLONG_MAX - offset < pos)
6835                 return B_BUFFER_OVERFLOW;
6836
6837         pos += offset;
6838         if (pos < 0)
6839                 return B_BAD_VALUE;
6840
6841         return descriptor->pos = pos;
6842 }
6843
6844
6845 static status_t
6846 attr_read_stat(struct file_descriptor* descriptor, struct stat* stat)
6847 {
6848         struct vnode* vnode = descriptor->u.vnode;
6849
6850         FUNCTION(("attr_read_stat: stat 0x%p\n", stat));
6851
6852         if (!HAS_FS_CALL(vnode, read_attr_stat))
6853                 return B_UNSUPPORTED;
6854
6855         return FS_CALL(vnode, read_attr_stat, descriptor->cookie, stat);
6856 }
6857
6858
6859 static status_t
6860 attr_write_stat(struct file_descriptor* descriptor, const struct stat* stat,
6861         int statMask)
6862 {
6863         struct vnode* vnode = descriptor->u.vnode;
6864
6865         FUNCTION(("attr_write_stat: stat = %p, statMask %d\n", stat, statMask));
6866
6867         if (!HAS_FS_CALL(vnode, write_attr_stat))
6868                 return B_READ_ONLY_DEVICE;
6869
6870         return FS_CALL(vnode, write_attr_stat, descriptor->cookie, stat, statMask);
6871 }
6872
6873
6874 static status_t
6875 attr_remove(int fd, const char* name, bool kernel)
6876 {
6877         struct file_descriptor* descriptor;
6878         struct vnode* vnode;
6879         status_t status;
6880
6881         if (name == NULL || *name == '\0')
6882                 return B_BAD_VALUE;
6883
6884         FUNCTION(("attr_remove: fd = %d, name = \"%s\", kernel %d\n", fd, name,
6885                 kernel));
6886
6887         descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6888         if (descriptor == NULL)
6889                 return B_FILE_ERROR;
6890
6891         if (HAS_FS_CALL(vnode, remove_attr))
6892                 status = FS_CALL(vnode, remove_attr, name);
6893         else
6894                 status = B_READ_ONLY_DEVICE;
6895
6896         put_fd(descriptor);
6897
6898         return status;
6899 }
6900
6901
6902 static status_t
6903 attr_rename(int fromFD, const char* fromName, int toFD, const char* toName,
6904         bool kernel)
6905 {
6906         struct file_descriptor* fromDescriptor;
6907         struct file_descriptor* toDescriptor;
6908         struct vnode* fromVnode;
6909         struct vnode* toVnode;
6910         status_t status;
6911
6912         if (fromName == NULL || *fromName == '\0' || toName == NULL
6913                 || *toName == '\0')
6914                 return B_BAD_VALUE;
6915
6916         FUNCTION(("attr_rename: from fd = %d, from name = \"%s\", to fd = %d, to "
6917                 "name = \"%s\", kernel %d\n", fromFD, fromName, toFD, toName, kernel));
6918
6919         fromDescriptor = get_fd_and_vnode(fromFD, &fromVnode, kernel);
6920         if (fromDescriptor == NULL)
6921                 return B_FILE_ERROR;
6922
6923         toDescriptor = get_fd_and_vnode(toFD, &toVnode, kernel);
6924         if (toDescriptor == NULL) {
6925                 status = B_FILE_ERROR;
6926                 goto err;
6927         }
6928
6929         // are the files on the same volume?
6930         if (fromVnode->device != toVnode->device) {
6931                 status = B_CROSS_DEVICE_LINK;
6932                 goto err1;
6933         }
6934
6935         if (HAS_FS_CALL(fromVnode, rename_attr)) {
6936                 status = FS_CALL(fromVnode, rename_attr, fromName, toVnode, toName);
6937         } else
6938                 status = B_READ_ONLY_DEVICE;
6939
6940 err1:
6941         put_fd(toDescriptor);
6942 err:
6943         put_fd(fromDescriptor);
6944
6945         return status;
6946 }
6947
6948
6949 static int
6950 index_dir_open(dev_t mountID, bool kernel)
6951 {
6952         struct fs_mount* mount;
6953         void* cookie;
6954
6955         FUNCTION(("index_dir_open(mountID = %" B_PRId32 ", kernel = %d)\n", mountID,
6956                 kernel));
6957
6958         status_t status = get_mount(mountID, &mount);
6959         if (status != B_OK)
6960                 return status;
6961
6962         if (!HAS_FS_MOUNT_CALL(mount, open_index_dir)) {
6963                 status = B_UNSUPPORTED;
6964                 goto error;
6965         }
6966
6967         status = FS_MOUNT_CALL(mount, open_index_dir, &cookie);
6968         if (status != B_OK)
6969                 goto error;
6970
6971         // get fd for the index directory
6972         int fd;
6973         fd = get_new_fd(FDTYPE_INDEX_DIR, mount, NULL, cookie, O_CLOEXEC, kernel);
6974         if (fd >= 0)
6975                 return fd;
6976
6977         // something went wrong
6978         FS_MOUNT_CALL(mount, close_index_dir, cookie);
6979         FS_MOUNT_CALL(mount, free_index_dir_cookie, cookie);
6980
6981         status = fd;
6982
6983 error:
6984         put_mount(mount);
6985         return status;
6986 }
6987
6988
6989 static status_t
6990 index_dir_close(struct file_descriptor* descriptor)
6991 {
6992         struct fs_mount* mount = descriptor->u.mount;
6993
6994         FUNCTION(("index_dir_close(descriptor = %p)\n", descriptor));
6995
6996         if (HAS_FS_MOUNT_CALL(mount, close_index_dir))
6997                 return FS_MOUNT_CALL(mount, close_index_dir, descriptor->cookie);
6998
6999         return B_OK;
7000 }
7001
7002
7003 static void
7004 index_dir_free_fd(struct file_descriptor* descriptor)
7005 {
7006         struct fs_mount* mount = descriptor->u.mount;
7007
7008         if (mount != NULL) {
7009                 FS_MOUNT_CALL(mount, free_index_dir_cookie, descriptor->cookie);
7010                 put_mount(mount);
7011         }
7012 }
7013
7014
7015 static status_t
7016 index_dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
7017         struct dirent* buffer, size_t bufferSize, uint32* _count)
7018 {
7019         struct fs_mount* mount = descriptor->u.mount;
7020
7021         if (HAS_FS_MOUNT_CALL(mount, read_index_dir)) {
7022                 return FS_MOUNT_CALL(mount, read_index_dir, descriptor->cookie, buffer,
7023                         bufferSize, _count);
7024         }
7025
7026         return B_UNSUPPORTED;
7027 }
7028
7029
7030 static status_t
7031 index_dir_rewind(struct file_descriptor* descriptor)
7032 {
7033         struct fs_mount* mount = descriptor->u.mount;
7034
7035         if (HAS_FS_MOUNT_CALL(mount, rewind_index_dir))
7036                 return FS_MOUNT_CALL(mount, rewind_index_dir, descriptor->cookie);
7037
7038         return B_UNSUPPORTED;
7039 }
7040
7041
7042 static status_t
7043 index_create(dev_t mountID, const char* name, uint32 type, uint32 flags,
7044         bool kernel)
7045 {
7046         FUNCTION(("index_create(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7047                 mountID, name, kernel));
7048
7049         struct fs_mount* mount;
7050         status_t status = get_mount(mountID, &mount);
7051         if (status != B_OK)
7052                 return status;
7053
7054         if (!HAS_FS_MOUNT_CALL(mount, create_index)) {
7055                 status = B_READ_ONLY_DEVICE;
7056                 goto out;
7057         }
7058
7059         status = FS_MOUNT_CALL(mount, create_index, name, type, flags);
7060
7061 out:
7062         put_mount(mount);
7063         return status;
7064 }
7065
7066
7067 #if 0
7068 static status_t
7069 index_read_stat(struct file_descriptor* descriptor, struct stat* stat)
7070 {
7071         struct vnode* vnode = descriptor->u.vnode;
7072
7073         // ToDo: currently unused!
7074         FUNCTION(("index_read_stat: stat 0x%p\n", stat));
7075         if (!HAS_FS_CALL(vnode, read_index_stat))
7076                 return B_UNSUPPORTED;
7077
7078         return B_UNSUPPORTED;
7079         //return FS_CALL(vnode, read_index_stat, descriptor->cookie, stat);
7080 }
7081
7082
7083 static void
7084 index_free_fd(struct file_descriptor* descriptor)
7085 {
7086         struct vnode* vnode = descriptor->u.vnode;
7087
7088         if (vnode != NULL) {
7089                 FS_CALL(vnode, free_index_cookie, descriptor->cookie);
7090                 put_vnode(vnode);
7091         }
7092 }
7093 #endif
7094
7095
7096 static status_t
7097 index_name_read_stat(dev_t mountID, const char* name, struct stat* stat,
7098         bool kernel)
7099 {
7100         FUNCTION(("index_remove(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7101                 mountID, name, kernel));
7102
7103         struct fs_mount* mount;
7104         status_t status = get_mount(mountID, &mount);
7105         if (status != B_OK)
7106                 return status;
7107
7108         if (!HAS_FS_MOUNT_CALL(mount, read_index_stat)) {
7109                 status = B_UNSUPPORTED;
7110                 goto out;
7111         }
7112
7113         status = FS_MOUNT_CALL(mount, read_index_stat, name, stat);
7114
7115 out:
7116         put_mount(mount);
7117         return status;
7118 }
7119
7120
7121 static status_t
7122 index_remove(dev_t mountID, const char* name, bool kernel)
7123 {
7124         FUNCTION(("index_remove(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7125                 mountID, name, kernel));
7126
7127         struct fs_mount* mount;
7128         status_t status = get_mount(mountID, &mount);
7129         if (status != B_OK)
7130                 return status;
7131
7132         if (!HAS_FS_MOUNT_CALL(mount, remove_index)) {
7133                 status = B_READ_ONLY_DEVICE;
7134                 goto out;
7135         }
7136
7137         status = FS_MOUNT_CALL(mount, remove_index, name);
7138
7139 out:
7140         put_mount(mount);
7141         return status;
7142 }
7143
7144
7145 /*!     TODO: the query FS API is still the pretty much the same as in R5.
7146                 It would be nice if the FS would find some more kernel support
7147                 for them.
7148                 For example, query parsing should be moved into the kernel.
7149 */
7150 static int
7151 query_open(dev_t device, const char* query, uint32 flags, port_id port,
7152         int32 token, bool kernel)
7153 {
7154         struct fs_mount* mount;
7155         void* cookie;
7156
7157         FUNCTION(("query_open(device = %" B_PRId32 ", query = \"%s\", kernel = %d)\n",
7158                 device, query, kernel));
7159
7160         status_t status = get_mount(device, &mount);
7161         if (status != B_OK)
7162                 return status;
7163
7164         if (!HAS_FS_MOUNT_CALL(mount, open_query)) {
7165                 status = B_UNSUPPORTED;
7166                 goto error;
7167         }
7168
7169         status = FS_MOUNT_CALL(mount, open_query, query, flags, port, token,
7170                 &cookie);
7171         if (status != B_OK)
7172                 goto error;
7173
7174         // get fd for the index directory
7175         int fd;
7176         fd = get_new_fd(FDTYPE_QUERY, mount, NULL, cookie, O_CLOEXEC, kernel);
7177         if (fd >= 0)
7178                 return fd;
7179
7180         status = fd;
7181
7182         // something went wrong
7183         FS_MOUNT_CALL(mount, close_query, cookie);
7184         FS_MOUNT_CALL(mount, free_query_cookie, cookie);
7185
7186 error:
7187         put_mount(mount);
7188         return status;
7189 }
7190
7191
7192 static status_t
7193 query_close(struct file_descriptor* descriptor)
7194 {
7195         struct fs_mount* mount = descriptor->u.mount;
7196
7197         FUNCTION(("query_close(descriptor = %p)\n", descriptor));
7198
7199         if (HAS_FS_MOUNT_CALL(mount, close_query))
7200                 return FS_MOUNT_CALL(mount, close_query, descriptor->cookie);
7201
7202         return B_OK;
7203 }
7204
7205
7206 static void
7207 query_free_fd(struct file_descriptor* descriptor)
7208 {
7209         struct fs_mount* mount = descriptor->u.mount;
7210
7211         if (mount != NULL) {
7212                 FS_MOUNT_CALL(mount, free_query_cookie, descriptor->cookie);
7213                 put_mount(mount);
7214         }
7215 }
7216
7217
7218 static status_t
7219 query_read(struct io_context* ioContext, struct file_descriptor* descriptor,
7220         struct dirent* buffer, size_t bufferSize, uint32* _count)
7221 {
7222         struct fs_mount* mount = descriptor->u.mount;
7223
7224         if (HAS_FS_MOUNT_CALL(mount, read_query)) {
7225                 return FS_MOUNT_CALL(mount, read_query, descriptor->cookie, buffer,
7226                         bufferSize, _count);
7227         }
7228
7229         return B_UNSUPPORTED;
7230 }
7231
7232
7233 static status_t
7234 query_rewind(struct file_descriptor* descriptor)
7235 {
7236         struct fs_mount* mount = descriptor->u.mount;
7237
7238         if (HAS_FS_MOUNT_CALL(mount, rewind_query))
7239                 return FS_MOUNT_CALL(mount, rewind_query, descriptor->cookie);
7240
7241         return B_UNSUPPORTED;
7242 }
7243
7244
7245 //      #pragma mark - General File System functions
7246
7247
7248 static dev_t
7249 fs_mount(char* path, const char* device, const char* fsName, uint32 flags,
7250         const char* args, bool kernel)
7251 {
7252         struct ::fs_mount* mount;
7253         status_t status = B_OK;
7254         fs_volume* volume = NULL;
7255         int32 layer = 0;
7256         Vnode* coveredNode = NULL;
7257
7258         FUNCTION(("fs_mount: path = '%s', device = '%s', fs_name = '%s', flags = %#"
7259                 B_PRIx32 ", args = '%s'\n", path, device, fsName, flags, args));
7260
7261         // The path is always safe, we just have to make sure that fsName is
7262         // almost valid - we can't make any assumptions about args, though.
7263         // A NULL fsName is OK, if a device was given and the FS is not virtual.
7264         // We'll get it from the DDM later.
7265         if (fsName == NULL) {
7266                 if (!device || flags & B_MOUNT_VIRTUAL_DEVICE)
7267                         return B_BAD_VALUE;
7268         } else if (fsName[0] == '\0')
7269                 return B_BAD_VALUE;
7270
7271         RecursiveLocker mountOpLocker(sMountOpLock);
7272
7273         // Helper to delete a newly created file device on failure.
7274         // Not exactly beautiful, but helps to keep the code below cleaner.
7275         struct FileDeviceDeleter {
7276                 FileDeviceDeleter() : id(-1) {}
7277                 ~FileDeviceDeleter()
7278                 {
7279                         KDiskDeviceManager::Default()->DeleteFileDevice(id);
7280                 }
7281
7282                 partition_id id;
7283         } fileDeviceDeleter;
7284
7285         // If the file system is not a "virtual" one, the device argument should
7286         // point to a real file/device (if given at all).
7287         // get the partition
7288         KDiskDeviceManager* ddm = KDiskDeviceManager::Default();
7289         KPartition* partition = NULL;
7290         KPath normalizedDevice;
7291         bool newlyCreatedFileDevice = false;
7292
7293         if (!(flags & B_MOUNT_VIRTUAL_DEVICE) && device != NULL) {
7294                 // normalize the device path
7295                 status = normalizedDevice.SetTo(device, true);
7296                 if (status != B_OK)
7297                         return status;
7298
7299                 // get a corresponding partition from the DDM
7300                 partition = ddm->RegisterPartition(normalizedDevice.Path());
7301                 if (partition == NULL) {
7302                         // Partition not found: This either means, the user supplied
7303                         // an invalid path, or the path refers to an image file. We try
7304                         // to let the DDM create a file device for the path.
7305                         partition_id deviceID = ddm->CreateFileDevice(
7306                                 normalizedDevice.Path(), &newlyCreatedFileDevice);
7307                         if (deviceID >= 0) {
7308                                 partition = ddm->RegisterPartition(deviceID);
7309                                 if (newlyCreatedFileDevice)
7310                                         fileDeviceDeleter.id = deviceID;
7311                         }
7312                 }
7313
7314                 if (!partition) {
7315                         TRACE(("fs_mount(): Partition `%s' not found.\n",
7316                                 normalizedDevice.Path()));
7317                         return B_ENTRY_NOT_FOUND;
7318                 }
7319
7320                 device = normalizedDevice.Path();
7321                         // correct path to file device
7322         }
7323         PartitionRegistrar partitionRegistrar(partition, true);
7324
7325         // Write lock the partition's device. For the time being, we keep the lock
7326         // until we're done mounting -- not nice, but ensure, that no-one is
7327         // interfering.
7328         // TODO: Just mark the partition busy while mounting!
7329         KDiskDevice* diskDevice = NULL;
7330         if (partition) {
7331                 diskDevice = ddm->WriteLockDevice(partition->Device()->ID());
7332                 if (!diskDevice) {
7333                         TRACE(("fs_mount(): Failed to lock disk device!\n"));
7334                         return B_ERROR;
7335                 }
7336         }
7337
7338         DeviceWriteLocker writeLocker(diskDevice, true);
7339                 // this takes over the write lock acquired before
7340
7341         if (partition != NULL) {
7342                 // make sure, that the partition is not busy
7343                 if (partition->IsBusy()) {
7344                         TRACE(("fs_mount(): Partition is busy.\n"));
7345                         return B_BUSY;
7346                 }
7347
7348                 // if no FS name had been supplied, we get it from the partition
7349                 if (fsName == NULL) {
7350                         KDiskSystem* diskSystem = partition->DiskSystem();
7351                         if (!diskSystem) {
7352                                 TRACE(("fs_mount(): No FS name was given, and the DDM didn't "
7353                                         "recognize it.\n"));
7354                                 return B_BAD_VALUE;
7355                         }
7356
7357                         if (!diskSystem->IsFileSystem()) {
7358                                 TRACE(("fs_mount(): No FS name was given, and the DDM found a "
7359                                         "partitioning system.\n"));
7360                                 return B_BAD_VALUE;
7361                         }
7362
7363                         // The disk system name will not change, and the KDiskSystem
7364                         // object will not go away while the disk device is locked (and
7365                         // the partition has a reference to it), so this is safe.
7366                         fsName = diskSystem->Name();
7367                 }
7368         }
7369
7370         mount = new(std::nothrow) (struct ::fs_mount);
7371         if (mount == NULL)
7372                 return B_NO_MEMORY;
7373
7374         mount->device_name = strdup(device);
7375                 // "device" can be NULL
7376
7377         status = mount->entry_cache.Init();
7378         if (status != B_OK)
7379                 goto err1;
7380
7381         // initialize structure
7382         mount->id = sNextMountID++;
7383         mount->partition = NULL;
7384         mount->root_vnode = NULL;
7385         mount->covers_vnode = NULL;
7386         mount->unmounting = false;
7387         mount->owns_file_device = false;
7388         mount->volume = NULL;
7389
7390         // build up the volume(s)
7391         while (true) {
7392                 char* layerFSName = get_file_system_name_for_layer(fsName, layer);
7393                 if (layerFSName == NULL) {
7394                         if (layer == 0) {
7395                                 status = B_NO_MEMORY;
7396                                 goto err1;
7397                         }
7398
7399                         break;
7400                 }
7401                 MemoryDeleter layerFSNameDeleter(layerFSName);
7402
7403                 volume = (fs_volume*)malloc(sizeof(fs_volume));
7404                 if (volume == NULL) {
7405                         status = B_NO_MEMORY;
7406                         goto err1;
7407                 }
7408
7409                 volume->id = mount->id;
7410                 volume->partition = partition != NULL ? partition->ID() : -1;
7411                 volume->layer = layer++;
7412                 volume->private_volume = NULL;
7413                 volume->ops = NULL;
7414                 volume->sub_volume = NULL;
7415                 volume->super_volume = NULL;
7416                 volume->file_system = NULL;
7417                 volume->file_system_name = NULL;
7418
7419                 volume->file_system_name = get_file_system_name(layerFSName);
7420                 if (volume->file_system_name == NULL) {
7421                         status = B_NO_MEMORY;
7422                         free(volume);
7423                         goto err1;
7424                 }
7425
7426                 volume->file_system = get_file_system(layerFSName);
7427                 if (volume->file_system == NULL) {
7428                         status = B_DEVICE_NOT_FOUND;
7429                         free(volume->file_system_name);
7430                         free(volume);
7431                         goto err1;
7432                 }
7433
7434                 if (mount->volume == NULL)
7435                         mount->volume = volume;
7436                 else {
7437                         volume->super_volume = mount->volume;
7438                         mount->volume->sub_volume = volume;
7439                         mount->volume = volume;
7440                 }
7441         }
7442
7443         // insert mount struct into list before we call FS's mount() function
7444         // so that vnodes can be created for this mount
7445         mutex_lock(&sMountMutex);
7446         sMountsTable->Insert(mount);
7447         mutex_unlock(&sMountMutex);
7448
7449         ino_t rootID;
7450
7451         if (!sRoot) {
7452                 // we haven't mounted anything yet
7453                 if (strcmp(path, "/") != 0) {
7454                         status = B_ERROR;
7455                         goto err2;
7456                 }
7457
7458                 status = mount->volume->file_system->mount(mount->volume, device, flags,
7459                         args, &rootID);
7460                 if (status != 0)
7461                         goto err2;
7462         } else {
7463                 status = path_to_vnode(path, true, &coveredNode, NULL, kernel);
7464                 if (status != B_OK)
7465                         goto err2;
7466
7467                 mount->covers_vnode = coveredNode;
7468
7469                 // make sure covered_vnode is a directory
7470                 if (!S_ISDIR(coveredNode->Type())) {
7471                         status = B_NOT_A_DIRECTORY;
7472                         goto err3;
7473                 }
7474
7475                 if (coveredNode->IsCovered()) {
7476                         // this is already a covered vnode
7477                         status = B_BUSY;
7478                         goto err3;
7479                 }
7480
7481                 // mount it/them
7482                 fs_volume* volume = mount->volume;
7483                 while (volume) {
7484                         status = volume->file_system->mount(volume, device, flags, args,
7485                                 &rootID);
7486                         if (status != B_OK) {
7487                                 if (volume->sub_volume)
7488                                         goto err4;
7489                                 goto err3;
7490                         }
7491
7492                         volume = volume->super_volume;
7493                 }
7494
7495                 volume = mount->volume;
7496                 while (volume) {
7497                         if (volume->ops->all_layers_mounted != NULL)
7498                                 volume->ops->all_layers_mounted(volume);
7499                         volume = volume->super_volume;
7500                 }
7501         }
7502
7503         // the root node is supposed to be owned by the file system - it must
7504         // exist at this point
7505         mount->root_vnode = lookup_vnode(mount->id, rootID);
7506         if (mount->root_vnode == NULL || mount->root_vnode->ref_count != 1) {
7507                 panic("fs_mount: file system does not own its root node!\n");
7508                 status = B_ERROR;
7509                 goto err4;
7510         }
7511
7512         // set up the links between the root vnode and the vnode it covers
7513         rw_lock_write_lock(&sVnodeLock);
7514         if (coveredNode != NULL) {
7515                 if (coveredNode->IsCovered()) {
7516                         // the vnode is covered now
7517                         status = B_BUSY;
7518                         rw_lock_write_unlock(&sVnodeLock);
7519                         goto err4;
7520                 }
7521
7522                 mount->root_vnode->covers = coveredNode;
7523                 mount->root_vnode->SetCovering(true);
7524
7525                 coveredNode->covered_by = mount->root_vnode;
7526                 coveredNode->SetCovered(true);
7527         }
7528         rw_lock_write_unlock(&sVnodeLock);
7529
7530         if (!sRoot) {
7531                 sRoot = mount->root_vnode;
7532                 mutex_lock(&sIOContextRootLock);
7533                 get_current_io_context(true)->root = sRoot;
7534                 mutex_unlock(&sIOContextRootLock);
7535                 inc_vnode_ref_count(sRoot);
7536         }
7537
7538         // supply the partition (if any) with the mount cookie and mark it mounted
7539         if (partition) {
7540                 partition->SetMountCookie(mount->volume->private_volume);
7541                 partition->SetVolumeID(mount->id);
7542
7543                 // keep a partition reference as long as the partition is mounted
7544                 partitionRegistrar.Detach();
7545                 mount->partition = partition;
7546                 mount->owns_file_device = newlyCreatedFileDevice;
7547                 fileDeviceDeleter.id = -1;
7548         }
7549
7550         notify_mount(mount->id,
7551                 coveredNode != NULL ? coveredNode->device : -1,
7552                 coveredNode ? coveredNode->id : -1);
7553
7554         return mount->id;
7555
7556 err4:
7557         FS_MOUNT_CALL_NO_PARAMS(mount, unmount);
7558 err3:
7559         if (coveredNode != NULL)
7560                 put_vnode(coveredNode);
7561 err2:
7562         mutex_lock(&sMountMutex);
7563         sMountsTable->Remove(mount);
7564         mutex_unlock(&sMountMutex);
7565 err1:
7566         delete mount;
7567
7568         return status;
7569 }
7570
7571
7572 static status_t
7573 fs_unmount(char* path, dev_t mountID, uint32 flags, bool kernel)
7574 {
7575         struct fs_mount* mount;
7576         status_t err;
7577
7578         FUNCTION(("fs_unmount(path '%s', dev %" B_PRId32 ", kernel %d\n", path,
7579                 mountID, kernel));
7580
7581         struct vnode* pathVnode = NULL;
7582         if (path != NULL) {
7583                 err = path_to_vnode(path, true, &pathVnode, NULL, kernel);
7584                 if (err != B_OK)
7585                         return B_ENTRY_NOT_FOUND;
7586         }
7587
7588         RecursiveLocker mountOpLocker(sMountOpLock);
7589
7590         // this lock is not strictly necessary, but here in case of KDEBUG
7591         // to keep the ASSERT in find_mount() working.
7592         KDEBUG_ONLY(mutex_lock(&sMountMutex));
7593         mount = find_mount(path != NULL ? pathVnode->device : mountID);
7594         KDEBUG_ONLY(mutex_unlock(&sMountMutex));
7595         if (mount == NULL) {
7596                 panic("fs_unmount: find_mount() failed on root vnode @%p of mount\n",
7597                         pathVnode);
7598         }
7599
7600         if (path != NULL) {
7601                 put_vnode(pathVnode);
7602
7603                 if (mount->root_vnode != pathVnode) {
7604                         // not mountpoint
7605                         return B_BAD_VALUE;
7606                 }
7607         }
7608
7609         // if the volume is associated with a partition, lock the device of the
7610         // partition as long as we are unmounting
7611         KDiskDeviceManager* ddm = KDiskDeviceManager::Default();
7612         KPartition* partition = mount->partition;
7613         KDiskDevice* diskDevice = NULL;
7614         if (partition != NULL) {
7615                 if (partition->Device() == NULL) {
7616                         dprintf("fs_unmount(): There is no device!\n");
7617                         return B_ERROR;
7618                 }
7619                 diskDevice = ddm->WriteLockDevice(partition->Device()->ID());
7620                 if (!diskDevice) {
7621                         TRACE(("fs_unmount(): Failed to lock disk device!\n"));
7622                         return B_ERROR;
7623                 }
7624         }
7625         DeviceWriteLocker writeLocker(diskDevice, true);
7626
7627         // make sure, that the partition is not busy
7628         if (partition != NULL) {
7629                 if ((flags & B_UNMOUNT_BUSY_PARTITION) == 0 && partition->IsBusy()) {
7630                         TRACE(("fs_unmount(): Partition is busy.\n"));
7631                         return B_BUSY;
7632                 }
7633         }
7634
7635         // grab the vnode master mutex to keep someone from creating
7636         // a vnode while we're figuring out if we can continue
7637         WriteLocker vnodesWriteLocker(&sVnodeLock);
7638
7639         bool disconnectedDescriptors = false;
7640
7641         while (true) {
7642                 bool busy = false;
7643
7644                 // cycle through the list of vnodes associated with this mount and
7645                 // make sure all of them are not busy or have refs on them
7646                 VnodeList::Iterator iterator = mount->vnodes.GetIterator();
7647                 while (struct vnode* vnode = iterator.Next()) {
7648                         if (vnode->IsBusy()) {
7649                                 busy = true;
7650                                 break;
7651                         }
7652
7653                         // check the vnode's ref count -- subtract additional references for
7654                         // covering
7655                         int32 refCount = vnode->ref_count;
7656                         if (vnode->covers != NULL)
7657                                 refCount--;
7658                         if (vnode->covered_by != NULL)
7659                                 refCount--;
7660
7661                         if (refCount != 0) {
7662                                 // there are still vnodes in use on this mount, so we cannot
7663                                 // unmount yet
7664                                 busy = true;
7665                                 break;
7666                         }
7667                 }
7668
7669                 if (!busy)
7670                         break;
7671
7672                 if ((flags & B_FORCE_UNMOUNT) == 0)
7673                         return B_BUSY;
7674
7675                 if (disconnectedDescriptors) {
7676                         // wait a bit until the last access is finished, and then try again
7677                         vnodesWriteLocker.Unlock();
7678                         snooze(100000);
7679                         // TODO: if there is some kind of bug that prevents the ref counts
7680                         // from getting back to zero, this will fall into an endless loop...
7681                         vnodesWriteLocker.Lock();
7682                         continue;
7683                 }
7684
7685                 // the file system is still busy - but we're forced to unmount it,
7686                 // so let's disconnect all open file descriptors
7687
7688                 mount->unmounting = true;
7689                         // prevent new vnodes from being created
7690
7691                 vnodesWriteLocker.Unlock();
7692
7693                 disconnect_mount_or_vnode_fds(mount, NULL);
7694                 disconnectedDescriptors = true;
7695
7696                 vnodesWriteLocker.Lock();
7697         }
7698
7699         // We can safely continue. Mark all of the vnodes busy and this mount
7700         // structure in unmounting state. Also undo the vnode covers/covered_by
7701         // links.
7702         mount->unmounting = true;
7703
7704         VnodeList::Iterator iterator = mount->vnodes.GetIterator();
7705         while (struct vnode* vnode = iterator.Next()) {
7706                 // Remove all covers/covered_by links from other mounts' nodes to this
7707                 // vnode and adjust the node ref count accordingly. We will release the
7708                 // references to the external vnodes below.
7709                 if (Vnode* coveredNode = vnode->covers) {
7710                         if (Vnode* coveringNode = vnode->covered_by) {
7711                                 // We have both covered and covering vnodes, so just remove us
7712                                 // from the chain.
7713                                 coveredNode->covered_by = coveringNode;
7714                                 coveringNode->covers = coveredNode;
7715                                 vnode->ref_count -= 2;
7716
7717                                 vnode->covered_by = NULL;
7718                                 vnode->covers = NULL;
7719                                 vnode->SetCovering(false);
7720                                 vnode->SetCovered(false);
7721                         } else {
7722                                 // We only have a covered vnode. Remove its link to us.
7723                                 coveredNode->covered_by = NULL;
7724                                 coveredNode->SetCovered(false);
7725                                 vnode->ref_count--;
7726
7727                                 // If the other node is an external vnode, we keep its link
7728                                 // link around so we can put the reference later on. Otherwise
7729                                 // we get rid of it right now.
7730                                 if (coveredNode->mount == mount) {
7731                                         vnode->covers = NULL;
7732                                         coveredNode->ref_count--;
7733                                 }
7734                         }
7735                 } else if (Vnode* coveringNode = vnode->covered_by) {
7736                         // We only have a covering vnode. Remove its link to us.
7737                         coveringNode->covers = NULL;
7738                         coveringNode->SetCovering(false);
7739                         vnode->ref_count--;
7740
7741                         // If the other node is an external vnode, we keep its link
7742                         // link around so we can put the reference later on. Otherwise
7743                         // we get rid of it right now.
7744                         if (coveringNode->mount == mount) {
7745                                 vnode->covered_by = NULL;
7746                                 coveringNode->ref_count--;
7747                         }
7748                 }
7749
7750                 vnode->SetBusy(true);
7751                 vnode_to_be_freed(vnode);
7752         }
7753
7754         vnodesWriteLocker.Unlock();
7755
7756         // Free all vnodes associated with this mount.
7757         // They will be removed from the mount list by free_vnode(), so
7758         // we don't have to do this.
7759         while (struct vnode* vnode = mount->vnodes.Head()) {
7760                 // Put the references to external covered/covering vnodes we kept above.
7761                 if (Vnode* coveredNode = vnode->covers)
7762                         put_vnode(coveredNode);
7763                 if (Vnode* coveringNode = vnode->covered_by)
7764                         put_vnode(coveringNode);
7765
7766                 free_vnode(vnode, false);
7767         }
7768
7769         // remove the mount structure from the hash table
7770         mutex_lock(&sMountMutex);
7771         sMountsTable->Remove(mount);
7772         mutex_unlock(&sMountMutex);
7773
7774         mountOpLocker.Unlock();
7775
7776         FS_MOUNT_CALL_NO_PARAMS(mount, unmount);
7777         notify_unmount(mount->id);
7778
7779         // dereference the partition and mark it unmounted
7780         if (partition) {
7781                 partition->SetVolumeID(-1);
7782                 partition->SetMountCookie(NULL);
7783
7784                 if (mount->owns_file_device)
7785                         KDiskDeviceManager::Default()->DeleteFileDevice(partition->ID());
7786                 partition->Unregister();
7787         }
7788
7789         delete mount;
7790         return B_OK;
7791 }
7792
7793
7794 static status_t
7795 fs_sync(dev_t device)
7796 {
7797         struct fs_mount* mount;
7798         status_t status = get_mount(device, &mount);
7799         if (status != B_OK)
7800                 return status;
7801
7802         struct vnode marker;
7803         memset(&marker, 0, sizeof(marker));
7804         marker.SetBusy(true);
7805         marker.SetRemoved(true);
7806
7807         // First, synchronize all file caches
7808
7809         while (true) {
7810                 WriteLocker locker(sVnodeLock);
7811                         // Note: That's the easy way. Which is probably OK for sync(),
7812                         // since it's a relatively rare call and doesn't need to allow for
7813                         // a lot of concurrency. Using a read lock would be possible, but
7814                         // also more involved, since we had to lock the individual nodes
7815                         // and take care of the locking order, which we might not want to
7816                         // do while holding fs_mount::rlock.
7817
7818                 // synchronize access to vnode list
7819                 recursive_lock_lock(&mount->rlock);
7820
7821                 struct vnode* vnode;
7822                 if (!marker.IsRemoved()) {
7823                         vnode = mount->vnodes.GetNext(&marker);
7824                         mount->vnodes.Remove(&marker);
7825                         marker.SetRemoved(true);
7826                 } else
7827                         vnode = mount->vnodes.First();
7828
7829                 while (vnode != NULL && (vnode->cache == NULL
7830                         || vnode->IsRemoved() || vnode->IsBusy())) {
7831                         // TODO: we could track writes (and writable mapped vnodes)
7832                         //      and have a simple flag that we could test for here
7833                         vnode = mount->vnodes.GetNext(vnode);
7834                 }
7835
7836                 if (vnode != NULL) {
7837                         // insert marker vnode again
7838                         mount->vnodes.Insert(mount->vnodes.GetNext(vnode), &marker);
7839                         marker.SetRemoved(false);
7840                 }
7841
7842                 recursive_lock_unlock(&mount->rlock);
7843
7844                 if (vnode == NULL)
7845                         break;
7846
7847                 vnode = lookup_vnode(mount->id, vnode->id);
7848                 if (vnode == NULL || vnode->IsBusy())
7849                         continue;
7850
7851                 if (vnode->ref_count == 0) {
7852                         // this vnode has been unused before
7853                         vnode_used(vnode);
7854                 }
7855                 inc_vnode_ref_count(vnode);
7856
7857                 locker.Unlock();
7858
7859                 if (vnode->cache != NULL && !vnode->IsRemoved())
7860                         vnode->cache->WriteModified();
7861
7862                 put_vnode(vnode);
7863         }
7864
7865         // And then, let the file systems do their synchronizing work
7866
7867         if (HAS_FS_MOUNT_CALL(mount, sync))
7868                 status = FS_MOUNT_CALL_NO_PARAMS(mount, sync);
7869
7870         put_mount(mount);
7871         return status;
7872 }
7873
7874
7875 static status_t
7876 fs_read_info(dev_t device, struct fs_info* info)
7877 {
7878         struct fs_mount* mount;
7879         status_t status = get_mount(device, &mount);
7880         if (status != B_OK)
7881                 return status;
7882
7883         memset(info, 0, sizeof(struct fs_info));
7884
7885         if (HAS_FS_MOUNT_CALL(mount, read_fs_info))
7886                 status = FS_MOUNT_CALL(mount, read_fs_info, info);
7887
7888         // fill in info the file system doesn't (have to) know about
7889         if (status == B_OK) {
7890                 info->dev = mount->id;
7891                 info->root = mount->root_vnode->id;
7892
7893                 fs_volume* volume = mount->volume;
7894                 while (volume->super_volume != NULL)
7895                         volume = volume->super_volume;
7896
7897                 strlcpy(info->fsh_name, volume->file_system_name,
7898                         sizeof(info->fsh_name));
7899                 if (mount->device_name != NULL) {
7900                         strlcpy(info->device_name, mount->device_name,
7901                                 sizeof(info->device_name));
7902                 }
7903         }
7904
7905         // if the call is not supported by the file system, there are still
7906         // the parts that we filled out ourselves
7907
7908         put_mount(mount);
7909         return status;
7910 }
7911
7912
7913 static status_t
7914 fs_write_info(dev_t device, const struct fs_info* info, int mask)
7915 {
7916         struct fs_mount* mount;
7917         status_t status = get_mount(device, &mount);
7918         if (status != B_OK)
7919                 return status;
7920
7921         if (HAS_FS_MOUNT_CALL(mount, write_fs_info))
7922                 status = FS_MOUNT_CALL(mount, write_fs_info, info, mask);
7923         else
7924                 status = B_READ_ONLY_DEVICE;
7925
7926         put_mount(mount);
7927         return status;
7928 }
7929
7930
7931 static dev_t
7932 fs_next_device(int32* _cookie)
7933 {
7934         struct fs_mount* mount = NULL;
7935         dev_t device = *_cookie;
7936
7937         mutex_lock(&sMountMutex);
7938
7939         // Since device IDs are assigned sequentially, this algorithm
7940         // does work good enough. It makes sure that the device list
7941         // returned is sorted, and that no device is skipped when an
7942         // already visited device got unmounted.
7943
7944         while (device < sNextMountID) {
7945                 mount = find_mount(device++);
7946                 if (mount != NULL && mount->volume->private_volume != NULL)
7947                         break;
7948         }
7949
7950         *_cookie = device;
7951
7952         if (mount != NULL)
7953                 device = mount->id;
7954         else
7955                 device = B_BAD_VALUE;
7956
7957         mutex_unlock(&sMountMutex);
7958
7959         return device;
7960 }
7961
7962
7963 ssize_t
7964 fs_read_attr(int fd, const char *attribute, uint32 type, off_t pos,
7965         void *buffer, size_t readBytes)
7966 {
7967         int attrFD = attr_open(fd, NULL, attribute, O_RDONLY, true);
7968         if (attrFD < 0)
7969                 return attrFD;
7970
7971         ssize_t bytesRead = _kern_read(attrFD, pos, buffer, readBytes);
7972
7973         _kern_close(attrFD);
7974
7975         return bytesRead;
7976 }
7977
7978
7979 static status_t
7980 get_cwd(char* buffer, size_t size, bool kernel)
7981 {
7982         // Get current working directory from io context
7983         struct io_context* context = get_current_io_context(kernel);
7984         status_t status;
7985
7986         FUNCTION(("vfs_get_cwd: buf %p, size %ld\n", buffer, size));
7987
7988         mutex_lock(&context->io_mutex);
7989
7990         struct vnode* vnode = context->cwd;
7991         if (vnode)
7992                 inc_vnode_ref_count(vnode);
7993
7994         mutex_unlock(&context->io_mutex);
7995
7996         if (vnode) {
7997                 status = dir_vnode_to_path(vnode, buffer, size, kernel);
7998                 put_vnode(vnode);
7999         } else
8000                 status = B_ERROR;
8001
8002         return status;
8003 }
8004
8005
8006 static status_t
8007 set_cwd(int fd, char* path, bool kernel)
8008 {
8009         struct io_context* context;
8010         struct vnode* vnode = NULL;
8011         struct vnode* oldDirectory;
8012         status_t status;
8013
8014         FUNCTION(("set_cwd: path = \'%s\'\n", path));
8015
8016         // Get vnode for passed path, and bail if it failed
8017         status = fd_and_path_to_vnode(fd, path, true, &vnode, NULL, kernel);
8018         if (status < 0)
8019                 return status;
8020
8021         if (!S_ISDIR(vnode->Type())) {
8022                 // nope, can't cwd to here
8023                 status = B_NOT_A_DIRECTORY;
8024                 goto err;
8025         }
8026
8027         // We need to have the permission to enter the directory, too
8028         if (HAS_FS_CALL(vnode, access)) {
8029                 status = FS_CALL(vnode, access, X_OK);
8030                 if (status != B_OK)
8031                         goto err;
8032         }
8033
8034         // Get current io context and lock
8035         context = get_current_io_context(kernel);
8036         mutex_lock(&context->io_mutex);
8037
8038         // save the old current working directory first
8039         oldDirectory = context->cwd;
8040         context->cwd = vnode;
8041
8042         mutex_unlock(&context->io_mutex);
8043
8044         if (oldDirectory)
8045                 put_vnode(oldDirectory);
8046
8047         return B_NO_ERROR;
8048
8049 err:
8050         put_vnode(vnode);
8051         return status;
8052 }
8053
8054
8055 //      #pragma mark - kernel mirrored syscalls
8056
8057
8058 dev_t
8059 _kern_mount(const char* path, const char* device, const char* fsName,
8060         uint32 flags, const char* args, size_t argsLength)
8061 {
8062         KPath pathBuffer(path, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8063         if (pathBuffer.InitCheck() != B_OK)
8064                 return B_NO_MEMORY;
8065
8066         return fs_mount(pathBuffer.LockBuffer(), device, fsName, flags, args, true);
8067 }
8068
8069
8070 status_t
8071 _kern_unmount(const char* path, uint32 flags)
8072 {
8073         KPath pathBuffer(path, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8074         if (pathBuffer.InitCheck() != B_OK)
8075                 return B_NO_MEMORY;
8076
8077         return fs_unmount(pathBuffer.LockBuffer(), -1, flags, true);
8078 }
8079
8080
8081 status_t
8082 _kern_read_fs_info(dev_t device, struct fs_info* info)
8083 {
8084         if (info == NULL)
8085                 return B_BAD_VALUE;
8086
8087         return fs_read_info(device, info);
8088 }
8089
8090
8091 status_t
8092 _kern_write_fs_info(dev_t device, const struct fs_info* info, int mask)
8093 {
8094         if (info == NULL)
8095                 return B_BAD_VALUE;
8096
8097         return fs_write_info(device, info, mask);
8098 }
8099
8100
8101 status_t
8102 _kern_sync(void)
8103 {
8104         // Note: _kern_sync() is also called from _user_sync()
8105         int32 cookie = 0;
8106         dev_t device;
8107         while ((device = next_dev(&cookie)) >= 0) {
8108                 status_t status = fs_sync(device);
8109                 if (status != B_OK && status != B_BAD_VALUE) {
8110                         dprintf("sync: device %" B_PRIdDEV " couldn't sync: %s\n", device,
8111                                 strerror(status));
8112                 }
8113         }
8114
8115         return B_OK;
8116 }
8117
8118
8119 dev_t
8120 _kern_next_device(int32* _cookie)
8121 {
8122         return fs_next_device(_cookie);
8123 }
8124
8125
8126 status_t
8127 _kern_get_next_fd_info(team_id teamID, uint32* _cookie, fd_info* info,
8128         size_t infoSize)
8129 {
8130         if (infoSize != sizeof(fd_info))
8131                 return B_BAD_VALUE;
8132
8133         // get the team
8134         Team* team = Team::Get(teamID);
8135         if (team == NULL)
8136                 return B_BAD_TEAM_ID;
8137         BReference<Team> teamReference(team, true);
8138
8139         // now that we have a team reference, its I/O context won't go away
8140         io_context* context = team->io_context;
8141         MutexLocker contextLocker(context->io_mutex);
8142
8143         uint32 slot = *_cookie;
8144
8145         struct file_descriptor* descriptor;
8146         while (slot < context->table_size
8147                 && (descriptor = context->fds[slot]) == NULL) {
8148                 slot++;
8149         }
8150
8151         if (slot >= context->table_size)
8152                 return B_ENTRY_NOT_FOUND;
8153
8154         info->number = slot;
8155         info->open_mode = descriptor->open_mode;
8156
8157         struct vnode* vnode = fd_vnode(descriptor);
8158         if (vnode != NULL) {
8159                 info->device = vnode->device;
8160                 info->node = vnode->id;
8161         } else if (descriptor->u.mount != NULL) {
8162                 info->device = descriptor->u.mount->id;
8163                 info->node = -1;
8164         }
8165
8166         *_cookie = slot + 1;
8167         return B_OK;
8168 }
8169
8170
8171 int
8172 _kern_open_entry_ref(dev_t device, ino_t inode, const char* name, int openMode,
8173         int perms)
8174 {
8175         if ((openMode & O_CREAT) != 0) {
8176                 return file_create_entry_ref(device, inode, name, openMode, perms,
8177                         true);
8178         }
8179
8180         return file_open_entry_ref(device, inode, name, openMode, true);
8181 }
8182
8183
8184 /*!     \brief Opens a node specified by a FD + path pair.
8185
8186         At least one of \a fd and \a path must be specified.
8187         If only \a fd is given, the function opens the node identified by this
8188         FD. If only a path is given, this path is opened. If both are given and
8189         the path is absolute, \a fd is ignored; a relative path is reckoned off
8190         of the directory (!) identified by \a fd.
8191
8192         \param fd The FD. May be < 0.
8193         \param path The absolute or relative path. May be \c NULL.
8194         \param openMode The open mode.
8195         \return A FD referring to the newly opened node, or an error code,
8196                         if an error occurs.
8197 */
8198 int
8199 _kern_open(int fd, const char* path, int openMode, int perms)
8200 {
8201         KPath pathBuffer(path, KPath::LAZY_ALLOC, B_PATH_NAME_LENGTH + 1);
8202         if (pathBuffer.InitCheck() != B_OK)
8203                 return B_NO_MEMORY;
8204
8205         if ((openMode & O_CREAT) != 0)
8206                 return file_create(fd, pathBuffer.LockBuffer(), openMode, perms, true);
8207
8208         return file_open(fd, pathBuffer.LockBuffer(), openMode, true);
8209 }
8210
8211
8212 /*!     \brief Opens a directory specified by entry_ref or node_ref.
8213
8214         The supplied name may be \c NULL, in which case directory identified
8215         by \a device and \a inode will be opened. Otherwise \a device and
8216         \a inode identify the parent directory of the directory to be opened
8217         and \a name its entry name.
8218
8219         \param device If \a name is specified the ID of the device the parent
8220                    directory of the directory to be opened resides on, otherwise
8221                    the device of the directory itself.
8222         \param inode If \a name is specified the node ID of the parent
8223                    directory of the directory to be opened, otherwise node ID of the
8224                    directory itself.
8225         \param name The entry name of the directory to be opened. If \c NULL,
8226                    the \a device + \a inode pair identify the node to be opened.
8227         \return The FD of the newly opened directory or an error code, if
8228                         something went wrong.
8229 */
8230 int
8231 _kern_open_dir_entry_ref(dev_t device, ino_t inode, const char* name)
8232 {
8233         return dir_open_entry_ref(device, inode, name, true);
8234 }
8235
8236
8237 /*!     \brief Opens a directory specified by a FD + path pair.
8238
8239         At least one of \a fd and \a path must be specified.
8240         If only \a fd is given, the function opens the directory identified by this
8241         FD. If only a path is given, this path is opened. If both are given and
8242         the path is absolute, \a fd is ignored; a relative path is reckoned off
8243         of the directory (!) identified by \a fd.
8244
8245         \param fd The FD. May be < 0.
8246         \param path The absolute or relative path. May be \c NULL.
8247         \return A FD referring to the newly opened directory, or an error code,
8248                         if an error occurs.
8249 */
8250 int
8251 _kern_open_dir(int fd, const char* path)
8252 {
8253         KPath pathBuffer(path, KPath::LAZY_ALLOC, B_PATH_NAME_LENGTH + 1);
8254         if (pathBuffer.InitCheck() != B_OK)
8255                 return B_NO_MEMORY;
8256
8257         return dir_open(fd, pathBuffer.LockBuffer(), true);
8258 }
8259
8260
8261 status_t
8262 _kern_fcntl(int fd, int op, size_t argument)
8263 {
8264         return common_fcntl(fd, op, argument, true);
8265 }
8266
8267
8268 status_t
8269 _kern_fsync(int fd)
8270 {
8271         return common_sync(fd, true);
8272 }
8273
8274
8275 status_t
8276 _kern_lock_node(int fd)
8277 {
8278         return common_lock_node(fd, true);
8279 }
8280
8281
8282 status_t
8283 _kern_unlock_node(int fd)
8284 {
8285         return common_unlock_node(fd, true);
8286 }
8287
8288
8289 status_t
8290 _kern_create_dir_entry_ref(dev_t device, ino_t inode, const char* name,
8291         int perms)
8292 {
8293         return dir_create_entry_ref(device, inode, name, perms, true);
8294 }
8295
8296
8297 /*!     \brief Creates a directory specified by a FD + path pair.
8298
8299         \a path must always be specified (it contains the name of the new directory
8300         at least). If only a path is given, this path identifies the location at
8301         which the directory shall be created. If both \a fd and \a path are given
8302         and the path is absolute, \a fd is ignored; a relative path is reckoned off
8303         of the directory (!) identified by \a fd.
8304
8305         \param fd The FD. May be < 0.
8306         \param path The absolute or relative path. Must not be \c NULL.
8307         \param perms The access permissions the new directory shall have.
8308         \return \c B_OK, if the directory has been created successfully, another
8309                         error code otherwise.
8310 */
8311 status_t
8312 _kern_create_dir(int fd, const char* path, int perms)
8313 {
8314         KPath pathBuffer(path, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8315         if (pathBuffer.InitCheck() != B_OK)
8316                 return B_NO_MEMORY;
8317
8318         return dir_create(fd, pathBuffer.LockBuffer(), perms, true);
8319 }
8320
8321
8322 status_t
8323 _kern_remove_dir(int fd, const char* path)
8324 {
8325         KPath pathBuffer(path, KPath::LAZY_ALLOC, B_PATH_NAME_LENGTH + 1);
8326         if (pathBuffer.InitCheck() != B_OK)
8327                 return B_NO_MEMORY;
8328
8329         return dir_remove(fd, pathBuffer.LockBuffer(), true);
8330 }
8331
8332
8333 /*!     \brief Reads the contents of a symlink referred to by a FD + path pair.
8334
8335         At least one of \a fd and \a path must be specified.
8336         If only \a fd is given, the function the symlink to be read is the node
8337         identified by this FD. If only a path is given, this path identifies the
8338         symlink to be read. If both are given and the path is absolute, \a fd is
8339         ignored; a relative path is reckoned off of the directory (!) identified
8340         by \a fd.
8341         If this function fails with B_BUFFER_OVERFLOW, the \a _bufferSize pointer
8342         will still be updated to reflect the required buffer size.
8343
8344         \param fd The FD. May be < 0.
8345         \param path The absolute or relative path. May be \c NULL.
8346         \param buffer The buffer into which the contents of the symlink shall be
8347                    written.
8348         \param _bufferSize A pointer to the size of the supplied buffer.
8349         \return The length of the link on success or an appropriate error code
8350 */
8351 status_t
8352 _kern_read_link(int fd, const char* path, char* buffer, size_t* _bufferSize)
8353 {
8354         KPath pathBuffer(path, KPath::LAZY_ALLOC, B_PATH_NAME_LENGTH + 1);
8355         if (pathBuffer.InitCheck() != B_OK)
8356                 return B_NO_MEMORY;
8357
8358         return common_read_link(fd, pathBuffer.LockBuffer(),
8359                 buffer, _bufferSize, true);
8360 }
8361
8362
8363 /*!     \brief Creates a symlink specified by a FD + path pair.
8364
8365         \a path must always be specified (it contains the name of the new symlink
8366         at least). If only a path is given, this path identifies the location at
8367         which the symlink shall be created. If both \a fd and \a path are given and
8368         the path is absolute, \a fd is ignored; a relative path is reckoned off
8369         of the directory (!) identified by \a fd.
8370
8371         \param fd The FD. May be < 0.
8372         \param toPath The absolute or relative path. Must not be \c NULL.
8373         \param mode The access permissions the new symlink shall have.
8374         \return \c B_OK, if the symlink has been created successfully, another
8375                         error code otherwise.
8376 */
8377 status_t
8378 _kern_create_symlink(int fd, const char* path, const char* toPath, int mode)
8379 {
8380         KPath pathBuffer(path, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8381         if (pathBuffer.InitCheck() != B_OK)
8382                 return B_NO_MEMORY;
8383
8384         return common_create_symlink(fd, pathBuffer.LockBuffer(),
8385                 toPath, mode, true);
8386 }
8387
8388
8389 status_t
8390 _kern_create_link(int pathFD, const char* path, int toFD, const char* toPath,
8391         bool traverseLeafLink)
8392 {
8393         KPath pathBuffer(path, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8394         KPath toPathBuffer(toPath, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8395         if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
8396                 return B_NO_MEMORY;
8397
8398         return common_create_link(pathFD, pathBuffer.LockBuffer(), toFD,
8399                 toPathBuffer.LockBuffer(), traverseLeafLink, true);
8400 }
8401
8402
8403 /*!     \brief Removes an entry specified by a FD + path pair from its directory.
8404
8405         \a path must always be specified (it contains at least the name of the entry
8406         to be deleted). If only a path is given, this path identifies the entry
8407         directly. If both \a fd and \a path are given and the path is absolute,
8408         \a fd is ignored; a relative path is reckoned off of the directory (!)
8409         identified by \a fd.
8410
8411         \param fd The FD. May be < 0.
8412         \param path The absolute or relative path. Must not be \c NULL.
8413         \return \c B_OK, if the entry has been removed successfully, another
8414                         error code otherwise.
8415 */
8416 status_t
8417 _kern_unlink(int fd, const char* path)
8418 {
8419         KPath pathBuffer(path, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8420         if (pathBuffer.InitCheck() != B_OK)
8421                 return B_NO_MEMORY;
8422
8423         return common_unlink(fd, pathBuffer.LockBuffer(), true);
8424 }
8425
8426
8427 /*!     \brief Moves an entry specified by a FD + path pair to a an entry specified
8428                    by another FD + path pair.
8429
8430         \a oldPath and \a newPath must always be specified (they contain at least
8431         the name of the entry). If only a path is given, this path identifies the
8432         entry directly. If both a FD and a path are given and the path is absolute,
8433         the FD is ignored; a relative path is reckoned off of the directory (!)
8434         identified by the respective FD.
8435
8436         \param oldFD The FD of the old location. May be < 0.
8437         \param oldPath The absolute or relative path of the old location. Must not
8438                    be \c NULL.
8439         \param newFD The FD of the new location. May be < 0.
8440         \param newPath The absolute or relative path of the new location. Must not
8441                    be \c NULL.
8442         \return \c B_OK, if the entry has been moved successfully, another
8443                         error code otherwise.
8444 */
8445 status_t
8446 _kern_rename(int oldFD, const char* oldPath, int newFD, const char* newPath)
8447 {
8448         KPath oldPathBuffer(oldPath, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8449         KPath newPathBuffer(newPath, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8450         if (oldPathBuffer.InitCheck() != B_OK || newPathBuffer.InitCheck() != B_OK)
8451                 return B_NO_MEMORY;
8452
8453         return common_rename(oldFD, oldPathBuffer.LockBuffer(),
8454                 newFD, newPathBuffer.LockBuffer(), true);
8455 }
8456
8457
8458 status_t
8459 _kern_access(int fd, const char* path, int mode, bool effectiveUserGroup)
8460 {
8461         KPath pathBuffer(path, KPath::LAZY_ALLOC, B_PATH_NAME_LENGTH + 1);
8462         if (pathBuffer.InitCheck() != B_OK)
8463                 return B_NO_MEMORY;
8464
8465         return common_access(fd, pathBuffer.LockBuffer(), mode, effectiveUserGroup,
8466                 true);
8467 }
8468
8469
8470 /*!     \brief Reads stat data of an entity specified by a FD + path pair.
8471
8472         If only \a fd is given, the stat operation associated with the type
8473         of the FD (node, attr, attr dir etc.) is performed. If only \a path is
8474         given, this path identifies the entry for whose node to retrieve the
8475         stat data. If both \a fd and \a path are given and the path is absolute,
8476         \a fd is ignored; a relative path is reckoned off of the directory (!)
8477         identified by \a fd and specifies the entry whose stat data shall be
8478         retrieved.
8479
8480         \param fd The FD. May be < 0.
8481         \param path The absolute or relative path. Must not be \c NULL.
8482         \param traverseLeafLink If \a path is given, \c true specifies that the
8483                    function shall not stick to symlinks, but traverse them.
8484         \param stat The buffer the stat data shall be written into.
8485         \param statSize The size of the supplied stat buffer.
8486         \return \c B_OK, if the the stat data have been read successfully, another
8487                         error code otherwise.
8488 */
8489 status_t
8490 _kern_read_stat(int fd, const char* path, bool traverseLeafLink,
8491         struct stat* stat, size_t statSize)
8492 {
8493         struct stat completeStat;
8494         struct stat* originalStat = NULL;
8495         status_t status;
8496
8497         if (statSize > sizeof(struct stat))
8498                 return B_BAD_VALUE;
8499
8500         // this supports different stat extensions
8501         if (statSize < sizeof(struct stat)) {
8502                 originalStat = stat;
8503                 stat = &completeStat;
8504         }
8505
8506         status = vfs_read_stat(fd, path, traverseLeafLink, stat, true);
8507
8508         if (status == B_OK && originalStat != NULL)
8509                 memcpy(originalStat, stat, statSize);
8510
8511         return status;
8512 }
8513
8514
8515 /*!     \brief Writes stat data of an entity specified by a FD + path pair.
8516
8517         If only \a fd is given, the stat operation associated with the type
8518         of the FD (node, attr, attr dir etc.) is performed. If only \a path is
8519         given, this path identifies the entry for whose node to write the
8520         stat data. If both \a fd and \a path are given and the path is absolute,
8521         \a fd is ignored; a relative path is reckoned off of the directory (!)
8522         identified by \a fd and specifies the entry whose stat data shall be
8523         written.
8524
8525         \param fd The FD. May be < 0.
8526         \param path The absolute or relative path. May be \c NULL.
8527         \param traverseLeafLink If \a path is given, \c true specifies that the
8528                    function shall not stick to symlinks, but traverse them.
8529         \param stat The buffer containing the stat data to be written.
8530         \param statSize The size of the supplied stat buffer.
8531         \param statMask A mask specifying which parts of the stat data shall be
8532                    written.
8533         \return \c B_OK, if the the stat data have been written successfully,
8534                         another error code otherwise.
8535 */
8536 status_t
8537 _kern_write_stat(int fd, const char* path, bool traverseLeafLink,
8538         const struct stat* stat, size_t statSize, int statMask)
8539 {
8540         struct stat completeStat;
8541
8542         if (statSize > sizeof(struct stat))
8543                 return B_BAD_VALUE;
8544
8545         // this supports different stat extensions
8546         if (statSize < sizeof(struct stat)) {
8547                 memset((uint8*)&completeStat + statSize, 0,
8548                         sizeof(struct stat) - statSize);
8549                 memcpy(&completeStat, stat, statSize);
8550                 stat = &completeStat;
8551         }
8552
8553         status_t status;
8554
8555         if (path != NULL) {
8556                 // path given: write the stat of the node referred to by (fd, path)
8557                 KPath pathBuffer(path, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8558                 if (pathBuffer.InitCheck() != B_OK)
8559                         return B_NO_MEMORY;
8560
8561                 status = common_path_write_stat(fd, pathBuffer.LockBuffer(),
8562                         traverseLeafLink, stat, statMask, true);
8563         } else {
8564                 // no path given: get the FD and use the FD operation
8565                 struct file_descriptor* descriptor
8566                         = get_fd(get_current_io_context(true), fd);
8567                 if (descriptor == NULL)
8568                         return B_FILE_ERROR;
8569
8570                 if (descriptor->ops->fd_write_stat)
8571                         status = descriptor->ops->fd_write_stat(descriptor, stat, statMask);
8572                 else
8573                         status = B_UNSUPPORTED;
8574
8575                 put_fd(descriptor);
8576         }
8577
8578         return status;
8579 }
8580
8581
8582 int
8583 _kern_open_attr_dir(int fd, const char* path, bool traverseLeafLink)
8584 {
8585         KPath pathBuffer(path, KPath::LAZY_ALLOC, B_PATH_NAME_LENGTH + 1);
8586         if (pathBuffer.InitCheck() != B_OK)
8587                 return B_NO_MEMORY;
8588
8589         return attr_dir_open(fd, pathBuffer.LockBuffer(), traverseLeafLink, true);
8590 }
8591
8592
8593 int
8594 _kern_open_attr(int fd, const char* path, const char* name, uint32 type,
8595         int openMode)
8596 {
8597         KPath pathBuffer(path, KPath::LAZY_ALLOC, B_PATH_NAME_LENGTH + 1);
8598         if (pathBuffer.InitCheck() != B_OK)
8599                 return B_NO_MEMORY;
8600
8601         if ((openMode & O_CREAT) != 0) {
8602                 return attr_create(fd, pathBuffer.LockBuffer(), name, type, openMode,
8603                         true);
8604         }
8605
8606         return attr_open(fd, pathBuffer.LockBuffer(), name, openMode, true);
8607 }
8608
8609
8610 status_t
8611 _kern_remove_attr(int fd, const char* name)
8612 {
8613         return attr_remove(fd, name, true);
8614 }
8615
8616
8617 status_t
8618 _kern_rename_attr(int fromFile, const char* fromName, int toFile,
8619         const char* toName)
8620 {
8621         return attr_rename(fromFile, fromName, toFile, toName, true);
8622 }
8623
8624
8625 int
8626 _kern_open_index_dir(dev_t device)
8627 {
8628         return index_dir_open(device, true);
8629 }
8630
8631
8632 status_t
8633 _kern_create_index(dev_t device, const char* name, uint32 type, uint32 flags)
8634 {
8635         return index_create(device, name, type, flags, true);
8636 }
8637
8638
8639 status_t
8640 _kern_read_index_stat(dev_t device, const char* name, struct stat* stat)
8641 {
8642         return index_name_read_stat(device, name, stat, true);
8643 }
8644
8645
8646 status_t
8647 _kern_remove_index(dev_t device, const char* name)
8648 {
8649         return index_remove(device, name, true);
8650 }
8651
8652
8653 status_t
8654 _kern_getcwd(char* buffer, size_t size)
8655 {
8656         TRACE(("_kern_getcwd: buf %p, %ld\n", buffer, size));
8657
8658         // Call vfs to get current working directory
8659         return get_cwd(buffer, size, true);
8660 }
8661
8662
8663 status_t
8664 _kern_setcwd(int fd, const char* path)
8665 {
8666         KPath pathBuffer(path, KPath::LAZY_ALLOC, B_PATH_NAME_LENGTH + 1);
8667         if (pathBuffer.InitCheck() != B_OK)
8668                 return B_NO_MEMORY;
8669
8670         return set_cwd(fd, pathBuffer.LockBuffer(), true);
8671 }
8672
8673
8674 //      #pragma mark - userland syscalls
8675
8676
8677 dev_t
8678 _user_mount(const char* userPath, const char* userDevice,
8679         const char* userFileSystem, uint32 flags, const char* userArgs,
8680         size_t argsLength)
8681 {
8682         char fileSystem[B_FILE_NAME_LENGTH];
8683         KPath path, device;
8684         char* args = NULL;
8685         status_t status;
8686
8687         if (!IS_USER_ADDRESS(userPath)
8688                 || !IS_USER_ADDRESS(userFileSystem)
8689                 || !IS_USER_ADDRESS(userDevice))
8690                 return B_BAD_ADDRESS;
8691
8692         if (path.InitCheck() != B_OK || device.InitCheck() != B_OK)
8693                 return B_NO_MEMORY;
8694
8695         if (user_strlcpy(path.LockBuffer(), userPath, B_PATH_NAME_LENGTH) < B_OK)
8696                 return B_BAD_ADDRESS;
8697
8698         if (userFileSystem != NULL
8699                 && user_strlcpy(fileSystem, userFileSystem, sizeof(fileSystem)) < B_OK)
8700                 return B_BAD_ADDRESS;
8701
8702         if (userDevice != NULL
8703                 && user_strlcpy(device.LockBuffer(), userDevice, B_PATH_NAME_LENGTH)
8704                         < B_OK)
8705                 return B_BAD_ADDRESS;
8706
8707         if (userArgs != NULL && argsLength > 0) {
8708                 // this is a safety restriction
8709                 if (argsLength >= 65536)
8710                         return B_NAME_TOO_LONG;
8711
8712                 args = (char*)malloc(argsLength + 1);
8713                 if (args == NULL)
8714                         return B_NO_MEMORY;
8715
8716                 if (user_strlcpy(args, userArgs, argsLength + 1) < B_OK) {
8717                         free(args);
8718                         return B_BAD_ADDRESS;
8719                 }
8720         }
8721         path.UnlockBuffer();
8722         device.UnlockBuffer();
8723
8724         status = fs_mount(path.LockBuffer(),
8725                 userDevice != NULL ? device.Path() : NULL,
8726                 userFileSystem ? fileSystem : NULL, flags, args, false);
8727
8728         free(args);
8729         return status;
8730 }
8731
8732
8733 status_t
8734 _user_unmount(const char* userPath, uint32 flags)
8735 {
8736         KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
8737         if (pathBuffer.InitCheck() != B_OK)
8738                 return B_NO_MEMORY;
8739
8740         char* path = pathBuffer.LockBuffer();
8741
8742         if (user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
8743                 return B_BAD_ADDRESS;
8744
8745         return fs_unmount(path, -1, flags & ~B_UNMOUNT_BUSY_PARTITION, false);
8746 }
8747
8748
8749 status_t
8750 _user_read_fs_info(dev_t device, struct fs_info* userInfo)
8751 {
8752         struct fs_info info;
8753         status_t status;
8754
8755         if (userInfo == NULL)
8756                 return B_BAD_VALUE;
8757
8758         if (!IS_USER_ADDRESS(userInfo))
8759                 return B_BAD_ADDRESS;
8760
8761         status = fs_read_info(device, &info);
8762         if (status != B_OK)
8763                 return status;
8764
8765         if (user_memcpy(userInfo, &info, sizeof(struct fs_info)) != B_OK)
8766                 return B_BAD_ADDRESS;
8767
8768         return B_OK;
8769 }
8770
8771
8772 status_t
8773 _user_write_fs_info(dev_t device, const struct fs_info* userInfo, int mask)
8774 {
8775         struct fs_info info;
8776
8777         if (userInfo == NULL)
8778                 return B_BAD_VALUE;
8779
8780         if (!IS_USER_ADDRESS(userInfo)
8781                 || user_memcpy(&info, userInfo, sizeof(struct fs_info)) != B_OK)
8782                 return B_BAD_ADDRESS;
8783
8784         return fs_write_info(device, &info, mask);
8785 }
8786
8787
8788 dev_t
8789 _user_next_device(int32* _userCookie)
8790 {
8791         int32 cookie;
8792         dev_t device;
8793
8794         if (!IS_USER_ADDRESS(_userCookie)
8795                 || user_memcpy(&cookie, _userCookie, sizeof(int32)) != B_OK)
8796                 return B_BAD_ADDRESS;
8797
8798         device = fs_next_device(&cookie);
8799
8800         if (device >= B_OK) {
8801                 // update user cookie
8802                 if (user_memcpy(_userCookie, &cookie, sizeof(int32)) != B_OK)
8803                         return B_BAD_ADDRESS;
8804         }
8805
8806         return device;
8807 }
8808
8809
8810 status_t
8811 _user_sync(void)
8812 {
8813         return _kern_sync();
8814 }
8815
8816
8817 status_t
8818 _user_get_next_fd_info(team_id team, uint32* userCookie, fd_info* userInfo,
8819         size_t infoSize)
8820 {
8821         struct fd_info info;
8822         uint32 cookie;
8823
8824         // only root can do this (or should root's group be enough?)
8825         if (geteuid() != 0)
8826                 return B_NOT_ALLOWED;
8827
8828         if (infoSize != sizeof(fd_info))
8829                 return B_BAD_VALUE;
8830
8831         if (!IS_USER_ADDRESS(userCookie) || !IS_USER_ADDRESS(userInfo)
8832                 || user_memcpy(&cookie, userCookie, sizeof(uint32)) != B_OK)
8833                 return B_BAD_ADDRESS;
8834
8835         status_t status = _kern_get_next_fd_info(team, &cookie, &info, infoSize);
8836         if (status != B_OK)
8837                 return status;
8838
8839         if (user_memcpy(userCookie, &cookie, sizeof(uint32)) != B_OK
8840                 || user_memcpy(userInfo, &info, infoSize) != B_OK)
8841                 return B_BAD_ADDRESS;
8842
8843         return status;
8844 }
8845
8846
8847 status_t
8848 _user_entry_ref_to_path(dev_t device, ino_t inode, const char* leaf,
8849         char* userPath, size_t pathLength)
8850 {
8851         if (!IS_USER_ADDRESS(userPath))
8852                 return B_BAD_ADDRESS;
8853
8854         KPath path(B_PATH_NAME_LENGTH + 1);
8855         if (path.InitCheck() != B_OK)
8856                 return B_NO_MEMORY;
8857
8858         // copy the leaf name onto the stack
8859         char stackLeaf[B_FILE_NAME_LENGTH];
8860         if (leaf != NULL) {
8861                 if (!IS_USER_ADDRESS(leaf))
8862                         return B_BAD_ADDRESS;
8863
8864                 int length = user_strlcpy(stackLeaf, leaf, B_FILE_NAME_LENGTH);
8865                 if (length < 0)
8866                         return length;
8867                 if (length >= B_FILE_NAME_LENGTH)
8868                         return B_NAME_TOO_LONG;
8869
8870                 leaf = stackLeaf;
8871         }
8872
8873         status_t status = vfs_entry_ref_to_path(device, inode, leaf,
8874                 false, path.LockBuffer(), path.BufferSize());
8875         if (status != B_OK)
8876                 return status;
8877
8878         path.UnlockBuffer();
8879
8880         int length = user_strlcpy(userPath, path.Path(), pathLength);
8881         if (length < 0)
8882                 return length;
8883         if (length >= (int)pathLength)
8884                 return B_BUFFER_OVERFLOW;
8885
8886         return B_OK;
8887 }
8888
8889
8890 status_t
8891 _user_normalize_path(const char* userPath, bool traverseLink, char* buffer)
8892 {
8893         if (userPath == NULL || buffer == NULL)
8894                 return B_BAD_VALUE;
8895         if (!IS_USER_ADDRESS(userPath) || !IS_USER_ADDRESS(buffer))
8896                 return B_BAD_ADDRESS;
8897
8898         // copy path from userland
8899         KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
8900         if (pathBuffer.InitCheck() != B_OK)
8901                 return B_NO_MEMORY;
8902         char* path = pathBuffer.LockBuffer();
8903
8904         if (user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
8905                 return B_BAD_ADDRESS;
8906
8907         status_t error = normalize_path(path, pathBuffer.BufferSize(), traverseLink,
8908                 false);
8909         if (error != B_OK)
8910                 return error;
8911
8912         // copy back to userland
8913         int len = user_strlcpy(buffer, path, B_PATH_NAME_LENGTH);
8914         if (len < 0)
8915                 return len;
8916         if (len >= B_PATH_NAME_LENGTH)
8917                 return B_BUFFER_OVERFLOW;
8918
8919         return B_OK;
8920 }
8921
8922
8923 int
8924 _user_open_entry_ref(dev_t device, ino_t inode, const char* userName,
8925         int openMode, int perms)
8926 {
8927         char name[B_FILE_NAME_LENGTH];
8928
8929         if (userName == NULL || device < 0 || inode < 0)
8930                 return B_BAD_VALUE;
8931         if (!IS_USER_ADDRESS(userName)
8932                 || user_strlcpy(name, userName, sizeof(name)) < B_OK)
8933                 return B_BAD_ADDRESS;
8934
8935         if ((openMode & O_CREAT) != 0) {
8936                 return file_create_entry_ref(device, inode, name, openMode, perms,
8937                         false);
8938         }
8939
8940         return file_open_entry_ref(device, inode, name, openMode, false);
8941 }
8942
8943
8944 int
8945 _user_open(int fd, const char* userPath, int openMode, int perms)
8946 {
8947         KPath path(B_PATH_NAME_LENGTH + 1);
8948         if (path.InitCheck() != B_OK)
8949                 return B_NO_MEMORY;
8950
8951         char* buffer = path.LockBuffer();
8952
8953         if (!IS_USER_ADDRESS(userPath)
8954                 || user_strlcpy(buffer, userPath, B_PATH_NAME_LENGTH) < B_OK)
8955                 return B_BAD_ADDRESS;
8956
8957         if ((openMode & O_CREAT) != 0)
8958                 return file_create(fd, buffer, openMode, perms, false);
8959
8960         return file_open(fd, buffer, openMode, false);
8961 }
8962
8963
8964 int
8965 _user_open_dir_entry_ref(dev_t device, ino_t inode, const char* userName)
8966 {
8967         if (userName != NULL) {
8968                 char name[B_FILE_NAME_LENGTH];
8969
8970                 if (!IS_USER_ADDRESS(userName)
8971                         || user_strlcpy(name, userName, sizeof(name)) < B_OK)
8972                         return B_BAD_ADDRESS;
8973
8974                 return dir_open_entry_ref(device, inode, name, false);
8975         }
8976         return dir_open_entry_ref(device, inode, NULL, false);
8977 }
8978
8979
8980 int
8981 _user_open_dir(int fd, const char* userPath)
8982 {
8983         if (userPath == NULL)
8984                 return dir_open(fd, NULL, false);
8985
8986         KPath path(B_PATH_NAME_LENGTH + 1);
8987         if (path.InitCheck() != B_OK)
8988                 return B_NO_MEMORY;
8989
8990         char* buffer = path.LockBuffer();
8991
8992         if (!IS_USER_ADDRESS(userPath)
8993                 || user_strlcpy(buffer, userPath, B_PATH_NAME_LENGTH) < B_OK)
8994                 return B_BAD_ADDRESS;
8995
8996         return dir_open(fd, buffer, false);
8997 }
8998
8999
9000 /*!     \brief Opens a directory's parent directory and returns the entry name
9001                    of the former.
9002
9003         Aside from that it returns the directory's entry name, this method is
9004         equivalent to \code _user_open_dir(fd, "..") \endcode. It really is
9005         equivalent, if \a userName is \c NULL.
9006
9007         If a name buffer is supplied and the name does not fit the buffer, the
9008         function fails. A buffer of size \c B_FILE_NAME_LENGTH should be safe.
9009
9010         \param fd A FD referring to a directory.
9011         \param userName Buffer the directory's entry name shall be written into.
9012                    May be \c NULL.
9013         \param nameLength Size of the name buffer.
9014         \return The file descriptor of the opened parent directory, if everything
9015                         went fine, an error code otherwise.
9016 */
9017 int
9018 _user_open_parent_dir(int fd, char* userName, size_t nameLength)
9019 {
9020         bool kernel = false;
9021
9022         if (userName && !IS_USER_ADDRESS(userName))
9023                 return B_BAD_ADDRESS;
9024
9025         // open the parent dir
9026         int parentFD = dir_open(fd, (char*)"..", kernel);
9027         if (parentFD < 0)
9028                 return parentFD;
9029         FDCloser fdCloser(parentFD, kernel);
9030
9031         if (userName) {
9032                 // get the vnodes
9033                 struct vnode* parentVNode = get_vnode_from_fd(parentFD, kernel);
9034                 struct vnode* dirVNode = get_vnode_from_fd(fd, kernel);
9035                 VNodePutter parentVNodePutter(parentVNode);
9036                 VNodePutter dirVNodePutter(dirVNode);
9037                 if (!parentVNode || !dirVNode)
9038                         return B_FILE_ERROR;
9039
9040                 // get the vnode name
9041                 char _buffer[sizeof(struct dirent) + B_FILE_NAME_LENGTH];
9042                 struct dirent* buffer = (struct dirent*)_buffer;
9043                 status_t status = get_vnode_name(dirVNode, parentVNode, buffer,
9044                         sizeof(_buffer), get_current_io_context(false));
9045                 if (status != B_OK)
9046                         return status;
9047
9048                 // copy the name to the userland buffer
9049                 int len = user_strlcpy(userName, buffer->d_name, nameLength);
9050                 if (len < 0)
9051                         return len;
9052                 if (len >= (int)nameLength)
9053                         return B_BUFFER_OVERFLOW;
9054         }
9055
9056         return fdCloser.Detach();
9057 }
9058
9059
9060 status_t
9061 _user_fcntl(int fd, int op, size_t argument)
9062 {
9063         status_t status = common_fcntl(fd, op, argument, false);
9064         if (op == F_SETLKW)
9065                 syscall_restart_handle_post(status);
9066
9067         return status;
9068 }
9069
9070
9071 status_t
9072 _user_fsync(int fd)
9073 {
9074         return common_sync(fd, false);
9075 }
9076
9077
9078 status_t
9079 _user_flock(int fd, int operation)
9080 {
9081         FUNCTION(("_user_fcntl(fd = %d, op = %d)\n", fd, operation));
9082
9083         // Check if the operation is valid
9084         switch (operation & ~LOCK_NB) {
9085                 case LOCK_UN:
9086                 case LOCK_SH:
9087                 case LOCK_EX:
9088                         break;
9089
9090                 default:
9091                         return B_BAD_VALUE;
9092         }
9093
9094         struct file_descriptor* descriptor;
9095         struct vnode* vnode;
9096         descriptor = get_fd_and_vnode(fd, &vnode, false);
9097         if (descriptor == NULL)
9098                 return B_FILE_ERROR;
9099
9100         if (descriptor->type != FDTYPE_FILE) {
9101                 put_fd(descriptor);
9102                 return B_BAD_VALUE;
9103         }
9104
9105         struct flock flock;
9106         flock.l_start = 0;
9107         flock.l_len = OFF_MAX;
9108         flock.l_whence = 0;
9109         flock.l_type = (operation & LOCK_SH) != 0 ? F_RDLCK : F_WRLCK;
9110
9111         status_t status;
9112         if ((operation & LOCK_UN) != 0)
9113                 status = release_advisory_lock(vnode, &flock);
9114         else {
9115                 status = acquire_advisory_lock(vnode,
9116                         thread_get_current_thread()->team->session_id, &flock,
9117                         (operation & LOCK_NB) == 0);
9118         }
9119
9120         syscall_restart_handle_post(status);
9121
9122         put_fd(descriptor);
9123         return status;
9124 }
9125
9126
9127 status_t
9128 _user_lock_node(int fd)
9129 {
9130         return common_lock_node(fd, false);
9131 }
9132
9133
9134 status_t
9135 _user_unlock_node(int fd)
9136 {
9137         return common_unlock_node(fd, false);
9138 }
9139
9140
9141 status_t
9142 _user_create_dir_entry_ref(dev_t device, ino_t inode, const char* userName,
9143         int perms)
9144 {
9145         char name[B_FILE_NAME_LENGTH];
9146         status_t status;
9147
9148         if (!IS_USER_ADDRESS(userName))
9149                 return B_BAD_ADDRESS;
9150
9151         status = user_strlcpy(name, userName, sizeof(name));
9152         if (status < 0)
9153                 return status;
9154
9155         return dir_create_entry_ref(device, inode, name, perms, false);
9156 }
9157
9158
9159 status_t
9160 _user_create_dir(int fd, const char* userPath, int perms)
9161 {
9162         KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9163         if (pathBuffer.InitCheck() != B_OK)
9164                 return B_NO_MEMORY;
9165
9166         char* path = pathBuffer.LockBuffer();
9167
9168         if (!IS_USER_ADDRESS(userPath)
9169                 || user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9170                 return B_BAD_ADDRESS;
9171
9172         return dir_create(fd, path, perms, false);
9173 }
9174
9175
9176 status_t
9177 _user_remove_dir(int fd, const char* userPath)
9178 {
9179         KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9180         if (pathBuffer.InitCheck() != B_OK)
9181                 return B_NO_MEMORY;
9182
9183         char* path = pathBuffer.LockBuffer();
9184
9185         if (userPath != NULL) {
9186                 if (!IS_USER_ADDRESS(userPath)
9187                         || user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9188                         return B_BAD_ADDRESS;
9189         }
9190
9191         return dir_remove(fd, userPath ? path : NULL, false);
9192 }
9193
9194
9195 status_t
9196 _user_read_link(int fd, const char* userPath, char* userBuffer,
9197         size_t* userBufferSize)
9198 {
9199         KPath pathBuffer(B_PATH_NAME_LENGTH + 1), linkBuffer;
9200         if (pathBuffer.InitCheck() != B_OK || linkBuffer.InitCheck() != B_OK)
9201                 return B_NO_MEMORY;
9202
9203         size_t bufferSize;
9204
9205         if (!IS_USER_ADDRESS(userBuffer) || !IS_USER_ADDRESS(userBufferSize)
9206                 || user_memcpy(&bufferSize, userBufferSize, sizeof(size_t)) != B_OK)
9207                 return B_BAD_ADDRESS;
9208
9209         char* path = pathBuffer.LockBuffer();
9210         char* buffer = linkBuffer.LockBuffer();
9211
9212         if (userPath) {
9213                 if (!IS_USER_ADDRESS(userPath)
9214                         || user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9215                         return B_BAD_ADDRESS;
9216
9217                 if (bufferSize > B_PATH_NAME_LENGTH)
9218                         bufferSize = B_PATH_NAME_LENGTH;
9219         }
9220
9221         status_t status = common_read_link(fd, userPath ? path : NULL, buffer,
9222                 &bufferSize, false);
9223
9224         // we also update the bufferSize in case of errors
9225         // (the real length will be returned in case of B_BUFFER_OVERFLOW)
9226         if (user_memcpy(userBufferSize, &bufferSize, sizeof(size_t)) != B_OK)
9227                 return B_BAD_ADDRESS;
9228
9229         if (status != B_OK)
9230                 return status;
9231
9232         if (user_memcpy(userBuffer, buffer, bufferSize) != B_OK)
9233                 return B_BAD_ADDRESS;
9234
9235         return B_OK;
9236 }
9237
9238
9239 status_t
9240 _user_create_symlink(int fd, const char* userPath, const char* userToPath,
9241         int mode)
9242 {
9243         KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9244         KPath toPathBuffer(B_PATH_NAME_LENGTH + 1);
9245         if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
9246                 return B_NO_MEMORY;
9247
9248         char* path = pathBuffer.LockBuffer();
9249         char* toPath = toPathBuffer.LockBuffer();
9250
9251         if (!IS_USER_ADDRESS(userPath)
9252                 || !IS_USER_ADDRESS(userToPath)
9253                 || user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK
9254                 || user_strlcpy(toPath, userToPath, B_PATH_NAME_LENGTH) < B_OK)
9255                 return B_BAD_ADDRESS;
9256
9257         return common_create_symlink(fd, path, toPath, mode, false);
9258 }
9259
9260
9261 status_t
9262 _user_create_link(int pathFD, const char* userPath, int toFD,
9263         const char* userToPath, bool traverseLeafLink)
9264 {
9265         KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9266         KPath toPathBuffer(B_PATH_NAME_LENGTH + 1);
9267         if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
9268                 return B_NO_MEMORY;
9269
9270         char* path = pathBuffer.LockBuffer();
9271         char* toPath = toPathBuffer.LockBuffer();
9272
9273         if (!IS_USER_ADDRESS(userPath)
9274                 || !IS_USER_ADDRESS(userToPath)
9275                 || user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK
9276                 || user_strlcpy(toPath, userToPath, B_PATH_NAME_LENGTH) < B_OK)
9277                 return B_BAD_ADDRESS;
9278
9279         status_t status = check_path(toPath);
9280         if (status != B_OK)
9281                 return status;
9282
9283         return common_create_link(pathFD, path, toFD, toPath, traverseLeafLink,
9284                 false);
9285 }
9286
9287
9288 status_t
9289 _user_unlink(int fd, const char* userPath)
9290 {
9291         KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9292         if (pathBuffer.InitCheck() != B_OK)
9293                 return B_NO_MEMORY;
9294
9295         char* path = pathBuffer.LockBuffer();
9296
9297         if (!IS_USER_ADDRESS(userPath)
9298                 || user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9299                 return B_BAD_ADDRESS;
9300
9301         return common_unlink(fd, path, false);
9302 }
9303
9304
9305 status_t
9306 _user_rename(int oldFD, const char* userOldPath, int newFD,
9307         const char* userNewPath)
9308 {
9309         KPath oldPathBuffer(B_PATH_NAME_LENGTH + 1);
9310         KPath newPathBuffer(B_PATH_NAME_LENGTH + 1);
9311         if (oldPathBuffer.InitCheck() != B_OK || newPathBuffer.InitCheck() != B_OK)
9312                 return B_NO_MEMORY;
9313
9314         char* oldPath = oldPathBuffer.LockBuffer();
9315         char* newPath = newPathBuffer.LockBuffer();
9316
9317         if (!IS_USER_ADDRESS(userOldPath) || !IS_USER_ADDRESS(userNewPath)
9318                 || user_strlcpy(oldPath, userOldPath, B_PATH_NAME_LENGTH) < B_OK
9319                 || user_strlcpy(newPath, userNewPath, B_PATH_NAME_LENGTH) < B_OK)
9320                 return B_BAD_ADDRESS;
9321
9322         return common_rename(oldFD, oldPath, newFD, newPath, false);
9323 }
9324
9325
9326 status_t
9327 _user_create_fifo(int fd, const char* userPath, mode_t perms)
9328 {
9329         KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9330         if (pathBuffer.InitCheck() != B_OK)
9331                 return B_NO_MEMORY;
9332
9333         char* path = pathBuffer.LockBuffer();
9334
9335         if (!IS_USER_ADDRESS(userPath)
9336                 || user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK) {
9337                 return B_BAD_ADDRESS;
9338         }
9339
9340         // split into directory vnode and filename path
9341         char filename[B_FILE_NAME_LENGTH];
9342         struct vnode* dir;
9343         status_t status = fd_and_path_to_dir_vnode(fd, path, &dir, filename, false);
9344         if (status != B_OK)
9345                 return status;
9346
9347         VNodePutter _(dir);
9348
9349         // the underlying FS needs to support creating FIFOs
9350         if (!HAS_FS_CALL(dir, create_special_node))
9351                 return B_UNSUPPORTED;
9352
9353         // create the entry     -- the FIFO sub node is set up automatically
9354         fs_vnode superVnode;
9355         ino_t nodeID;
9356         status = FS_CALL(dir, create_special_node, filename, NULL,
9357                 S_IFIFO | (perms & S_IUMSK), 0, &superVnode, &nodeID);
9358
9359         // create_special_node() acquired a reference for us that we don't need.
9360         if (status == B_OK)
9361                 put_vnode(dir->mount->volume, nodeID);
9362
9363         return status;
9364 }
9365
9366
9367 status_t
9368 _user_create_pipe(int* userFDs)
9369 {
9370         // rootfs should support creating FIFOs, but let's be sure
9371         if (!HAS_FS_CALL(sRoot, create_special_node))
9372                 return B_UNSUPPORTED;
9373
9374         // create the node      -- the FIFO sub node is set up automatically
9375         fs_vnode superVnode;
9376         ino_t nodeID;
9377         status_t status = FS_CALL(sRoot, create_special_node, NULL, NULL,
9378                 S_IFIFO | S_IRUSR | S_IWUSR, 0, &superVnode, &nodeID);
9379         if (status != B_OK)
9380                 return status;
9381
9382         // We've got one reference to the node and need another one.
9383         struct vnode* vnode;
9384         status = get_vnode(sRoot->mount->id, nodeID, &vnode, true, false);
9385         if (status != B_OK) {
9386                 // that should not happen
9387                 dprintf("_user_create_pipe(): Failed to lookup vnode (%" B_PRIdDEV ", "
9388                         "%" B_PRIdINO ")\n", sRoot->mount->id, sRoot->id);
9389                 return status;
9390         }
9391
9392         // Everything looks good so far. Open two FDs for reading respectively
9393         // writing.
9394         int fds[2];
9395         fds[0] = open_vnode(vnode, O_RDONLY, false);
9396         fds[1] = open_vnode(vnode, O_WRONLY, false);
9397
9398         FDCloser closer0(fds[0], false);
9399         FDCloser closer1(fds[1], false);
9400
9401         status = (fds[0] >= 0 ? (fds[1] >= 0 ? B_OK : fds[1]) : fds[0]);
9402
9403         // copy FDs to userland
9404         if (status == B_OK) {
9405                 if (!IS_USER_ADDRESS(userFDs)
9406                         || user_memcpy(userFDs, fds, sizeof(fds)) != B_OK) {
9407                         status = B_BAD_ADDRESS;
9408                 }
9409         }
9410
9411         // keep FDs, if everything went fine
9412         if (status == B_OK) {
9413                 closer0.Detach();
9414                 closer1.Detach();
9415         }
9416
9417         return status;
9418 }
9419
9420
9421 status_t
9422 _user_access(int fd, const char* userPath, int mode, bool effectiveUserGroup)
9423 {
9424         KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9425         if (pathBuffer.InitCheck() != B_OK)
9426                 return B_NO_MEMORY;
9427
9428         char* path = pathBuffer.LockBuffer();
9429
9430         if (!IS_USER_ADDRESS(userPath)
9431                 || user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9432                 return B_BAD_ADDRESS;
9433
9434         return common_access(fd, path, mode, effectiveUserGroup, false);
9435 }
9436
9437
9438 status_t
9439 _user_read_stat(int fd, const char* userPath, bool traverseLink,
9440         struct stat* userStat, size_t statSize)
9441 {
9442         struct stat stat;
9443         status_t status;
9444
9445         if (statSize > sizeof(struct stat))
9446                 return B_BAD_VALUE;
9447
9448         if (!IS_USER_ADDRESS(userStat))
9449                 return B_BAD_ADDRESS;
9450
9451         if (userPath != NULL) {
9452                 // path given: get the stat of the node referred to by (fd, path)
9453                 if (!IS_USER_ADDRESS(userPath))
9454                         return B_BAD_ADDRESS;
9455
9456                 KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9457                 if (pathBuffer.InitCheck() != B_OK)
9458                         return B_NO_MEMORY;
9459
9460                 char* path = pathBuffer.LockBuffer();
9461
9462                 ssize_t length = user_strlcpy(path, userPath, B_PATH_NAME_LENGTH);
9463                 if (length < B_OK)
9464                         return length;
9465                 if (length >= B_PATH_NAME_LENGTH)
9466                         return B_NAME_TOO_LONG;
9467
9468                 status = common_path_read_stat(fd, path, traverseLink, &stat, false);
9469         } else {
9470                 // no path given: get the FD and use the FD operation
9471                 struct file_descriptor* descriptor
9472                         = get_fd(get_current_io_context(false), fd);
9473                 if (descriptor == NULL)
9474                         return B_FILE_ERROR;
9475
9476                 if (descriptor->ops->fd_read_stat)
9477                         status = descriptor->ops->fd_read_stat(descriptor, &stat);
9478                 else
9479                         status = B_UNSUPPORTED;
9480
9481                 put_fd(descriptor);
9482         }
9483
9484         if (status != B_OK)
9485                 return status;
9486
9487         return user_memcpy(userStat, &stat, statSize);
9488 }
9489
9490
9491 status_t
9492 _user_write_stat(int fd, const char* userPath, bool traverseLeafLink,
9493         const struct stat* userStat, size_t statSize, int statMask)
9494 {
9495         if (statSize > sizeof(struct stat))
9496                 return B_BAD_VALUE;
9497
9498         struct stat stat;
9499
9500         if (!IS_USER_ADDRESS(userStat)
9501                 || user_memcpy(&stat, userStat, statSize) < B_OK)
9502                 return B_BAD_ADDRESS;
9503
9504         // clear additional stat fields
9505         if (statSize < sizeof(struct stat))
9506                 memset((uint8*)&stat + statSize, 0, sizeof(struct stat) - statSize);
9507
9508         status_t status;
9509
9510         if (userPath != NULL) {
9511                 // path given: write the stat of the node referred to by (fd, path)
9512                 if (!IS_USER_ADDRESS(userPath))
9513                         return B_BAD_ADDRESS;
9514
9515                 KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9516                 if (pathBuffer.InitCheck() != B_OK)
9517                         return B_NO_MEMORY;
9518
9519                 char* path = pathBuffer.LockBuffer();
9520
9521                 ssize_t length = user_strlcpy(path, userPath, B_PATH_NAME_LENGTH);
9522                 if (length < B_OK)
9523                         return length;
9524                 if (length >= B_PATH_NAME_LENGTH)
9525                         return B_NAME_TOO_LONG;
9526
9527                 status = common_path_write_stat(fd, path, traverseLeafLink, &stat,
9528                         statMask, false);
9529         } else {
9530                 // no path given: get the FD and use the FD operation
9531                 struct file_descriptor* descriptor
9532                         = get_fd(get_current_io_context(false), fd);
9533                 if (descriptor == NULL)
9534                         return B_FILE_ERROR;
9535
9536                 if (descriptor->ops->fd_write_stat) {
9537                         status = descriptor->ops->fd_write_stat(descriptor, &stat,
9538                                 statMask);
9539                 } else
9540                         status = B_UNSUPPORTED;
9541
9542                 put_fd(descriptor);
9543         }
9544
9545         return status;
9546 }
9547
9548
9549 int
9550 _user_open_attr_dir(int fd, const char* userPath, bool traverseLeafLink)
9551 {
9552         KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9553         if (pathBuffer.InitCheck() != B_OK)
9554                 return B_NO_MEMORY;
9555
9556         char* path = pathBuffer.LockBuffer();
9557
9558         if (userPath != NULL) {
9559                 if (!IS_USER_ADDRESS(userPath)
9560                         || user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9561                         return B_BAD_ADDRESS;
9562         }
9563
9564         return attr_dir_open(fd, userPath ? path : NULL, traverseLeafLink, false);
9565 }
9566
9567
9568 ssize_t
9569 _user_read_attr(int fd, const char* attribute, off_t pos, void* userBuffer,
9570         size_t readBytes)
9571 {
9572         int attr = attr_open(fd, NULL, attribute, O_RDONLY, false);
9573         if (attr < 0)
9574                 return attr;
9575
9576         ssize_t bytes = _user_read(attr, pos, userBuffer, readBytes);
9577         _user_close(attr);
9578
9579         return bytes;
9580 }
9581
9582
9583 ssize_t
9584 _user_write_attr(int fd, const char* attribute, uint32 type, off_t pos,
9585         const void* buffer, size_t writeBytes)
9586 {
9587         // Try to support the BeOS typical truncation as well as the position
9588         // argument
9589         int attr = attr_create(fd, NULL, attribute, type,
9590                 O_CREAT | O_WRONLY | (pos != 0 ? 0 : O_TRUNC), false);
9591         if (attr < 0)
9592                 return attr;
9593
9594         ssize_t bytes = _user_write(attr, pos, buffer, writeBytes);
9595         _user_close(attr);
9596
9597         return bytes;
9598 }
9599
9600
9601 status_t
9602 _user_stat_attr(int fd, const char* attribute, struct attr_info* userAttrInfo)
9603 {
9604         int attr = attr_open(fd, NULL, attribute, O_RDONLY, false);
9605         if (attr < 0)
9606                 return attr;
9607
9608         struct file_descriptor* descriptor
9609                 = get_fd(get_current_io_context(false), attr);
9610         if (descriptor == NULL) {
9611                 _user_close(attr);
9612                 return B_FILE_ERROR;
9613         }
9614
9615         struct stat stat;
9616         status_t status;
9617         if (descriptor->ops->fd_read_stat)
9618                 status = descriptor->ops->fd_read_stat(descriptor, &stat);
9619         else
9620                 status = B_UNSUPPORTED;
9621
9622         put_fd(descriptor);
9623         _user_close(attr);
9624
9625         if (status == B_OK) {
9626                 attr_info info;
9627                 info.type = stat.st_type;
9628                 info.size = stat.st_size;
9629
9630                 if (user_memcpy(userAttrInfo, &info, sizeof(struct attr_info)) != B_OK)
9631                         return B_BAD_ADDRESS;
9632         }
9633
9634         return status;
9635 }
9636
9637
9638 int
9639 _user_open_attr(int fd, const char* userPath, const char* userName,
9640         uint32 type, int openMode)
9641 {
9642         char name[B_FILE_NAME_LENGTH];
9643
9644         if (!IS_USER_ADDRESS(userName)
9645                 || user_strlcpy(name, userName, B_FILE_NAME_LENGTH) < B_OK)
9646                 return B_BAD_ADDRESS;
9647
9648         KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9649         if (pathBuffer.InitCheck() != B_OK)
9650                 return B_NO_MEMORY;
9651
9652         char* path = pathBuffer.LockBuffer();
9653
9654         if (userPath != NULL) {
9655                 if (!IS_USER_ADDRESS(userPath)
9656                         || user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9657                         return B_BAD_ADDRESS;
9658         }
9659
9660         if ((openMode & O_CREAT) != 0) {
9661                 return attr_create(fd, userPath ? path : NULL, name, type, openMode,
9662                         false);
9663         }
9664
9665         return attr_open(fd, userPath ? path : NULL, name, openMode, false);
9666 }
9667
9668
9669 status_t
9670 _user_remove_attr(int fd, const char* userName)
9671 {
9672         char name[B_FILE_NAME_LENGTH];
9673
9674         if (!IS_USER_ADDRESS(userName)
9675                 || user_strlcpy(name, userName, B_FILE_NAME_LENGTH) < B_OK)
9676                 return B_BAD_ADDRESS;
9677
9678         return attr_remove(fd, name, false);
9679 }
9680
9681
9682 status_t
9683 _user_rename_attr(int fromFile, const char* userFromName, int toFile,
9684         const char* userToName)
9685 {
9686         if (!IS_USER_ADDRESS(userFromName)
9687                 || !IS_USER_ADDRESS(userToName))
9688                 return B_BAD_ADDRESS;
9689
9690         KPath fromNameBuffer(B_FILE_NAME_LENGTH);
9691         KPath toNameBuffer(B_FILE_NAME_LENGTH);
9692         if (fromNameBuffer.InitCheck() != B_OK || toNameBuffer.InitCheck() != B_OK)
9693                 return B_NO_MEMORY;
9694
9695         char* fromName = fromNameBuffer.LockBuffer();
9696         char* toName = toNameBuffer.LockBuffer();
9697
9698         if (user_strlcpy(fromName, userFromName, B_FILE_NAME_LENGTH) < B_OK
9699                 || user_strlcpy(toName, userToName, B_FILE_NAME_LENGTH) < B_OK)
9700                 return B_BAD_ADDRESS;
9701
9702         return attr_rename(fromFile, fromName, toFile, toName, false);
9703 }
9704
9705
9706 int
9707 _user_open_index_dir(dev_t device)
9708 {
9709         return index_dir_open(device, false);
9710 }
9711
9712
9713 status_t
9714 _user_create_index(dev_t device, const char* userName, uint32 type,
9715         uint32 flags)
9716 {
9717         char name[B_FILE_NAME_LENGTH];
9718
9719         if (!IS_USER_ADDRESS(userName)
9720                 || user_strlcpy(name, userName, B_FILE_NAME_LENGTH) < B_OK)
9721                 return B_BAD_ADDRESS;
9722
9723         return index_create(device, name, type, flags, false);
9724 }
9725
9726
9727 status_t
9728 _user_read_index_stat(dev_t device, const char* userName, struct stat* userStat)
9729 {
9730         char name[B_FILE_NAME_LENGTH];
9731         struct stat stat;
9732         status_t status;
9733
9734         if (!IS_USER_ADDRESS(userName)
9735                 || !IS_USER_ADDRESS(userStat)
9736                 || user_strlcpy(name, userName, B_FILE_NAME_LENGTH) < B_OK)
9737                 return B_BAD_ADDRESS;
9738
9739         status = index_name_read_stat(device, name, &stat, false);
9740         if (status == B_OK) {
9741                 if (user_memcpy(userStat, &stat, sizeof(stat)) != B_OK)
9742                         return B_BAD_ADDRESS;
9743         }
9744
9745         return status;
9746 }
9747
9748
9749 status_t
9750 _user_remove_index(dev_t device, const char* userName)
9751 {
9752         char name[B_FILE_NAME_LENGTH];
9753
9754         if (!IS_USER_ADDRESS(userName)
9755                 || user_strlcpy(name, userName, B_FILE_NAME_LENGTH) < B_OK)
9756                 return B_BAD_ADDRESS;
9757
9758         return index_remove(device, name, false);
9759 }
9760
9761
9762 status_t
9763 _user_getcwd(char* userBuffer, size_t size)
9764 {
9765         if (size == 0)
9766                 return B_BAD_VALUE;
9767         if (!IS_USER_ADDRESS(userBuffer))
9768                 return B_BAD_ADDRESS;
9769
9770         if (size > kMaxPathLength)
9771                 size = kMaxPathLength;
9772
9773         KPath pathBuffer(size);
9774         if (pathBuffer.InitCheck() != B_OK)
9775                 return B_NO_MEMORY;
9776
9777         TRACE(("user_getcwd: buf %p, %ld\n", userBuffer, size));
9778
9779         char* path = pathBuffer.LockBuffer();
9780
9781         status_t status = get_cwd(path, size, false);
9782         if (status != B_OK)
9783                 return status;
9784
9785         // Copy back the result
9786         if (user_strlcpy(userBuffer, path, size) < B_OK)
9787                 return B_BAD_ADDRESS;
9788
9789         return status;
9790 }
9791
9792
9793 status_t
9794 _user_setcwd(int fd, const char* userPath)
9795 {
9796         TRACE(("user_setcwd: path = %p\n", userPath));
9797
9798         KPath pathBuffer(B_PATH_NAME_LENGTH);
9799         if (pathBuffer.InitCheck() != B_OK)
9800                 return B_NO_MEMORY;
9801
9802         char* path = pathBuffer.LockBuffer();
9803
9804         if (userPath != NULL) {
9805                 if (!IS_USER_ADDRESS(userPath)
9806                         || user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9807                         return B_BAD_ADDRESS;
9808         }
9809
9810         return set_cwd(fd, userPath != NULL ? path : NULL, false);
9811 }
9812
9813
9814 status_t
9815 _user_change_root(const char* userPath)
9816 {
9817         // only root is allowed to chroot()
9818         if (geteuid() != 0)
9819                 return B_NOT_ALLOWED;
9820
9821         // alloc path buffer
9822         KPath pathBuffer(B_PATH_NAME_LENGTH);
9823         if (pathBuffer.InitCheck() != B_OK)
9824                 return B_NO_MEMORY;
9825
9826         // copy userland path to kernel
9827         char* path = pathBuffer.LockBuffer();
9828         if (userPath != NULL) {
9829                 if (!IS_USER_ADDRESS(userPath)
9830                         || user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9831                         return B_BAD_ADDRESS;
9832         }
9833
9834         // get the vnode
9835         struct vnode* vnode;
9836         status_t status = path_to_vnode(path, true, &vnode, NULL, false);
9837         if (status != B_OK)
9838                 return status;
9839
9840         // set the new root
9841         struct io_context* context = get_current_io_context(false);
9842         mutex_lock(&sIOContextRootLock);
9843         struct vnode* oldRoot = context->root;
9844         context->root = vnode;
9845         mutex_unlock(&sIOContextRootLock);
9846
9847         put_vnode(oldRoot);
9848
9849         return B_OK;
9850 }
9851
9852
9853 int
9854 _user_open_query(dev_t device, const char* userQuery, size_t queryLength,
9855         uint32 flags, port_id port, int32 token)
9856 {
9857         char* query;
9858
9859         if (device < 0 || userQuery == NULL || queryLength == 0)
9860                 return B_BAD_VALUE;
9861
9862         // this is a safety restriction
9863         if (queryLength >= 65536)
9864                 return B_NAME_TOO_LONG;
9865
9866         query = (char*)malloc(queryLength + 1);
9867         if (query == NULL)
9868                 return B_NO_MEMORY;
9869         if (user_strlcpy(query, userQuery, queryLength + 1) < B_OK) {
9870                 free(query);
9871                 return B_BAD_ADDRESS;
9872         }
9873
9874         int fd = query_open(device, query, flags, port, token, false);
9875
9876         free(query);
9877         return fd;
9878 }
9879
9880
9881 #include "vfs_request_io.cpp"