2 * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
3 * Copyright (C) 2007 The Regents of the University of California.
4 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
5 * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
8 * This file is part of the SPL, Solaris Porting Layer.
9 * For details, see <http://zfsonlinux.org/>.
11 * The SPL is free software; you can redistribute it and/or modify it
12 * under the terms of the GNU General Public License as published by the
13 * Free Software Foundation; either version 2 of the License, or (at your
14 * option) any later version.
16 * The SPL is distributed in the hope that it will be useful, but WITHOUT
17 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
18 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
21 * You should have received a copy of the GNU General Public License along
22 * with the SPL. If not, see <http://www.gnu.org/licenses/>.
24 * Solaris Porting Layer (SPL) Vnode Implementation.
28 #include <sys/vnode.h>
29 #include <sys/kmem_cache.h>
30 #include <linux/falloc.h>
32 #include <linux/uaccess.h>
33 #ifdef HAVE_FDTABLE_HEADER
34 #include <linux/fdtable.h>
37 vnode_t
*rootdir
= (vnode_t
*)0xabcd1234;
38 EXPORT_SYMBOL(rootdir
);
40 static spl_kmem_cache_t
*vn_cache
;
41 static spl_kmem_cache_t
*vn_file_cache
;
43 static spinlock_t vn_file_lock
;
44 static LIST_HEAD(vn_file_list
);
47 spl_filp_fallocate(struct file
*fp
, int mode
, loff_t offset
, loff_t len
)
49 int error
= -EOPNOTSUPP
;
51 #ifdef HAVE_FILE_FALLOCATE
52 if (fp
->f_op
->fallocate
)
53 error
= fp
->f_op
->fallocate(fp
, mode
, offset
, len
);
55 #ifdef HAVE_INODE_FALLOCATE
56 if (fp
->f_dentry
&& fp
->f_dentry
->d_inode
&&
57 fp
->f_dentry
->d_inode
->i_op
->fallocate
)
58 error
= fp
->f_dentry
->d_inode
->i_op
->fallocate(
59 fp
->f_dentry
->d_inode
, mode
, offset
, len
);
60 #endif /* HAVE_INODE_FALLOCATE */
61 #endif /* HAVE_FILE_FALLOCATE */
67 spl_filp_fsync(struct file
*fp
, int sync
)
69 #ifdef HAVE_2ARGS_VFS_FSYNC
70 return (vfs_fsync(fp
, sync
));
72 return (vfs_fsync(fp
, (fp
)->f_dentry
, sync
));
73 #endif /* HAVE_2ARGS_VFS_FSYNC */
77 spl_kernel_write(struct file
*file
, const void *buf
, size_t count
, loff_t
*pos
)
79 #if defined(HAVE_KERNEL_WRITE_PPOS)
80 return (kernel_write(file
, buf
, count
, pos
));
82 mm_segment_t saved_fs
;
88 ret
= vfs_write(file
, (__force
const char __user
*)buf
, count
, pos
);
97 spl_kernel_read(struct file
*file
, void *buf
, size_t count
, loff_t
*pos
)
99 #if defined(HAVE_KERNEL_READ_PPOS)
100 return (kernel_read(file
, buf
, count
, pos
));
102 mm_segment_t saved_fs
;
108 ret
= vfs_read(file
, (void __user
*)buf
, count
, pos
);
117 vn_mode_to_vtype(mode_t mode
)
141 } /* vn_mode_to_vtype() */
142 EXPORT_SYMBOL(vn_mode_to_vtype
);
145 vn_vtype_to_mode(vtype_t vtype
)
169 } /* vn_vtype_to_mode() */
170 EXPORT_SYMBOL(vn_vtype_to_mode
);
177 vp
= kmem_cache_alloc(vn_cache
, flag
);
185 EXPORT_SYMBOL(vn_alloc
);
190 kmem_cache_free(vn_cache
, vp
);
192 EXPORT_SYMBOL(vn_free
);
195 vn_open(const char *path
, uio_seg_t seg
, int flags
, int mode
, vnode_t
**vpp
,
200 int rc
, saved_umask
= 0;
204 ASSERT(flags
& (FWRITE
| FREAD
));
205 ASSERT(seg
== UIO_SYSSPACE
);
209 if (!(flags
& FCREAT
) && (flags
& FWRITE
))
213 * Note for filp_open() the two low bits must be remapped to mean:
214 * 01 - read-only -> 00 read-only
215 * 10 - write-only -> 01 write-only
216 * 11 - read-write -> 10 read-write
221 saved_umask
= xchg(¤t
->fs
->umask
, 0);
223 fp
= filp_open(path
, flags
, mode
);
226 (void) xchg(¤t
->fs
->umask
, saved_umask
);
229 return (-PTR_ERR(fp
));
231 #if defined(HAVE_4ARGS_VFS_GETATTR)
232 rc
= vfs_getattr(&fp
->f_path
, &stat
, STATX_TYPE
, AT_STATX_SYNC_AS_STAT
);
233 #elif defined(HAVE_2ARGS_VFS_GETATTR)
234 rc
= vfs_getattr(&fp
->f_path
, &stat
);
236 rc
= vfs_getattr(fp
->f_path
.mnt
, fp
->f_dentry
, &stat
);
243 vp
= vn_alloc(KM_SLEEP
);
249 saved_gfp
= mapping_gfp_mask(fp
->f_mapping
);
250 mapping_set_gfp_mask(fp
->f_mapping
, saved_gfp
& ~(__GFP_IO
|__GFP_FS
));
252 mutex_enter(&vp
->v_lock
);
253 vp
->v_type
= vn_mode_to_vtype(stat
.mode
);
255 vp
->v_gfp_mask
= saved_gfp
;
257 mutex_exit(&vp
->v_lock
);
261 EXPORT_SYMBOL(vn_open
);
264 vn_openat(const char *path
, uio_seg_t seg
, int flags
, int mode
,
265 vnode_t
**vpp
, int x1
, void *x2
, vnode_t
*vp
, int fd
)
270 ASSERT(vp
== rootdir
);
272 len
= strlen(path
) + 2;
273 realpath
= kmalloc(len
, kmem_flags_convert(KM_SLEEP
));
277 (void) snprintf(realpath
, len
, "/%s", path
);
278 rc
= vn_open(realpath
, seg
, flags
, mode
, vpp
, x1
, x2
);
283 EXPORT_SYMBOL(vn_openat
);
286 vn_rdwr(uio_rw_t uio
, vnode_t
*vp
, void *addr
, ssize_t len
, offset_t off
,
287 uio_seg_t seg
, int ioflag
, rlim64_t x2
, void *x3
, ssize_t
*residp
)
289 struct file
*fp
= vp
->v_file
;
293 ASSERT(uio
== UIO_WRITE
|| uio
== UIO_READ
);
294 ASSERT(seg
== UIO_SYSSPACE
);
295 ASSERT((ioflag
& ~FAPPEND
) == 0);
297 if (ioflag
& FAPPEND
)
301 rc
= spl_kernel_write(fp
, addr
, len
, &offset
);
303 rc
= spl_kernel_read(fp
, addr
, len
, &offset
);
319 EXPORT_SYMBOL(vn_rdwr
);
322 vn_close(vnode_t
*vp
, int flags
, int x1
, int x2
, void *x3
, void *x4
)
329 mapping_set_gfp_mask(vp
->v_file
->f_mapping
, vp
->v_gfp_mask
);
330 rc
= filp_close(vp
->v_file
, 0);
335 EXPORT_SYMBOL(vn_close
);
338 * vn_seek() does not actually seek it only performs bounds checking on the
339 * proposed seek. We perform minimal checking and allow vn_rdwr() to catch
340 * anything more serious.
343 vn_seek(vnode_t
*vp
, offset_t ooff
, offset_t
*noffp
, void *ct
)
345 return ((*noffp
< 0 || *noffp
> MAXOFFSET_T
) ? EINVAL
: 0);
347 EXPORT_SYMBOL(vn_seek
);
350 vn_getattr(vnode_t
*vp
, vattr_t
*vap
, int flags
, void *x3
, void *x4
)
362 #if defined(HAVE_4ARGS_VFS_GETATTR)
363 rc
= vfs_getattr(&fp
->f_path
, &stat
, STATX_BASIC_STATS
,
364 AT_STATX_SYNC_AS_STAT
);
365 #elif defined(HAVE_2ARGS_VFS_GETATTR)
366 rc
= vfs_getattr(&fp
->f_path
, &stat
);
368 rc
= vfs_getattr(fp
->f_path
.mnt
, fp
->f_dentry
, &stat
);
373 vap
->va_type
= vn_mode_to_vtype(stat
.mode
);
374 vap
->va_mode
= stat
.mode
;
375 vap
->va_uid
= KUID_TO_SUID(stat
.uid
);
376 vap
->va_gid
= KGID_TO_SGID(stat
.gid
);
378 vap
->va_nodeid
= stat
.ino
;
379 vap
->va_nlink
= stat
.nlink
;
380 vap
->va_size
= stat
.size
;
381 vap
->va_blksize
= stat
.blksize
;
382 vap
->va_atime
= stat
.atime
;
383 vap
->va_mtime
= stat
.mtime
;
384 vap
->va_ctime
= stat
.ctime
;
385 vap
->va_rdev
= stat
.rdev
;
386 vap
->va_nblocks
= stat
.blocks
;
390 EXPORT_SYMBOL(vn_getattr
);
393 vn_fsync(vnode_t
*vp
, int flags
, void *x3
, void *x4
)
406 * May enter XFS which generates a warning when PF_FSTRANS is set.
407 * To avoid this the flag is cleared over vfs_sync() and then reset.
409 fstrans
= __spl_pf_fstrans_check();
411 current
->flags
&= ~(__SPL_PF_FSTRANS
);
413 error
= -spl_filp_fsync(vp
->v_file
, datasync
);
415 current
->flags
|= __SPL_PF_FSTRANS
;
419 EXPORT_SYMBOL(vn_fsync
);
421 int vn_space(vnode_t
*vp
, int cmd
, struct flock
*bfp
, int flag
,
422 offset_t offset
, void *x6
, void *x7
)
424 int error
= EOPNOTSUPP
;
425 #ifdef FALLOC_FL_PUNCH_HOLE
429 if (cmd
!= F_FREESP
|| bfp
->l_whence
!= 0)
434 ASSERT(bfp
->l_start
>= 0 && bfp
->l_len
> 0);
436 #ifdef FALLOC_FL_PUNCH_HOLE
438 * May enter XFS which generates a warning when PF_FSTRANS is set.
439 * To avoid this the flag is cleared over vfs_sync() and then reset.
441 fstrans
= __spl_pf_fstrans_check();
443 current
->flags
&= ~(__SPL_PF_FSTRANS
);
446 * When supported by the underlying file system preferentially
447 * use the fallocate() callback to preallocate the space.
449 error
= -spl_filp_fallocate(vp
->v_file
,
450 FALLOC_FL_KEEP_SIZE
| FALLOC_FL_PUNCH_HOLE
,
451 bfp
->l_start
, bfp
->l_len
);
454 current
->flags
|= __SPL_PF_FSTRANS
;
460 #ifdef HAVE_INODE_TRUNCATE_RANGE
461 if (vp
->v_file
->f_dentry
&& vp
->v_file
->f_dentry
->d_inode
&&
462 vp
->v_file
->f_dentry
->d_inode
->i_op
&&
463 vp
->v_file
->f_dentry
->d_inode
->i_op
->truncate_range
) {
464 off_t end
= bfp
->l_start
+ bfp
->l_len
;
466 * Judging from the code in shmem_truncate_range(),
467 * it seems the kernel expects the end offset to be
468 * inclusive and aligned to the end of a page.
470 if (end
% PAGE_SIZE
!= 0) {
471 end
&= ~(off_t
)(PAGE_SIZE
- 1);
472 if (end
<= bfp
->l_start
)
477 vp
->v_file
->f_dentry
->d_inode
->i_op
->truncate_range(
478 vp
->v_file
->f_dentry
->d_inode
, bfp
->l_start
, end
);
486 EXPORT_SYMBOL(vn_space
);
488 /* Function must be called while holding the vn_file_lock */
490 file_find(int fd
, struct task_struct
*task
)
494 list_for_each_entry(fp
, &vn_file_list
, f_list
) {
495 if (fd
== fp
->f_fd
&& fp
->f_task
== task
) {
496 ASSERT(atomic_read(&fp
->f_ref
) != 0);
516 /* Already open just take an extra reference */
517 spin_lock(&vn_file_lock
);
519 fp
= file_find(fd
, current
);
524 * areleasef() can cause us to see a stale reference when
525 * userspace has reused a file descriptor before areleasef()
526 * has run. fput() the stale reference and replace it. We
527 * retain the original reference count such that the concurrent
528 * areleasef() will decrement its reference and terminate.
530 if (lfp
!= fp
->f_file
) {
532 fp
->f_vnode
->v_file
= lfp
;
534 atomic_inc(&fp
->f_ref
);
535 spin_unlock(&vn_file_lock
);
539 spin_unlock(&vn_file_lock
);
541 /* File was not yet opened create the object and setup */
542 fp
= kmem_cache_alloc(vn_file_cache
, KM_SLEEP
);
546 mutex_enter(&fp
->f_lock
);
549 fp
->f_task
= current
;
551 atomic_inc(&fp
->f_ref
);
557 vp
= vn_alloc(KM_SLEEP
);
561 #if defined(HAVE_4ARGS_VFS_GETATTR)
562 rc
= vfs_getattr(&lfp
->f_path
, &stat
, STATX_TYPE
,
563 AT_STATX_SYNC_AS_STAT
);
564 #elif defined(HAVE_2ARGS_VFS_GETATTR)
565 rc
= vfs_getattr(&lfp
->f_path
, &stat
);
567 rc
= vfs_getattr(lfp
->f_path
.mnt
, lfp
->f_dentry
, &stat
);
572 mutex_enter(&vp
->v_lock
);
573 vp
->v_type
= vn_mode_to_vtype(stat
.mode
);
575 mutex_exit(&vp
->v_lock
);
580 /* Put it on the tracking list */
581 spin_lock(&vn_file_lock
);
582 list_add(&fp
->f_list
, &vn_file_list
);
583 spin_unlock(&vn_file_lock
);
585 mutex_exit(&fp
->f_lock
);
593 mutex_exit(&fp
->f_lock
);
594 kmem_cache_free(vn_file_cache
, fp
);
600 static void releasef_locked(file_t
*fp
)
605 /* Unlinked from list, no refs, safe to free outside mutex */
607 vn_free(fp
->f_vnode
);
609 kmem_cache_free(vn_file_cache
, fp
);
615 areleasef(fd
, P_FINFO(current
));
617 EXPORT_SYMBOL(releasef
);
620 vn_areleasef(int fd
, uf_info_t
*fip
)
623 struct task_struct
*task
= (struct task_struct
*)fip
;
628 spin_lock(&vn_file_lock
);
629 fp
= file_find(fd
, task
);
631 atomic_dec(&fp
->f_ref
);
632 if (atomic_read(&fp
->f_ref
) > 0) {
633 spin_unlock(&vn_file_lock
);
637 list_del(&fp
->f_list
);
640 spin_unlock(&vn_file_lock
);
642 EXPORT_SYMBOL(areleasef
);
646 #ifdef HAVE_SET_FS_PWD_WITH_CONST
647 vn_set_fs_pwd(struct fs_struct
*fs
, const struct path
*path
)
649 vn_set_fs_pwd(struct fs_struct
*fs
, struct path
*path
)
650 #endif /* HAVE_SET_FS_PWD_WITH_CONST */
654 #ifdef HAVE_FS_STRUCT_SPINLOCK
655 spin_lock(&fs
->lock
);
659 spin_unlock(&fs
->lock
);
661 write_lock(&fs
->lock
);
665 write_unlock(&fs
->lock
);
666 #endif /* HAVE_FS_STRUCT_SPINLOCK */
673 vn_set_pwd(const char *filename
)
676 mm_segment_t saved_fs
;
680 * user_path_dir() and __user_walk() both expect 'filename' to be
681 * a user space address so we must briefly increase the data segment
682 * size to ensure strncpy_from_user() does not fail with -EFAULT.
687 rc
= user_path_dir(filename
, &path
);
691 rc
= inode_permission(path
.dentry
->d_inode
, MAY_EXEC
| MAY_ACCESS
);
695 vn_set_fs_pwd(current
->fs
, &path
);
704 EXPORT_SYMBOL(vn_set_pwd
);
707 vn_cache_constructor(void *buf
, void *cdrarg
, int kmflags
)
709 struct vnode
*vp
= buf
;
711 mutex_init(&vp
->v_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
714 } /* vn_cache_constructor() */
717 vn_cache_destructor(void *buf
, void *cdrarg
)
719 struct vnode
*vp
= buf
;
721 mutex_destroy(&vp
->v_lock
);
722 } /* vn_cache_destructor() */
725 vn_file_cache_constructor(void *buf
, void *cdrarg
, int kmflags
)
729 atomic_set(&fp
->f_ref
, 0);
730 mutex_init(&fp
->f_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
731 INIT_LIST_HEAD(&fp
->f_list
);
734 } /* vn_file_cache_constructor() */
737 vn_file_cache_destructor(void *buf
, void *cdrarg
)
741 mutex_destroy(&fp
->f_lock
);
742 } /* vn_file_cache_destructor() */
747 spin_lock_init(&vn_file_lock
);
749 vn_cache
= kmem_cache_create("spl_vn_cache",
750 sizeof (struct vnode
), 64, vn_cache_constructor
,
751 vn_cache_destructor
, NULL
, NULL
, NULL
, 0);
753 vn_file_cache
= kmem_cache_create("spl_vn_file_cache",
754 sizeof (file_t
), 64, vn_file_cache_constructor
,
755 vn_file_cache_destructor
, NULL
, NULL
, NULL
, 0);
758 } /* spl_vn_init() */
763 file_t
*fp
, *next_fp
;
766 spin_lock(&vn_file_lock
);
768 list_for_each_entry_safe(fp
, next_fp
, &vn_file_list
, f_list
) {
769 list_del(&fp
->f_list
);
774 spin_unlock(&vn_file_lock
);
777 printk(KERN_WARNING
"WARNING: %d vnode files leaked\n", leaked
);
779 kmem_cache_destroy(vn_file_cache
);
780 kmem_cache_destroy(vn_cache
);
781 } /* spl_vn_fini() */