Sync usage with man page.
[netbsd-mini2440.git] / external / cddl / osnet / dist / uts / common / fs / zfs / zfs_vnops.c
blob366aa39dabedd58559c3f1bf61e431c8712c21bb
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 /* Portions Copyright 2007 Jeremy Teo */
28 #include <sys/types.h>
29 #include <sys/param.h>
30 #include <sys/time.h>
31 #include <sys/systm.h>
32 #include <sys/sysmacros.h>
33 #include <sys/resource.h>
34 #include <sys/vfs.h>
35 #include <sys/vnode.h>
36 #include <sys/file.h>
37 #include <sys/stat.h>
38 #include <sys/kmem.h>
39 #include <sys/taskq.h>
40 #include <sys/uio.h>
41 #include <sys/atomic.h>
42 #include <sys/namei.h>
43 #include <sys/mman.h>
44 #include <sys/cmn_err.h>
45 #include <sys/errno.h>
46 #include <sys/unistd.h>
47 #include <sys/zfs_dir.h>
48 #include <sys/zfs_ioctl.h>
49 #include <sys/fs/zfs.h>
50 #include <sys/dmu.h>
51 #include <sys/spa.h>
52 #include <sys/txg.h>
53 #include <sys/dbuf.h>
54 #include <sys/zap.h>
55 #include <sys/dirent.h>
56 #include <sys/policy.h>
57 #include <sys/sunddi.h>
58 #include <sys/filio.h>
59 #include <sys/zfs_ctldir.h>
60 #include <sys/zfs_fuid.h>
61 #include <sys/zfs_vfsops.h>
62 #include <sys/dnlc.h>
63 #include <sys/zfs_rlock.h>
64 #include <sys/extdirent.h>
65 #include <sys/kidmap.h>
66 #include <sys/buf.h>
67 #include <sys/sched.h>
68 #include <sys/acl.h>
69 #include <sys/extattr.h>
71 #ifdef __NetBSD__
72 #include <miscfs/genfs/genfs.h>
73 #endif
76 * Programming rules.
78 * Each vnode op performs some logical unit of work. To do this, the ZPL must
79 * properly lock its in-core state, create a DMU transaction, do the work,
80 * record this work in the intent log (ZIL), commit the DMU transaction,
81 * and wait for the intent log to commit if it is a synchronous operation.
82 * Moreover, the vnode ops must work in both normal and log replay context.
83 * The ordering of events is important to avoid deadlocks and references
84 * to freed memory. The example below illustrates the following Big Rules:
86 * (1) A check must be made in each zfs thread for a mounted file system.
87 * This is done avoiding races using ZFS_ENTER(zfsvfs).
88 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes
89 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros
90 * can return EIO from the calling function.
92 * (2) VN_RELE() should always be the last thing except for zil_commit()
93 * (if necessary) and ZFS_EXIT(). This is for 3 reasons:
94 * First, if it's the last reference, the vnode/znode
95 * can be freed, so the zp may point to freed memory. Second, the last
96 * reference will call zfs_zinactive(), which may induce a lot of work --
97 * pushing cached pages (which acquires range locks) and syncing out
98 * cached atime changes. Third, zfs_zinactive() may require a new tx,
99 * which could deadlock the system if you were already holding one.
100 * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
102 * (3) All range locks must be grabbed before calling dmu_tx_assign(),
103 * as they can span dmu_tx_assign() calls.
105 * (4) Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign().
106 * In normal operation, this will be TXG_NOWAIT. During ZIL replay,
107 * it will be a specific txg. Either way, dmu_tx_assign() never blocks.
108 * This is critical because we don't want to block while holding locks.
109 * Note, in particular, that if a lock is sometimes acquired before
110 * the tx assigns, and sometimes after (e.g. z_lock), then failing to
111 * use a non-blocking assign can deadlock the system. The scenario:
113 * Thread A has grabbed a lock before calling dmu_tx_assign().
114 * Thread B is in an already-assigned tx, and blocks for this lock.
115 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
116 * forever, because the previous txg can't quiesce until B's tx commits.
118 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
119 * then drop all locks, call dmu_tx_wait(), and try again.
121 * (5) If the operation succeeded, generate the intent log entry for it
122 * before dropping locks. This ensures that the ordering of events
123 * in the intent log matches the order in which they actually occurred.
125 * (6) At the end of each vnode op, the DMU tx must always commit,
126 * regardless of whether there were any errors.
128 * (7) After dropping all locks, invoke zil_commit(zilog, seq, foid)
129 * to ensure that synchronous semantics are provided when necessary.
131 * In general, this is how things should be ordered in each vnode op:
133 * ZFS_ENTER(zfsvfs); // exit if unmounted
134 * top:
135 * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD())
136 * rw_enter(...); // grab any other locks you need
137 * tx = dmu_tx_create(...); // get DMU tx
138 * dmu_tx_hold_*(); // hold each object you might modify
139 * error = dmu_tx_assign(tx, zfsvfs->z_assign); // try to assign
140 * if (error) {
141 * rw_exit(...); // drop locks
142 * zfs_dirent_unlock(...); // unlock directory entry
143 * VN_RELE(...); // release held vnodes
144 * if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
145 * dmu_tx_wait(tx);
146 * dmu_tx_abort(tx);
147 * goto top;
149 * dmu_tx_abort(tx); // abort DMU tx
150 * ZFS_EXIT(zfsvfs); // finished in zfs
151 * return (error); // really out of space
153 * error = do_real_work(); // do whatever this VOP does
154 * if (error == 0)
155 * zfs_log_*(...); // on success, make ZIL entry
156 * dmu_tx_commit(tx); // commit DMU tx -- error or not
157 * rw_exit(...); // drop locks
158 * zfs_dirent_unlock(dl, 0); // unlock directory entry
159 * VN_RELE(...); // release held vnodes
160 * zil_commit(zilog, seq, foid); // synchronous when necessary
161 * ZFS_EXIT(zfsvfs); // finished in zfs
162 * return (error); // done, report error
165 /* ARGSUSED */
166 static int
167 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
169 znode_t *zp = VTOZ(*vpp);
171 if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) &&
172 ((flag & FAPPEND) == 0)) {
173 return (EPERM);
176 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
177 ZTOV(zp)->v_type == VREG &&
178 !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
179 zp->z_phys->zp_size > 0)
180 if (fs_vscan(*vpp, cr, 0) != 0)
181 return (EACCES);
183 /* Keep a count of the synchronous opens in the znode */
184 if (flag & (FSYNC | FDSYNC))
185 atomic_inc_32(&zp->z_sync_cnt);
187 return (0);
190 /* ARGSUSED */
191 static int
192 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
193 caller_context_t *ct)
195 znode_t *zp = VTOZ(vp);
197 dprintf("zfs_close called \n");
198 /* Decrement the synchronous opens in the znode */
199 if ((flag & (FSYNC | FDSYNC)) && (count == 1))
200 atomic_dec_32(&zp->z_sync_cnt);
203 * Clean up any locks held by this process on the vp.
205 cleanlocks(vp, ddi_get_pid(), 0);
206 cleanshares(vp, ddi_get_pid());
208 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
209 ZTOV(zp)->v_type == VREG &&
210 !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
211 zp->z_phys->zp_size > 0)
212 VERIFY(fs_vscan(vp, cr, 1) == 0);
214 return (0);
217 #ifdef PORT_NETBSD
219 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
220 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
222 static int
223 zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
225 znode_t *zp = VTOZ(vp);
226 uint64_t noff = (uint64_t)*off; /* new offset */
227 uint64_t file_sz;
228 int error;
229 boolean_t hole;
231 file_sz = zp->z_phys->zp_size;
232 if (noff >= file_sz) {
233 return (ENXIO);
236 if (cmd == _FIO_SEEK_HOLE)
237 hole = B_TRUE;
238 else
239 hole = B_FALSE;
241 error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
243 /* end of file? */
244 if ((error == ESRCH) || (noff > file_sz)) {
246 * Handle the virtual hole at the end of file.
248 if (hole) {
249 *off = file_sz;
250 return (0);
252 return (ENXIO);
255 if (noff < *off)
256 return (error);
257 *off = noff;
258 return (error);
260 #endif /* PORT_NETBSD */
262 static int
263 zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
264 int *rvalp, caller_context_t *ct)
266 offset_t off;
267 int error;
268 zfsvfs_t *zfsvfs;
269 znode_t *zp;
271 switch (com) {
272 case _FIOFFS:
273 return (0);
276 * The following two ioctls are used by bfu. Faking out,
277 * necessary to avoid bfu errors.
279 case _FIOGDIO:
280 case _FIOSDIO:
281 return (0);
282 #ifdef PORT_NETBSD /* XXX NetBSD Do we support holes in files ? */
283 case _FIO_SEEK_DATA:
284 case _FIO_SEEK_HOLE:
285 if (ddi_copyin((void *)data, &off, sizeof (off), flag))
286 return (EFAULT);
288 zp = VTOZ(vp);
289 zfsvfs = zp->z_zfsvfs;
290 ZFS_ENTER(zfsvfs);
291 ZFS_VERIFY_ZP(zp);
293 /* offset parameter is in/out */
294 error = zfs_holey(vp, com, &off);
295 ZFS_EXIT(zfsvfs);
296 if (error)
297 return (error);
298 if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
299 return (EFAULT);
300 return (0);
301 #endif
304 return (ENOTTY);
307 #ifdef PORT_NETBSD
309 * When a file is memory mapped, we must keep the IO data synchronized
310 * between the DMU cache and the memory mapped pages. What this means:
312 * On Write: If we find a memory mapped page, we write to *both*
313 * the page and the dmu buffer.
315 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
316 * the file is memory mapped.
318 static int
319 mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx)
321 znode_t *zp = VTOZ(vp);
322 objset_t *os = zp->z_zfsvfs->z_os;
323 vm_object_t obj;
324 vm_page_t m;
325 struct sf_buf *sf;
326 int64_t start, off;
327 int len = nbytes;
328 int error = 0;
329 uint64_t dirbytes;
331 ASSERT(vp->v_mount != NULL);
332 obj = vp->v_object;
333 ASSERT(obj != NULL);
335 start = uio->uio_loffset;
336 off = start & PAGEOFFSET;
337 dirbytes = 0;
338 VM_OBJECT_LOCK(obj);
339 for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
340 uint64_t bytes = MIN(PAGESIZE - off, len);
341 uint64_t fsize;
343 again:
344 if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
345 vm_page_is_valid(m, (vm_offset_t)off, bytes)) {
346 uint64_t woff;
347 caddr_t va;
349 if (vm_page_sleep_if_busy(m, FALSE, "zfsmwb"))
350 goto again;
351 fsize = obj->un_pager.vnp.vnp_size;
352 vm_page_busy(m);
353 vm_page_lock_queues();
354 vm_page_undirty(m);
355 vm_page_unlock_queues();
356 VM_OBJECT_UNLOCK(obj);
357 if (dirbytes > 0) {
358 error = dmu_write_uio(os, zp->z_id, uio,
359 dirbytes, tx);
360 dirbytes = 0;
362 if (error == 0) {
363 sched_pin();
364 sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
365 va = (caddr_t)sf_buf_kva(sf);
366 woff = uio->uio_loffset - off;
367 error = uiomove(va + off, bytes, UIO_WRITE, uio);
369 * The uiomove() above could have been partially
370 * successful, that's why we call dmu_write()
371 * below unconditionally. The page was marked
372 * non-dirty above and we would lose the changes
373 * without doing so. If the uiomove() failed
374 * entirely, well, we just write what we got
375 * before one more time.
377 dmu_write(os, zp->z_id, woff,
378 MIN(PAGESIZE, fsize - woff), va, tx);
379 sf_buf_free(sf);
380 sched_unpin();
382 VM_OBJECT_LOCK(obj);
383 vm_page_wakeup(m);
384 } else {
385 if (__predict_false(obj->cache != NULL)) {
386 vm_page_cache_free(obj, OFF_TO_IDX(start),
387 OFF_TO_IDX(start) + 1);
389 dirbytes += bytes;
391 len -= bytes;
392 off = 0;
393 if (error)
394 break;
396 VM_OBJECT_UNLOCK(obj);
397 if (error == 0 && dirbytes > 0)
398 error = dmu_write_uio(os, zp->z_id, uio, dirbytes, tx);
399 return (error);
403 * When a file is memory mapped, we must keep the IO data synchronized
404 * between the DMU cache and the memory mapped pages. What this means:
406 * On Read: We "read" preferentially from memory mapped pages,
407 * else we default from the dmu buffer.
409 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
410 * the file is memory mapped.
412 static int
413 mappedread(vnode_t *vp, int nbytes, uio_t *uio)
415 znode_t *zp = VTOZ(vp);
416 objset_t *os = zp->z_zfsvfs->z_os;
417 vm_object_t obj;
418 vm_page_t m;
419 struct sf_buf *sf;
420 int64_t start, off;
421 caddr_t va;
422 int len = nbytes;
423 int error = 0;
424 uint64_t dirbytes;
426 ASSERT(vp->v_mount != NULL);
427 obj = vp->v_object;
428 ASSERT(obj != NULL);
430 start = uio->uio_loffset;
431 off = start & PAGEOFFSET;
432 dirbytes = 0;
433 VM_OBJECT_LOCK(obj);
434 for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
435 uint64_t bytes = MIN(PAGESIZE - off, len);
437 again:
438 if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
439 vm_page_is_valid(m, (vm_offset_t)off, bytes)) {
440 if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb"))
441 goto again;
442 vm_page_busy(m);
443 VM_OBJECT_UNLOCK(obj);
444 if (dirbytes > 0) {
445 error = dmu_read_uio(os, zp->z_id, uio,
446 dirbytes);
447 dirbytes = 0;
449 if (error == 0) {
450 sched_pin();
451 sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
452 va = (caddr_t)sf_buf_kva(sf);
453 error = uiomove(va + off, bytes, UIO_READ, uio);
454 sf_buf_free(sf);
455 sched_unpin();
457 VM_OBJECT_LOCK(obj);
458 vm_page_wakeup(m);
459 } else if (m != NULL && uio->uio_segflg == UIO_NOCOPY) {
461 * The code below is here to make sendfile(2) work
462 * correctly with ZFS. As pointed out by ups@
463 * sendfile(2) should be changed to use VOP_GETPAGES(),
464 * but it pessimize performance of sendfile/UFS, that's
465 * why I handle this special case in ZFS code.
467 if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb"))
468 goto again;
469 vm_page_busy(m);
470 VM_OBJECT_UNLOCK(obj);
471 if (dirbytes > 0) {
472 error = dmu_read_uio(os, zp->z_id, uio,
473 dirbytes);
474 dirbytes = 0;
476 if (error == 0) {
477 sched_pin();
478 sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
479 va = (caddr_t)sf_buf_kva(sf);
480 error = dmu_read(os, zp->z_id, start + off,
481 bytes, (void *)(va + off));
482 sf_buf_free(sf);
483 sched_unpin();
485 VM_OBJECT_LOCK(obj);
486 vm_page_wakeup(m);
487 if (error == 0)
488 uio->uio_resid -= bytes;
489 } else {
490 dirbytes += bytes;
492 len -= bytes;
493 off = 0;
494 if (error)
495 break;
497 VM_OBJECT_UNLOCK(obj);
498 if (error == 0 && dirbytes > 0)
499 error = dmu_read_uio(os, zp->z_id, uio, dirbytes);
500 return (error);
502 #endif /* PORT_NETBSD */
503 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
506 * Read bytes from specified file into supplied buffer.
508 * IN: vp - vnode of file to be read from.
509 * uio - structure supplying read location, range info,
510 * and return buffer.
511 * ioflag - SYNC flags; used to provide FRSYNC semantics.
512 * cr - credentials of caller.
513 * ct - caller context
515 * OUT: uio - updated offset and range, buffer filled.
517 * RETURN: 0 if success
518 * error code if failure
520 * Side Effects:
521 * vp - atime updated if byte count > 0
523 /* ARGSUSED */
524 static int
525 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
527 znode_t *zp = VTOZ(vp);
528 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
529 objset_t *os;
530 ssize_t n, nbytes;
531 int error;
532 rl_t *rl;
534 dprintf("zfs_read called\n");
536 ZFS_ENTER(zfsvfs);
537 ZFS_VERIFY_ZP(zp);
538 os = zfsvfs->z_os;
540 if (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) {
541 ZFS_EXIT(zfsvfs);
542 return (EACCES);
546 * Validate file offset
548 if (uio->uio_loffset < (offset_t)0) {
549 ZFS_EXIT(zfsvfs);
550 return (EINVAL);
554 * Fasttrack empty reads
556 if (uio->uio_resid == 0) {
557 ZFS_EXIT(zfsvfs);
558 return (0);
562 * Check for mandatory locks
564 if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
565 if (error = chklock(vp, FREAD,
566 uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
567 ZFS_EXIT(zfsvfs);
568 return (error);
573 * If we're in FRSYNC mode, sync out this znode before reading it.
575 if (ioflag & FRSYNC)
576 zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
579 * Lock the range against changes.
581 rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
584 * If we are reading past end-of-file we can skip
585 * to the end; but we might still need to set atime.
587 if (uio->uio_loffset >= zp->z_phys->zp_size) {
588 error = 0;
589 goto out;
592 ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
593 n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
595 while (n > 0) {
596 nbytes = MIN(n, zfs_read_chunk_size -
597 P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
599 // if (vn_has_cached_data(vp))
600 // error = mappedread(vp, nbytes, uio);
601 // else
602 error = dmu_read_uio(os, zp->z_id, uio, nbytes);
603 if (error) {
604 /* convert checksum errors into IO errors */
605 if (error == ECKSUM)
606 error = EIO;
607 break;
610 n -= nbytes;
613 out:
614 zfs_range_unlock(rl);
616 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
617 ZFS_EXIT(zfsvfs);
618 return (error);
622 * Fault in the pages of the first n bytes specified by the uio structure.
623 * 1 byte in each page is touched and the uio struct is unmodified.
624 * Any error will exit this routine as this is only a best
625 * attempt to get the pages resident. This is a copy of ufs_trans_touch().
627 static void
628 zfs_prefault_write(ssize_t n, struct uio *uio)
630 struct iovec *iov;
631 ulong_t cnt, incr;
632 caddr_t p;
634 if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace))
635 return;
637 iov = uio->uio_iov;
639 while (n) {
640 cnt = MIN(iov->iov_len, n);
641 if (cnt == 0) {
642 /* empty iov entry */
643 iov++;
644 continue;
646 n -= cnt;
648 * touch each page in this segment.
650 p = iov->iov_base;
651 while (cnt) {
652 if (fubyte(p) == -1)
653 return;
654 incr = MIN(cnt, PAGESIZE);
655 p += incr;
656 cnt -= incr;
659 * touch the last byte in case it straddles a page.
661 p--;
662 if (fubyte(p) == -1)
663 return;
664 iov++;
669 * Write the bytes to a file.
671 * IN: vp - vnode of file to be written to.
672 * uio - structure supplying write location, range info,
673 * and data buffer.
674 * ioflag - IO_APPEND flag set if in append mode.
675 * cr - credentials of caller.
676 * ct - caller context (NFS/CIFS fem monitor only)
678 * OUT: uio - updated offset and range.
680 * RETURN: 0 if success
681 * error code if failure
683 * Timestamps:
684 * vp - ctime|mtime updated if byte count > 0
686 /* ARGSUSED */
687 static int
688 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
690 znode_t *zp = VTOZ(vp);
691 rlim64_t limit = MAXOFFSET_T;
692 ssize_t start_resid = uio->uio_resid;
693 ssize_t tx_bytes;
694 uint64_t end_size;
695 dmu_tx_t *tx;
696 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
697 zilog_t *zilog;
698 offset_t woff;
699 ssize_t n, nbytes;
700 rl_t *rl;
701 int max_blksz = zfsvfs->z_max_blksz;
702 uint64_t pflags;
703 int error;
705 dprintf("zfs_write called\n");
708 * Fasttrack empty write
710 n = start_resid;
711 if (n == 0)
712 return (0);
714 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
715 limit = MAXOFFSET_T;
717 ZFS_ENTER(zfsvfs);
718 ZFS_VERIFY_ZP(zp);
721 * If immutable or not appending then return EPERM
723 pflags = zp->z_phys->zp_flags;
724 if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
725 ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
726 (uio->uio_loffset < zp->z_phys->zp_size))) {
727 ZFS_EXIT(zfsvfs);
728 return (EPERM);
731 zilog = zfsvfs->z_log;
734 * Pre-fault the pages to ensure slow (eg NFS) pages
735 * don't hold up txg.
737 zfs_prefault_write(n, uio);
740 * If in append mode, set the io offset pointer to eof.
742 if (ioflag & IO_APPEND) {
744 * Range lock for a file append:
745 * The value for the start of range will be determined by
746 * zfs_range_lock() (to guarantee append semantics).
747 * If this write will cause the block size to increase,
748 * zfs_range_lock() will lock the entire file, so we must
749 * later reduce the range after we grow the block size.
751 rl = zfs_range_lock(zp, 0, n, RL_APPEND);
752 if (rl->r_len == UINT64_MAX) {
753 /* overlocked, zp_size can't change */
754 woff = uio->uio_loffset = zp->z_phys->zp_size;
755 } else {
756 woff = uio->uio_loffset = rl->r_off;
758 } else {
759 woff = uio->uio_loffset;
761 * Validate file offset
763 if (woff < 0) {
764 ZFS_EXIT(zfsvfs);
765 return (EINVAL);
769 * If we need to grow the block size then zfs_range_lock()
770 * will lock a wider range than we request here.
771 * Later after growing the block size we reduce the range.
773 rl = zfs_range_lock(zp, woff, n, RL_WRITER);
776 if (woff >= limit) {
777 zfs_range_unlock(rl);
778 ZFS_EXIT(zfsvfs);
779 return (EFBIG);
782 if ((woff + n) > limit || woff > (limit - n))
783 n = limit - woff;
786 * Check for mandatory locks
788 if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
789 (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
790 zfs_range_unlock(rl);
791 ZFS_EXIT(zfsvfs);
792 return (error);
794 end_size = MAX(zp->z_phys->zp_size, woff + n);
797 * Write the file in reasonable size chunks. Each chunk is written
798 * in a separate transaction; this keeps the intent log records small
799 * and allows us to do more fine-grained space accounting.
801 while (n > 0) {
803 * Start a transaction.
805 woff = uio->uio_loffset;
806 tx = dmu_tx_create(zfsvfs->z_os);
807 dmu_tx_hold_bonus(tx, zp->z_id);
808 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
809 error = dmu_tx_assign(tx, zfsvfs->z_assign);
810 if (error) {
811 if (error == ERESTART &&
812 zfsvfs->z_assign == TXG_NOWAIT) {
813 dmu_tx_wait(tx);
814 dmu_tx_abort(tx);
815 continue;
817 dmu_tx_abort(tx);
818 break;
822 * If zfs_range_lock() over-locked we grow the blocksize
823 * and then reduce the lock range. This will only happen
824 * on the first iteration since zfs_range_reduce() will
825 * shrink down r_len to the appropriate size.
827 if (rl->r_len == UINT64_MAX) {
828 uint64_t new_blksz;
830 if (zp->z_blksz > max_blksz) {
831 ASSERT(!ISP2(zp->z_blksz));
832 new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
833 } else {
834 new_blksz = MIN(end_size, max_blksz);
836 zfs_grow_blocksize(zp, new_blksz, tx);
837 zfs_range_reduce(rl, woff, n);
841 * XXX - should we really limit each write to z_max_blksz?
842 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
844 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
846 if (woff + nbytes > zp->z_phys->zp_size)
847 uvm_vnp_setsize(vp, woff + nbytes);
849 rw_enter(&zp->z_map_lock, RW_READER);
851 tx_bytes = uio->uio_resid;
852 if (vn_has_cached_data(vp)) {
853 rw_exit(&zp->z_map_lock);
854 // error = mappedwrite(vp, nbytes, uio, tx);
855 } else {
856 error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
857 uio, nbytes, tx);
858 rw_exit(&zp->z_map_lock);
860 tx_bytes -= uio->uio_resid;
863 * If we made no progress, we're done. If we made even
864 * partial progress, update the znode and ZIL accordingly.
866 if (tx_bytes == 0) {
867 dmu_tx_commit(tx);
868 ASSERT(error != 0);
869 break;
873 * Clear Set-UID/Set-GID bits on successful write if not
874 * privileged and at least one of the excute bits is set.
876 * It would be nice to to this after all writes have
877 * been done, but that would still expose the ISUID/ISGID
878 * to another app after the partial write is committed.
880 * Note: we don't call zfs_fuid_map_id() here because
881 * user 0 is not an ephemeral uid.
883 mutex_enter(&zp->z_acl_lock);
884 if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) |
885 (S_IXUSR >> 6))) != 0 &&
886 (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 &&
887 secpolicy_vnode_setid_retain(cr, (zp->z_phys->zp_mode & S_ISUID) != 0 && zp->z_phys->zp_uid == 0) != 0) {
888 zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID);
890 mutex_exit(&zp->z_acl_lock);
893 * Update time stamp. NOTE: This marks the bonus buffer as
894 * dirty, so we don't have to do it again for zp_size.
896 zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
899 * Update the file size (zp_size) if it has changed;
900 * account for possible concurrent updates.
902 while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset)
903 (void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
904 uio->uio_loffset);
905 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
906 dmu_tx_commit(tx);
908 if (error != 0)
909 break;
910 ASSERT(tx_bytes == nbytes);
911 n -= nbytes;
914 zfs_range_unlock(rl);
917 * If we're in replay mode, or we made no progress, return error.
918 * Otherwise, it's at least a partial write, so it's successful.
920 if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) {
921 ZFS_EXIT(zfsvfs);
922 return (error);
925 if (ioflag & (FSYNC | FDSYNC))
926 zil_commit(zilog, zp->z_last_itx, zp->z_id);
928 ZFS_EXIT(zfsvfs);
930 return (0);
933 void
934 zfs_get_done(dmu_buf_t *db, void *vzgd)
936 zgd_t *zgd = (zgd_t *)vzgd;
937 rl_t *rl = zgd->zgd_rl;
938 vnode_t *vp = ZTOV(rl->r_zp);
939 int vfslocked;
941 dmu_buf_rele(db, vzgd);
942 zfs_range_unlock(rl);
944 * Release the vnode asynchronously as we currently have the
945 * txg stopped from syncing.
947 vrele(vp);
948 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
949 kmem_free(zgd, sizeof (zgd_t));
953 * Get data to generate a TX_WRITE intent log record.
956 zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
958 zfsvfs_t *zfsvfs = arg;
959 objset_t *os = zfsvfs->z_os;
960 znode_t *zp;
961 uint64_t off = lr->lr_offset;
962 dmu_buf_t *db;
963 rl_t *rl;
964 zgd_t *zgd;
965 int dlen = lr->lr_length; /* length of user data */
966 int error = 0;
968 ASSERT(zio);
969 ASSERT(dlen != 0);
972 * Nothing to do if the file has been removed
974 if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
975 return (ENOENT);
976 if (zp->z_unlinked) {
978 * Release the vnode asynchronously as we currently have the
979 * txg stopped from syncing.
981 vrele(ZTOV(zp));
983 return (ENOENT);
987 * Write records come in two flavors: immediate and indirect.
988 * For small writes it's cheaper to store the data with the
989 * log record (immediate); for large writes it's cheaper to
990 * sync the data and get a pointer to it (indirect) so that
991 * we don't have to write the data twice.
993 if (buf != NULL) { /* immediate write */
994 rl = zfs_range_lock(zp, off, dlen, RL_READER);
995 /* test for truncation needs to be done while range locked */
996 if (off >= zp->z_phys->zp_size) {
997 error = ENOENT;
998 goto out;
1000 VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf));
1001 } else { /* indirect write */
1002 uint64_t boff; /* block starting offset */
1005 * Have to lock the whole block to ensure when it's
1006 * written out and it's checksum is being calculated
1007 * that no one can change the data. We need to re-check
1008 * blocksize after we get the lock in case it's changed!
1010 for (;;) {
1011 if (ISP2(zp->z_blksz)) {
1012 boff = P2ALIGN_TYPED(off, zp->z_blksz,
1013 uint64_t);
1014 } else {
1015 boff = 0;
1017 dlen = zp->z_blksz;
1018 rl = zfs_range_lock(zp, boff, dlen, RL_READER);
1019 if (zp->z_blksz == dlen)
1020 break;
1021 zfs_range_unlock(rl);
1023 /* test for truncation needs to be done while range locked */
1024 if (off >= zp->z_phys->zp_size) {
1025 error = ENOENT;
1026 goto out;
1028 zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
1029 zgd->zgd_rl = rl;
1030 zgd->zgd_zilog = zfsvfs->z_log;
1031 zgd->zgd_bp = &lr->lr_blkptr;
1032 VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db));
1033 ASSERT(boff == db->db_offset);
1034 lr->lr_blkoff = off - boff;
1035 error = dmu_sync(zio, db, &lr->lr_blkptr,
1036 lr->lr_common.lrc_txg, zfs_get_done, zgd);
1037 ASSERT((error && error != EINPROGRESS) ||
1038 lr->lr_length <= zp->z_blksz);
1039 if (error == 0)
1040 zil_add_block(zfsvfs->z_log, &lr->lr_blkptr);
1042 * If we get EINPROGRESS, then we need to wait for a
1043 * write IO initiated by dmu_sync() to complete before
1044 * we can release this dbuf. We will finish everything
1045 * up in the zfs_get_done() callback.
1047 if (error == EINPROGRESS)
1048 return (0);
1049 dmu_buf_rele(db, zgd);
1050 kmem_free(zgd, sizeof (zgd_t));
1052 out:
1053 zfs_range_unlock(rl);
1055 * Release the vnode asynchronously as we currently have the
1056 * txg stopped from syncing.
1058 vrele(ZTOV(zp));
1059 return (error);
1062 /*ARGSUSED*/
1063 static int
1064 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1065 caller_context_t *ct)
1067 znode_t *zp = VTOZ(vp);
1068 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1069 int error;
1071 ZFS_ENTER(zfsvfs);
1072 ZFS_VERIFY_ZP(zp);
1074 if (flag & V_ACE_MASK)
1075 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1076 else
1077 error = zfs_zaccess_rwx(zp, mode, flag, cr);
1079 ZFS_EXIT(zfsvfs);
1080 return (error);
1084 * Lookup an entry in a directory, or an extended attribute directory.
1085 * If it exists, return a held vnode reference for it.
1087 * IN: dvp - vnode of directory to search.
1088 * nm - name of entry to lookup.
1089 * pnp - full pathname to lookup [UNUSED].
1090 * flags - LOOKUP_XATTR set if looking for an attribute.
1091 * rdir - root directory vnode [UNUSED].
1092 * cr - credentials of caller.
1093 * ct - caller context
1094 * direntflags - directory lookup flags
1095 * realpnp - returned pathname.
1097 * OUT: vpp - vnode of located entry, NULL if not found.
1099 * RETURN: 0 if success
1100 * error code if failure
1102 * Timestamps:
1103 * NA
1105 /* ARGSUSED */
1106 static int
1107 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
1108 int nameiop, cred_t *cr, int flags)
1110 znode_t *zdp = VTOZ(dvp);
1111 zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1112 int error;
1113 int *direntflags = NULL;
1114 void *realpnp = NULL;
1116 ZFS_ENTER(zfsvfs);
1117 ZFS_VERIFY_ZP(zdp);
1119 *vpp = NULL;
1120 dprintf("zfs_lookup called %s\n", nm);
1121 if (flags & LOOKUP_XATTR) {
1122 #ifdef TODO
1124 * If the xattr property is off, refuse the lookup request.
1126 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1127 ZFS_EXIT(zfsvfs);
1128 return (EINVAL);
1130 #endif
1133 * We don't allow recursive attributes..
1134 * Maybe someday we will.
1136 if (zdp->z_phys->zp_flags & ZFS_XATTR) {
1137 ZFS_EXIT(zfsvfs);
1138 return (EINVAL);
1141 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1142 ZFS_EXIT(zfsvfs);
1143 return (error);
1147 * Do we have permission to get into attribute directory?
1149 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1150 B_FALSE, cr)) {
1151 VN_RELE(*vpp);
1152 *vpp = NULL;
1155 ZFS_EXIT(zfsvfs);
1156 return (error);
1159 if (dvp->v_type != VDIR) {
1160 ZFS_EXIT(zfsvfs);
1161 return (ENOTDIR);
1165 * Check accessibility of directory.
1167 if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1168 ZFS_EXIT(zfsvfs);
1169 return (error);
1173 * Before tediously performing a linear scan of the directory,
1174 * check the name cache to see if the directory/name pair
1175 * we are looking for is known already.
1178 if ((error = cache_lookup(dvp, vpp, cnp)) >= 0) {
1179 ZFS_EXIT(zfsvfs);
1180 return (error);
1183 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1184 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1185 ZFS_EXIT(zfsvfs);
1186 return (EILSEQ);
1189 error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
1190 if (error == 0) {
1192 * Convert device special files
1194 if (IS_DEVVP(*vpp)) {
1195 vnode_t *svp;
1197 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1198 VN_RELE(*vpp);
1199 if (svp == NULL)
1200 error = ENOSYS;
1201 else
1202 *vpp = svp;
1206 ZFS_EXIT(zfsvfs);
1208 /* Translate errors and add SAVENAME when needed. */
1209 if (cnp->cn_flags & ISLASTCN) {
1210 switch (nameiop) {
1211 case CREATE:
1212 case RENAME:
1213 if (error == ENOENT) {
1214 error = EJUSTRETURN;
1215 cnp->cn_flags |= SAVENAME;
1216 break;
1218 /* FALLTHROUGH */
1219 case DELETE:
1220 if (error == 0)
1221 cnp->cn_flags |= SAVENAME;
1222 break;
1226 if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) {
1227 int ltype = 0;
1229 if (cnp->cn_flags & ISDOTDOT) {
1230 ltype = VOP_ISLOCKED(dvp);
1231 VOP_UNLOCK(dvp, 0);
1233 error = vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
1234 if (cnp->cn_flags & ISDOTDOT)
1235 vn_lock(dvp, ltype | LK_RETRY);
1236 if (error != 0) {
1237 VN_RELE(*vpp);
1238 *vpp = NULL;
1239 return (error);
1244 * Insert name into cache if appropriate.
1246 if ((cnp->cn_flags & MAKEENTRY) == 0){
1247 return (error);
1249 switch (error) {
1250 case 0:
1251 cache_enter(dvp, *vpp, cnp);
1252 break;
1253 case ENOENT:
1254 if (nameiop != CREATE)
1255 cache_enter(dvp, *vpp, cnp);
1256 break;
1257 default:
1258 break;
1260 return (error);
1264 * Attempt to create a new entry in a directory. If the entry
1265 * already exists, truncate the file if permissible, else return
1266 * an error. Return the vp of the created or trunc'd file.
1268 * IN: dvp - vnode of directory to put new file entry in.
1269 * name - name of new file entry.
1270 * vap - attributes of new file.
1271 * excl - flag indicating exclusive or non-exclusive mode.
1272 * mode - mode to open file with.
1273 * cr - credentials of caller.
1274 * flag - large file flag [UNUSED].
1275 * ct - caller context
1276 * vsecp - ACL to be set
1278 * OUT: vpp - vnode of created or trunc'd entry.
1280 * RETURN: 0 if success
1281 * error code if failure
1283 * Timestamps:
1284 * dvp - ctime|mtime updated if new entry created
1285 * vp - ctime|mtime always, atime if new
1288 /* ARGSUSED */
1289 static int
1290 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
1291 vnode_t **vpp, cred_t *cr)
1293 znode_t *zp, *dzp = VTOZ(dvp);
1294 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
1295 zilog_t *zilog;
1296 objset_t *os;
1297 zfs_dirlock_t *dl;
1298 dmu_tx_t *tx;
1299 int error;
1300 zfs_acl_t *aclp = NULL;
1301 zfs_fuid_info_t *fuidp = NULL;
1302 void *vsecp = NULL;
1303 int flag = 0;
1305 dprintf("zfs_create called\n");
1307 * If we have an ephemeral id, ACL, or XVATTR then
1308 * make sure file system is at proper version
1311 if (zfsvfs->z_use_fuids == B_FALSE &&
1312 (vsecp || (vap->va_mask & AT_XVATTR) ||
1313 IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))))
1314 return (EINVAL);
1316 ZFS_ENTER(zfsvfs);
1317 ZFS_VERIFY_ZP(dzp);
1318 os = zfsvfs->z_os;
1319 zilog = zfsvfs->z_log;
1321 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1322 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1323 ZFS_EXIT(zfsvfs);
1324 return (EILSEQ);
1327 if (vap->va_mask & AT_XVATTR) {
1328 if ((error = secpolicy_xvattr((xvattr_t *)vap,
1329 crgetuid(cr), cr, vap->va_type)) != 0) {
1330 ZFS_EXIT(zfsvfs);
1331 return (error);
1334 top:
1335 *vpp = NULL;
1337 if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
1338 vap->va_mode &= ~S_ISVTX;
1340 if (*name == '\0') {
1342 * Null component name refers to the directory itself.
1344 VN_HOLD(dvp);
1345 zp = dzp;
1346 dl = NULL;
1347 error = 0;
1348 } else {
1349 /* possible VN_HOLD(zp) */
1350 int zflg = 0;
1352 if (flag & FIGNORECASE)
1353 zflg |= ZCILOOK;
1355 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1356 NULL, NULL);
1357 if (error) {
1358 if (strcmp(name, "..") == 0)
1359 error = EISDIR;
1360 ZFS_EXIT(zfsvfs);
1361 if (aclp)
1362 zfs_acl_free(aclp);
1363 return (error);
1366 if (vsecp && aclp == NULL) {
1367 error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp);
1368 if (error) {
1369 ZFS_EXIT(zfsvfs);
1370 if (dl)
1371 zfs_dirent_unlock(dl, 0);
1372 return (error);
1376 if (zp == NULL) {
1377 uint64_t txtype;
1380 * Create a new file object and update the directory
1381 * to reference it.
1383 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1384 goto out;
1388 * We only support the creation of regular files in
1389 * extended attribute directories.
1391 if ((dzp->z_phys->zp_flags & ZFS_XATTR) &&
1392 (vap->va_type != VREG)) {
1393 error = EINVAL;
1394 goto out;
1397 tx = dmu_tx_create(os);
1398 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1399 if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(crgetuid(cr)) ||
1400 IS_EPHEMERAL(crgetgid(cr))) {
1401 if (zfsvfs->z_fuid_obj == 0) {
1402 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1403 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1404 FUID_SIZE_ESTIMATE(zfsvfs));
1405 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ,
1406 FALSE, NULL);
1407 } else {
1408 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
1409 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
1410 FUID_SIZE_ESTIMATE(zfsvfs));
1413 dmu_tx_hold_bonus(tx, dzp->z_id);
1414 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1415 if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) {
1416 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1417 0, SPA_MAXBLOCKSIZE);
1419 error = dmu_tx_assign(tx, zfsvfs->z_assign);
1420 if (error) {
1421 zfs_dirent_unlock(dl, 0);
1422 if (error == ERESTART &&
1423 zfsvfs->z_assign == TXG_NOWAIT) {
1424 dmu_tx_wait(tx);
1425 dmu_tx_abort(tx);
1426 goto top;
1428 dmu_tx_abort(tx);
1429 ZFS_EXIT(zfsvfs);
1430 if (aclp)
1431 zfs_acl_free(aclp);
1432 return (error);
1434 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp);
1435 (void) zfs_link_create(dl, zp, tx, ZNEW);
1436 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1437 if (flag & FIGNORECASE)
1438 txtype |= TX_CI;
1439 zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1440 vsecp, fuidp, vap);
1441 if (fuidp)
1442 zfs_fuid_info_free(fuidp);
1443 dmu_tx_commit(tx);
1444 } else {
1445 int aflags = (flag & FAPPEND) ? V_APPEND : 0;
1448 * A directory entry already exists for this name.
1451 * Can't truncate an existing file if in exclusive mode.
1453 if (excl == EXCL) {
1454 error = EEXIST;
1455 goto out;
1458 * Can't open a directory for writing.
1460 if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
1461 error = EISDIR;
1462 goto out;
1465 * Verify requested access to file.
1467 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
1468 goto out;
1471 mutex_enter(&dzp->z_lock);
1472 dzp->z_seq++;
1473 mutex_exit(&dzp->z_lock);
1476 * Truncate regular files if requested.
1478 if ((ZTOV(zp)->v_type == VREG) &&
1479 (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
1480 /* we can't hold any locks when calling zfs_freesp() */
1481 zfs_dirent_unlock(dl, 0);
1482 dl = NULL;
1483 error = zfs_freesp(zp, 0, 0, mode, TRUE);
1484 if (error == 0) {
1485 vnevent_create(ZTOV(zp), NULL);
1489 out:
1490 if (dl)
1491 zfs_dirent_unlock(dl, 0);
1493 if (error) {
1494 if (zp)
1495 VN_RELE(ZTOV(zp));
1496 } else {
1497 *vpp = ZTOV(zp);
1499 * If vnode is for a device return a specfs vnode instead.
1501 if (IS_DEVVP(*vpp)) {
1502 struct vnode *svp;
1504 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1505 VN_RELE(*vpp);
1506 if (svp == NULL) {
1507 error = ENOSYS;
1509 *vpp = svp;
1512 if (aclp)
1513 zfs_acl_free(aclp);
1515 ZFS_EXIT(zfsvfs);
1516 return (error);
1520 * Remove an entry from a directory.
1522 * IN: dvp - vnode of directory to remove entry from.
1523 * name - name of entry to remove.
1524 * cr - credentials of caller.
1525 * ct - caller context
1526 * flags - case flags
1528 * RETURN: 0 if success
1529 * error code if failure
1531 * Timestamps:
1532 * dvp - ctime|mtime
1533 * vp - ctime (if nlink > 0)
1535 /*ARGSUSED*/
1536 static int
1537 zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
1538 int flags)
1540 znode_t *zp, *dzp = VTOZ(dvp);
1541 znode_t *xzp = NULL;
1542 vnode_t *vp;
1543 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
1544 zilog_t *zilog;
1545 uint64_t acl_obj, xattr_obj;
1546 zfs_dirlock_t *dl;
1547 dmu_tx_t *tx;
1548 boolean_t may_delete_now, delete_now = FALSE;
1549 boolean_t unlinked, toobig = FALSE;
1550 uint64_t txtype;
1551 pathname_t *realnmp = NULL;
1552 pathname_t realnm;
1553 int error;
1554 int zflg = ZEXISTS;
1556 dprintf("zfs_remove called\n");
1558 ZFS_ENTER(zfsvfs);
1559 ZFS_VERIFY_ZP(dzp);
1560 zilog = zfsvfs->z_log;
1562 if (flags & FIGNORECASE) {
1563 zflg |= ZCILOOK;
1564 pn_alloc(&realnm);
1565 realnmp = &realnm;
1568 top:
1570 * Attempt to lock directory; fail if entry doesn't exist.
1572 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1573 NULL, realnmp)) {
1574 if (realnmp)
1575 pn_free(realnmp);
1576 ZFS_EXIT(zfsvfs);
1577 return (error);
1580 vp = ZTOV(zp);
1582 if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1583 goto out;
1587 * Need to use rmdir for removing directories.
1589 if (vp->v_type == VDIR) {
1590 error = EPERM;
1591 goto out;
1594 vnevent_remove(vp, dvp, name, ct);
1596 if (realnmp)
1597 dnlc_remove(dvp, realnmp->pn_buf);
1598 else
1599 dnlc_remove(dvp, name);
1601 may_delete_now = FALSE;
1604 * We may delete the znode now, or we may put it in the unlinked set;
1605 * it depends on whether we're the last link, and on whether there are
1606 * other holds on the vnode. So we dmu_tx_hold() the right things to
1607 * allow for either case.
1609 tx = dmu_tx_create(zfsvfs->z_os);
1610 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1611 dmu_tx_hold_bonus(tx, zp->z_id);
1612 if (may_delete_now) {
1613 toobig =
1614 zp->z_phys->zp_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
1615 /* if the file is too big, only hold_free a token amount */
1616 dmu_tx_hold_free(tx, zp->z_id, 0,
1617 (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1620 /* are there any extended attributes? */
1621 if ((xattr_obj = zp->z_phys->zp_xattr) != 0) {
1622 /* XXX - do we need this if we are deleting? */
1623 dmu_tx_hold_bonus(tx, xattr_obj);
1626 /* are there any additional acls */
1627 if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 &&
1628 may_delete_now)
1629 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1631 /* charge as an update -- would be nice not to charge at all */
1632 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1634 error = dmu_tx_assign(tx, zfsvfs->z_assign);
1635 if (error) {
1636 zfs_dirent_unlock(dl, 0);
1637 VN_RELE(vp);
1638 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
1639 dmu_tx_wait(tx);
1640 dmu_tx_abort(tx);
1641 goto top;
1643 if (realnmp)
1644 pn_free(realnmp);
1645 dmu_tx_abort(tx);
1646 ZFS_EXIT(zfsvfs);
1647 return (error);
1651 * Remove the directory entry.
1653 error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1655 if (error) {
1656 dmu_tx_commit(tx);
1657 goto out;
1660 if (0 && unlinked) {
1661 KASSERT(0); /* NetBSD: must now happen now */
1662 VI_LOCK(vp);
1663 delete_now = may_delete_now && !toobig &&
1664 vp->v_count == 1 && !vn_has_cached_data(vp) &&
1665 zp->z_phys->zp_xattr == xattr_obj &&
1666 zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj;
1667 VI_UNLOCK(vp);
1670 if (delete_now) {
1671 KASSERT(0); /* NetBSD: must now happen now */
1672 if (zp->z_phys->zp_xattr) {
1673 error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
1674 ASSERT3U(error, ==, 0);
1675 ASSERT3U(xzp->z_phys->zp_links, ==, 2);
1676 dmu_buf_will_dirty(xzp->z_dbuf, tx);
1677 mutex_enter(&xzp->z_lock);
1678 xzp->z_unlinked = 1;
1679 xzp->z_phys->zp_links = 0;
1680 mutex_exit(&xzp->z_lock);
1681 zfs_unlinked_add(xzp, tx);
1682 zp->z_phys->zp_xattr = 0; /* probably unnecessary */
1684 mutex_enter(&zp->z_lock);
1685 VI_LOCK(vp);
1686 vp->v_count--;
1687 ASSERT3U(vp->v_count, ==, 0);
1688 VI_UNLOCK(vp);
1689 mutex_exit(&zp->z_lock);
1690 zfs_znode_delete(zp, tx);
1691 } else if (unlinked) {
1692 zfs_unlinked_add(zp, tx);
1695 txtype = TX_REMOVE;
1696 if (flags & FIGNORECASE)
1697 txtype |= TX_CI;
1698 zfs_log_remove(zilog, tx, txtype, dzp, name);
1700 dmu_tx_commit(tx);
1701 out:
1702 if (realnmp)
1703 pn_free(realnmp);
1705 zfs_dirent_unlock(dl, 0);
1707 if (!delete_now) {
1708 VN_RELE(vp);
1709 } else if (xzp) {
1710 /* this rele is delayed to prevent nesting transactions */
1711 VN_RELE(ZTOV(xzp));
1714 ZFS_EXIT(zfsvfs);
1715 return (error);
1719 * Create a new directory and insert it into dvp using the name
1720 * provided. Return a pointer to the inserted directory.
1722 * IN: dvp - vnode of directory to add subdir to.
1723 * dirname - name of new directory.
1724 * vap - attributes of new directory.
1725 * cr - credentials of caller.
1726 * ct - caller context
1727 * vsecp - ACL to be set
1729 * OUT: vpp - vnode of created directory.
1731 * RETURN: 0 if success
1732 * error code if failure
1734 * Timestamps:
1735 * dvp - ctime|mtime updated
1736 * vp - ctime|mtime|atime updated
1738 /*ARGSUSED*/
1739 static int
1740 zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
1741 caller_context_t *ct, int flags, vsecattr_t *vsecp)
1743 znode_t *zp, *dzp = VTOZ(dvp);
1744 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
1745 zilog_t *zilog;
1746 zfs_dirlock_t *dl;
1747 uint64_t txtype;
1748 dmu_tx_t *tx;
1749 int error;
1750 zfs_acl_t *aclp = NULL;
1751 zfs_fuid_info_t *fuidp = NULL;
1752 int zf = ZNEW;
1754 ASSERT(vap->va_type == VDIR);
1757 * If we have an ephemeral id, ACL, or XVATTR then
1758 * make sure file system is at proper version
1761 if (zfsvfs->z_use_fuids == B_FALSE &&
1762 (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(crgetuid(cr))||
1763 IS_EPHEMERAL(crgetgid(cr))))
1764 return (EINVAL);
1766 ZFS_ENTER(zfsvfs);
1767 ZFS_VERIFY_ZP(dzp);
1768 zilog = zfsvfs->z_log;
1770 if (dzp->z_phys->zp_flags & ZFS_XATTR) {
1771 ZFS_EXIT(zfsvfs);
1772 return (EINVAL);
1775 if (zfsvfs->z_utf8 && u8_validate(dirname,
1776 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1777 ZFS_EXIT(zfsvfs);
1778 return (EILSEQ);
1780 if (flags & FIGNORECASE)
1781 zf |= ZCILOOK;
1783 if (vap->va_mask & AT_XVATTR)
1784 if ((error = secpolicy_xvattr((xvattr_t *)vap,
1785 crgetuid(cr), cr, vap->va_type)) != 0) {
1786 ZFS_EXIT(zfsvfs);
1787 return (error);
1791 * First make sure the new directory doesn't exist.
1793 top:
1794 *vpp = NULL;
1796 if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
1797 NULL, NULL)) {
1798 ZFS_EXIT(zfsvfs);
1799 return (error);
1802 if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
1803 zfs_dirent_unlock(dl, 0);
1804 ZFS_EXIT(zfsvfs);
1805 return (error);
1808 if (vsecp && aclp == NULL) {
1809 error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp);
1810 if (error) {
1811 zfs_dirent_unlock(dl, 0);
1812 ZFS_EXIT(zfsvfs);
1813 return (error);
1817 * Add a new entry to the directory.
1819 tx = dmu_tx_create(zfsvfs->z_os);
1820 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1821 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1822 if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(crgetuid(cr)) ||
1823 IS_EPHEMERAL(crgetgid(cr))) {
1824 if (zfsvfs->z_fuid_obj == 0) {
1825 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1826 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1827 FUID_SIZE_ESTIMATE(zfsvfs));
1828 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
1829 } else {
1830 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
1831 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
1832 FUID_SIZE_ESTIMATE(zfsvfs));
1835 if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp)
1836 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1837 0, SPA_MAXBLOCKSIZE);
1838 error = dmu_tx_assign(tx, zfsvfs->z_assign);
1839 if (error) {
1840 zfs_dirent_unlock(dl, 0);
1841 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
1842 dmu_tx_wait(tx);
1843 dmu_tx_abort(tx);
1844 goto top;
1846 dmu_tx_abort(tx);
1847 ZFS_EXIT(zfsvfs);
1848 if (aclp)
1849 zfs_acl_free(aclp);
1850 return (error);
1854 * Create new node.
1856 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp);
1858 if (aclp)
1859 zfs_acl_free(aclp);
1862 * Now put new name in parent dir.
1864 (void) zfs_link_create(dl, zp, tx, ZNEW);
1866 *vpp = ZTOV(zp);
1868 txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
1869 if (flags & FIGNORECASE)
1870 txtype |= TX_CI;
1871 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, fuidp, vap);
1873 if (fuidp)
1874 zfs_fuid_info_free(fuidp);
1875 dmu_tx_commit(tx);
1877 zfs_dirent_unlock(dl, 0);
1879 ZFS_EXIT(zfsvfs);
1880 return (0);
1884 * Remove a directory subdir entry. If the current working
1885 * directory is the same as the subdir to be removed, the
1886 * remove will fail.
1888 * IN: dvp - vnode of directory to remove from.
1889 * name - name of directory to be removed.
1890 * cwd - vnode of current working directory.
1891 * cr - credentials of caller.
1892 * ct - caller context
1893 * flags - case flags
1895 * RETURN: 0 if success
1896 * error code if failure
1898 * Timestamps:
1899 * dvp - ctime|mtime updated
1901 /*ARGSUSED*/
1902 static int
1903 zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
1904 caller_context_t *ct, int flags)
1906 znode_t *dzp = VTOZ(dvp);
1907 znode_t *zp;
1908 vnode_t *vp;
1909 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
1910 zilog_t *zilog;
1911 zfs_dirlock_t *dl;
1912 dmu_tx_t *tx;
1913 int error;
1914 int zflg = ZEXISTS;
1916 ZFS_ENTER(zfsvfs);
1917 ZFS_VERIFY_ZP(dzp);
1918 zilog = zfsvfs->z_log;
1920 if (flags & FIGNORECASE)
1921 zflg |= ZCILOOK;
1922 top:
1923 zp = NULL;
1926 * Attempt to lock directory; fail if entry doesn't exist.
1928 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1929 NULL, NULL)) {
1930 ZFS_EXIT(zfsvfs);
1931 return (error);
1934 vp = ZTOV(zp);
1936 if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1937 goto out;
1940 if (vp->v_type != VDIR) {
1941 error = ENOTDIR;
1942 goto out;
1945 if (vp == cwd) {
1946 error = EINVAL;
1947 goto out;
1950 vnevent_rmdir(vp, dvp, name, ct);
1953 * Grab a lock on the parent pointer to make sure we play well
1954 * with the treewalk and directory rename code.
1956 rw_enter(&zp->z_parent_lock, RW_WRITER);
1958 tx = dmu_tx_create(zfsvfs->z_os);
1959 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1960 dmu_tx_hold_bonus(tx, zp->z_id);
1961 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1962 error = dmu_tx_assign(tx, zfsvfs->z_assign);
1963 if (error) {
1964 rw_exit(&zp->z_parent_lock);
1965 rw_exit(&zp->z_name_lock);
1966 zfs_dirent_unlock(dl, 0);
1967 VN_RELE(vp);
1968 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
1969 dmu_tx_wait(tx);
1970 dmu_tx_abort(tx);
1971 goto top;
1973 dmu_tx_abort(tx);
1974 ZFS_EXIT(zfsvfs);
1975 return (error);
1978 /* Purge cache entries, while still holding locks. */
1979 cache_purge(dvp);
1980 cache_purge(vp);
1982 error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
1984 if (error == 0) {
1985 uint64_t txtype = TX_RMDIR;
1986 if (flags & FIGNORECASE)
1987 txtype |= TX_CI;
1988 zfs_log_remove(zilog, tx, txtype, dzp, name);
1991 dmu_tx_commit(tx);
1993 rw_exit(&zp->z_parent_lock);
1994 rw_exit(&zp->z_name_lock);
1995 out:
1996 zfs_dirent_unlock(dl, 0);
1998 VN_RELE(vp);
2000 ZFS_EXIT(zfsvfs);
2001 return (error);
2005 * Read as many directory entries as will fit into the provided
2006 * buffer from the given directory cursor position (specified in
2007 * the uio structure.
2009 * IN: vp - vnode of directory to read.
2010 * uio - structure supplying read location, range info,
2011 * and return buffer.
2012 * cr - credentials of caller.
2013 * ct - caller context
2014 * flags - case flags
2016 * OUT: uio - updated offset and range, buffer filled.
2017 * eofp - set to true if end-of-file detected.
2019 * RETURN: 0 if success
2020 * error code if failure
2022 * Timestamps:
2023 * vp - atime updated
2025 * Note that the low 4 bits of the cookie returned by zap is always zero.
2026 * This allows us to use the low range for "special" directory entries:
2027 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem,
2028 * we use the offset 2 for the '.zfs' directory.
2030 /* ARGSUSED */
2031 static int
2032 zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
2034 znode_t *zp = VTOZ(vp);
2035 iovec_t *iovp;
2036 edirent_t *eodp;
2037 dirent64_t *odp;
2038 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2039 objset_t *os;
2040 caddr_t outbuf;
2041 size_t bufsize;
2042 zap_cursor_t zc;
2043 zap_attribute_t zap;
2044 uint_t bytes_wanted;
2045 uint64_t offset; /* must be unsigned; checks for < 1 */
2046 int local_eof;
2047 int outcount;
2048 int error;
2049 uint8_t prefetch;
2050 boolean_t check_sysattrs;
2051 uint8_t type;
2052 int ncooks;
2053 u_long *cooks = NULL;
2054 int flags = 0;
2056 dprintf("zfs_readdir called\n");
2058 ZFS_ENTER(zfsvfs);
2059 ZFS_VERIFY_ZP(zp);
2062 * If we are not given an eof variable,
2063 * use a local one.
2065 if (eofp == NULL)
2066 eofp = &local_eof;
2069 * Check for valid iov_len.
2071 if (uio->uio_iov->iov_len <= 0) {
2072 ZFS_EXIT(zfsvfs);
2073 return (EINVAL);
2077 * Quit if directory has been removed (posix)
2079 if ((*eofp = zp->z_unlinked) != 0) {
2080 ZFS_EXIT(zfsvfs);
2081 return (0);
2084 error = 0;
2085 os = zfsvfs->z_os;
2086 offset = uio->uio_loffset;
2087 prefetch = zp->z_zn_prefetch;
2090 * Initialize the iterator cursor.
2092 if (offset <= 3) {
2094 * Start iteration from the beginning of the directory.
2096 zap_cursor_init(&zc, os, zp->z_id);
2097 } else {
2099 * The offset is a serialized cursor.
2101 zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2105 * Get space to change directory entries into fs independent format.
2107 iovp = uio->uio_iov;
2108 bytes_wanted = iovp->iov_len;
2109 if (!VMSPACE_IS_KERNEL_P(uio->uio_vmspace) || uio->uio_iovcnt != 1) {
2110 bufsize = bytes_wanted;
2111 outbuf = kmem_alloc(bufsize, KM_SLEEP);
2112 memset(outbuf, 0, bufsize);
2113 odp = (struct dirent64 *)outbuf;
2114 } else {
2115 bufsize = bytes_wanted;
2116 odp = (struct dirent64 *)iovp->iov_base;
2118 eodp = (struct edirent *)odp;
2120 if (ncookies != NULL) {
2122 * Minimum entry size is dirent size and 1 byte for a file name.
2124 ncooks = uio->uio_resid / _DIRENT_MINSIZE(odp);
2125 // sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
2126 cooks = kmem_alloc(ncooks * sizeof(u_long), KM_SLEEP);
2128 memset(cooks, 0, ncooks * sizeof(u_long));
2129 *cookies = cooks;
2130 *ncookies = ncooks;
2134 * If this VFS supports the system attribute view interface; and
2135 * we're looking at an extended attribute directory; and we care
2136 * about normalization conflicts on this vfs; then we must check
2137 * for normalization conflicts with the sysattr name space.
2139 #ifdef TODO
2140 check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2141 (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2142 (flags & V_RDDIR_ENTFLAGS);
2143 #else
2144 check_sysattrs = 0;
2145 #endif
2148 * Transform to file-system independent format
2150 outcount = 0;
2151 while (outcount < bytes_wanted) {
2152 ino64_t objnum;
2153 ushort_t reclen;
2154 off64_t *next;
2157 * Special case `.', `..', and `.zfs'.
2159 if (offset == 0) {
2160 (void) strcpy(zap.za_name, ".");
2161 zap.za_normalization_conflict = 0;
2162 objnum = zp->z_id;
2163 type = DT_DIR;
2164 } else if (offset == 1) {
2165 (void) strcpy(zap.za_name, "..");
2166 zap.za_normalization_conflict = 0;
2167 objnum = zp->z_phys->zp_parent;
2168 type = DT_DIR;
2169 } else if (offset == 2 && zfs_show_ctldir(zp)) {
2170 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2171 zap.za_normalization_conflict = 0;
2172 objnum = ZFSCTL_INO_ROOT;
2173 type = DT_DIR;
2174 } else {
2176 * Grab next entry.
2178 if (error = zap_cursor_retrieve(&zc, &zap)) {
2179 if ((*eofp = (error == ENOENT)) != 0)
2180 break;
2181 else
2182 goto update;
2185 if (zap.za_integer_length != 8 ||
2186 zap.za_num_integers != 1) {
2187 cmn_err(CE_WARN, "zap_readdir: bad directory "
2188 "entry, obj = %lld, offset = %lld\n",
2189 (u_longlong_t)zp->z_id,
2190 (u_longlong_t)offset);
2191 error = ENXIO;
2192 goto update;
2195 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2197 * MacOS X can extract the object type here such as:
2198 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2200 type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2202 if (check_sysattrs && !zap.za_normalization_conflict) {
2203 #ifdef TODO
2204 zap.za_normalization_conflict =
2205 xattr_sysattr_casechk(zap.za_name);
2206 #else
2207 panic("%s:%u: TODO", __func__, __LINE__);
2208 #endif
2212 if (flags & V_RDDIR_ENTFLAGS)
2213 reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2214 else
2215 reclen = _DIRENT_RECLEN(odp, strlen(zap.za_name));
2218 * Will this entry fit in the buffer?
2220 if (outcount + reclen > bufsize) {
2222 * Did we manage to fit anything in the buffer?
2224 if (!outcount) {
2225 error = EINVAL;
2226 goto update;
2228 break;
2230 if (flags & V_RDDIR_ENTFLAGS) {
2232 * Add extended flag entry:
2234 eodp->ed_ino = objnum;
2235 eodp->ed_reclen = reclen;
2236 /* NOTE: ed_off is the offset for the *next* entry */
2237 next = &(eodp->ed_off);
2238 eodp->ed_eflags = zap.za_normalization_conflict ?
2239 ED_CASE_CONFLICT : 0;
2240 (void) strncpy(eodp->ed_name, zap.za_name,
2241 EDIRENT_NAMELEN(reclen));
2242 eodp = (edirent_t *)((intptr_t)eodp + reclen);
2243 } else {
2245 * Add normal entry:
2247 odp->d_ino = objnum;
2248 odp->d_reclen = reclen;
2249 odp->d_namlen = strlen(zap.za_name);
2250 (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
2251 odp->d_type = type;
2252 odp = (dirent64_t *)((intptr_t)odp + reclen);
2254 outcount += reclen;
2256 KASSERT(outcount <= bufsize);
2258 /* Prefetch znode */
2259 if (prefetch)
2260 dmu_prefetch(os, objnum, 0, 0);
2263 * Move to the next entry, fill in the previous offset.
2265 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2266 zap_cursor_advance(&zc);
2267 offset = zap_cursor_serialize(&zc);
2268 } else {
2269 offset += 1;
2272 if (cooks != NULL) {
2273 *cooks++ = offset;
2274 ncooks--;
2275 KASSERT(ncooks >= 0);
2278 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2280 /* Subtract unused cookies */
2281 if (ncookies != NULL)
2282 *ncookies -= ncooks;
2284 if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace) && uio->uio_iovcnt == 1) {
2285 iovp->iov_base += outcount;
2286 iovp->iov_len -= outcount;
2287 uio->uio_resid -= outcount;
2288 } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2290 * Reset the pointer.
2292 offset = uio->uio_loffset;
2295 update:
2296 zap_cursor_fini(&zc);
2297 if (!VMSPACE_IS_KERNEL_P(uio->uio_vmspace) || uio->uio_iovcnt != 1)
2298 kmem_free(outbuf, bufsize);
2300 if (error == ENOENT)
2301 error = 0;
2303 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2305 uio->uio_loffset = offset;
2306 ZFS_EXIT(zfsvfs);
2307 if (error != 0 && cookies != NULL) {
2308 kmem_free(*cookies, ncooks * sizeof(u_long));
2309 *cookies = NULL;
2310 *ncookies = 0;
2312 return (error);
2315 ulong_t zfs_fsync_sync_cnt = 4;
2317 static int
2318 zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2320 znode_t *zp = VTOZ(vp);
2321 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2322 int error;
2324 error = 0;
2326 dprintf("zfs_fsync called vp %p -- zfsvfs %p\n", vp, zfsvfs);
2327 (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2329 ZFS_ENTER(zfsvfs);
2330 ZFS_VERIFY_ZP(zp);
2332 * NetBSD: if the sync is from reclaim or from ioflush,
2333 * push dirty atime now. No need to lock: in the reclaim
2334 * case, everything is single threaded and for ioflush this
2335 * is a lazy writeback.
2337 * XXXNETBSD: in the ioflush case, we don't want to push anything
2338 * to disk immediately. We just want to queue the update so it
2339 * will happen "soon". Check this is the case otherwise zfs will
2340 * perform poorly.
2342 if (zp->z_atime_dirty && zp->z_unlinked == 0 &&
2343 (syncflag & (FSYNC_RECLAIM | FSYNC_LAZY)) != 0) {
2344 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
2346 dmu_tx_hold_bonus(tx, zp->z_id);
2347 error = dmu_tx_assign(tx, TXG_WAIT);
2348 if (error) {
2349 dmu_tx_abort(tx);
2350 } else {
2351 dmu_buf_will_dirty(zp->z_dbuf, tx);
2352 mutex_enter(&zp->z_lock);
2353 zp->z_atime_dirty = 0;
2354 mutex_exit(&zp->z_lock);
2355 dmu_tx_commit(tx);
2358 zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
2359 ZFS_EXIT(zfsvfs);
2360 return (0);
2365 * Get the requested file attributes and place them in the provided
2366 * vattr structure.
2368 * IN: vp - vnode of file.
2369 * vap - va_mask identifies requested attributes.
2370 * If AT_XVATTR set, then optional attrs are requested
2371 * flags - ATTR_NOACLCHECK (CIFS server context)
2372 * cr - credentials of caller.
2373 * ct - caller context
2375 * OUT: vap - attribute values.
2377 * RETURN: 0 (always succeeds)
2379 /* ARGSUSED */
2380 static int
2381 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2382 caller_context_t *ct)
2384 znode_t *zp = VTOZ(vp);
2385 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2386 znode_phys_t *pzp;
2387 int error = 0;
2388 uint32_t blksize;
2389 u_longlong_t nblocks;
2390 uint64_t links;
2391 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
2392 xoptattr_t *xoap = NULL;
2393 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2395 dprintf("zfs_getattr called\n");
2397 ZFS_ENTER(zfsvfs);
2398 ZFS_VERIFY_ZP(zp);
2399 pzp = zp->z_phys;
2401 mutex_enter(&zp->z_lock);
2404 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2405 * Also, if we are the owner don't bother, since owner should
2406 * always be allowed to read basic attributes of file.
2408 if (!(pzp->zp_flags & ZFS_ACL_TRIVIAL) &&
2409 (pzp->zp_uid != crgetuid(cr))) {
2410 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2411 skipaclchk, cr)) {
2412 mutex_exit(&zp->z_lock);
2413 ZFS_EXIT(zfsvfs);
2414 return (error);
2419 * Return all attributes. It's cheaper to provide the answer
2420 * than to determine whether we were asked the question.
2423 vap->va_type = IFTOVT(pzp->zp_mode);
2424 vap->va_mode = pzp->zp_mode & ~S_IFMT;
2425 zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2426 vap->va_nodeid = zp->z_id;
2427 if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2428 links = pzp->zp_links + 1;
2429 else
2430 links = pzp->zp_links;
2431 vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */
2432 vap->va_size = pzp->zp_size;
2433 vap->va_fsid = vp->v_mount->mnt_stat.f_fsidx.__fsid_val[0];
2434 // vap->va_fsid = 0;
2435 vap->va_rdev = zfs_cmpldev(pzp->zp_rdev);
2436 vap->va_seq = zp->z_seq;
2437 vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */
2440 * Add in any requested optional attributes and the create time.
2441 * Also set the corresponding bits in the returned attribute bitmap.
2443 if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2444 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2445 xoap->xoa_archive =
2446 ((pzp->zp_flags & ZFS_ARCHIVE) != 0);
2447 XVA_SET_RTN(xvap, XAT_ARCHIVE);
2450 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2451 xoap->xoa_readonly =
2452 ((pzp->zp_flags & ZFS_READONLY) != 0);
2453 XVA_SET_RTN(xvap, XAT_READONLY);
2456 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2457 xoap->xoa_system =
2458 ((pzp->zp_flags & ZFS_SYSTEM) != 0);
2459 XVA_SET_RTN(xvap, XAT_SYSTEM);
2462 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2463 xoap->xoa_hidden =
2464 ((pzp->zp_flags & ZFS_HIDDEN) != 0);
2465 XVA_SET_RTN(xvap, XAT_HIDDEN);
2468 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2469 xoap->xoa_nounlink =
2470 ((pzp->zp_flags & ZFS_NOUNLINK) != 0);
2471 XVA_SET_RTN(xvap, XAT_NOUNLINK);
2474 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2475 xoap->xoa_immutable =
2476 ((pzp->zp_flags & ZFS_IMMUTABLE) != 0);
2477 XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2480 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2481 xoap->xoa_appendonly =
2482 ((pzp->zp_flags & ZFS_APPENDONLY) != 0);
2483 XVA_SET_RTN(xvap, XAT_APPENDONLY);
2486 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2487 xoap->xoa_nodump =
2488 ((pzp->zp_flags & ZFS_NODUMP) != 0);
2489 XVA_SET_RTN(xvap, XAT_NODUMP);
2492 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2493 xoap->xoa_opaque =
2494 ((pzp->zp_flags & ZFS_OPAQUE) != 0);
2495 XVA_SET_RTN(xvap, XAT_OPAQUE);
2498 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2499 xoap->xoa_av_quarantined =
2500 ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0);
2501 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2504 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2505 xoap->xoa_av_modified =
2506 ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0);
2507 XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2510 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2511 vp->v_type == VREG &&
2512 (pzp->zp_flags & ZFS_BONUS_SCANSTAMP)) {
2513 size_t len;
2514 dmu_object_info_t doi;
2517 * Only VREG files have anti-virus scanstamps, so we
2518 * won't conflict with symlinks in the bonus buffer.
2520 dmu_object_info_from_db(zp->z_dbuf, &doi);
2521 len = sizeof (xoap->xoa_av_scanstamp) +
2522 sizeof (znode_phys_t);
2523 if (len <= doi.doi_bonus_size) {
2525 * pzp points to the start of the
2526 * znode_phys_t. pzp + 1 points to the
2527 * first byte after the znode_phys_t.
2529 (void) memcpy(xoap->xoa_av_scanstamp,
2530 pzp + 1,
2531 sizeof (xoap->xoa_av_scanstamp));
2532 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
2536 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2537 ZFS_TIME_DECODE(&xoap->xoa_createtime, pzp->zp_crtime);
2538 XVA_SET_RTN(xvap, XAT_CREATETIME);
2542 ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime);
2543 ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime);
2544 ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime);
2545 ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime);
2547 mutex_exit(&zp->z_lock);
2549 dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks);
2550 vap->va_blksize = blksize;
2551 vap->va_bytes = nblocks << 9; /* nblocks * 512 */
2553 if (zp->z_blksz == 0) {
2555 * Block size hasn't been set; suggest maximal I/O transfers.
2557 vap->va_blksize = zfsvfs->z_max_blksz;
2560 ZFS_EXIT(zfsvfs);
2561 return (0);
2565 * Set the file attributes to the values contained in the
2566 * vattr structure.
2568 * IN: vp - vnode of file to be modified.
2569 * vap - new attribute values.
2570 * If AT_XVATTR set, then optional attrs are being set
2571 * flags - ATTR_UTIME set if non-default time values provided.
2572 * - ATTR_NOACLCHECK (CIFS context only).
2573 * cr - credentials of caller.
2574 * ct - caller context
2576 * RETURN: 0 if success
2577 * error code if failure
2579 * Timestamps:
2580 * vp - ctime updated, mtime updated if size changed.
2582 /* ARGSUSED */
2583 static int
2584 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2585 caller_context_t *ct)
2587 znode_t *zp = VTOZ(vp);
2588 znode_phys_t *pzp;
2589 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2590 zilog_t *zilog;
2591 dmu_tx_t *tx;
2592 vattr_t oldva;
2593 uint_t mask = vap->va_mask;
2594 uint_t saved_mask;
2595 int trim_mask = 0;
2596 uint64_t new_mode;
2597 znode_t *attrzp;
2598 int need_policy = FALSE;
2599 int err;
2600 zfs_fuid_info_t *fuidp = NULL;
2601 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
2602 xoptattr_t *xoap;
2603 zfs_acl_t *aclp = NULL;
2604 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2606 dprintf("zfs_setattr called\n");
2608 if (mask == 0)
2609 return (0);
2611 if (mask & AT_NOSET)
2612 return (EINVAL);
2614 ZFS_ENTER(zfsvfs);
2615 ZFS_VERIFY_ZP(zp);
2617 pzp = zp->z_phys;
2618 zilog = zfsvfs->z_log;
2621 * Make sure that if we have ephemeral uid/gid or xvattr specified
2622 * that file system is at proper version level
2625 if (zfsvfs->z_use_fuids == B_FALSE &&
2626 (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2627 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2628 (mask & AT_XVATTR))) {
2629 ZFS_EXIT(zfsvfs);
2630 return (EINVAL);
2633 if (mask & AT_SIZE && vp->v_type == VDIR) {
2634 ZFS_EXIT(zfsvfs);
2635 return (EISDIR);
2638 if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2639 ZFS_EXIT(zfsvfs);
2640 return (EINVAL);
2644 * If this is an xvattr_t, then get a pointer to the structure of
2645 * optional attributes. If this is NULL, then we have a vattr_t.
2647 xoap = xva_getxoptattr(xvap);
2650 * Immutable files can only alter immutable bit and atime
2652 if ((pzp->zp_flags & ZFS_IMMUTABLE) &&
2653 ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2654 ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2655 ZFS_EXIT(zfsvfs);
2656 return (EPERM);
2659 if ((mask & AT_SIZE) && (pzp->zp_flags & ZFS_READONLY)) {
2660 ZFS_EXIT(zfsvfs);
2661 return (EPERM);
2665 * Verify timestamps doesn't overflow 32 bits.
2666 * ZFS can handle large timestamps, but 32bit syscalls can't
2667 * handle times greater than 2039. This check should be removed
2668 * once large timestamps are fully supported.
2670 if (mask & (AT_ATIME | AT_MTIME)) {
2671 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2672 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2673 ZFS_EXIT(zfsvfs);
2674 return (EOVERFLOW);
2678 top:
2679 attrzp = NULL;
2681 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2682 ZFS_EXIT(zfsvfs);
2683 return (EROFS);
2687 * First validate permissions
2689 if (mask & AT_SIZE) {
2690 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
2691 if (err) {
2692 ZFS_EXIT(zfsvfs);
2693 return (err);
2696 * XXX - Note, we are not providing any open
2697 * mode flags here (like FNDELAY), so we may
2698 * block if there are locks present... this
2699 * should be addressed in openat().
2701 /* XXX - would it be OK to generate a log record here? */
2702 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2703 if (err) {
2704 ZFS_EXIT(zfsvfs);
2705 return (err);
2709 if (mask & (AT_ATIME|AT_MTIME) ||
2710 ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2711 XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2712 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2713 XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2714 XVA_ISSET_REQ(xvap, XAT_SYSTEM))))
2715 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2716 skipaclchk, cr);
2718 if (mask & (AT_UID|AT_GID)) {
2719 int idmask = (mask & (AT_UID|AT_GID));
2720 int take_owner;
2721 int take_group;
2724 * NOTE: even if a new mode is being set,
2725 * we may clear S_ISUID/S_ISGID bits.
2728 if (!(mask & AT_MODE))
2729 vap->va_mode = pzp->zp_mode;
2732 * Take ownership or chgrp to group we are a member of
2735 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
2736 take_group = (mask & AT_GID) &&
2737 zfs_groupmember(zfsvfs, vap->va_gid, cr);
2740 * If both AT_UID and AT_GID are set then take_owner and
2741 * take_group must both be set in order to allow taking
2742 * ownership.
2744 * Otherwise, send the check through secpolicy_vnode_setattr()
2748 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
2749 ((idmask == AT_UID) && take_owner) ||
2750 ((idmask == AT_GID) && take_group)) {
2751 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2752 skipaclchk, cr) == 0) {
2754 * Remove setuid/setgid for non-privileged users
2756 secpolicy_setid_clear(vap, cr);
2757 trim_mask = (mask & (AT_UID|AT_GID));
2758 } else {
2759 need_policy = TRUE;
2761 } else {
2762 need_policy = TRUE;
2766 mutex_enter(&zp->z_lock);
2767 oldva.va_mode = pzp->zp_mode;
2768 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2769 if (mask & AT_XVATTR) {
2770 if ((need_policy == FALSE) &&
2771 (XVA_ISSET_REQ(xvap, XAT_APPENDONLY) &&
2772 xoap->xoa_appendonly !=
2773 ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) ||
2774 (XVA_ISSET_REQ(xvap, XAT_NOUNLINK) &&
2775 xoap->xoa_nounlink !=
2776 ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) ||
2777 (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE) &&
2778 xoap->xoa_immutable !=
2779 ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) ||
2780 (XVA_ISSET_REQ(xvap, XAT_NODUMP) &&
2781 xoap->xoa_nodump !=
2782 ((pzp->zp_flags & ZFS_NODUMP) != 0)) ||
2783 (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED) &&
2784 xoap->xoa_av_modified !=
2785 ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) ||
2786 ((XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED) &&
2787 ((vp->v_type != VREG && xoap->xoa_av_quarantined) ||
2788 xoap->xoa_av_quarantined !=
2789 ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)))) ||
2790 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
2791 (XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2792 need_policy = TRUE;
2795 mutex_exit(&zp->z_lock);
2797 if (mask & AT_MODE) {
2798 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
2799 err = secpolicy_setid_setsticky_clear(vp, vap,
2800 &oldva, cr);
2801 if (err) {
2802 ZFS_EXIT(zfsvfs);
2803 return (err);
2805 trim_mask |= AT_MODE;
2806 } else {
2807 need_policy = TRUE;
2811 if (need_policy) {
2813 * If trim_mask is set then take ownership
2814 * has been granted or write_acl is present and user
2815 * has the ability to modify mode. In that case remove
2816 * UID|GID and or MODE from mask so that
2817 * secpolicy_vnode_setattr() doesn't revoke it.
2820 if (trim_mask) {
2821 saved_mask = vap->va_mask;
2822 vap->va_mask &= ~trim_mask;
2824 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
2825 (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
2826 if (err) {
2827 ZFS_EXIT(zfsvfs);
2828 return (err);
2831 if (trim_mask)
2832 vap->va_mask |= saved_mask;
2835 * secpolicy_vnode_setattr, or take ownership may have
2836 * changed va_mask
2838 mask = vap->va_mask;
2840 tx = dmu_tx_create(zfsvfs->z_os);
2841 dmu_tx_hold_bonus(tx, zp->z_id);
2842 if (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2843 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid))) {
2844 if (zfsvfs->z_fuid_obj == 0) {
2845 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
2846 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
2847 FUID_SIZE_ESTIMATE(zfsvfs));
2848 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
2849 } else {
2850 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
2851 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
2852 FUID_SIZE_ESTIMATE(zfsvfs));
2856 if (mask & AT_MODE) {
2857 uint64_t pmode = pzp->zp_mode;
2859 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
2861 if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) {
2862 dmu_tx_abort(tx);
2863 ZFS_EXIT(zfsvfs);
2864 return (err);
2866 if (pzp->zp_acl.z_acl_extern_obj) {
2867 /* Are we upgrading ACL from old V0 format to new V1 */
2868 if (zfsvfs->z_version <= ZPL_VERSION_FUID &&
2869 pzp->zp_acl.z_acl_version ==
2870 ZFS_ACL_VERSION_INITIAL) {
2871 dmu_tx_hold_free(tx,
2872 pzp->zp_acl.z_acl_extern_obj, 0,
2873 DMU_OBJECT_END);
2874 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2875 0, aclp->z_acl_bytes);
2876 } else {
2877 dmu_tx_hold_write(tx,
2878 pzp->zp_acl.z_acl_extern_obj, 0,
2879 aclp->z_acl_bytes);
2881 } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2882 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2883 0, aclp->z_acl_bytes);
2887 if ((mask & (AT_UID | AT_GID)) && pzp->zp_xattr != 0) {
2888 err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp);
2889 if (err) {
2890 dmu_tx_abort(tx);
2891 ZFS_EXIT(zfsvfs);
2892 if (aclp)
2893 zfs_acl_free(aclp);
2894 return (err);
2896 dmu_tx_hold_bonus(tx, attrzp->z_id);
2899 err = dmu_tx_assign(tx, zfsvfs->z_assign);
2900 if (err) {
2901 if (attrzp)
2902 VN_RELE(ZTOV(attrzp));
2904 if (aclp) {
2905 zfs_acl_free(aclp);
2906 aclp = NULL;
2909 if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
2910 dmu_tx_wait(tx);
2911 dmu_tx_abort(tx);
2912 goto top;
2914 dmu_tx_abort(tx);
2915 ZFS_EXIT(zfsvfs);
2916 return (err);
2919 dmu_buf_will_dirty(zp->z_dbuf, tx);
2922 * Set each attribute requested.
2923 * We group settings according to the locks they need to acquire.
2925 * Note: you cannot set ctime directly, although it will be
2926 * updated as a side-effect of calling this function.
2929 mutex_enter(&zp->z_lock);
2931 if (mask & AT_MODE) {
2932 mutex_enter(&zp->z_acl_lock);
2933 zp->z_phys->zp_mode = new_mode;
2934 err = zfs_aclset_common(zp, aclp, cr, &fuidp, tx);
2935 ASSERT3U(err, ==, 0);
2936 mutex_exit(&zp->z_acl_lock);
2939 if (attrzp)
2940 mutex_enter(&attrzp->z_lock);
2942 if (mask & AT_UID) {
2943 pzp->zp_uid = zfs_fuid_create(zfsvfs,
2944 vap->va_uid, cr, ZFS_OWNER, tx, &fuidp);
2945 if (attrzp) {
2946 attrzp->z_phys->zp_uid = zfs_fuid_create(zfsvfs,
2947 vap->va_uid, cr, ZFS_OWNER, tx, &fuidp);
2951 if (mask & AT_GID) {
2952 pzp->zp_gid = zfs_fuid_create(zfsvfs, vap->va_gid,
2953 cr, ZFS_GROUP, tx, &fuidp);
2954 if (attrzp)
2955 attrzp->z_phys->zp_gid = zfs_fuid_create(zfsvfs,
2956 vap->va_gid, cr, ZFS_GROUP, tx, &fuidp);
2959 if (aclp)
2960 zfs_acl_free(aclp);
2962 if (attrzp)
2963 mutex_exit(&attrzp->z_lock);
2965 if (mask & AT_ATIME)
2966 ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
2968 if (mask & AT_MTIME)
2969 ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
2971 /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
2972 if (mask & AT_SIZE)
2973 zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx);
2974 else if (mask != 0)
2975 zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
2977 * Do this after setting timestamps to prevent timestamp
2978 * update from toggling bit
2981 if (xoap && (mask & AT_XVATTR)) {
2982 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
2983 size_t len;
2984 dmu_object_info_t doi;
2986 ASSERT(vp->v_type == VREG);
2988 /* Grow the bonus buffer if necessary. */
2989 dmu_object_info_from_db(zp->z_dbuf, &doi);
2990 len = sizeof (xoap->xoa_av_scanstamp) +
2991 sizeof (znode_phys_t);
2992 if (len > doi.doi_bonus_size)
2993 VERIFY(dmu_set_bonus(zp->z_dbuf, len, tx) == 0);
2995 zfs_xvattr_set(zp, xvap);
2998 if (mask != 0)
2999 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3001 if (fuidp)
3002 zfs_fuid_info_free(fuidp);
3003 mutex_exit(&zp->z_lock);
3005 if (attrzp)
3006 VN_RELE(ZTOV(attrzp));
3008 dmu_tx_commit(tx);
3010 ZFS_EXIT(zfsvfs);
3011 return (err);
3014 typedef struct zfs_zlock {
3015 krwlock_t *zl_rwlock; /* lock we acquired */
3016 znode_t *zl_znode; /* znode we held */
3017 struct zfs_zlock *zl_next; /* next in list */
3018 } zfs_zlock_t;
3021 * Drop locks and release vnodes that were held by zfs_rename_lock().
3023 static void
3024 zfs_rename_unlock(zfs_zlock_t **zlpp)
3026 zfs_zlock_t *zl;
3028 while ((zl = *zlpp) != NULL) {
3029 if (zl->zl_znode != NULL)
3030 VN_RELE(ZTOV(zl->zl_znode));
3031 rw_exit(zl->zl_rwlock);
3032 *zlpp = zl->zl_next;
3033 kmem_free(zl, sizeof (*zl));
3038 * Search back through the directory tree, using the ".." entries.
3039 * Lock each directory in the chain to prevent concurrent renames.
3040 * Fail any attempt to move a directory into one of its own descendants.
3041 * XXX - z_parent_lock can overlap with map or grow locks
3043 static int
3044 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
3046 zfs_zlock_t *zl;
3047 znode_t *zp = tdzp;
3048 uint64_t rootid = zp->z_zfsvfs->z_root;
3049 uint64_t *oidp = &zp->z_id;
3050 krwlock_t *rwlp = &szp->z_parent_lock;
3051 krw_t rw = RW_WRITER;
3054 * First pass write-locks szp and compares to zp->z_id.
3055 * Later passes read-lock zp and compare to zp->z_parent.
3057 do {
3058 if (!rw_tryenter(rwlp, rw)) {
3060 * Another thread is renaming in this path.
3061 * Note that if we are a WRITER, we don't have any
3062 * parent_locks held yet.
3064 if (rw == RW_READER && zp->z_id > szp->z_id) {
3066 * Drop our locks and restart
3068 zfs_rename_unlock(&zl);
3069 *zlpp = NULL;
3070 zp = tdzp;
3071 oidp = &zp->z_id;
3072 rwlp = &szp->z_parent_lock;
3073 rw = RW_WRITER;
3074 continue;
3075 } else {
3077 * Wait for other thread to drop its locks
3079 rw_enter(rwlp, rw);
3083 zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
3084 zl->zl_rwlock = rwlp;
3085 zl->zl_znode = NULL;
3086 zl->zl_next = *zlpp;
3087 *zlpp = zl;
3089 if (*oidp == szp->z_id) /* We're a descendant of szp */
3090 return (EINVAL);
3092 if (*oidp == rootid) /* We've hit the top */
3093 return (0);
3095 if (rw == RW_READER) { /* i.e. not the first pass */
3096 int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp);
3097 if (error)
3098 return (error);
3099 zl->zl_znode = zp;
3101 oidp = &zp->z_phys->zp_parent;
3102 rwlp = &zp->z_parent_lock;
3103 rw = RW_READER;
3105 } while (zp->z_id != sdzp->z_id);
3107 return (0);
3111 * Move an entry from the provided source directory to the target
3112 * directory. Change the entry name as indicated.
3114 * IN: sdvp - Source directory containing the "old entry".
3115 * snm - Old entry name.
3116 * tdvp - Target directory to contain the "new entry".
3117 * tnm - New entry name.
3118 * cr - credentials of caller.
3119 * ct - caller context
3120 * flags - case flags
3122 * RETURN: 0 if success
3123 * error code if failure
3125 * Timestamps:
3126 * sdvp,tdvp - ctime|mtime updated
3128 /* XXX NetBSD There is significant problem with dirent locking during rename
3129 * of files which are in a same dir. zfs_dirent_lock is then called twice on
3130 * same lock which panics LOCKDEBUG kernel. Locking twice is not needed.
3131 * Proper solution for this is add new flag to zfs_dirent_lock which will
3132 * disable rw_enter in it. Renaming of files in same dir is considered as broken
3133 * on LOCKDEBUG kernels on NetBSD for now.
3135 /*ARGSUSED*/
3136 static int
3137 zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
3138 caller_context_t *ct, int flags)
3140 znode_t *tdzp, *szp, *tzp;
3141 znode_t *sdzp = VTOZ(sdvp);
3142 zfsvfs_t *zfsvfs = sdzp->z_zfsvfs;
3143 zilog_t *zilog;
3144 vnode_t *realvp;
3145 zfs_dirlock_t *sdl, *tdl;
3146 dmu_tx_t *tx;
3147 zfs_zlock_t *zl;
3148 int cmp, serr, terr;
3149 int error = 0;
3150 int zflg = 0;
3151 int samedir = 0;
3153 tdl = NULL;
3154 sdl = NULL;
3156 dprintf("zfs_rename called\n");
3158 ZFS_ENTER(zfsvfs);
3159 ZFS_VERIFY_ZP(sdzp);
3160 zilog = zfsvfs->z_log;
3163 * Make sure we have the real vp for the target directory.
3165 if (VOP_REALVP(tdvp, &realvp, ct) == 0)
3166 tdvp = realvp;
3168 if (tdvp->v_vfsp != sdvp->v_vfsp) {
3169 ZFS_EXIT(zfsvfs);
3170 return (EXDEV);
3173 tdzp = VTOZ(tdvp);
3174 ZFS_VERIFY_ZP(tdzp);
3175 if (zfsvfs->z_utf8 && u8_validate(tnm,
3176 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3177 ZFS_EXIT(zfsvfs);
3178 return (EILSEQ);
3181 if (flags & FIGNORECASE)
3182 zflg |= ZCILOOK;
3184 top:
3185 szp = NULL;
3186 tzp = NULL;
3187 zl = NULL;
3190 * This is to prevent the creation of links into attribute space
3191 * by renaming a linked file into/outof an attribute directory.
3192 * See the comment in zfs_link() for why this is considered bad.
3194 if ((tdzp->z_phys->zp_flags & ZFS_XATTR) !=
3195 (sdzp->z_phys->zp_flags & ZFS_XATTR)) {
3196 ZFS_EXIT(zfsvfs);
3197 return (EINVAL);
3201 * Lock source and target directory entries. To prevent deadlock,
3202 * a lock ordering must be defined. We lock the directory with
3203 * the smallest object id first, or if it's a tie, the one with
3204 * the lexically first name.
3206 if (sdzp->z_id < tdzp->z_id) {
3207 cmp = -1;
3208 } else if (sdzp->z_id > tdzp->z_id) {
3209 cmp = 1;
3210 } else {
3212 * First compare the two name arguments without
3213 * considering any case folding.
3215 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
3217 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
3218 ASSERT(error == 0 || !zfsvfs->z_utf8);
3219 if (cmp == 0) {
3221 * POSIX: "If the old argument and the new argument
3222 * both refer to links to the same existing file,
3223 * the rename() function shall return successfully
3224 * and perform no other action."
3226 ZFS_EXIT(zfsvfs);
3227 return (0);
3230 * If the file system is case-folding, then we may
3231 * have some more checking to do. A case-folding file
3232 * system is either supporting mixed case sensitivity
3233 * access or is completely case-insensitive. Note
3234 * that the file system is always case preserving.
3236 * In mixed sensitivity mode case sensitive behavior
3237 * is the default. FIGNORECASE must be used to
3238 * explicitly request case insensitive behavior.
3240 * If the source and target names provided differ only
3241 * by case (e.g., a request to rename 'tim' to 'Tim'),
3242 * we will treat this as a special case in the
3243 * case-insensitive mode: as long as the source name
3244 * is an exact match, we will allow this to proceed as
3245 * a name-change request.
3247 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
3248 (zfsvfs->z_case == ZFS_CASE_MIXED &&
3249 flags & FIGNORECASE)) &&
3250 u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
3251 &error) == 0) {
3253 * case preserving rename request, require exact
3254 * name matches
3256 zflg |= ZCIEXACT;
3257 zflg &= ~ZCILOOK;
3261 if (cmp < 0) {
3263 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
3264 ZEXISTS | zflg, NULL, NULL);
3265 if ((serr == 0) && (sdzp == tdzp)) {
3267 * If renaming within the one directory we must
3268 * be careful not to recursively acquire locks.
3270 zflg |= ZSAMEDIR;
3272 terr = zfs_dirent_lock(&tdl,
3273 tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
3274 } else {
3275 terr = zfs_dirent_lock(&tdl,
3276 tdzp, tnm, &tzp, zflg, NULL, NULL);
3278 if ((terr == 0) && (sdzp == tdzp)) {
3280 * If renaming within the one directory we must
3281 * be careful not to recursively acquire locks.
3283 zflg |= ZSAMEDIR;
3285 serr = zfs_dirent_lock(&sdl,
3286 sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
3287 NULL, NULL);
3290 if (serr) {
3292 * Source entry invalid or not there.
3294 if (!terr) {
3295 zfs_dirent_unlock(tdl, 0);
3296 if (tzp)
3297 VN_RELE(ZTOV(tzp));
3299 if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
3300 serr = EINVAL;
3301 ZFS_EXIT(zfsvfs);
3302 return (serr);
3304 if (terr) {
3305 if (sdl != NULL)
3306 zfs_dirent_unlock(sdl, 0);
3307 VN_RELE(ZTOV(szp));
3308 if (strcmp(tnm, "..") == 0)
3309 terr = EINVAL;
3310 ZFS_EXIT(zfsvfs);
3311 return (terr);
3315 * Must have write access at the source to remove the old entry
3316 * and write access at the target to create the new entry.
3317 * Note that if target and source are the same, this can be
3318 * done in a single check.
3321 if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3322 goto out;
3324 if (ZTOV(szp)->v_type == VDIR) {
3326 * Check to make sure rename is valid.
3327 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3329 if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
3330 goto out;
3334 * Does target exist?
3336 if (tzp) {
3338 * Source and target must be the same type.
3340 if (ZTOV(szp)->v_type == VDIR) {
3341 if (ZTOV(tzp)->v_type != VDIR) {
3342 error = ENOTDIR;
3343 goto out;
3345 } else {
3346 if (ZTOV(tzp)->v_type == VDIR) {
3347 error = EISDIR;
3348 goto out;
3352 * POSIX dictates that when the source and target
3353 * entries refer to the same file object, rename
3354 * must do nothing and exit without error.
3356 if (szp->z_id == tzp->z_id) {
3357 error = 0;
3358 goto out;
3362 vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
3363 if (tzp)
3364 vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
3367 * notify the target directory if it is not the same
3368 * as source directory.
3370 if (tdvp != sdvp) {
3371 vnevent_rename_dest_dir(tdvp, ct);
3374 tx = dmu_tx_create(zfsvfs->z_os);
3375 dmu_tx_hold_bonus(tx, szp->z_id); /* nlink changes */
3376 dmu_tx_hold_bonus(tx, sdzp->z_id); /* nlink changes */
3377 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3378 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3379 if (sdzp != tdzp)
3380 dmu_tx_hold_bonus(tx, tdzp->z_id); /* nlink changes */
3381 if (tzp)
3382 dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */
3383 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3384 error = dmu_tx_assign(tx, zfsvfs->z_assign);
3385 if (error) {
3386 if (zl != NULL)
3387 zfs_rename_unlock(&zl);
3388 zfs_dirent_unlock(sdl, zflg);
3389 zfs_dirent_unlock(tdl, 0);
3390 VN_RELE(ZTOV(szp));
3391 if (tzp)
3392 VN_RELE(ZTOV(tzp));
3393 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
3394 dmu_tx_wait(tx);
3395 dmu_tx_abort(tx);
3396 goto top;
3398 dmu_tx_abort(tx);
3399 ZFS_EXIT(zfsvfs);
3400 return (error);
3403 if (tzp) /* Attempt to remove the existing target */
3404 error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
3406 if (error == 0) {
3407 error = zfs_link_create(tdl, szp, tx, ZRENAMING);
3408 if (error == 0) {
3409 szp->z_phys->zp_flags |= ZFS_AV_MODIFIED;
3411 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
3412 ASSERT(error == 0);
3414 zfs_log_rename(zilog, tx,
3415 TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0),
3416 sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp);
3418 /* Update path information for the target vnode */
3419 vn_renamepath(tdvp, ZTOV(szp), tnm, strlen(tnm));
3421 if (error == 0) {
3422 /* Purge cache entries, while still holding locks. */
3423 cache_purge(sdvp);
3424 cache_purge(tdvp);
3428 dmu_tx_commit(tx);
3429 out:
3430 if (zl != NULL)
3431 zfs_rename_unlock(&zl);
3433 zfs_dirent_unlock(sdl, zflg);
3434 zfs_dirent_unlock(tdl, 0);
3436 VN_RELE(ZTOV(szp));
3437 if (tzp)
3438 VN_RELE(ZTOV(tzp));
3440 ZFS_EXIT(zfsvfs);
3442 return (error);
3446 * Insert the indicated symbolic reference entry into the directory.
3448 * IN: dvp - Directory to contain new symbolic link.
3449 * link - Name for new symlink entry.
3450 * vap - Attributes of new entry.
3451 * target - Target path of new symlink.
3452 * cr - credentials of caller.
3453 * ct - caller context
3454 * flags - case flags
3456 * RETURN: 0 if success
3457 * error code if failure
3459 * Timestamps:
3460 * dvp - ctime|mtime updated
3462 /*ARGSUSED*/
3463 static int
3464 zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
3465 cred_t *cr)
3467 znode_t *zp, *dzp = VTOZ(dvp);
3468 zfs_dirlock_t *dl;
3469 dmu_tx_t *tx;
3470 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
3471 zilog_t *zilog;
3472 int len = strlen(link);
3473 int error;
3474 int zflg = ZNEW;
3475 zfs_fuid_info_t *fuidp = NULL;
3476 int flags = 0;
3478 ASSERT(vap->va_type == VLNK);
3480 ZFS_ENTER(zfsvfs);
3481 ZFS_VERIFY_ZP(dzp);
3482 zilog = zfsvfs->z_log;
3484 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3485 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3486 ZFS_EXIT(zfsvfs);
3487 return (EILSEQ);
3489 if (flags & FIGNORECASE)
3490 zflg |= ZCILOOK;
3491 top:
3492 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
3493 ZFS_EXIT(zfsvfs);
3494 return (error);
3497 if (len > MAXPATHLEN) {
3498 ZFS_EXIT(zfsvfs);
3499 return (ENAMETOOLONG);
3503 * Attempt to lock directory; fail if entry already exists.
3505 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
3506 if (error) {
3507 ZFS_EXIT(zfsvfs);
3508 return (error);
3511 tx = dmu_tx_create(zfsvfs->z_os);
3512 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3513 dmu_tx_hold_bonus(tx, dzp->z_id);
3514 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3515 if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
3516 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
3517 if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) {
3518 if (zfsvfs->z_fuid_obj == 0) {
3519 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
3520 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3521 FUID_SIZE_ESTIMATE(zfsvfs));
3522 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
3523 } else {
3524 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
3525 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
3526 FUID_SIZE_ESTIMATE(zfsvfs));
3529 error = dmu_tx_assign(tx, zfsvfs->z_assign);
3530 if (error) {
3531 zfs_dirent_unlock(dl, 0);
3532 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
3533 dmu_tx_wait(tx);
3534 dmu_tx_abort(tx);
3535 goto top;
3537 dmu_tx_abort(tx);
3538 ZFS_EXIT(zfsvfs);
3539 return (error);
3542 dmu_buf_will_dirty(dzp->z_dbuf, tx);
3545 * Create a new object for the symlink.
3546 * Put the link content into bonus buffer if it will fit;
3547 * otherwise, store it just like any other file data.
3549 if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) {
3550 zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, NULL, &fuidp);
3551 if (len != 0)
3552 bcopy(link, zp->z_phys + 1, len);
3553 } else {
3554 dmu_buf_t *dbp;
3556 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, NULL, &fuidp);
3558 * Nothing can access the znode yet so no locking needed
3559 * for growing the znode's blocksize.
3561 zfs_grow_blocksize(zp, len, tx);
3563 VERIFY(0 == dmu_buf_hold(zfsvfs->z_os,
3564 zp->z_id, 0, FTAG, &dbp));
3565 dmu_buf_will_dirty(dbp, tx);
3567 ASSERT3U(len, <=, dbp->db_size);
3568 bcopy(link, dbp->db_data, len);
3569 dmu_buf_rele(dbp, FTAG);
3571 zp->z_phys->zp_size = len;
3574 * Insert the new object into the directory.
3576 (void) zfs_link_create(dl, zp, tx, ZNEW);
3577 out:
3578 if (error == 0) {
3579 uint64_t txtype = TX_SYMLINK;
3580 if (flags & FIGNORECASE)
3581 txtype |= TX_CI;
3582 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3583 *vpp = ZTOV(zp);
3585 if (fuidp)
3586 zfs_fuid_info_free(fuidp);
3588 dmu_tx_commit(tx);
3590 zfs_dirent_unlock(dl, 0);
3592 ZFS_EXIT(zfsvfs);
3593 return (error);
3597 * Return, in the buffer contained in the provided uio structure,
3598 * the symbolic path referred to by vp.
3600 * IN: vp - vnode of symbolic link.
3601 * uoip - structure to contain the link path.
3602 * cr - credentials of caller.
3603 * ct - caller context
3605 * OUT: uio - structure to contain the link path.
3607 * RETURN: 0 if success
3608 * error code if failure
3610 * Timestamps:
3611 * vp - atime updated
3613 /* ARGSUSED */
3614 static int
3615 zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
3617 znode_t *zp = VTOZ(vp);
3618 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3619 size_t bufsz;
3620 int error;
3622 ZFS_ENTER(zfsvfs);
3623 ZFS_VERIFY_ZP(zp);
3625 bufsz = (size_t)zp->z_phys->zp_size;
3626 if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) {
3627 error = uiomove(zp->z_phys + 1,
3628 MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
3629 } else {
3630 dmu_buf_t *dbp;
3631 error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp);
3632 if (error) {
3633 ZFS_EXIT(zfsvfs);
3634 return (error);
3636 error = uiomove(dbp->db_data,
3637 MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
3638 dmu_buf_rele(dbp, FTAG);
3641 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
3642 ZFS_EXIT(zfsvfs);
3643 return (error);
3647 * Insert a new entry into directory tdvp referencing svp.
3649 * IN: tdvp - Directory to contain new entry.
3650 * svp - vnode of new entry.
3651 * name - name of new entry.
3652 * cr - credentials of caller.
3653 * ct - caller context
3655 * RETURN: 0 if success
3656 * error code if failure
3658 * Timestamps:
3659 * tdvp - ctime|mtime updated
3660 * svp - ctime updated
3662 /* ARGSUSED */
3663 static int
3664 zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
3665 caller_context_t *ct, int flags)
3667 znode_t *dzp = VTOZ(tdvp);
3668 znode_t *tzp, *szp;
3669 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
3670 zilog_t *zilog;
3671 zfs_dirlock_t *dl;
3672 dmu_tx_t *tx;
3673 vnode_t *realvp;
3674 int error;
3675 int zf = ZNEW;
3676 uid_t owner;
3678 ASSERT(tdvp->v_type == VDIR);
3680 ZFS_ENTER(zfsvfs);
3681 ZFS_VERIFY_ZP(dzp);
3682 zilog = zfsvfs->z_log;
3684 if (VOP_REALVP(svp, &realvp, ct) == 0)
3685 svp = realvp;
3687 if (svp->v_vfsp != tdvp->v_vfsp) {
3688 ZFS_EXIT(zfsvfs);
3689 return (EXDEV);
3691 szp = VTOZ(svp);
3692 ZFS_VERIFY_ZP(szp);
3694 if (zfsvfs->z_utf8 && u8_validate(name,
3695 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3696 ZFS_EXIT(zfsvfs);
3697 return (EILSEQ);
3699 if (flags & FIGNORECASE)
3700 zf |= ZCILOOK;
3702 top:
3704 * We do not support links between attributes and non-attributes
3705 * because of the potential security risk of creating links
3706 * into "normal" file space in order to circumvent restrictions
3707 * imposed in attribute space.
3709 if ((szp->z_phys->zp_flags & ZFS_XATTR) !=
3710 (dzp->z_phys->zp_flags & ZFS_XATTR)) {
3711 ZFS_EXIT(zfsvfs);
3712 return (EINVAL);
3716 * POSIX dictates that we return EPERM here.
3717 * Better choices include ENOTSUP or EISDIR.
3719 if (svp->v_type == VDIR) {
3720 ZFS_EXIT(zfsvfs);
3721 return (EPERM);
3724 owner = zfs_fuid_map_id(zfsvfs, szp->z_phys->zp_uid, cr, ZFS_OWNER);
3725 if (owner != crgetuid(cr) &&
3726 secpolicy_basic_link(cr) != 0) {
3727 ZFS_EXIT(zfsvfs);
3728 return (EPERM);
3731 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
3732 ZFS_EXIT(zfsvfs);
3733 return (error);
3737 * Attempt to lock directory; fail if entry already exists.
3739 error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
3740 if (error) {
3741 ZFS_EXIT(zfsvfs);
3742 return (error);
3745 tx = dmu_tx_create(zfsvfs->z_os);
3746 dmu_tx_hold_bonus(tx, szp->z_id);
3747 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3748 error = dmu_tx_assign(tx, zfsvfs->z_assign);
3749 if (error) {
3750 zfs_dirent_unlock(dl, 0);
3751 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
3752 dmu_tx_wait(tx);
3753 dmu_tx_abort(tx);
3754 goto top;
3756 dmu_tx_abort(tx);
3757 ZFS_EXIT(zfsvfs);
3758 return (error);
3761 error = zfs_link_create(dl, szp, tx, 0);
3763 if (error == 0) {
3764 uint64_t txtype = TX_LINK;
3765 if (flags & FIGNORECASE)
3766 txtype |= TX_CI;
3767 zfs_log_link(zilog, tx, txtype, dzp, szp, name);
3770 dmu_tx_commit(tx);
3772 zfs_dirent_unlock(dl, 0);
3774 if (error == 0) {
3775 vnevent_link(svp, ct);
3778 ZFS_EXIT(zfsvfs);
3779 return (error);
3782 /*ARGSUSED*/
3784 /* CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid)); */
3785 /* CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid)); */
3787 /*ARGSUSED*/
3788 static int
3789 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
3791 znode_t *zp = VTOZ(vp);
3792 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3793 uint32_t gen;
3794 uint64_t object = zp->z_id;
3795 zfid_short_t *zfid;
3796 int size, i;
3798 ZFS_ENTER(zfsvfs);
3799 ZFS_VERIFY_ZP(zp);
3800 gen = (uint32_t)zp->z_gen;
3802 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
3803 fidp->fid_len = size;
3805 zfid = (zfid_short_t *)fidp;
3807 zfid->zf_len = size;
3809 for (i = 0; i < sizeof (zfid->zf_object); i++)
3810 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
3812 /* Must have a non-zero generation number to distinguish from .zfs */
3813 if (gen == 0)
3814 gen = 1;
3815 for (i = 0; i < sizeof (zfid->zf_gen); i++)
3816 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
3818 if (size == LONG_FID_LEN) {
3819 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os);
3820 zfid_long_t *zlfid;
3822 zlfid = (zfid_long_t *)fidp;
3824 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
3825 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
3827 /* XXX - this should be the generation number for the objset */
3828 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
3829 zlfid->zf_setgen[i] = 0;
3832 ZFS_EXIT(zfsvfs);
3833 return (0);
3836 static int
3837 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
3838 caller_context_t *ct)
3840 znode_t *zp, *xzp;
3841 zfsvfs_t *zfsvfs;
3842 zfs_dirlock_t *dl;
3843 int error;
3845 switch (cmd) {
3846 case _PC_LINK_MAX:
3847 *valp = INT_MAX;
3848 return (0);
3850 case _PC_FILESIZEBITS:
3851 *valp = 64;
3852 return (0);
3854 #if 0
3855 case _PC_XATTR_EXISTS:
3856 zp = VTOZ(vp);
3857 zfsvfs = zp->z_zfsvfs;
3858 ZFS_ENTER(zfsvfs);
3859 ZFS_VERIFY_ZP(zp);
3860 *valp = 0;
3861 error = zfs_dirent_lock(&dl, zp, "", &xzp,
3862 ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
3863 if (error == 0) {
3864 zfs_dirent_unlock(dl, 0);
3865 if (!zfs_dirempty(xzp))
3866 *valp = 1;
3867 VN_RELE(ZTOV(xzp));
3868 } else if (error == ENOENT) {
3870 * If there aren't extended attributes, it's the
3871 * same as having zero of them.
3873 error = 0;
3875 ZFS_EXIT(zfsvfs);
3876 return (error);
3877 #endif
3879 case _PC_ACL_EXTENDED:
3880 *valp = 0; /* TODO */
3881 return (0);
3883 case _PC_MIN_HOLE_SIZE:
3884 *valp = (int)SPA_MINBLOCKSIZE;
3885 return (0);
3887 default:
3888 return (EOPNOTSUPP);
3892 static int
3893 zfs_netbsd_open(struct vop_open_args *ap)
3895 vnode_t *vp = ap->a_vp;
3896 znode_t *zp = VTOZ(vp);
3897 int error;
3899 error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
3901 return (error);
3904 static int
3905 zfs_netbsd_close(struct vop_close_args *ap)
3908 return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred, NULL));
3911 static int
3912 zfs_netbsd_ioctl(struct vop_ioctl_args *ap)
3915 return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
3916 ap->a_fflag, ap->a_cred, NULL, NULL));
3920 static int
3921 zfs_netbsd_read(struct vop_read_args *ap)
3924 return (zfs_read(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL));
3927 static int
3928 zfs_netbsd_write(struct vop_write_args *ap)
3931 return (zfs_write(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL));
3934 static int
3935 zfs_netbsd_access(struct vop_access_args *ap)
3939 * ZFS itself only knowns about VREAD, VWRITE and VEXEC, the rest
3940 * we have to handle by calling vaccess().
3942 if ((ap->a_mode & ~(VREAD|VWRITE|VEXEC)) != 0) {
3943 vnode_t *vp = ap->a_vp;
3944 znode_t *zp = VTOZ(vp);
3945 znode_phys_t *zphys = zp->z_phys;
3947 return (vaccess(vp->v_type, zphys->zp_mode, zphys->zp_uid,
3948 zphys->zp_gid, ap->a_mode, ap->a_cred));
3951 return (zfs_access(ap->a_vp, ap->a_mode, 0, ap->a_cred, NULL));
3954 static int
3955 zfs_netbsd_lookup(struct vop_lookup_args *ap)
3957 struct componentname *cnp = ap->a_cnp;
3958 char nm[NAME_MAX + 1];
3959 int err;
3961 ASSERT(cnp->cn_namelen < sizeof(nm));
3962 strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
3964 err = zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
3965 cnp->cn_cred, 0);
3967 return err;
3970 static int
3971 zfs_netbsd_create(struct vop_create_args *ap)
3973 struct componentname *cnp = ap->a_cnp;
3974 vattr_t *vap = ap->a_vap;
3975 int mode;
3977 ASSERT(cnp->cn_flags & SAVENAME);
3979 vattr_init_mask(vap);
3980 mode = vap->va_mode & ALLPERMS;
3982 return (zfs_create(ap->a_dvp, (char *)cnp->cn_nameptr, vap, !EXCL, mode,
3983 ap->a_vpp, cnp->cn_cred));
3986 static int
3987 zfs_netbsd_remove(struct vop_remove_args *ap)
3990 ASSERT(ap->a_cnp->cn_flags & SAVENAME);
3992 return (zfs_remove(ap->a_dvp, (char *)ap->a_cnp->cn_nameptr,
3993 ap->a_cnp->cn_cred, NULL, 0));
3996 static int
3997 zfs_netbsd_mkdir(struct vop_mkdir_args *ap)
3999 vattr_t *vap = ap->a_vap;
4001 ASSERT(ap->a_cnp->cn_flags & SAVENAME);
4003 vattr_init_mask(vap);
4005 return (zfs_mkdir(ap->a_dvp, (char *)ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
4006 ap->a_cnp->cn_cred, NULL, 0, NULL));
4009 static int
4010 zfs_netbsd_rmdir(struct vop_rmdir_args *ap)
4012 struct componentname *cnp = ap->a_cnp;
4014 ASSERT(cnp->cn_flags & SAVENAME);
4016 return (zfs_rmdir(ap->a_dvp, (char *)cnp->cn_nameptr, NULL, cnp->cn_cred, NULL, 0));
4019 static int
4020 zfs_netbsd_readdir(struct vop_readdir_args *ap)
4023 return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
4024 ap->a_ncookies, (u_long **)ap->a_cookies));
4027 static int
4028 zfs_netbsd_fsync(struct vop_fsync_args *ap)
4031 return (zfs_fsync(ap->a_vp, ap->a_flags, ap->a_cred, NULL));
4034 static int
4035 zfs_netbsd_getattr(struct vop_getattr_args *ap)
4037 vattr_t *vap = ap->a_vap;
4038 xvattr_t xvap;
4039 u_long fflags = 0;
4040 int error;
4042 xva_init(&xvap);
4043 xvap.xva_vattr = *vap;
4044 xvap.xva_vattr.va_mask |= AT_XVATTR;
4046 /* Convert chflags into ZFS-type flags. */
4047 /* XXX: what about SF_SETTABLE?. */
4048 XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
4049 XVA_SET_REQ(&xvap, XAT_APPENDONLY);
4050 XVA_SET_REQ(&xvap, XAT_NOUNLINK);
4051 XVA_SET_REQ(&xvap, XAT_NODUMP);
4052 error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
4053 if (error != 0)
4054 return (error);
4056 /* Convert ZFS xattr into chflags. */
4057 #define FLAG_CHECK(fflag, xflag, xfield) do { \
4058 if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \
4059 fflags |= (fflag); \
4060 } while (0)
4061 FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
4062 xvap.xva_xoptattrs.xoa_immutable);
4063 FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
4064 xvap.xva_xoptattrs.xoa_appendonly);
4065 FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
4066 xvap.xva_xoptattrs.xoa_nounlink);
4067 FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
4068 xvap.xva_xoptattrs.xoa_nodump);
4069 #undef FLAG_CHECK
4070 *vap = xvap.xva_vattr;
4071 vap->va_flags = fflags;
4072 return (0);
4075 static int
4076 zfs_netbsd_setattr(struct vop_setattr_args *ap)
4078 vnode_t *vp = ap->a_vp;
4079 vattr_t *vap = ap->a_vap;
4080 cred_t *cred = ap->a_cred;
4081 xvattr_t xvap;
4082 u_long fflags;
4083 uint64_t zflags;
4085 vattr_init_mask(vap);
4086 vap->va_mask &= ~AT_NOSET;
4088 xva_init(&xvap);
4089 xvap.xva_vattr = *vap;
4091 zflags = VTOZ(vp)->z_phys->zp_flags;
4093 if (vap->va_flags != VNOVAL) {
4094 int error;
4096 fflags = vap->va_flags;
4097 if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_NODUMP)) != 0)
4098 return (EOPNOTSUPP);
4100 * Callers may only modify the file flags on objects they
4101 * have VADMIN rights for.
4103 if ((error = VOP_ACCESS(vp, VWRITE, cred)) != 0)
4104 return (error);
4106 * Unprivileged processes are not permitted to unset system
4107 * flags, or modify flags if any system flags are set.
4108 * Privileged non-jail processes may not modify system flags
4109 * if securelevel > 0 and any existing system flags are set.
4110 * Privileged jail processes behave like privileged non-jail
4111 * processes if the security.jail.chflags_allowed sysctl is
4112 * is non-zero; otherwise, they behave like unprivileged
4113 * processes.
4115 if (kauth_authorize_system(cred, KAUTH_SYSTEM_CHSYSFLAGS, 0,
4116 NULL, NULL, NULL) != 0) {
4118 if (zflags &
4119 (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
4120 return (EPERM);
4122 if (fflags &
4123 (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
4124 return (EPERM);
4128 #define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \
4129 if (((fflags & (fflag)) && !(zflags & (zflag))) || \
4130 ((zflags & (zflag)) && !(fflags & (fflag)))) { \
4131 XVA_SET_REQ(&xvap, (xflag)); \
4132 (xfield) = ((fflags & (fflag)) != 0); \
4134 } while (0)
4135 /* Convert chflags into ZFS-type flags. */
4136 /* XXX: what about SF_SETTABLE?. */
4137 FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
4138 xvap.xva_xoptattrs.xoa_immutable);
4139 FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
4140 xvap.xva_xoptattrs.xoa_appendonly);
4141 FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
4142 xvap.xva_xoptattrs.xoa_nounlink);
4143 FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
4144 xvap.xva_xoptattrs.xoa_nodump);
4145 #undef FLAG_CHANGE
4147 return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
4150 static int
4151 zfs_netbsd_rename(ap)
4152 struct vop_rename_args /* {
4153 struct vnode *a_fdvp;
4154 struct vnode *a_fvp;
4155 struct componentname *a_fcnp;
4156 struct vnode *a_tdvp;
4157 struct vnode *a_tvp;
4158 struct componentname *a_tcnp;
4159 } */ *ap;
4161 vnode_t *fdvp = ap->a_fdvp;
4162 vnode_t *fvp = ap->a_fvp;
4163 vnode_t *tdvp = ap->a_tdvp;
4164 vnode_t *tvp = ap->a_tvp;
4165 int error;
4167 ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
4168 ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
4170 error = zfs_rename(fdvp, (char *)ap->a_fcnp->cn_nameptr, tdvp,
4171 (char *)ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred, NULL, 0);
4173 if (tdvp == tvp)
4174 VN_RELE(tdvp);
4175 else
4176 VN_URELE(tdvp);
4177 if (tvp)
4178 VN_URELE(tvp);
4179 VN_RELE(fdvp);
4180 VN_RELE(fvp);
4182 return (error);
4185 static int
4186 zfs_netbsd_symlink(struct vop_symlink_args *ap)
4188 struct componentname *cnp = ap->a_cnp;
4189 vattr_t *vap = ap->a_vap;
4191 ASSERT(cnp->cn_flags & SAVENAME);
4193 vap->va_type = VLNK; /* Netbsd: Syscall only sets va_mode. */
4194 vattr_init_mask(vap);
4196 return (zfs_symlink(ap->a_dvp, ap->a_vpp, (char *)cnp->cn_nameptr, vap,
4197 ap->a_target, cnp->cn_cred));
4200 static int
4201 zfs_netbsd_readlink(struct vop_readlink_args *ap)
4204 return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
4207 static int
4208 zfs_netbsd_link(struct vop_link_args *ap)
4210 struct componentname *cnp = ap->a_cnp;
4212 ASSERT(cnp->cn_flags & SAVENAME);
4214 return (zfs_link(ap->a_dvp, ap->a_vp, (char *)cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
4217 static int
4218 zfs_netbsd_inactive(struct vop_inactive_args *ap)
4220 vnode_t *vp = ap->a_vp;
4221 znode_t *zp = VTOZ(vp);
4224 * NetBSD: nothing to do here, other than indicate if the
4225 * vnode should be reclaimed. No need to lock, if we race
4226 * vrele() will call us again.
4228 *ap->a_recycle = (zp->z_unlinked != 0);
4229 VOP_UNLOCK(vp, 0);
4230 return (0);
4234 * Destroy znode from taskq thread without ZFS_OBJ_MUTEX held.
4236 static void
4237 zfs_reclaim_deferred(void *arg, int pending)
4239 znode_t *zp = arg;
4240 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4241 uint64_t z_id = zp->z_id;
4244 * Don't allow a zfs_zget() while were trying to release this znode
4246 ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
4248 /* Don't need to call ZFS_OBJ_HOLD_EXIT zfs_inactive did thatfor us. */
4249 zfs_zinactive(zp);
4253 static int
4254 zfs_netbsd_reclaim(struct vop_reclaim_args *ap)
4256 vnode_t *vp = ap->a_vp;
4257 znode_t *zp = VTOZ(vp);
4258 zfsvfs_t *zfsvfs;
4259 int locked;
4261 locked = 0;
4263 ASSERT(zp != NULL);
4264 KASSERT(!vn_has_cached_data(vp));
4266 zfsvfs = zp->z_zfsvfs;
4268 mutex_enter(&zp->z_lock);
4269 ASSERT(zp->z_phys);
4271 // dprintf("destroying znode %p -- vnode %p -- zp->z_buf = %p\n", zp, ZTOV(zp), zp->z_dbuf);
4272 // rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4273 genfs_node_destroy(vp);
4274 cache_purge(vp);
4276 if (zp->z_dbuf == NULL) {
4278 * The fs has been unmounted, or we did a
4279 * suspend/resume and this file no longer exists.
4281 rw_exit(&zfsvfs->z_teardown_inactive_lock);
4282 mutex_exit(&zp->z_lock);
4283 zfs_znode_free(zp);
4284 return (0);
4286 mutex_exit(&zp->z_lock);
4288 mutex_enter(&zp->z_lock);
4289 if (!zp->z_unlinked) {
4291 * XXX Hack because ZFS_OBJ_MUTEX is held we can't call zfs_zinactive
4292 * now. I need to defer zfs_zinactive to another thread which doesn't hold this mutex.
4294 locked = MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)) ? 2 :
4295 ZFS_OBJ_HOLD_TRYENTER(zfsvfs, zp->z_id);
4296 if (locked == 0) {
4298 * Lock can't be obtained due to deadlock possibility,
4299 * so defer znode destruction.
4301 taskq_dispatch(system_taskq, zfs_reclaim_deferred, zp, 0);
4302 } else {
4303 zfs_znode_dmu_fini(zp);
4304 /* Our LWP is holding ZFS_OBJ_HELD mutex but it was locked before
4305 zfs_zinactive was called therefore we can't release it. */
4306 if (locked == 1)
4307 ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
4308 zfs_znode_free(zp);
4310 } else
4311 mutex_exit(&zp->z_lock);
4313 ZTOV(zp) = NULL;
4314 vp->v_data = NULL; /* v_data must be NULL for a cleaned vnode. */
4316 return (0);
4319 static int
4320 zfs_netbsd_fid(struct vop_fid_args *ap)
4323 return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
4326 static int
4327 zfs_netbsd_pathconf(struct vop_pathconf_args *ap)
4329 ulong_t val;
4330 int error;
4332 error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->l_cred, NULL);
4333 if (error == 0)
4334 *ap->a_retval = val;
4335 else if (error == EOPNOTSUPP) {
4336 switch (ap->a_name) {
4337 case _PC_NAME_MAX:
4338 *ap->a_retval = NAME_MAX;
4339 return (0);
4340 case _PC_PATH_MAX:
4341 *ap->a_retval = PATH_MAX;
4342 return (0);
4343 case _PC_LINK_MAX:
4344 *ap->a_retval = LINK_MAX;
4345 return (0);
4346 case _PC_MAX_CANON:
4347 *ap->a_retval = MAX_CANON;
4348 return (0);
4349 case _PC_MAX_INPUT:
4350 *ap->a_retval = MAX_INPUT;
4351 return (0);
4352 case _PC_PIPE_BUF:
4353 *ap->a_retval = PIPE_BUF;
4354 return (0);
4355 case _PC_CHOWN_RESTRICTED:
4356 *ap->a_retval = 1;
4357 return (0);
4358 case _PC_VDISABLE:
4359 *ap->a_retval = _POSIX_VDISABLE;
4360 return (0);
4361 default:
4362 return (EINVAL);
4364 /* NOTREACHED */
4366 return (error);
4370 zfs_netbsd_lock(struct vop_lock_args *ap)
4372 struct vnode *vp = ap->a_vp;
4373 int flags = ap->a_flags;
4375 if ((flags & LK_INTERLOCK) != 0) {
4376 mutex_exit(&vp->v_interlock);
4379 return 0;
4383 zfs_netbsd_unlock(void *v)
4386 return 0;
4390 zfs_netbsd_getpages(void *v)
4392 struct vnode *vp = ((struct vop_getpages_args *)v)->a_vp;
4393 voff_t offset = ((struct vop_getpages_args *)v)->a_offset;
4394 struct vm_page **m = ((struct vop_getpages_args *)v)->a_m;
4395 int *count = ((struct vop_getpages_args *)v)->a_count;
4396 int centeridx = ((struct vop_getpages_args *)v)->a_centeridx;
4397 vm_prot_t access_type = ((struct vop_getpages_args *)v)->a_access_type;
4398 int advice = ((struct vop_getpages_args *)v)->a_advice;
4399 int flags = ((struct vop_getpages_args *)v)->a_flags;
4401 int error;
4403 error = 0;
4405 KASSERT(!vn_has_cached_data(vp));
4406 mutex_exit(&vp->v_interlock);
4408 return error;
4413 zfs_netbsd_putpages(void *v)
4415 struct vnode *vp = ((struct vop_putpages_args *)v)->a_vp;
4416 voff_t offlo = ((struct vop_putpages_args *)v)->a_offlo;
4417 voff_t offhi = ((struct vop_putpages_args *)v)->a_offhi;
4418 int flags = ((struct vop_putpages_args *)v)->a_flags;
4419 znode_t *zp = VTOZ(vp);
4421 int error;
4423 dprintf("putpages entry %p -- zfsvfs %p\n", vp, zp->z_zfsvfs);
4424 error = genfs_putpages(v);
4425 dprintf("putpages exit %p -- zfsvfs %p\n", vp, zp->z_zfsvfs);
4427 return error;
4430 #define zfs_netbsd_seek genfs_seek
4431 #define zfs_netbsd_mmap genfs_mmap
4432 #define zfs_netbsd_getpages genfs_compat_getpages
4433 //#define zfs_netbsd_putpages genfs_putpages
4434 #define zfs_netbsd_islocked genfs_islocked
4436 int (**zfs_vnodeop_p)(void *);
4437 const struct vnodeopv_entry_desc zfs_vnodeop_entries[] = {
4438 { &vop_default_desc, vn_default_error },
4439 { &vop_lookup_desc, zfs_netbsd_lookup },
4440 { &vop_create_desc, zfs_netbsd_create },
4441 { &vop_open_desc, zfs_netbsd_open },
4442 { &vop_close_desc, zfs_netbsd_close },
4443 { &vop_access_desc, zfs_netbsd_access },
4444 { &vop_getattr_desc, zfs_netbsd_getattr },
4445 { &vop_setattr_desc, zfs_netbsd_setattr },
4446 { &vop_read_desc, zfs_netbsd_read },
4447 { &vop_write_desc, zfs_netbsd_write },
4448 { &vop_ioctl_desc, zfs_netbsd_ioctl },
4449 { &vop_fsync_desc, zfs_netbsd_fsync },
4450 { &vop_remove_desc, zfs_netbsd_remove },
4451 { &vop_link_desc, zfs_netbsd_link },
4452 { &vop_lock_desc, zfs_netbsd_lock },
4453 { &vop_unlock_desc, zfs_netbsd_unlock },
4454 { &vop_rename_desc, zfs_netbsd_rename },
4455 { &vop_mkdir_desc, zfs_netbsd_mkdir },
4456 { &vop_rmdir_desc, zfs_netbsd_rmdir },
4457 { &vop_symlink_desc, zfs_netbsd_symlink },
4458 { &vop_readdir_desc, zfs_netbsd_readdir },
4459 { &vop_readlink_desc, zfs_netbsd_readlink },
4460 { &vop_inactive_desc, zfs_netbsd_inactive },
4461 { &vop_reclaim_desc, zfs_netbsd_reclaim },
4462 { &vop_pathconf_desc, zfs_netbsd_pathconf },
4463 { &vop_seek_desc, zfs_netbsd_seek },
4464 { &vop_getpages_desc, zfs_netbsd_getpages },
4465 { &vop_putpages_desc, zfs_netbsd_putpages },
4466 { &vop_mmap_desc, zfs_netbsd_mmap },
4467 { &vop_islocked_desc, zfs_netbsd_islocked },
4468 #ifdef notyet
4469 { &vop_advlock_desc, zfs_netbsd_advlock },
4470 { &vop_fcntl_desc, zfs_netbsd_fcntl },
4471 { &vop_bmap_desc, zfs_netbsd_bmap },
4472 { &vop_strategy_desc, zfs_netbsd_strategy },
4473 { &vop_print_desc, zfs_netbsd_print },
4474 { &vop_bwrite_desc, zfs_netbsd_bwrite },
4475 #endif
4476 { NULL, NULL }
4479 const struct vnodeopv_desc zfs_vnodeop_opv_desc =
4480 { &zfs_vnodeop_p, zfs_vnodeop_entries };
4482 #if 0
4483 struct vop_vector zfs_vnodeops;
4484 struct vop_vector zfs_fifoops;
4488 struct vop_vector zfs_vnodeops = {
4489 .vop_default = &default_vnodeops,
4490 .vop_inactive = zfs_netbsd_inactive,
4491 .vop_reclaim = zfs_netbsd_reclaim,
4492 .vop_access = zfs_netbsd_access,
4493 .vop_lookup = zfs_netbsd_lookup,
4494 .vop_getattr = zfs_netbsd_getattr,
4495 .vop_setattr = zfs_netbsd_setattr,
4496 .vop_create = zfs_netbsd_create,
4497 .vop_mknod = zfs_netbsd_create,
4498 .vop_mkdir = zfs_netbsd_mkdir,
4499 .vop_readdir = zfs_netbsd_readdir,
4500 .vop_fsync = zfs_netbsd_fsync,
4501 .vop_open = zfs_netbsd_open,
4502 .vop_close = zfs_netbsd_close,
4503 .vop_rmdir = zfs_netbsd_rmdir,
4504 .vop_ioctl = zfs_netbsd_ioctl,
4505 .vop_link = zfs_netbsd_link,
4506 .vop_lock = zfs_netbsd_lock,
4507 .vop_unlock = zfs_netbsd_unlock,
4508 .vop_symlink = zfs_netbsd_symlink,
4509 .vop_readlink = zfs_netbsd_readlink,
4510 .vop_read = zfs_netbsd_read,
4511 .vop_write = zfs_netbsd_write,
4512 .vop_remove = zfs_netbsd_remove,
4513 .vop_rename = zfs_netbsd_rename,
4514 .vop_pathconf = zfs_netbsd_pathconf,
4515 .vop_bmap = VOP_EOPNOTSUPP,
4516 .vop_fid = zfs_netbsd_fid,
4517 .vop_getextattr = zfs_getextattr,
4518 .vop_deleteextattr = zfs_deleteextattr,
4519 .vop_setextattr = zfs_setextattr,
4520 .vop_listextattr = zfs_listextattr,
4521 #ifdef notyet
4522 .vop_getacl = zfs_netbsd_getacl,
4523 .vop_setacl = zfs_netbsd_setacl,
4524 .vop_aclcheck = zfs_netbsd_aclcheck,
4525 #endif
4528 struct vop_vector zfs_fifoops = {
4529 .vop_default = &fifo_specops,
4530 .vop_fsync = VOP_PANIC,
4531 .vop_access = zfs_netbsd_access,
4532 .vop_getattr = zfs_netbsd_getattr,
4533 .vop_inactive = zfs_netbsd_inactive,
4534 .vop_read = VOP_PANIC,
4535 .vop_reclaim = zfs_netbsd_reclaim,
4536 .vop_setattr = zfs_netbsd_setattr,
4537 .vop_write = VOP_PANIC,
4538 .vop_fid = zfs_netbsd_fid,
4539 #ifdef notyet
4540 .vop_getacl = zfs_netbsd_getacl,
4541 .vop_setacl = zfs_netbsd_setacl,
4542 .vop_aclcheck = zfs_netbsd_aclcheck,
4543 #endif
4545 #endif