Patrick Welche <prlw1@cam.ac.uk>
[netbsd-mini2440.git] / external / cddl / osnet / dist / uts / common / fs / zfs / zfs_znode.c
blob3a869696c316e720565e45d2966d67aa8cf2bf9c
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 /* Portions Copyright 2007 Jeremy Teo */
28 #ifdef _KERNEL
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/time.h>
32 #include <sys/systm.h>
33 #include <sys/sysmacros.h>
34 #include <sys/resource.h>
35 #include <sys/mntent.h>
36 #include <sys/u8_textprep.h>
37 #include <sys/dsl_dataset.h>
38 #include <sys/vfs.h>
39 #include <sys/vnode.h>
40 #include <sys/file.h>
41 #include <sys/kmem.h>
42 #include <sys/errno.h>
43 #include <sys/unistd.h>
44 #include <sys/atomic.h>
45 #include <sys/zfs_dir.h>
46 #include <sys/zfs_acl.h>
47 #include <sys/zfs_ioctl.h>
48 #include <sys/zfs_rlock.h>
49 #include <sys/zfs_fuid.h>
50 #include <sys/fs/zfs.h>
51 #include <sys/kidmap.h>
52 #endif /* _KERNEL */
54 #include <sys/dmu.h>
55 #include <sys/refcount.h>
56 #include <sys/stat.h>
57 #include <sys/zap.h>
58 #include <sys/zfs_znode.h>
60 #include "zfs_prop.h"
62 #if defined(_KERNEL) && defined(__NetBSD__)
63 #include <miscfs/specfs/specdev.h>
64 static const struct genfs_ops zfs_genfsops = {
65 .gop_write = genfs_compat_gop_write,
68 #endif
70 extern int (**zfs_vnodeop_p)(void *);
71 extern int (**zfs_fifoop_p)(void *);
72 extern int (**zfs_specop_p)(void *);
75 * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
76 * turned on when DEBUG is also defined.
78 #ifdef DEBUG
79 #define ZNODE_STATS
80 #endif /* DEBUG */
82 #ifdef ZNODE_STATS
83 #define ZNODE_STAT_ADD(stat) ((stat)++)
84 #else
85 #define ZNODE_STAT_ADD(stat) /* nothing */
86 #endif /* ZNODE_STATS */
88 #define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3))
89 #define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1))
92 * Functions needed for userland (ie: libzpool) are not put under
93 * #ifdef_KERNEL; the rest of the functions have dependencies
94 * (such as VFS logic) that will not compile easily in userland.
96 #ifdef _KERNEL
97 static kmem_cache_t *znode_cache = NULL;
99 /*ARGSUSED*/
100 static void
101 znode_evict_error(dmu_buf_t *dbuf, void *user_ptr)
104 * We should never drop all dbuf refs without first clearing
105 * the eviction callback.
107 panic("evicting znode %p\n", user_ptr);
110 /*ARGSUSED*/
111 static int
112 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
114 znode_t *zp = arg;
116 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
118 list_link_init(&zp->z_link_node);
120 mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
121 rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL);
122 rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
123 rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
124 mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
126 mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
127 avl_create(&zp->z_range_avl, zfs_range_compare,
128 sizeof (rl_t), offsetof(rl_t, r_node));
130 zp->z_dbuf = NULL;
131 zp->z_dirlocks = NULL;
132 return (0);
135 /*ARGSUSED*/
136 static void
137 zfs_znode_cache_destructor(void *buf, void *arg)
139 znode_t *zp = arg;
141 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
142 ASSERT(ZTOV(zp) == NULL);
144 ASSERT(!list_link_active(&zp->z_link_node));
145 mutex_destroy(&zp->z_lock);
146 rw_destroy(&zp->z_map_lock);
147 rw_destroy(&zp->z_parent_lock);
148 rw_destroy(&zp->z_name_lock);
149 mutex_destroy(&zp->z_acl_lock);
150 avl_destroy(&zp->z_range_avl);
151 mutex_destroy(&zp->z_range_lock);
153 ASSERT(zp->z_dbuf == NULL);
154 ASSERT(zp->z_dirlocks == NULL);
157 #ifdef ZNODE_STATS
158 static struct {
159 uint64_t zms_zfsvfs_invalid;
160 uint64_t zms_zfsvfs_unmounted;
161 uint64_t zms_zfsvfs_recheck_invalid;
162 uint64_t zms_obj_held;
163 uint64_t zms_vnode_locked;
164 uint64_t zms_not_only_dnlc;
165 } znode_move_stats;
166 #endif /* ZNODE_STATS */
168 static void
169 zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
171 vnode_t *vp;
173 /* Copy fields. */
174 nzp->z_zfsvfs = ozp->z_zfsvfs;
176 /* Swap vnodes. */
177 vp = nzp->z_vnode;
178 nzp->z_vnode = ozp->z_vnode;
179 ozp->z_vnode = vp; /* let destructor free the overwritten vnode */
180 ZTOV(ozp)->v_data = ozp;
181 ZTOV(nzp)->v_data = nzp;
183 nzp->z_id = ozp->z_id;
184 ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */
185 ASSERT(avl_numnodes(&ozp->z_range_avl) == 0);
186 nzp->z_unlinked = ozp->z_unlinked;
187 nzp->z_atime_dirty = ozp->z_atime_dirty;
188 nzp->z_zn_prefetch = ozp->z_zn_prefetch;
189 nzp->z_blksz = ozp->z_blksz;
190 nzp->z_seq = ozp->z_seq;
191 nzp->z_mapcnt = ozp->z_mapcnt;
192 nzp->z_last_itx = ozp->z_last_itx;
193 nzp->z_gen = ozp->z_gen;
194 nzp->z_sync_cnt = ozp->z_sync_cnt;
195 nzp->z_phys = ozp->z_phys;
196 nzp->z_dbuf = ozp->z_dbuf;
198 /* Update back pointers. */
199 (void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys,
200 znode_evict_error);
203 * Invalidate the original znode by clearing fields that provide a
204 * pointer back to the znode. Set the low bit of the vfs pointer to
205 * ensure that zfs_znode_move() recognizes the znode as invalid in any
206 * subsequent callback.
208 ozp->z_dbuf = NULL;
209 POINTER_INVALIDATE(&ozp->z_zfsvfs);
213 * Wrapper function for ZFS_ENTER that returns 0 if successful and otherwise
214 * returns a non-zero error code.
216 static int
217 zfs_enter(zfsvfs_t *zfsvfs)
219 ZFS_ENTER(zfsvfs);
220 return (0);
223 #ifndef __NetBSD__
224 /*ARGSUSED*/
225 static kmem_cbrc_t
226 zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg)
228 znode_t *ozp = buf, *nzp = newbuf;
229 zfsvfs_t *zfsvfs;
230 vnode_t *vp;
233 * The znode is on the file system's list of known znodes if the vfs
234 * pointer is valid. We set the low bit of the vfs pointer when freeing
235 * the znode to invalidate it, and the memory patterns written by kmem
236 * (baddcafe and deadbeef) set at least one of the two low bits. A newly
237 * created znode sets the vfs pointer last of all to indicate that the
238 * znode is known and in a valid state to be moved by this function.
240 zfsvfs = ozp->z_zfsvfs;
241 if (!POINTER_IS_VALID(zfsvfs)) {
242 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid);
243 return (KMEM_CBRC_DONT_KNOW);
247 * Ensure that the filesystem is not unmounted during the move.
249 if (zfs_enter(zfsvfs) != 0) { /* ZFS_ENTER */
250 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted);
251 return (KMEM_CBRC_DONT_KNOW);
254 mutex_enter(&zfsvfs->z_znodes_lock);
256 * Recheck the vfs pointer in case the znode was removed just before
257 * acquiring the lock.
259 if (zfsvfs != ozp->z_zfsvfs) {
260 mutex_exit(&zfsvfs->z_znodes_lock);
261 ZFS_EXIT(zfsvfs);
262 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck_invalid);
263 return (KMEM_CBRC_DONT_KNOW);
267 * At this point we know that as long as we hold z_znodes_lock, the
268 * znode cannot be freed and fields within the znode can be safely
269 * accessed. Now, prevent a race with zfs_zget().
271 if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) {
272 mutex_exit(&zfsvfs->z_znodes_lock);
273 ZFS_EXIT(zfsvfs);
274 ZNODE_STAT_ADD(znode_move_stats.zms_obj_held);
275 return (KMEM_CBRC_LATER);
278 vp = ZTOV(ozp);
279 if (mutex_tryenter(&vp->v_lock) == 0) {
280 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
281 mutex_exit(&zfsvfs->z_znodes_lock);
282 ZFS_EXIT(zfsvfs);
283 ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked);
284 return (KMEM_CBRC_LATER);
287 /* Only move znodes that are referenced _only_ by the DNLC. */
288 if (vp->v_count != 1 || !vn_in_dnlc(vp)) {
289 mutex_exit(&vp->v_lock);
290 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
291 mutex_exit(&zfsvfs->z_znodes_lock);
292 ZFS_EXIT(zfsvfs);
293 ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc);
294 return (KMEM_CBRC_LATER);
298 * The znode is known and in a valid state to move. We're holding the
299 * locks needed to execute the critical section.
301 zfs_znode_move_impl(ozp, nzp);
302 mutex_exit(&vp->v_lock);
303 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
305 list_link_replace(&ozp->z_link_node, &nzp->z_link_node);
306 mutex_exit(&zfsvfs->z_znodes_lock);
307 ZFS_EXIT(zfsvfs);
309 return (KMEM_CBRC_YES);
311 #endif /* !__NetBSD__ */
313 void
314 zfs_znode_init(void)
317 * Initialize zcache
319 ASSERT(znode_cache == NULL);
320 znode_cache = kmem_cache_create("zfs_znode_cache",
321 sizeof (znode_t), 0, zfs_znode_cache_constructor,
322 zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
325 void
326 zfs_znode_fini(void)
330 * Cleanup zcache
332 if (znode_cache)
333 kmem_cache_destroy(znode_cache);
334 znode_cache = NULL;
337 #ifndef __NetBSD__
338 struct vnodeops *zfs_dvnodeops;
339 struct vnodeops *zfs_fvnodeops;
340 struct vnodeops *zfs_symvnodeops;
341 struct vnodeops *zfs_xdvnodeops;
342 struct vnodeops *zfs_evnodeops;
343 #endif
344 void
345 zfs_remove_op_tables()
347 #ifndef __NetBSD__
349 * Remove vfs ops
351 ASSERT(zfsfstype);
352 (void) vfs_freevfsops_by_type(zfsfstype);
353 zfsfstype = 0;
356 * Remove vnode ops
358 if (zfs_dvnodeops)
359 vn_freevnodeops(zfs_dvnodeops);
360 if (zfs_fvnodeops)
361 vn_freevnodeops(zfs_fvnodeops);
362 if (zfs_symvnodeops)
363 vn_freevnodeops(zfs_symvnodeops);
364 if (zfs_xdvnodeops)
365 vn_freevnodeops(zfs_xdvnodeops);
366 if (zfs_evnodeops)
367 vn_freevnodeops(zfs_evnodeops);
369 zfs_dvnodeops = NULL;
370 zfs_fvnodeops = NULL;
371 zfs_symvnodeops = NULL;
372 zfs_xdvnodeops = NULL;
373 zfs_evnodeops = NULL;
374 #endif
376 #ifndef __NetBSD__
377 extern const fs_operation_def_t zfs_dvnodeops_template[];
378 extern const fs_operation_def_t zfs_fvnodeops_template[];
379 extern const fs_operation_def_t zfs_xdvnodeops_template[];
380 extern const fs_operation_def_t zfs_symvnodeops_template[];
381 extern const fs_operation_def_t zfs_evnodeops_template[];
382 #endif
384 zfs_create_op_tables()
386 #ifndef __NetBSD__
387 int error;
390 * zfs_dvnodeops can be set if mod_remove() calls mod_installfs()
391 * due to a failure to remove the the 2nd modlinkage (zfs_modldrv).
392 * In this case we just return as the ops vectors are already set up.
394 if (zfs_dvnodeops)
395 return (0);
397 error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template,
398 &zfs_dvnodeops);
399 if (error)
400 return (error);
402 error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template,
403 &zfs_fvnodeops);
404 if (error)
405 return (error);
407 error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template,
408 &zfs_symvnodeops);
409 if (error)
410 return (error);
412 error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template,
413 &zfs_xdvnodeops);
414 if (error)
415 return (error);
417 error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template,
418 &zfs_evnodeops);
420 return (error);
421 #endif
422 return 0;
426 * zfs_init_fs - Initialize the zfsvfs struct and the file system
427 * incore "master" object. Verify version compatibility.
430 zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp)
432 extern int zfsfstype;
434 objset_t *os = zfsvfs->z_os;
435 int i, error;
436 uint64_t fsid_guid;
437 uint64_t zval;
439 *zpp = NULL;
441 error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
442 if (error) {
443 return (error);
444 } else if (zfsvfs->z_version > ZPL_VERSION) {
445 (void) printf("Mismatched versions: File system "
446 "is version %llu on-disk format, which is "
447 "incompatible with this software version %lld!",
448 (u_longlong_t)zfsvfs->z_version, ZPL_VERSION);
449 return (ENOTSUP);
452 if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0)
453 return (error);
454 zfsvfs->z_norm = (int)zval;
455 if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0)
456 return (error);
457 zfsvfs->z_utf8 = (zval != 0);
458 if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0)
459 return (error);
460 zfsvfs->z_case = (uint_t)zval;
462 * Fold case on file systems that are always or sometimes case
463 * insensitive.
465 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
466 zfsvfs->z_case == ZFS_CASE_MIXED)
467 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
470 * The fsid is 64 bits, composed of an 8-bit fs type, which
471 * separates our fsid from any other filesystem types, and a
472 * 56-bit objset unique ID. The objset unique ID is unique to
473 * all objsets open on this system, provided by unique_create().
474 * The 8-bit fs type must be put in the low bits of fsid[1]
475 * because that's where other Solaris filesystems put it.
477 fsid_guid = dmu_objset_fsid_guid(os);
478 ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
479 zfsvfs->z_vfs->mnt_stat.f_fsidx.__fsid_val[0] = fsid_guid;
480 zfsvfs->z_vfs->mnt_stat.f_fsidx.__fsid_val[1] = ((fsid_guid>>32) << 8) |
481 zfsfstype & 0xFF;
482 zfsvfs->z_vfs->mnt_stat.f_fsid = fsid_guid;
484 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
485 &zfsvfs->z_root);
486 if (error)
487 return (error);
488 ASSERT(zfsvfs->z_root != 0);
490 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
491 &zfsvfs->z_unlinkedobj);
492 if (error)
493 return (error);
496 * Initialize zget mutex's
498 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
499 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
501 error = zfs_zget(zfsvfs, zfsvfs->z_root, zpp);
502 if (error) {
504 * On error, we destroy the mutexes here since it's not
505 * possible for the caller to determine if the mutexes were
506 * initialized properly.
508 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
509 mutex_destroy(&zfsvfs->z_hold_mtx[i]);
510 return (error);
512 ASSERT3U((*zpp)->z_id, ==, zfsvfs->z_root);
513 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
514 &zfsvfs->z_fuid_obj);
515 if (error == ENOENT)
516 error = 0;
518 return (0);
522 * define a couple of values we need available
523 * for both 64 and 32 bit environments.
525 #ifndef NBITSMINOR64
526 #define NBITSMINOR64 32
527 #endif
528 #ifndef MAXMAJ64
529 #define MAXMAJ64 0xffffffffUL
530 #endif
531 #ifndef MAXMIN64
532 #define MAXMIN64 0xffffffffUL
533 #endif
536 * Create special expldev for ZFS private use.
537 * Can't use standard expldev since it doesn't do
538 * what we want. The standard expldev() takes a
539 * dev32_t in LP64 and expands it to a long dev_t.
540 * We need an interface that takes a dev32_t in ILP32
541 * and expands it to a long dev_t.
543 static uint64_t
544 zfs_expldev(dev_t dev)
546 return ((uint64_t)major(dev) << NBITSMINOR64) |
547 (minor_t)minor(dev);
551 * Special cmpldev for ZFS private use.
552 * Can't use standard cmpldev since it takes
553 * a long dev_t and compresses it to dev32_t in
554 * LP64. We need to do a compaction of a long dev_t
555 * to a dev32_t in ILP32.
557 dev_t
558 zfs_cmpldev(uint64_t dev)
560 minor_t minor = (minor_t)dev & MAXMIN64;
561 major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
563 return makedev(minor, major);
566 static void
567 zfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db)
569 znode_t *nzp;
571 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
572 ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
574 mutex_enter(&zp->z_lock);
576 ASSERT(zp->z_dbuf == NULL);
577 zp->z_dbuf = db;
578 nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_evict_error);
581 * there should be no
582 * concurrent zgets on this object.
584 if (nzp != NULL)
585 panic("existing znode %p for dbuf %p", (void *)nzp, (void *)db);
588 * Slap on VROOT if we are the root znode
590 if (zp->z_id == zfsvfs->z_root)
591 ZTOV(zp)->v_flag |= VROOT;
593 mutex_exit(&zp->z_lock);
596 void
597 zfs_znode_dmu_fini(znode_t *zp)
599 dmu_buf_t *db = zp->z_dbuf;
600 ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) ||
601 zp->z_unlinked ||
602 RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock));
603 ASSERT(zp->z_dbuf != NULL);
604 zp->z_dbuf = NULL;
605 VERIFY(zp == dmu_buf_update_user(db, zp, NULL, NULL, NULL));
606 dmu_buf_rele(db, NULL);
610 * Construct a new znode/vnode and intialize.
612 * This does not do a call to dmu_set_user() that is
613 * up to the caller to do, in case you don't want to
614 * return the znode
617 static znode_t *
618 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
620 znode_t *zp;
621 vnode_t *vp;
622 int error;
624 zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
625 for (;;) {
627 error = getnewvnode(VT_ZFS, zfsvfs->z_parent->z_vfs,
628 zfs_vnodeop_p, &zp->z_vnode);
629 if (__predict_true(error == 0))
630 break;
631 printf("WARNING: zfs_znode_alloc: unable to get vnode, "
632 "error=%d\n", error);
633 (void)kpause("zfsnewvn", false, hz, NULL);
636 ASSERT(zp->z_dirlocks == NULL);
637 ASSERT(zp->z_dbuf == NULL);
638 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
641 * Defer setting z_zfsvfs until the znode is ready to be a candidate for
642 * the zfs_znode_move() callback.
644 zp->z_phys = NULL;
645 zp->z_unlinked = 0;
646 zp->z_atime_dirty = 0;
647 zp->z_mapcnt = 0;
648 zp->z_last_itx = 0;
649 zp->z_id = db->db_object;
650 zp->z_blksz = blksz;
651 zp->z_seq = 0x7A4653;
652 zp->z_sync_cnt = 0;
654 vp = ZTOV(zp);
656 zfs_znode_dmu_init(zfsvfs, zp, db);
658 zp->z_gen = zp->z_phys->zp_gen;
660 vp->v_vfsp = zfsvfs->z_parent->z_vfs;
661 vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
662 vp->v_data = zp;
663 switch (vp->v_type) {
664 case VDIR:
665 zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
666 break;
667 case VBLK:
668 case VCHR:
669 /* XXX NetBSD vp->v_op = zfs_specop_p; */
670 spec_node_init(vp, zfs_cmpldev(zp->z_phys->zp_rdev));
671 break;
672 case VFIFO:
673 /* XXX NetBSD vp->v_op = zfs_fifoop_p; */
674 break;
677 dprintf("zfs_znode_alloc znode %p -- vnode %p\n", zp, vp);
678 dprintf("zfs_znode_alloc z_id %ld\n", zp->z_id);
679 //cpu_Debugger();
681 uvm_vnp_setsize(vp, zp->z_phys->zp_size);
683 mutex_enter(&zfsvfs->z_znodes_lock);
684 list_insert_tail(&zfsvfs->z_all_znodes, zp);
685 membar_producer();
687 * Everything else must be valid before assigning z_zfsvfs makes the
688 * znode eligible for zfs_znode_move().
690 zp->z_zfsvfs = zfsvfs;
691 mutex_exit(&zfsvfs->z_znodes_lock);
693 return (zp);
697 * Create a new DMU object to hold a zfs znode.
699 * IN: dzp - parent directory for new znode
700 * vap - file attributes for new znode
701 * tx - dmu transaction id for zap operations
702 * cr - credentials of caller
703 * flag - flags:
704 * IS_ROOT_NODE - new object will be root
705 * IS_XATTR - new object is an attribute
706 * IS_REPLAY - intent log replay
707 * bonuslen - length of bonus buffer
708 * setaclp - File/Dir initial ACL
709 * fuidp - Tracks fuid allocation.
711 * OUT: zpp - allocated znode
714 void
715 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
716 uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_t *setaclp,
717 zfs_fuid_info_t **fuidp)
719 dmu_buf_t *db;
720 znode_phys_t *pzp;
721 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
722 timestruc_t now;
723 uint64_t gen, obj;
724 int err;
726 ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
728 if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */
729 obj = vap->va_nodeid;
730 flag |= IS_REPLAY;
731 now = vap->va_ctime; /* see zfs_replay_create() */
732 gen = vap->va_nblocks; /* ditto */
733 } else {
734 obj = 0;
735 gethrestime(&now);
736 gen = dmu_tx_get_txg(tx);
740 * Create a new DMU object.
743 * There's currently no mechanism for pre-reading the blocks that will
744 * be to needed allocate a new object, so we accept the small chance
745 * that there will be an i/o error and we will fail one of the
746 * assertions below.
748 if (vap->va_type == VDIR) {
749 if (flag & IS_REPLAY) {
750 err = zap_create_claim_norm(zfsvfs->z_os, obj,
751 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
752 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
753 ASSERT3U(err, ==, 0);
754 } else {
755 obj = zap_create_norm(zfsvfs->z_os,
756 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
757 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
759 } else {
760 if (flag & IS_REPLAY) {
761 err = dmu_object_claim(zfsvfs->z_os, obj,
762 DMU_OT_PLAIN_FILE_CONTENTS, 0,
763 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
764 ASSERT3U(err, ==, 0);
765 } else {
766 obj = dmu_object_alloc(zfsvfs->z_os,
767 DMU_OT_PLAIN_FILE_CONTENTS, 0,
768 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
771 VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, obj, NULL, &db));
772 dmu_buf_will_dirty(db, tx);
775 * Initialize the znode physical data to zero.
777 ASSERT(db->db_size >= sizeof (znode_phys_t));
778 bzero(db->db_data, db->db_size);
779 pzp = db->db_data;
782 * If this is the root, fix up the half-initialized parent pointer
783 * to reference the just-allocated physical data area.
785 if (flag & IS_ROOT_NODE) {
786 dzp->z_dbuf = db;
787 dzp->z_phys = pzp;
788 dzp->z_id = obj;
792 * If parent is an xattr, so am I.
794 if (dzp->z_phys->zp_flags & ZFS_XATTR)
795 flag |= IS_XATTR;
797 if (vap->va_type == VBLK || vap->va_type == VCHR) {
798 pzp->zp_rdev = zfs_expldev(vap->va_rdev);
801 if (zfsvfs->z_use_fuids)
802 pzp->zp_flags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
804 if (vap->va_type == VDIR) {
805 pzp->zp_size = 2; /* contents ("." and "..") */
806 pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
809 pzp->zp_parent = dzp->z_id;
810 if (flag & IS_XATTR)
811 pzp->zp_flags |= ZFS_XATTR;
813 pzp->zp_gen = gen;
815 ZFS_TIME_ENCODE(&now, pzp->zp_crtime);
816 ZFS_TIME_ENCODE(&now, pzp->zp_ctime);
818 if (vap->va_mask & AT_ATIME) {
819 ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
820 } else {
821 ZFS_TIME_ENCODE(&now, pzp->zp_atime);
824 if (vap->va_mask & AT_MTIME) {
825 ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
826 } else {
827 ZFS_TIME_ENCODE(&now, pzp->zp_mtime);
830 pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode);
831 if (!(flag & IS_ROOT_NODE)) {
832 dprintf("zfs_mknode parent vp %p - zp %p\n", ZTOV(dzp), dzp);
833 dprintf("Going to lock %p with %ld\n", ZFS_OBJ_MUTEX(zfsvfs, obj), obj);
835 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
836 *zpp = zfs_znode_alloc(zfsvfs, db, 0);
838 genfs_node_init(ZTOV(*zpp), &zfs_genfsops);
840 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
841 } else {
843 * If we are creating the root node, the "parent" we
844 * passed in is the znode for the root.
846 *zpp = dzp;
848 zfs_perm_init(*zpp, dzp, flag, vap, tx, cr, setaclp, fuidp);
851 void
852 zfs_xvattr_set(znode_t *zp, xvattr_t *xvap)
854 xoptattr_t *xoap;
856 xoap = xva_getxoptattr(xvap);
857 ASSERT(xoap);
859 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
860 ZFS_TIME_ENCODE(&xoap->xoa_createtime, zp->z_phys->zp_crtime);
861 XVA_SET_RTN(xvap, XAT_CREATETIME);
863 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
864 ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly);
865 XVA_SET_RTN(xvap, XAT_READONLY);
867 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
868 ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden);
869 XVA_SET_RTN(xvap, XAT_HIDDEN);
871 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
872 ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system);
873 XVA_SET_RTN(xvap, XAT_SYSTEM);
875 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
876 ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive);
877 XVA_SET_RTN(xvap, XAT_ARCHIVE);
879 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
880 ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable);
881 XVA_SET_RTN(xvap, XAT_IMMUTABLE);
883 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
884 ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink);
885 XVA_SET_RTN(xvap, XAT_NOUNLINK);
887 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
888 ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly);
889 XVA_SET_RTN(xvap, XAT_APPENDONLY);
891 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
892 ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump);
893 XVA_SET_RTN(xvap, XAT_NODUMP);
895 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
896 ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque);
897 XVA_SET_RTN(xvap, XAT_OPAQUE);
899 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
900 ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
901 xoap->xoa_av_quarantined);
902 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
904 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
905 ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified);
906 XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
908 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
909 (void) memcpy(zp->z_phys + 1, xoap->xoa_av_scanstamp,
910 sizeof (xoap->xoa_av_scanstamp));
911 zp->z_phys->zp_flags |= ZFS_BONUS_SCANSTAMP;
912 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
917 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
919 dmu_object_info_t doi;
920 dmu_buf_t *db;
921 znode_t *zp;
922 vnode_t *vp;
923 int err, first = 1;
925 *zpp = NULL;
926 again:
927 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
929 err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
930 if (err) {
931 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
932 return (err);
935 dmu_object_info_from_db(db, &doi);
936 if (doi.doi_bonus_type != DMU_OT_ZNODE ||
937 doi.doi_bonus_size < sizeof (znode_phys_t)) {
938 dmu_buf_rele(db, NULL);
939 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
940 return (EINVAL);
943 zp = dmu_buf_get_user(db);
944 if (zp != NULL) {
945 mutex_enter(&zp->z_lock);
948 * Since we do immediate eviction of the z_dbuf, we
949 * should never find a dbuf with a znode that doesn't
950 * know about the dbuf.
952 ASSERT3P(zp->z_dbuf, ==, db);
953 ASSERT3U(zp->z_id, ==, obj_num);
954 if (zp->z_unlinked) {
955 err = ENOENT;
956 } else {
957 if ((vp = ZTOV(zp)) != NULL) {
958 mutex_enter(&vp->v_interlock);
959 mutex_exit(&zp->z_lock);
960 if (vget(vp, LK_INTERLOCK) != 0) {
961 dmu_buf_rele(db, NULL);
962 mutex_exit(&vp->v_interlock);
963 goto again;
965 mutex_enter(&zp->z_lock);
966 } else {
967 if (first) {
968 ZFS_LOG(1, "dying znode detected (zp=%p)", zp);
969 first = 0;
972 * znode is dying so we can't reuse it, we must
973 * wait until destruction is completed.
975 dmu_buf_rele(db, NULL);
976 mutex_exit(&zp->z_lock);
977 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
978 kpause("zcollide", 0, 1, NULL);
979 goto again;
981 *zpp = zp;
982 err = 0;
985 dmu_buf_rele(db, NULL);
986 mutex_exit(&zp->z_lock);
987 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
988 return (err);
992 * Not found create new znode/vnode
994 zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size);
995 vp = ZTOV(zp);
997 genfs_node_init(vp, &zfs_genfsops);
999 VOP_UNLOCK(vp, 0);
1001 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1002 *zpp = zp;
1003 return (0);
1007 zfs_rezget(znode_t *zp)
1009 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1010 dmu_object_info_t doi;
1011 dmu_buf_t *db;
1012 uint64_t obj_num = zp->z_id;
1013 int err;
1015 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
1017 err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
1018 if (err) {
1019 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1020 return (err);
1023 dmu_object_info_from_db(db, &doi);
1024 if (doi.doi_bonus_type != DMU_OT_ZNODE ||
1025 doi.doi_bonus_size < sizeof (znode_phys_t)) {
1026 dmu_buf_rele(db, NULL);
1027 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1028 return (EINVAL);
1031 if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) {
1032 dmu_buf_rele(db, NULL);
1033 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1034 return (EIO);
1037 zfs_znode_dmu_init(zfsvfs, zp, db);
1038 zp->z_unlinked = (zp->z_phys->zp_links == 0);
1039 zp->z_blksz = doi.doi_data_block_size;
1041 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1043 return (0);
1046 void
1047 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
1049 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1050 objset_t *os = zfsvfs->z_os;
1051 uint64_t obj = zp->z_id;
1052 uint64_t acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;
1053 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
1054 if (acl_obj)
1055 VERIFY(0 == dmu_object_free(os, acl_obj, tx));
1056 VERIFY(0 == dmu_object_free(os, obj, tx));
1057 zfs_znode_dmu_fini(zp);
1058 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
1059 zfs_znode_free(zp);
1063 * zfs_zinactive must be called with ZFS_OBJ_HOLD_ENTER held. And this lock
1064 * will be released in zfs_zinactive.
1066 void
1067 zfs_zinactive(znode_t *zp)
1069 vnode_t *vp = ZTOV(zp);
1070 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1072 ASSERT(zp->z_dbuf && zp->z_phys);
1074 //printf("zfs_zinactive vp %p - zp %p\n", vp, zp);
1075 //printf("Going to lock %p with %ld\n", ZFS_OBJ_MUTEX(zfsvfs, z_id), z_id);
1077 mutex_enter(&zp->z_lock);
1079 * If this was the last reference to a file with no links,
1080 * remove the file from the file system.
1082 if (zp->z_unlinked) {
1083 mutex_exit(&zp->z_lock);
1084 ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
1085 zfs_rmnode(zp);
1086 return;
1089 mutex_exit(&zp->z_lock);
1090 ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
1091 zfs_znode_free(zp);
1094 void
1095 zfs_znode_free(znode_t *zp)
1097 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1098 ASSERT(ZTOV(zp) == NULL);
1100 dprintf("destroying znode %p\n", zp);
1101 //cpu_Debugger();
1102 mutex_enter(&zfsvfs->z_znodes_lock);
1103 POINTER_INVALIDATE(&zp->z_zfsvfs);
1104 list_remove(&zfsvfs->z_all_znodes, zp);
1105 mutex_exit(&zfsvfs->z_znodes_lock);
1107 kmem_cache_free(znode_cache, zp);
1109 VFS_RELE(zfsvfs->z_vfs);
1112 void
1113 zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx)
1115 timestruc_t now;
1117 ASSERT(MUTEX_HELD(&zp->z_lock));
1119 gethrestime(&now);
1121 if (tx) {
1122 dmu_buf_will_dirty(zp->z_dbuf, tx);
1123 zp->z_atime_dirty = 0;
1124 zp->z_seq++;
1125 } else {
1126 zp->z_atime_dirty = 1;
1129 if (flag & AT_ATIME)
1130 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime);
1132 if (flag & AT_MTIME) {
1133 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime);
1134 if (zp->z_zfsvfs->z_use_fuids)
1135 zp->z_phys->zp_flags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED);
1138 if (flag & AT_CTIME) {
1139 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime);
1140 if (zp->z_zfsvfs->z_use_fuids)
1141 zp->z_phys->zp_flags |= ZFS_ARCHIVE;
1146 * Update the requested znode timestamps with the current time.
1147 * If we are in a transaction, then go ahead and mark the znode
1148 * dirty in the transaction so the timestamps will go to disk.
1149 * Otherwise, we will get pushed next time the znode is updated
1150 * in a transaction, or when this znode eventually goes inactive.
1152 * Why is this OK?
1153 * 1 - Only the ACCESS time is ever updated outside of a transaction.
1154 * 2 - Multiple consecutive updates will be collapsed into a single
1155 * znode update by the transaction grouping semantics of the DMU.
1157 void
1158 zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx)
1160 mutex_enter(&zp->z_lock);
1161 zfs_time_stamper_locked(zp, flag, tx);
1162 mutex_exit(&zp->z_lock);
1166 * Grow the block size for a file.
1168 * IN: zp - znode of file to free data in.
1169 * size - requested block size
1170 * tx - open transaction.
1172 * NOTE: this function assumes that the znode is write locked.
1174 void
1175 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
1177 int error;
1178 u_longlong_t dummy;
1180 if (size <= zp->z_blksz)
1181 return;
1183 * If the file size is already greater than the current blocksize,
1184 * we will not grow. If there is more than one block in a file,
1185 * the blocksize cannot change.
1187 if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz)
1188 return;
1190 error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
1191 size, 0, tx);
1192 if (error == ENOTSUP)
1193 return;
1194 ASSERT3U(error, ==, 0);
1196 /* What blocksize did we actually get? */
1197 dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy);
1201 * Increase the file length
1203 * IN: zp - znode of file to free data in.
1204 * end - new end-of-file
1206 * RETURN: 0 if success
1207 * error code if failure
1209 static int
1210 zfs_extend(znode_t *zp, uint64_t end)
1212 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1213 dmu_tx_t *tx;
1214 rl_t *rl;
1215 uint64_t newblksz;
1216 int error;
1219 * We will change zp_size, lock the whole file.
1221 rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
1224 * Nothing to do if file already at desired length.
1226 if (end <= zp->z_phys->zp_size) {
1227 zfs_range_unlock(rl);
1228 return (0);
1230 top:
1231 tx = dmu_tx_create(zfsvfs->z_os);
1232 dmu_tx_hold_bonus(tx, zp->z_id);
1233 if (end > zp->z_blksz &&
1234 (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
1236 * We are growing the file past the current block size.
1238 if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
1239 ASSERT(!ISP2(zp->z_blksz));
1240 newblksz = MIN(end, SPA_MAXBLOCKSIZE);
1241 } else {
1242 newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
1244 dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
1245 } else {
1246 newblksz = 0;
1249 error = dmu_tx_assign(tx, zfsvfs->z_assign);
1250 if (error) {
1251 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
1252 dmu_tx_wait(tx);
1253 dmu_tx_abort(tx);
1254 goto top;
1256 dmu_tx_abort(tx);
1257 zfs_range_unlock(rl);
1258 return (error);
1260 dmu_buf_will_dirty(zp->z_dbuf, tx);
1262 if (newblksz)
1263 zfs_grow_blocksize(zp, newblksz, tx);
1265 zp->z_phys->zp_size = end;
1267 zfs_range_unlock(rl);
1269 dmu_tx_commit(tx);
1271 rw_enter(&zp->z_map_lock, RW_WRITER);
1272 uvm_vnp_setsize(ZTOV(zp), end);
1273 rw_exit(&zp->z_map_lock);
1275 return (0);
1279 * Free space in a file.
1281 * IN: zp - znode of file to free data in.
1282 * off - start of section to free.
1283 * len - length of section to free.
1285 * RETURN: 0 if success
1286 * error code if failure
1288 static int
1289 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
1291 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1292 rl_t *rl;
1293 int error;
1296 * Lock the range being freed.
1298 rl = zfs_range_lock(zp, off, len, RL_WRITER);
1301 * Nothing to do if file already at desired length.
1303 if (off >= zp->z_phys->zp_size) {
1304 zfs_range_unlock(rl);
1305 return (0);
1308 if (off + len > zp->z_phys->zp_size)
1309 len = zp->z_phys->zp_size - off;
1311 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
1313 if (error == 0) {
1315 * In NetBSD we cannot free block in the middle of a file,
1316 * but only at the end of a file.
1318 rw_enter(&zp->z_map_lock, RW_WRITER);
1319 uvm_vnp_setsize(ZTOV(zp), off);
1320 rw_exit(&zp->z_map_lock);
1323 zfs_range_unlock(rl);
1325 return (error);
1329 * Truncate a file
1331 * IN: zp - znode of file to free data in.
1332 * end - new end-of-file.
1334 * RETURN: 0 if success
1335 * error code if failure
1337 static int
1338 zfs_trunc(znode_t *zp, uint64_t end)
1340 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1341 vnode_t *vp = ZTOV(zp);
1342 dmu_tx_t *tx;
1343 rl_t *rl;
1344 int error;
1347 * We will change zp_size, lock the whole file.
1349 rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
1352 * Nothing to do if file already at desired length.
1354 if (end >= zp->z_phys->zp_size) {
1355 zfs_range_unlock(rl);
1356 return (0);
1359 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, -1);
1360 if (error) {
1361 zfs_range_unlock(rl);
1362 return (error);
1364 top:
1365 tx = dmu_tx_create(zfsvfs->z_os);
1366 dmu_tx_hold_bonus(tx, zp->z_id);
1367 error = dmu_tx_assign(tx, zfsvfs->z_assign);
1368 if (error) {
1369 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
1370 dmu_tx_wait(tx);
1371 dmu_tx_abort(tx);
1372 goto top;
1374 dmu_tx_abort(tx);
1375 zfs_range_unlock(rl);
1376 return (error);
1378 dmu_buf_will_dirty(zp->z_dbuf, tx);
1380 zp->z_phys->zp_size = end;
1382 dmu_tx_commit(tx);
1384 zfs_range_unlock(rl);
1387 * Clear any mapped pages in the truncated region. This has to
1388 * happen outside of the transaction to avoid the possibility of
1389 * a deadlock with someone trying to push a page that we are
1390 * about to invalidate.
1392 rw_enter(&zp->z_map_lock, RW_WRITER);
1393 uvm_vnp_setsize(vp, end);
1394 rw_exit(&zp->z_map_lock);
1396 return (0);
1400 * Free space in a file
1402 * IN: zp - znode of file to free data in.
1403 * off - start of range
1404 * len - end of range (0 => EOF)
1405 * flag - current file open mode flags.
1406 * log - TRUE if this action should be logged
1408 * RETURN: 0 if success
1409 * error code if failure
1412 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
1414 vnode_t *vp = ZTOV(zp);
1415 dmu_tx_t *tx;
1416 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1417 zilog_t *zilog = zfsvfs->z_log;
1418 int error;
1420 if (off > zp->z_phys->zp_size) {
1421 error = zfs_extend(zp, off+len);
1422 if (error == 0 && log)
1423 goto log;
1424 else
1425 return (error);
1428 if (len == 0) {
1429 error = zfs_trunc(zp, off);
1430 } else {
1431 if ((error = zfs_free_range(zp, off, len)) == 0 &&
1432 off + len > zp->z_phys->zp_size)
1433 error = zfs_extend(zp, off+len);
1435 if (error || !log)
1436 return (error);
1437 log:
1438 tx = dmu_tx_create(zfsvfs->z_os);
1439 dmu_tx_hold_bonus(tx, zp->z_id);
1440 error = dmu_tx_assign(tx, zfsvfs->z_assign);
1441 if (error) {
1442 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
1443 dmu_tx_wait(tx);
1444 dmu_tx_abort(tx);
1445 goto log;
1447 dmu_tx_abort(tx);
1448 return (error);
1451 zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
1452 zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
1454 dmu_tx_commit(tx);
1455 return (0);
1458 void
1459 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
1461 zfsvfs_t zfsvfs;
1462 uint64_t moid, doid, version;
1463 uint64_t sense = ZFS_CASE_SENSITIVE;
1464 uint64_t norm = 0;
1465 nvpair_t *elem;
1466 int error;
1467 znode_t *rootzp = NULL;
1468 vnode_t *vp;
1469 vattr_t vattr;
1470 znode_t *zp;
1473 * First attempt to create master node.
1476 * In an empty objset, there are no blocks to read and thus
1477 * there can be no i/o errors (which we assert below).
1479 moid = MASTER_NODE_OBJ;
1480 error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
1481 DMU_OT_NONE, 0, tx);
1482 ASSERT(error == 0);
1485 * Set starting attributes.
1487 if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
1488 version = ZPL_VERSION;
1489 else
1490 version = ZPL_VERSION_FUID - 1;
1491 error = zap_update(os, moid, ZPL_VERSION_STR,
1492 8, 1, &version, tx);
1493 elem = NULL;
1494 while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
1495 /* For the moment we expect all zpl props to be uint64_ts */
1496 uint64_t val;
1497 char *name;
1499 ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
1500 VERIFY(nvpair_value_uint64(elem, &val) == 0);
1501 name = nvpair_name(elem);
1502 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
1503 version = val;
1504 error = zap_update(os, moid, ZPL_VERSION_STR,
1505 8, 1, &version, tx);
1506 } else {
1507 error = zap_update(os, moid, name, 8, 1, &val, tx);
1509 ASSERT(error == 0);
1510 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
1511 norm = val;
1512 else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
1513 sense = val;
1515 ASSERT(version != 0);
1518 * Create a delete queue.
1520 doid = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
1522 error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &doid, tx);
1523 ASSERT(error == 0);
1526 * Create root znode. Create minimal znode/vnode/zfsvfs
1527 * to allow zfs_mknode to work.
1529 VATTR_NULL(&vattr);
1530 vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
1531 vattr.va_type = VDIR;
1532 vattr.va_mode = S_IFDIR|0755;
1533 vattr.va_uid = crgetuid(cr);
1534 vattr.va_gid = crgetgid(cr);
1536 rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
1537 rootzp->z_unlinked = 0;
1538 rootzp->z_atime_dirty = 0;
1540 for (;;) {
1541 error = getnewvnode(VT_ZFS, NULL, zfs_vnodeop_p,
1542 &rootzp->z_vnode);
1543 if (error == 0)
1544 break;
1545 printf("WARNING: zfs_create_fs: unable to get vnode, "
1546 "error=%d\n", error);
1547 kpause("zfsvn", false, hz, NULL);
1550 vp = ZTOV(rootzp);
1551 vp->v_type = VDIR;
1553 bzero(&zfsvfs, sizeof (zfsvfs_t));
1555 zfsvfs.z_os = os;
1556 zfsvfs.z_assign = TXG_NOWAIT;
1557 zfsvfs.z_parent = &zfsvfs;
1558 zfsvfs.z_version = version;
1559 zfsvfs.z_use_fuids = USE_FUIDS(version, os);
1560 zfsvfs.z_norm = norm;
1562 * Fold case on file systems that are always or sometimes case
1563 * insensitive.
1565 if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
1566 zfsvfs.z_norm |= U8_TEXTPREP_TOUPPER;
1568 mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1569 list_create(&zfsvfs.z_all_znodes, sizeof (znode_t),
1570 offsetof(znode_t, z_link_node));
1572 ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
1573 rootzp->z_zfsvfs = &zfsvfs;
1574 zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, NULL, NULL);
1575 ASSERT3P(zp, ==, rootzp);
1576 error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
1577 ASSERT(error == 0);
1578 POINTER_INVALIDATE(&rootzp->z_zfsvfs);
1580 dmu_buf_rele(rootzp->z_dbuf, NULL);
1581 rootzp->z_dbuf = NULL;
1582 ungetnewvnode(vp);
1583 kmem_cache_free(znode_cache, rootzp);
1586 #endif /* _KERNEL */
1588 * Given an object number, return its parent object number and whether
1589 * or not the object is an extended attribute directory.
1591 static int
1592 zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir)
1594 dmu_buf_t *db;
1595 dmu_object_info_t doi;
1596 znode_phys_t *zp;
1597 int error;
1599 if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0)
1600 return (error);
1602 dmu_object_info_from_db(db, &doi);
1603 if (doi.doi_bonus_type != DMU_OT_ZNODE ||
1604 doi.doi_bonus_size < sizeof (znode_phys_t)) {
1605 dmu_buf_rele(db, FTAG);
1606 return (EINVAL);
1609 zp = db->db_data;
1610 *pobjp = zp->zp_parent;
1611 *is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) &&
1612 S_ISDIR(zp->zp_mode);
1613 dmu_buf_rele(db, FTAG);
1615 return (0);
1619 zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
1621 char *path = buf + len - 1;
1622 int error;
1624 *path = '\0';
1626 for (;;) {
1627 uint64_t pobj;
1628 char component[MAXNAMELEN + 2];
1629 size_t complen;
1630 int is_xattrdir;
1632 if ((error = zfs_obj_to_pobj(osp, obj, &pobj,
1633 &is_xattrdir)) != 0)
1634 break;
1636 if (pobj == obj) {
1637 if (path[0] != '/')
1638 *--path = '/';
1639 break;
1642 component[0] = '/';
1643 if (is_xattrdir) {
1644 (void) sprintf(component + 1, "<xattrdir>");
1645 } else {
1646 error = zap_value_search(osp, pobj, obj,
1647 ZFS_DIRENT_OBJ(-1ULL), component + 1);
1648 if (error != 0)
1649 break;
1652 complen = strlen(component);
1653 path -= complen;
1654 ASSERT(path >= buf);
1655 bcopy(component, path, complen);
1656 obj = pobj;
1659 if (error == 0)
1660 (void) memmove(buf, path, buf + len - path);
1661 return (error);