4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
24 * Copyright (c) 2013 Martin Matuska. All rights reserved.
25 * Copyright (c) 2014 Joyent, Inc. All rights reserved.
26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
27 * Copyright (c) 2016 Actifio, Inc. All rights reserved.
28 * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
32 #include <sys/dmu_objset.h>
33 #include <sys/dmu_tx.h>
34 #include <sys/dsl_dataset.h>
35 #include <sys/dsl_dir.h>
36 #include <sys/dsl_prop.h>
37 #include <sys/dsl_synctask.h>
38 #include <sys/dsl_deleg.h>
39 #include <sys/dmu_impl.h>
41 #include <sys/spa_impl.h>
42 #include <sys/metaslab.h>
46 #include <sys/sunddi.h>
47 #include <sys/zfeature.h>
48 #include <sys/policy.h>
49 #include <sys/zfs_vfsops.h>
50 #include <sys/zfs_znode.h>
53 #include "zfs_namecheck.h"
57 * Filesystem and Snapshot Limits
58 * ------------------------------
60 * These limits are used to restrict the number of filesystems and/or snapshots
61 * that can be created at a given level in the tree or below. A typical
62 * use-case is with a delegated dataset where the administrator wants to ensure
63 * that a user within the zone is not creating too many additional filesystems
64 * or snapshots, even though they're not exceeding their space quota.
66 * The filesystem and snapshot counts are stored as extensible properties. This
67 * capability is controlled by a feature flag and must be enabled to be used.
68 * Once enabled, the feature is not active until the first limit is set. At
69 * that point, future operations to create/destroy filesystems or snapshots
70 * will validate and update the counts.
72 * Because the count properties will not exist before the feature is active,
73 * the counts are updated when a limit is first set on an uninitialized
74 * dsl_dir node in the tree (The filesystem/snapshot count on a node includes
75 * all of the nested filesystems/snapshots. Thus, a new leaf node has a
76 * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and
77 * snapshot count properties on a node indicate uninitialized counts on that
78 * node.) When first setting a limit on an uninitialized node, the code starts
79 * at the filesystem with the new limit and descends into all sub-filesystems
80 * to add the count properties.
82 * In practice this is lightweight since a limit is typically set when the
83 * filesystem is created and thus has no children. Once valid, changing the
84 * limit value won't require a re-traversal since the counts are already valid.
85 * When recursively fixing the counts, if a node with a limit is encountered
86 * during the descent, the counts are known to be valid and there is no need to
87 * descend into that filesystem's children. The counts on filesystems above the
88 * one with the new limit will still be uninitialized, unless a limit is
89 * eventually set on one of those filesystems. The counts are always recursively
90 * updated when a limit is set on a dataset, unless there is already a limit.
91 * When a new limit value is set on a filesystem with an existing limit, it is
92 * possible for the new limit to be less than the current count at that level
93 * since a user who can change the limit is also allowed to exceed the limit.
95 * Once the feature is active, then whenever a filesystem or snapshot is
96 * created, the code recurses up the tree, validating the new count against the
97 * limit at each initialized level. In practice, most levels will not have a
98 * limit set. If there is a limit at any initialized level up the tree, the
99 * check must pass or the creation will fail. Likewise, when a filesystem or
100 * snapshot is destroyed, the counts are recursively adjusted all the way up
101 * the initialized nodes in the tree. Renaming a filesystem into different point
102 * in the tree will first validate, then update the counts on each branch up to
103 * the common ancestor. A receive will also validate the counts and then update
106 * An exception to the above behavior is that the limit is not enforced if the
107 * user has permission to modify the limit. This is primarily so that
108 * recursive snapshots in the global zone always work. We want to prevent a
109 * denial-of-service in which a lower level delegated dataset could max out its
110 * limit and thus block recursive snapshots from being taken in the global zone.
111 * Because of this, it is possible for the snapshot count to be over the limit
112 * and snapshots taken in the global zone could cause a lower level dataset to
113 * hit or exceed its limit. The administrator taking the global zone recursive
114 * snapshot should be aware of this side-effect and behave accordingly.
115 * For consistency, the filesystem limit is also not enforced if the user can
118 * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check()
119 * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in
120 * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by
121 * dsl_dir_init_fs_ss_count().
124 static uint64_t dsl_dir_space_towrite(dsl_dir_t
*dd
);
126 typedef struct ddulrt_arg
{
127 dsl_dir_t
*ddulrta_dd
;
132 dsl_dir_evict_async(void *dbu
)
136 dsl_pool_t
*dp __maybe_unused
= dd
->dd_pool
;
140 for (t
= 0; t
< TXG_SIZE
; t
++) {
141 ASSERT(!txg_list_member(&dp
->dp_dirty_dirs
, dd
, t
));
142 ASSERT(dd
->dd_tempreserved
[t
] == 0);
143 ASSERT(dd
->dd_space_towrite
[t
] == 0);
147 dsl_dir_async_rele(dd
->dd_parent
, dd
);
149 spa_async_close(dd
->dd_pool
->dp_spa
, dd
);
151 if (dsl_deadlist_is_open(&dd
->dd_livelist
))
152 dsl_dir_livelist_close(dd
);
155 cv_destroy(&dd
->dd_activity_cv
);
156 mutex_destroy(&dd
->dd_activity_lock
);
157 mutex_destroy(&dd
->dd_lock
);
158 kmem_free(dd
, sizeof (dsl_dir_t
));
162 dsl_dir_hold_obj(dsl_pool_t
*dp
, uint64_t ddobj
,
163 const char *tail
, void *tag
, dsl_dir_t
**ddp
)
167 dmu_object_info_t doi
;
170 ASSERT(dsl_pool_config_held(dp
));
172 err
= dmu_bonus_hold(dp
->dp_meta_objset
, ddobj
, tag
, &dbuf
);
175 dd
= dmu_buf_get_user(dbuf
);
177 dmu_object_info_from_db(dbuf
, &doi
);
178 ASSERT3U(doi
.doi_bonus_type
, ==, DMU_OT_DSL_DIR
);
179 ASSERT3U(doi
.doi_bonus_size
, >=, sizeof (dsl_dir_phys_t
));
184 dd
= kmem_zalloc(sizeof (dsl_dir_t
), KM_SLEEP
);
185 dd
->dd_object
= ddobj
;
189 mutex_init(&dd
->dd_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
190 mutex_init(&dd
->dd_activity_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
191 cv_init(&dd
->dd_activity_cv
, NULL
, CV_DEFAULT
, NULL
);
194 if (dsl_dir_is_zapified(dd
)) {
195 err
= zap_lookup(dp
->dp_meta_objset
,
196 ddobj
, DD_FIELD_CRYPTO_KEY_OBJ
,
197 sizeof (uint64_t), 1, &dd
->dd_crypto_obj
);
199 /* check for on-disk format errata */
200 if (dsl_dir_incompatible_encryption_version(
202 dp
->dp_spa
->spa_errata
=
203 ZPOOL_ERRATA_ZOL_6845_ENCRYPTION
;
205 } else if (err
!= ENOENT
) {
210 dsl_dir_snap_cmtime_update(dd
);
212 if (dsl_dir_phys(dd
)->dd_parent_obj
) {
213 err
= dsl_dir_hold_obj(dp
,
214 dsl_dir_phys(dd
)->dd_parent_obj
, NULL
, dd
,
222 err
= zap_lookup(dp
->dp_meta_objset
,
223 dsl_dir_phys(dd
->dd_parent
)->
224 dd_child_dir_zapobj
, tail
,
225 sizeof (foundobj
), 1, &foundobj
);
226 ASSERT(err
|| foundobj
== ddobj
);
228 (void) strlcpy(dd
->dd_myname
, tail
,
229 sizeof (dd
->dd_myname
));
231 err
= zap_value_search(dp
->dp_meta_objset
,
232 dsl_dir_phys(dd
->dd_parent
)->
234 ddobj
, 0, dd
->dd_myname
);
239 (void) strlcpy(dd
->dd_myname
, spa_name(dp
->dp_spa
),
240 sizeof (dd
->dd_myname
));
243 if (dsl_dir_is_clone(dd
)) {
244 dmu_buf_t
*origin_bonus
;
245 dsl_dataset_phys_t
*origin_phys
;
248 * We can't open the origin dataset, because
249 * that would require opening this dsl_dir.
250 * Just look at its phys directly instead.
252 err
= dmu_bonus_hold(dp
->dp_meta_objset
,
253 dsl_dir_phys(dd
)->dd_origin_obj
, FTAG
,
257 origin_phys
= origin_bonus
->db_data
;
259 origin_phys
->ds_creation_txg
;
260 dmu_buf_rele(origin_bonus
, FTAG
);
261 if (dsl_dir_is_zapified(dd
)) {
263 err
= zap_lookup(dp
->dp_meta_objset
,
264 dd
->dd_object
, DD_FIELD_LIVELIST
,
265 sizeof (uint64_t), 1, &obj
);
267 dsl_dir_livelist_open(dd
, obj
);
268 else if (err
!= ENOENT
)
273 dmu_buf_init_user(&dd
->dd_dbu
, NULL
, dsl_dir_evict_async
,
275 winner
= dmu_buf_set_user_ie(dbuf
, &dd
->dd_dbu
);
276 if (winner
!= NULL
) {
278 dsl_dir_rele(dd
->dd_parent
, dd
);
279 if (dsl_deadlist_is_open(&dd
->dd_livelist
))
280 dsl_dir_livelist_close(dd
);
282 cv_destroy(&dd
->dd_activity_cv
);
283 mutex_destroy(&dd
->dd_activity_lock
);
284 mutex_destroy(&dd
->dd_lock
);
285 kmem_free(dd
, sizeof (dsl_dir_t
));
288 spa_open_ref(dp
->dp_spa
, dd
);
293 * The dsl_dir_t has both open-to-close and instantiate-to-evict
294 * holds on the spa. We need the open-to-close holds because
295 * otherwise the spa_refcnt wouldn't change when we open a
296 * dir which the spa also has open, so we could incorrectly
297 * think it was OK to unload/export/destroy the pool. We need
298 * the instantiate-to-evict hold because the dsl_dir_t has a
299 * pointer to the dd_pool, which has a pointer to the spa_t.
301 spa_open_ref(dp
->dp_spa
, tag
);
302 ASSERT3P(dd
->dd_pool
, ==, dp
);
303 ASSERT3U(dd
->dd_object
, ==, ddobj
);
304 ASSERT3P(dd
->dd_dbuf
, ==, dbuf
);
310 dsl_dir_rele(dd
->dd_parent
, dd
);
311 if (dsl_deadlist_is_open(&dd
->dd_livelist
))
312 dsl_dir_livelist_close(dd
);
314 cv_destroy(&dd
->dd_activity_cv
);
315 mutex_destroy(&dd
->dd_activity_lock
);
316 mutex_destroy(&dd
->dd_lock
);
317 kmem_free(dd
, sizeof (dsl_dir_t
));
318 dmu_buf_rele(dbuf
, tag
);
323 dsl_dir_rele(dsl_dir_t
*dd
, void *tag
)
325 dprintf_dd(dd
, "%s\n", "");
326 spa_close(dd
->dd_pool
->dp_spa
, tag
);
327 dmu_buf_rele(dd
->dd_dbuf
, tag
);
331 * Remove a reference to the given dsl dir that is being asynchronously
332 * released. Async releases occur from a taskq performing eviction of
333 * dsl datasets and dirs. This process is identical to a normal release
334 * with the exception of using the async API for releasing the reference on
338 dsl_dir_async_rele(dsl_dir_t
*dd
, void *tag
)
340 dprintf_dd(dd
, "%s\n", "");
341 spa_async_close(dd
->dd_pool
->dp_spa
, tag
);
342 dmu_buf_rele(dd
->dd_dbuf
, tag
);
345 /* buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes */
347 dsl_dir_name(dsl_dir_t
*dd
, char *buf
)
350 dsl_dir_name(dd
->dd_parent
, buf
);
351 VERIFY3U(strlcat(buf
, "/", ZFS_MAX_DATASET_NAME_LEN
), <,
352 ZFS_MAX_DATASET_NAME_LEN
);
356 if (!MUTEX_HELD(&dd
->dd_lock
)) {
358 * recursive mutex so that we can use
359 * dprintf_dd() with dd_lock held
361 mutex_enter(&dd
->dd_lock
);
362 VERIFY3U(strlcat(buf
, dd
->dd_myname
, ZFS_MAX_DATASET_NAME_LEN
),
363 <, ZFS_MAX_DATASET_NAME_LEN
);
364 mutex_exit(&dd
->dd_lock
);
366 VERIFY3U(strlcat(buf
, dd
->dd_myname
, ZFS_MAX_DATASET_NAME_LEN
),
367 <, ZFS_MAX_DATASET_NAME_LEN
);
371 /* Calculate name length, avoiding all the strcat calls of dsl_dir_name */
373 dsl_dir_namelen(dsl_dir_t
*dd
)
378 /* parent's name + 1 for the "/" */
379 result
= dsl_dir_namelen(dd
->dd_parent
) + 1;
382 if (!MUTEX_HELD(&dd
->dd_lock
)) {
383 /* see dsl_dir_name */
384 mutex_enter(&dd
->dd_lock
);
385 result
+= strlen(dd
->dd_myname
);
386 mutex_exit(&dd
->dd_lock
);
388 result
+= strlen(dd
->dd_myname
);
395 getcomponent(const char *path
, char *component
, const char **nextp
)
399 if ((path
== NULL
) || (path
[0] == '\0'))
400 return (SET_ERROR(ENOENT
));
401 /* This would be a good place to reserve some namespace... */
402 p
= strpbrk(path
, "/@");
403 if (p
&& (p
[1] == '/' || p
[1] == '@')) {
404 /* two separators in a row */
405 return (SET_ERROR(EINVAL
));
407 if (p
== NULL
|| p
== path
) {
409 * if the first thing is an @ or /, it had better be an
410 * @ and it had better not have any more ats or slashes,
411 * and it had better have something after the @.
414 (p
[0] != '@' || strpbrk(path
+1, "/@") || p
[1] == '\0'))
415 return (SET_ERROR(EINVAL
));
416 if (strlen(path
) >= ZFS_MAX_DATASET_NAME_LEN
)
417 return (SET_ERROR(ENAMETOOLONG
));
418 (void) strlcpy(component
, path
, ZFS_MAX_DATASET_NAME_LEN
);
420 } else if (p
[0] == '/') {
421 if (p
- path
>= ZFS_MAX_DATASET_NAME_LEN
)
422 return (SET_ERROR(ENAMETOOLONG
));
423 (void) strncpy(component
, path
, p
- path
);
424 component
[p
- path
] = '\0';
426 } else if (p
[0] == '@') {
428 * if the next separator is an @, there better not be
431 if (strchr(path
, '/'))
432 return (SET_ERROR(EINVAL
));
433 if (p
- path
>= ZFS_MAX_DATASET_NAME_LEN
)
434 return (SET_ERROR(ENAMETOOLONG
));
435 (void) strncpy(component
, path
, p
- path
);
436 component
[p
- path
] = '\0';
438 panic("invalid p=%p", (void *)p
);
445 * Return the dsl_dir_t, and possibly the last component which couldn't
446 * be found in *tail. The name must be in the specified dsl_pool_t. This
447 * thread must hold the dp_config_rwlock for the pool. Returns NULL if the
448 * path is bogus, or if tail==NULL and we couldn't parse the whole name.
449 * (*tail)[0] == '@' means that the last component is a snapshot.
452 dsl_dir_hold(dsl_pool_t
*dp
, const char *name
, void *tag
,
453 dsl_dir_t
**ddp
, const char **tailp
)
456 const char *spaname
, *next
, *nextnext
= NULL
;
461 buf
= kmem_alloc(ZFS_MAX_DATASET_NAME_LEN
, KM_SLEEP
);
462 err
= getcomponent(name
, buf
, &next
);
466 /* Make sure the name is in the specified pool. */
467 spaname
= spa_name(dp
->dp_spa
);
468 if (strcmp(buf
, spaname
) != 0) {
469 err
= SET_ERROR(EXDEV
);
473 ASSERT(dsl_pool_config_held(dp
));
475 err
= dsl_dir_hold_obj(dp
, dp
->dp_root_dir_obj
, NULL
, tag
, &dd
);
480 while (next
!= NULL
) {
482 err
= getcomponent(next
, buf
, &nextnext
);
485 ASSERT(next
[0] != '\0');
488 dprintf("looking up %s in obj%lld\n",
489 buf
, (longlong_t
)dsl_dir_phys(dd
)->dd_child_dir_zapobj
);
491 err
= zap_lookup(dp
->dp_meta_objset
,
492 dsl_dir_phys(dd
)->dd_child_dir_zapobj
,
493 buf
, sizeof (ddobj
), 1, &ddobj
);
500 err
= dsl_dir_hold_obj(dp
, ddobj
, buf
, tag
, &child_dd
);
503 dsl_dir_rele(dd
, tag
);
509 dsl_dir_rele(dd
, tag
);
514 * It's an error if there's more than one component left, or
515 * tailp==NULL and there's any component left.
518 (tailp
== NULL
|| (nextnext
&& nextnext
[0] != '\0'))) {
520 dsl_dir_rele(dd
, tag
);
521 dprintf("next=%p (%s) tail=%p\n", next
, next
?next
:"", tailp
);
522 err
= SET_ERROR(ENOENT
);
529 kmem_free(buf
, ZFS_MAX_DATASET_NAME_LEN
);
534 * If the counts are already initialized for this filesystem and its
535 * descendants then do nothing, otherwise initialize the counts.
537 * The counts on this filesystem, and those below, may be uninitialized due to
538 * either the use of a pre-existing pool which did not support the
539 * filesystem/snapshot limit feature, or one in which the feature had not yet
542 * Recursively descend the filesystem tree and update the filesystem/snapshot
543 * counts on each filesystem below, then update the cumulative count on the
544 * current filesystem. If the filesystem already has a count set on it,
545 * then we know that its counts, and the counts on the filesystems below it,
546 * are already correct, so we don't have to update this filesystem.
549 dsl_dir_init_fs_ss_count(dsl_dir_t
*dd
, dmu_tx_t
*tx
)
551 uint64_t my_fs_cnt
= 0;
552 uint64_t my_ss_cnt
= 0;
553 dsl_pool_t
*dp
= dd
->dd_pool
;
554 objset_t
*os
= dp
->dp_meta_objset
;
559 ASSERT(spa_feature_is_active(dp
->dp_spa
, SPA_FEATURE_FS_SS_LIMIT
));
560 ASSERT(dsl_pool_config_held(dp
));
561 ASSERT(dmu_tx_is_syncing(tx
));
563 dsl_dir_zapify(dd
, tx
);
566 * If the filesystem count has already been initialized then we
567 * don't need to recurse down any further.
569 if (zap_contains(os
, dd
->dd_object
, DD_FIELD_FILESYSTEM_COUNT
) == 0)
572 zc
= kmem_alloc(sizeof (zap_cursor_t
), KM_SLEEP
);
573 za
= kmem_alloc(sizeof (zap_attribute_t
), KM_SLEEP
);
575 /* Iterate my child dirs */
576 for (zap_cursor_init(zc
, os
, dsl_dir_phys(dd
)->dd_child_dir_zapobj
);
577 zap_cursor_retrieve(zc
, za
) == 0; zap_cursor_advance(zc
)) {
581 VERIFY0(dsl_dir_hold_obj(dp
, za
->za_first_integer
, NULL
, FTAG
,
585 * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets.
587 if (chld_dd
->dd_myname
[0] == '$') {
588 dsl_dir_rele(chld_dd
, FTAG
);
592 my_fs_cnt
++; /* count this child */
594 dsl_dir_init_fs_ss_count(chld_dd
, tx
);
596 VERIFY0(zap_lookup(os
, chld_dd
->dd_object
,
597 DD_FIELD_FILESYSTEM_COUNT
, sizeof (count
), 1, &count
));
599 VERIFY0(zap_lookup(os
, chld_dd
->dd_object
,
600 DD_FIELD_SNAPSHOT_COUNT
, sizeof (count
), 1, &count
));
603 dsl_dir_rele(chld_dd
, FTAG
);
606 /* Count my snapshots (we counted children's snapshots above) */
607 VERIFY0(dsl_dataset_hold_obj(dd
->dd_pool
,
608 dsl_dir_phys(dd
)->dd_head_dataset_obj
, FTAG
, &ds
));
610 for (zap_cursor_init(zc
, os
, dsl_dataset_phys(ds
)->ds_snapnames_zapobj
);
611 zap_cursor_retrieve(zc
, za
) == 0;
612 zap_cursor_advance(zc
)) {
613 /* Don't count temporary snapshots */
614 if (za
->za_name
[0] != '%')
619 dsl_dataset_rele(ds
, FTAG
);
621 kmem_free(zc
, sizeof (zap_cursor_t
));
622 kmem_free(za
, sizeof (zap_attribute_t
));
624 /* we're in a sync task, update counts */
625 dmu_buf_will_dirty(dd
->dd_dbuf
, tx
);
626 VERIFY0(zap_add(os
, dd
->dd_object
, DD_FIELD_FILESYSTEM_COUNT
,
627 sizeof (my_fs_cnt
), 1, &my_fs_cnt
, tx
));
628 VERIFY0(zap_add(os
, dd
->dd_object
, DD_FIELD_SNAPSHOT_COUNT
,
629 sizeof (my_ss_cnt
), 1, &my_ss_cnt
, tx
));
633 dsl_dir_actv_fs_ss_limit_check(void *arg
, dmu_tx_t
*tx
)
635 char *ddname
= (char *)arg
;
636 dsl_pool_t
*dp
= dmu_tx_pool(tx
);
641 error
= dsl_dataset_hold(dp
, ddname
, FTAG
, &ds
);
645 if (!spa_feature_is_enabled(dp
->dp_spa
, SPA_FEATURE_FS_SS_LIMIT
)) {
646 dsl_dataset_rele(ds
, FTAG
);
647 return (SET_ERROR(ENOTSUP
));
651 if (spa_feature_is_active(dp
->dp_spa
, SPA_FEATURE_FS_SS_LIMIT
) &&
652 dsl_dir_is_zapified(dd
) &&
653 zap_contains(dp
->dp_meta_objset
, dd
->dd_object
,
654 DD_FIELD_FILESYSTEM_COUNT
) == 0) {
655 dsl_dataset_rele(ds
, FTAG
);
656 return (SET_ERROR(EALREADY
));
659 dsl_dataset_rele(ds
, FTAG
);
664 dsl_dir_actv_fs_ss_limit_sync(void *arg
, dmu_tx_t
*tx
)
666 char *ddname
= (char *)arg
;
667 dsl_pool_t
*dp
= dmu_tx_pool(tx
);
671 VERIFY0(dsl_dataset_hold(dp
, ddname
, FTAG
, &ds
));
673 spa
= dsl_dataset_get_spa(ds
);
675 if (!spa_feature_is_active(spa
, SPA_FEATURE_FS_SS_LIMIT
)) {
677 * Since the feature was not active and we're now setting a
678 * limit, increment the feature-active counter so that the
679 * feature becomes active for the first time.
681 * We are already in a sync task so we can update the MOS.
683 spa_feature_incr(spa
, SPA_FEATURE_FS_SS_LIMIT
, tx
);
687 * Since we are now setting a non-UINT64_MAX limit on the filesystem,
688 * we need to ensure the counts are correct. Descend down the tree from
689 * this point and update all of the counts to be accurate.
691 dsl_dir_init_fs_ss_count(ds
->ds_dir
, tx
);
693 dsl_dataset_rele(ds
, FTAG
);
697 * Make sure the feature is enabled and activate it if necessary.
698 * Since we're setting a limit, ensure the on-disk counts are valid.
699 * This is only called by the ioctl path when setting a limit value.
701 * We do not need to validate the new limit, since users who can change the
702 * limit are also allowed to exceed the limit.
705 dsl_dir_activate_fs_ss_limit(const char *ddname
)
709 error
= dsl_sync_task(ddname
, dsl_dir_actv_fs_ss_limit_check
,
710 dsl_dir_actv_fs_ss_limit_sync
, (void *)ddname
, 0,
711 ZFS_SPACE_CHECK_RESERVED
);
713 if (error
== EALREADY
)
720 * Used to determine if the filesystem_limit or snapshot_limit should be
721 * enforced. We allow the limit to be exceeded if the user has permission to
722 * write the property value. We pass in the creds that we got in the open
723 * context since we will always be the GZ root in syncing context. We also have
724 * to handle the case where we are allowed to change the limit on the current
725 * dataset, but there may be another limit in the tree above.
727 * We can never modify these two properties within a non-global zone. In
728 * addition, the other checks are modeled on zfs_secpolicy_write_perms. We
729 * can't use that function since we are already holding the dp_config_rwlock.
730 * In addition, we already have the dd and dealing with snapshots is simplified
741 dsl_enforce_ds_ss_limits(dsl_dir_t
*dd
, zfs_prop_t prop
,
742 cred_t
*cr
, proc_t
*proc
)
744 enforce_res_t enforce
= ENFORCE_ALWAYS
;
748 const char *zonedstr
;
750 ASSERT(prop
== ZFS_PROP_FILESYSTEM_LIMIT
||
751 prop
== ZFS_PROP_SNAPSHOT_LIMIT
);
754 if (crgetzoneid(cr
) != GLOBAL_ZONEID
)
755 return (ENFORCE_ALWAYS
);
758 * We are checking the saved credentials of the user process, which is
759 * not the current process. Note that we can't use secpolicy_zfs(),
760 * because it only works if the cred is that of the current process (on
763 if (secpolicy_zfs_proc(cr
, proc
) == 0)
764 return (ENFORCE_NEVER
);
769 if ((obj
= dsl_dir_phys(dd
)->dd_head_dataset_obj
) == 0)
770 return (ENFORCE_ALWAYS
);
772 ASSERT(dsl_pool_config_held(dd
->dd_pool
));
774 if (dsl_dataset_hold_obj(dd
->dd_pool
, obj
, FTAG
, &ds
) != 0)
775 return (ENFORCE_ALWAYS
);
777 zonedstr
= zfs_prop_to_name(ZFS_PROP_ZONED
);
778 if (dsl_prop_get_ds(ds
, zonedstr
, 8, 1, &zoned
, NULL
) || zoned
) {
779 /* Only root can access zoned fs's from the GZ */
780 enforce
= ENFORCE_ALWAYS
;
782 if (dsl_deleg_access_impl(ds
, zfs_prop_to_name(prop
), cr
) == 0)
783 enforce
= ENFORCE_ABOVE
;
786 dsl_dataset_rele(ds
, FTAG
);
791 * Check if adding additional child filesystem(s) would exceed any filesystem
792 * limits or adding additional snapshot(s) would exceed any snapshot limits.
793 * The prop argument indicates which limit to check.
795 * Note that all filesystem limits up to the root (or the highest
796 * initialized) filesystem or the given ancestor must be satisfied.
799 dsl_fs_ss_limit_check(dsl_dir_t
*dd
, uint64_t delta
, zfs_prop_t prop
,
800 dsl_dir_t
*ancestor
, cred_t
*cr
, proc_t
*proc
)
802 objset_t
*os
= dd
->dd_pool
->dp_meta_objset
;
803 uint64_t limit
, count
;
805 enforce_res_t enforce
;
808 ASSERT(dsl_pool_config_held(dd
->dd_pool
));
809 ASSERT(prop
== ZFS_PROP_FILESYSTEM_LIMIT
||
810 prop
== ZFS_PROP_SNAPSHOT_LIMIT
);
813 * If we're allowed to change the limit, don't enforce the limit
814 * e.g. this can happen if a snapshot is taken by an administrative
815 * user in the global zone (i.e. a recursive snapshot by root).
816 * However, we must handle the case of delegated permissions where we
817 * are allowed to change the limit on the current dataset, but there
818 * is another limit in the tree above.
820 enforce
= dsl_enforce_ds_ss_limits(dd
, prop
, cr
, proc
);
821 if (enforce
== ENFORCE_NEVER
)
825 * e.g. if renaming a dataset with no snapshots, count adjustment
831 if (prop
== ZFS_PROP_SNAPSHOT_LIMIT
) {
833 * We don't enforce the limit for temporary snapshots. This is
834 * indicated by a NULL cred_t argument.
839 count_prop
= DD_FIELD_SNAPSHOT_COUNT
;
841 count_prop
= DD_FIELD_FILESYSTEM_COUNT
;
845 * If an ancestor has been provided, stop checking the limit once we
846 * hit that dir. We need this during rename so that we don't overcount
847 * the check once we recurse up to the common ancestor.
853 * If we hit an uninitialized node while recursing up the tree, we can
854 * stop since we know there is no limit here (or above). The counts are
855 * not valid on this node and we know we won't touch this node's counts.
857 if (!dsl_dir_is_zapified(dd
))
859 err
= zap_lookup(os
, dd
->dd_object
,
860 count_prop
, sizeof (count
), 1, &count
);
866 err
= dsl_prop_get_dd(dd
, zfs_prop_to_name(prop
), 8, 1, &limit
, NULL
,
871 /* Is there a limit which we've hit? */
872 if (enforce
== ENFORCE_ALWAYS
&& (count
+ delta
) > limit
)
873 return (SET_ERROR(EDQUOT
));
875 if (dd
->dd_parent
!= NULL
)
876 err
= dsl_fs_ss_limit_check(dd
->dd_parent
, delta
, prop
,
883 * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all
884 * parents. When a new filesystem/snapshot is created, increment the count on
885 * all parents, and when a filesystem/snapshot is destroyed, decrement the
889 dsl_fs_ss_count_adjust(dsl_dir_t
*dd
, int64_t delta
, const char *prop
,
893 objset_t
*os
= dd
->dd_pool
->dp_meta_objset
;
896 ASSERT(dsl_pool_config_held(dd
->dd_pool
));
897 ASSERT(dmu_tx_is_syncing(tx
));
898 ASSERT(strcmp(prop
, DD_FIELD_FILESYSTEM_COUNT
) == 0 ||
899 strcmp(prop
, DD_FIELD_SNAPSHOT_COUNT
) == 0);
902 * We don't do accounting for hidden ($FREE, $MOS & $ORIGIN) objsets.
904 if (dd
->dd_myname
[0] == '$' && strcmp(prop
,
905 DD_FIELD_FILESYSTEM_COUNT
) == 0) {
910 * e.g. if renaming a dataset with no snapshots, count adjustment is 0
916 * If we hit an uninitialized node while recursing up the tree, we can
917 * stop since we know the counts are not valid on this node and we
918 * know we shouldn't touch this node's counts. An uninitialized count
919 * on the node indicates that either the feature has not yet been
920 * activated or there are no limits on this part of the tree.
922 if (!dsl_dir_is_zapified(dd
) || (err
= zap_lookup(os
, dd
->dd_object
,
923 prop
, sizeof (count
), 1, &count
)) == ENOENT
)
928 /* Use a signed verify to make sure we're not neg. */
929 VERIFY3S(count
, >=, 0);
931 VERIFY0(zap_update(os
, dd
->dd_object
, prop
, sizeof (count
), 1, &count
,
934 /* Roll up this additional count into our ancestors */
935 if (dd
->dd_parent
!= NULL
)
936 dsl_fs_ss_count_adjust(dd
->dd_parent
, delta
, prop
, tx
);
940 dsl_dir_create_sync(dsl_pool_t
*dp
, dsl_dir_t
*pds
, const char *name
,
943 objset_t
*mos
= dp
->dp_meta_objset
;
945 dsl_dir_phys_t
*ddphys
;
948 ddobj
= dmu_object_alloc(mos
, DMU_OT_DSL_DIR
, 0,
949 DMU_OT_DSL_DIR
, sizeof (dsl_dir_phys_t
), tx
);
951 VERIFY0(zap_add(mos
, dsl_dir_phys(pds
)->dd_child_dir_zapobj
,
952 name
, sizeof (uint64_t), 1, &ddobj
, tx
));
954 /* it's the root dir */
955 VERIFY0(zap_add(mos
, DMU_POOL_DIRECTORY_OBJECT
,
956 DMU_POOL_ROOT_DATASET
, sizeof (uint64_t), 1, &ddobj
, tx
));
958 VERIFY0(dmu_bonus_hold(mos
, ddobj
, FTAG
, &dbuf
));
959 dmu_buf_will_dirty(dbuf
, tx
);
960 ddphys
= dbuf
->db_data
;
962 ddphys
->dd_creation_time
= gethrestime_sec();
964 ddphys
->dd_parent_obj
= pds
->dd_object
;
966 /* update the filesystem counts */
967 dsl_fs_ss_count_adjust(pds
, 1, DD_FIELD_FILESYSTEM_COUNT
, tx
);
969 ddphys
->dd_props_zapobj
= zap_create(mos
,
970 DMU_OT_DSL_PROPS
, DMU_OT_NONE
, 0, tx
);
971 ddphys
->dd_child_dir_zapobj
= zap_create(mos
,
972 DMU_OT_DSL_DIR_CHILD_MAP
, DMU_OT_NONE
, 0, tx
);
973 if (spa_version(dp
->dp_spa
) >= SPA_VERSION_USED_BREAKDOWN
)
974 ddphys
->dd_flags
|= DD_FLAG_USED_BREAKDOWN
;
976 dmu_buf_rele(dbuf
, FTAG
);
982 dsl_dir_is_clone(dsl_dir_t
*dd
)
984 return (dsl_dir_phys(dd
)->dd_origin_obj
&&
985 (dd
->dd_pool
->dp_origin_snap
== NULL
||
986 dsl_dir_phys(dd
)->dd_origin_obj
!=
987 dd
->dd_pool
->dp_origin_snap
->ds_object
));
991 dsl_dir_get_used(dsl_dir_t
*dd
)
993 return (dsl_dir_phys(dd
)->dd_used_bytes
);
997 dsl_dir_get_compressed(dsl_dir_t
*dd
)
999 return (dsl_dir_phys(dd
)->dd_compressed_bytes
);
1003 dsl_dir_get_quota(dsl_dir_t
*dd
)
1005 return (dsl_dir_phys(dd
)->dd_quota
);
1009 dsl_dir_get_reservation(dsl_dir_t
*dd
)
1011 return (dsl_dir_phys(dd
)->dd_reserved
);
1015 dsl_dir_get_compressratio(dsl_dir_t
*dd
)
1017 /* a fixed point number, 100x the ratio */
1018 return (dsl_dir_phys(dd
)->dd_compressed_bytes
== 0 ? 100 :
1019 (dsl_dir_phys(dd
)->dd_uncompressed_bytes
* 100 /
1020 dsl_dir_phys(dd
)->dd_compressed_bytes
));
1024 dsl_dir_get_logicalused(dsl_dir_t
*dd
)
1026 return (dsl_dir_phys(dd
)->dd_uncompressed_bytes
);
1030 dsl_dir_get_usedsnap(dsl_dir_t
*dd
)
1032 return (dsl_dir_phys(dd
)->dd_used_breakdown
[DD_USED_SNAP
]);
1036 dsl_dir_get_usedds(dsl_dir_t
*dd
)
1038 return (dsl_dir_phys(dd
)->dd_used_breakdown
[DD_USED_HEAD
]);
1042 dsl_dir_get_usedrefreserv(dsl_dir_t
*dd
)
1044 return (dsl_dir_phys(dd
)->dd_used_breakdown
[DD_USED_REFRSRV
]);
1048 dsl_dir_get_usedchild(dsl_dir_t
*dd
)
1050 return (dsl_dir_phys(dd
)->dd_used_breakdown
[DD_USED_CHILD
] +
1051 dsl_dir_phys(dd
)->dd_used_breakdown
[DD_USED_CHILD_RSRV
]);
1055 dsl_dir_get_origin(dsl_dir_t
*dd
, char *buf
)
1058 VERIFY0(dsl_dataset_hold_obj(dd
->dd_pool
,
1059 dsl_dir_phys(dd
)->dd_origin_obj
, FTAG
, &ds
));
1061 dsl_dataset_name(ds
, buf
);
1063 dsl_dataset_rele(ds
, FTAG
);
1067 dsl_dir_get_filesystem_count(dsl_dir_t
*dd
, uint64_t *count
)
1069 if (dsl_dir_is_zapified(dd
)) {
1070 objset_t
*os
= dd
->dd_pool
->dp_meta_objset
;
1071 return (zap_lookup(os
, dd
->dd_object
, DD_FIELD_FILESYSTEM_COUNT
,
1072 sizeof (*count
), 1, count
));
1074 return (SET_ERROR(ENOENT
));
1079 dsl_dir_get_snapshot_count(dsl_dir_t
*dd
, uint64_t *count
)
1081 if (dsl_dir_is_zapified(dd
)) {
1082 objset_t
*os
= dd
->dd_pool
->dp_meta_objset
;
1083 return (zap_lookup(os
, dd
->dd_object
, DD_FIELD_SNAPSHOT_COUNT
,
1084 sizeof (*count
), 1, count
));
1086 return (SET_ERROR(ENOENT
));
1091 dsl_dir_stats(dsl_dir_t
*dd
, nvlist_t
*nv
)
1093 mutex_enter(&dd
->dd_lock
);
1094 dsl_prop_nvlist_add_uint64(nv
, ZFS_PROP_QUOTA
,
1095 dsl_dir_get_quota(dd
));
1096 dsl_prop_nvlist_add_uint64(nv
, ZFS_PROP_RESERVATION
,
1097 dsl_dir_get_reservation(dd
));
1098 dsl_prop_nvlist_add_uint64(nv
, ZFS_PROP_LOGICALUSED
,
1099 dsl_dir_get_logicalused(dd
));
1100 if (dsl_dir_phys(dd
)->dd_flags
& DD_FLAG_USED_BREAKDOWN
) {
1101 dsl_prop_nvlist_add_uint64(nv
, ZFS_PROP_USEDSNAP
,
1102 dsl_dir_get_usedsnap(dd
));
1103 dsl_prop_nvlist_add_uint64(nv
, ZFS_PROP_USEDDS
,
1104 dsl_dir_get_usedds(dd
));
1105 dsl_prop_nvlist_add_uint64(nv
, ZFS_PROP_USEDREFRESERV
,
1106 dsl_dir_get_usedrefreserv(dd
));
1107 dsl_prop_nvlist_add_uint64(nv
, ZFS_PROP_USEDCHILD
,
1108 dsl_dir_get_usedchild(dd
));
1110 mutex_exit(&dd
->dd_lock
);
1113 if (dsl_dir_get_filesystem_count(dd
, &count
) == 0) {
1114 dsl_prop_nvlist_add_uint64(nv
, ZFS_PROP_FILESYSTEM_COUNT
,
1117 if (dsl_dir_get_snapshot_count(dd
, &count
) == 0) {
1118 dsl_prop_nvlist_add_uint64(nv
, ZFS_PROP_SNAPSHOT_COUNT
,
1122 if (dsl_dir_is_clone(dd
)) {
1123 char buf
[ZFS_MAX_DATASET_NAME_LEN
];
1124 dsl_dir_get_origin(dd
, buf
);
1125 dsl_prop_nvlist_add_string(nv
, ZFS_PROP_ORIGIN
, buf
);
1131 dsl_dir_dirty(dsl_dir_t
*dd
, dmu_tx_t
*tx
)
1133 dsl_pool_t
*dp
= dd
->dd_pool
;
1135 ASSERT(dsl_dir_phys(dd
));
1137 if (txg_list_add(&dp
->dp_dirty_dirs
, dd
, tx
->tx_txg
)) {
1138 /* up the hold count until we can be written out */
1139 dmu_buf_add_ref(dd
->dd_dbuf
, dd
);
1144 parent_delta(dsl_dir_t
*dd
, uint64_t used
, int64_t delta
)
1146 uint64_t old_accounted
= MAX(used
, dsl_dir_phys(dd
)->dd_reserved
);
1147 uint64_t new_accounted
=
1148 MAX(used
+ delta
, dsl_dir_phys(dd
)->dd_reserved
);
1149 return (new_accounted
- old_accounted
);
1153 dsl_dir_sync(dsl_dir_t
*dd
, dmu_tx_t
*tx
)
1155 ASSERT(dmu_tx_is_syncing(tx
));
1157 mutex_enter(&dd
->dd_lock
);
1158 ASSERT0(dd
->dd_tempreserved
[tx
->tx_txg
& TXG_MASK
]);
1159 dprintf_dd(dd
, "txg=%llu towrite=%lluK\n", (u_longlong_t
)tx
->tx_txg
,
1160 (u_longlong_t
)dd
->dd_space_towrite
[tx
->tx_txg
& TXG_MASK
] / 1024);
1161 dd
->dd_space_towrite
[tx
->tx_txg
& TXG_MASK
] = 0;
1162 mutex_exit(&dd
->dd_lock
);
1164 /* release the hold from dsl_dir_dirty */
1165 dmu_buf_rele(dd
->dd_dbuf
, dd
);
1169 dsl_dir_space_towrite(dsl_dir_t
*dd
)
1173 ASSERT(MUTEX_HELD(&dd
->dd_lock
));
1175 for (int i
= 0; i
< TXG_SIZE
; i
++) {
1176 space
+= dd
->dd_space_towrite
[i
& TXG_MASK
];
1177 ASSERT3U(dd
->dd_space_towrite
[i
& TXG_MASK
], >=, 0);
1183 * How much space would dd have available if ancestor had delta applied
1184 * to it? If ondiskonly is set, we're only interested in what's
1185 * on-disk, not estimated pending changes.
1188 dsl_dir_space_available(dsl_dir_t
*dd
,
1189 dsl_dir_t
*ancestor
, int64_t delta
, int ondiskonly
)
1191 uint64_t parentspace
, myspace
, quota
, used
;
1194 * If there are no restrictions otherwise, assume we have
1195 * unlimited space available.
1198 parentspace
= UINT64_MAX
;
1200 if (dd
->dd_parent
!= NULL
) {
1201 parentspace
= dsl_dir_space_available(dd
->dd_parent
,
1202 ancestor
, delta
, ondiskonly
);
1205 mutex_enter(&dd
->dd_lock
);
1206 if (dsl_dir_phys(dd
)->dd_quota
!= 0)
1207 quota
= dsl_dir_phys(dd
)->dd_quota
;
1208 used
= dsl_dir_phys(dd
)->dd_used_bytes
;
1210 used
+= dsl_dir_space_towrite(dd
);
1212 if (dd
->dd_parent
== NULL
) {
1213 uint64_t poolsize
= dsl_pool_adjustedsize(dd
->dd_pool
,
1214 ZFS_SPACE_CHECK_NORMAL
);
1215 quota
= MIN(quota
, poolsize
);
1218 if (dsl_dir_phys(dd
)->dd_reserved
> used
&& parentspace
!= UINT64_MAX
) {
1220 * We have some space reserved, in addition to what our
1223 parentspace
+= dsl_dir_phys(dd
)->dd_reserved
- used
;
1226 if (dd
== ancestor
) {
1228 ASSERT(used
>= -delta
);
1230 if (parentspace
!= UINT64_MAX
)
1231 parentspace
-= delta
;
1239 * the lesser of the space provided by our parent and
1240 * the space left in our quota
1242 myspace
= MIN(parentspace
, quota
- used
);
1245 mutex_exit(&dd
->dd_lock
);
1250 struct tempreserve
{
1251 list_node_t tr_node
;
1257 dsl_dir_tempreserve_impl(dsl_dir_t
*dd
, uint64_t asize
, boolean_t netfree
,
1258 boolean_t ignorequota
, list_t
*tr_list
,
1259 dmu_tx_t
*tx
, boolean_t first
)
1263 struct tempreserve
*tr
;
1272 ASSERT3U(txg
, !=, 0);
1273 ASSERT3S(asize
, >, 0);
1275 mutex_enter(&dd
->dd_lock
);
1278 * Check against the dsl_dir's quota. We don't add in the delta
1279 * when checking for over-quota because they get one free hit.
1281 uint64_t est_inflight
= dsl_dir_space_towrite(dd
);
1282 for (int i
= 0; i
< TXG_SIZE
; i
++)
1283 est_inflight
+= dd
->dd_tempreserved
[i
];
1284 uint64_t used_on_disk
= dsl_dir_phys(dd
)->dd_used_bytes
;
1287 * On the first iteration, fetch the dataset's used-on-disk and
1288 * refreservation values. Also, if checkrefquota is set, test if
1289 * allocating this space would exceed the dataset's refquota.
1291 if (first
&& tx
->tx_objset
) {
1293 dsl_dataset_t
*ds
= tx
->tx_objset
->os_dsl_dataset
;
1295 error
= dsl_dataset_check_quota(ds
, !netfree
,
1296 asize
, est_inflight
, &used_on_disk
, &ref_rsrv
);
1298 mutex_exit(&dd
->dd_lock
);
1299 DMU_TX_STAT_BUMP(dmu_tx_quota
);
1305 * If this transaction will result in a net free of space,
1306 * we want to let it through.
1308 if (ignorequota
|| netfree
|| dsl_dir_phys(dd
)->dd_quota
== 0)
1311 quota
= dsl_dir_phys(dd
)->dd_quota
;
1314 * Adjust the quota against the actual pool size at the root
1315 * minus any outstanding deferred frees.
1316 * To ensure that it's possible to remove files from a full
1317 * pool without inducing transient overcommits, we throttle
1318 * netfree transactions against a quota that is slightly larger,
1319 * but still within the pool's allocation slop. In cases where
1320 * we're very close to full, this will allow a steady trickle of
1321 * removes to get through.
1323 if (dd
->dd_parent
== NULL
) {
1324 uint64_t avail
= dsl_pool_unreserved_space(dd
->dd_pool
,
1326 ZFS_SPACE_CHECK_RESERVED
: ZFS_SPACE_CHECK_NORMAL
);
1328 if (avail
< quota
) {
1330 retval
= SET_ERROR(ENOSPC
);
1335 * If they are requesting more space, and our current estimate
1336 * is over quota, they get to try again unless the actual
1337 * on-disk is over quota and there are no pending changes
1338 * or deferred frees (which may free up space for us).
1340 if (used_on_disk
+ est_inflight
>= quota
) {
1341 if (est_inflight
> 0 || used_on_disk
< quota
) {
1342 retval
= SET_ERROR(ERESTART
);
1344 ASSERT3U(used_on_disk
, >=, quota
);
1346 if (retval
== ENOSPC
&& (used_on_disk
- quota
) <
1347 dsl_pool_deferred_space(dd
->dd_pool
)) {
1348 retval
= SET_ERROR(ERESTART
);
1352 dprintf_dd(dd
, "failing: used=%lluK inflight = %lluK "
1353 "quota=%lluK tr=%lluK err=%d\n",
1354 (u_longlong_t
)used_on_disk
>>10,
1355 (u_longlong_t
)est_inflight
>>10,
1356 (u_longlong_t
)quota
>>10, (u_longlong_t
)asize
>>10, retval
);
1357 mutex_exit(&dd
->dd_lock
);
1358 DMU_TX_STAT_BUMP(dmu_tx_quota
);
1362 /* We need to up our estimated delta before dropping dd_lock */
1363 dd
->dd_tempreserved
[txg
& TXG_MASK
] += asize
;
1365 uint64_t parent_rsrv
= parent_delta(dd
, used_on_disk
+ est_inflight
,
1367 mutex_exit(&dd
->dd_lock
);
1369 tr
= kmem_zalloc(sizeof (struct tempreserve
), KM_SLEEP
);
1371 tr
->tr_size
= asize
;
1372 list_insert_tail(tr_list
, tr
);
1374 /* see if it's OK with our parent */
1375 if (dd
->dd_parent
!= NULL
&& parent_rsrv
!= 0) {
1377 * Recurse on our parent without recursion. This has been
1378 * observed to be potentially large stack usage even within
1379 * the test suite. Largest seen stack was 7632 bytes on linux.
1383 asize
= parent_rsrv
;
1384 ignorequota
= (dsl_dir_phys(dd
)->dd_head_dataset_obj
== 0);
1386 goto top_of_function
;
1394 * Reserve space in this dsl_dir, to be used in this tx's txg.
1395 * After the space has been dirtied (and dsl_dir_willuse_space()
1396 * has been called), the reservation should be canceled, using
1397 * dsl_dir_tempreserve_clear().
1400 dsl_dir_tempreserve_space(dsl_dir_t
*dd
, uint64_t lsize
, uint64_t asize
,
1401 boolean_t netfree
, void **tr_cookiep
, dmu_tx_t
*tx
)
1411 tr_list
= kmem_alloc(sizeof (list_t
), KM_SLEEP
);
1412 list_create(tr_list
, sizeof (struct tempreserve
),
1413 offsetof(struct tempreserve
, tr_node
));
1414 ASSERT3S(asize
, >, 0);
1416 err
= arc_tempreserve_space(dd
->dd_pool
->dp_spa
, lsize
, tx
->tx_txg
);
1418 struct tempreserve
*tr
;
1420 tr
= kmem_zalloc(sizeof (struct tempreserve
), KM_SLEEP
);
1421 tr
->tr_size
= lsize
;
1422 list_insert_tail(tr_list
, tr
);
1424 if (err
== EAGAIN
) {
1426 * If arc_memory_throttle() detected that pageout
1427 * is running and we are low on memory, we delay new
1428 * non-pageout transactions to give pageout an
1431 * It is unfortunate to be delaying while the caller's
1434 txg_delay(dd
->dd_pool
, tx
->tx_txg
,
1435 MSEC2NSEC(10), MSEC2NSEC(10));
1436 err
= SET_ERROR(ERESTART
);
1441 err
= dsl_dir_tempreserve_impl(dd
, asize
, netfree
,
1442 B_FALSE
, tr_list
, tx
, B_TRUE
);
1446 dsl_dir_tempreserve_clear(tr_list
, tx
);
1448 *tr_cookiep
= tr_list
;
1454 * Clear a temporary reservation that we previously made with
1455 * dsl_dir_tempreserve_space().
1458 dsl_dir_tempreserve_clear(void *tr_cookie
, dmu_tx_t
*tx
)
1460 int txgidx
= tx
->tx_txg
& TXG_MASK
;
1461 list_t
*tr_list
= tr_cookie
;
1462 struct tempreserve
*tr
;
1464 ASSERT3U(tx
->tx_txg
, !=, 0);
1466 if (tr_cookie
== NULL
)
1469 while ((tr
= list_head(tr_list
)) != NULL
) {
1471 mutex_enter(&tr
->tr_ds
->dd_lock
);
1472 ASSERT3U(tr
->tr_ds
->dd_tempreserved
[txgidx
], >=,
1474 tr
->tr_ds
->dd_tempreserved
[txgidx
] -= tr
->tr_size
;
1475 mutex_exit(&tr
->tr_ds
->dd_lock
);
1477 arc_tempreserve_clear(tr
->tr_size
);
1479 list_remove(tr_list
, tr
);
1480 kmem_free(tr
, sizeof (struct tempreserve
));
1483 kmem_free(tr_list
, sizeof (list_t
));
1487 * This should be called from open context when we think we're going to write
1488 * or free space, for example when dirtying data. Be conservative; it's okay
1489 * to write less space or free more, but we don't want to write more or free
1490 * less than the amount specified.
1492 * NOTE: The behavior of this function is identical to the Illumos / FreeBSD
1493 * version however it has been adjusted to use an iterative rather than
1494 * recursive algorithm to minimize stack usage.
1497 dsl_dir_willuse_space(dsl_dir_t
*dd
, int64_t space
, dmu_tx_t
*tx
)
1499 int64_t parent_space
;
1503 mutex_enter(&dd
->dd_lock
);
1505 dd
->dd_space_towrite
[tx
->tx_txg
& TXG_MASK
] += space
;
1507 est_used
= dsl_dir_space_towrite(dd
) +
1508 dsl_dir_phys(dd
)->dd_used_bytes
;
1509 parent_space
= parent_delta(dd
, est_used
, space
);
1510 mutex_exit(&dd
->dd_lock
);
1512 /* Make sure that we clean up dd_space_to* */
1513 dsl_dir_dirty(dd
, tx
);
1516 space
= parent_space
;
1517 } while (space
&& dd
);
1520 /* call from syncing context when we actually write/free space for this dd */
1522 dsl_dir_diduse_space(dsl_dir_t
*dd
, dd_used_t type
,
1523 int64_t used
, int64_t compressed
, int64_t uncompressed
, dmu_tx_t
*tx
)
1525 int64_t accounted_delta
;
1527 ASSERT(dmu_tx_is_syncing(tx
));
1528 ASSERT(type
< DD_USED_NUM
);
1530 dmu_buf_will_dirty(dd
->dd_dbuf
, tx
);
1533 * dsl_dataset_set_refreservation_sync_impl() calls this with
1534 * dd_lock held, so that it can atomically update
1535 * ds->ds_reserved and the dsl_dir accounting, so that
1536 * dsl_dataset_check_quota() can see dataset and dir accounting
1539 boolean_t needlock
= !MUTEX_HELD(&dd
->dd_lock
);
1541 mutex_enter(&dd
->dd_lock
);
1542 dsl_dir_phys_t
*ddp
= dsl_dir_phys(dd
);
1543 accounted_delta
= parent_delta(dd
, ddp
->dd_used_bytes
, used
);
1544 ASSERT(used
>= 0 || ddp
->dd_used_bytes
>= -used
);
1545 ASSERT(compressed
>= 0 || ddp
->dd_compressed_bytes
>= -compressed
);
1546 ASSERT(uncompressed
>= 0 ||
1547 ddp
->dd_uncompressed_bytes
>= -uncompressed
);
1548 ddp
->dd_used_bytes
+= used
;
1549 ddp
->dd_uncompressed_bytes
+= uncompressed
;
1550 ddp
->dd_compressed_bytes
+= compressed
;
1552 if (ddp
->dd_flags
& DD_FLAG_USED_BREAKDOWN
) {
1553 ASSERT(used
>= 0 || ddp
->dd_used_breakdown
[type
] >= -used
);
1554 ddp
->dd_used_breakdown
[type
] += used
;
1559 for (t
= 0; t
< DD_USED_NUM
; t
++)
1560 u
+= ddp
->dd_used_breakdown
[t
];
1561 ASSERT3U(u
, ==, ddp
->dd_used_bytes
);
1566 mutex_exit(&dd
->dd_lock
);
1568 if (dd
->dd_parent
!= NULL
) {
1569 dsl_dir_diduse_transfer_space(dd
->dd_parent
,
1570 accounted_delta
, compressed
, uncompressed
,
1571 used
, DD_USED_CHILD_RSRV
, DD_USED_CHILD
, tx
);
1576 dsl_dir_transfer_space(dsl_dir_t
*dd
, int64_t delta
,
1577 dd_used_t oldtype
, dd_used_t newtype
, dmu_tx_t
*tx
)
1579 ASSERT(dmu_tx_is_syncing(tx
));
1580 ASSERT(oldtype
< DD_USED_NUM
);
1581 ASSERT(newtype
< DD_USED_NUM
);
1583 dsl_dir_phys_t
*ddp
= dsl_dir_phys(dd
);
1585 !(ddp
->dd_flags
& DD_FLAG_USED_BREAKDOWN
))
1588 dmu_buf_will_dirty(dd
->dd_dbuf
, tx
);
1589 mutex_enter(&dd
->dd_lock
);
1591 ddp
->dd_used_breakdown
[oldtype
] >= delta
:
1592 ddp
->dd_used_breakdown
[newtype
] >= -delta
);
1593 ASSERT(ddp
->dd_used_bytes
>= ABS(delta
));
1594 ddp
->dd_used_breakdown
[oldtype
] -= delta
;
1595 ddp
->dd_used_breakdown
[newtype
] += delta
;
1596 mutex_exit(&dd
->dd_lock
);
1600 dsl_dir_diduse_transfer_space(dsl_dir_t
*dd
, int64_t used
,
1601 int64_t compressed
, int64_t uncompressed
, int64_t tonew
,
1602 dd_used_t oldtype
, dd_used_t newtype
, dmu_tx_t
*tx
)
1604 int64_t accounted_delta
;
1606 ASSERT(dmu_tx_is_syncing(tx
));
1607 ASSERT(oldtype
< DD_USED_NUM
);
1608 ASSERT(newtype
< DD_USED_NUM
);
1610 dmu_buf_will_dirty(dd
->dd_dbuf
, tx
);
1612 mutex_enter(&dd
->dd_lock
);
1613 dsl_dir_phys_t
*ddp
= dsl_dir_phys(dd
);
1614 accounted_delta
= parent_delta(dd
, ddp
->dd_used_bytes
, used
);
1615 ASSERT(used
>= 0 || ddp
->dd_used_bytes
>= -used
);
1616 ASSERT(compressed
>= 0 || ddp
->dd_compressed_bytes
>= -compressed
);
1617 ASSERT(uncompressed
>= 0 ||
1618 ddp
->dd_uncompressed_bytes
>= -uncompressed
);
1619 ddp
->dd_used_bytes
+= used
;
1620 ddp
->dd_uncompressed_bytes
+= uncompressed
;
1621 ddp
->dd_compressed_bytes
+= compressed
;
1623 if (ddp
->dd_flags
& DD_FLAG_USED_BREAKDOWN
) {
1624 ASSERT(tonew
- used
<= 0 ||
1625 ddp
->dd_used_breakdown
[oldtype
] >= tonew
- used
);
1626 ASSERT(tonew
>= 0 ||
1627 ddp
->dd_used_breakdown
[newtype
] >= -tonew
);
1628 ddp
->dd_used_breakdown
[oldtype
] -= tonew
- used
;
1629 ddp
->dd_used_breakdown
[newtype
] += tonew
;
1634 for (t
= 0; t
< DD_USED_NUM
; t
++)
1635 u
+= ddp
->dd_used_breakdown
[t
];
1636 ASSERT3U(u
, ==, ddp
->dd_used_bytes
);
1640 mutex_exit(&dd
->dd_lock
);
1642 if (dd
->dd_parent
!= NULL
) {
1643 dsl_dir_diduse_transfer_space(dd
->dd_parent
,
1644 accounted_delta
, compressed
, uncompressed
,
1645 used
, DD_USED_CHILD_RSRV
, DD_USED_CHILD
, tx
);
1649 typedef struct dsl_dir_set_qr_arg
{
1650 const char *ddsqra_name
;
1651 zprop_source_t ddsqra_source
;
1652 uint64_t ddsqra_value
;
1653 } dsl_dir_set_qr_arg_t
;
1656 dsl_dir_set_quota_check(void *arg
, dmu_tx_t
*tx
)
1658 dsl_dir_set_qr_arg_t
*ddsqra
= arg
;
1659 dsl_pool_t
*dp
= dmu_tx_pool(tx
);
1662 uint64_t towrite
, newval
;
1664 error
= dsl_dataset_hold(dp
, ddsqra
->ddsqra_name
, FTAG
, &ds
);
1668 error
= dsl_prop_predict(ds
->ds_dir
, "quota",
1669 ddsqra
->ddsqra_source
, ddsqra
->ddsqra_value
, &newval
);
1671 dsl_dataset_rele(ds
, FTAG
);
1676 dsl_dataset_rele(ds
, FTAG
);
1680 mutex_enter(&ds
->ds_dir
->dd_lock
);
1682 * If we are doing the preliminary check in open context, and
1683 * there are pending changes, then don't fail it, since the
1684 * pending changes could under-estimate the amount of space to be
1687 towrite
= dsl_dir_space_towrite(ds
->ds_dir
);
1688 if ((dmu_tx_is_syncing(tx
) || towrite
== 0) &&
1689 (newval
< dsl_dir_phys(ds
->ds_dir
)->dd_reserved
||
1690 newval
< dsl_dir_phys(ds
->ds_dir
)->dd_used_bytes
+ towrite
)) {
1691 error
= SET_ERROR(ENOSPC
);
1693 mutex_exit(&ds
->ds_dir
->dd_lock
);
1694 dsl_dataset_rele(ds
, FTAG
);
1699 dsl_dir_set_quota_sync(void *arg
, dmu_tx_t
*tx
)
1701 dsl_dir_set_qr_arg_t
*ddsqra
= arg
;
1702 dsl_pool_t
*dp
= dmu_tx_pool(tx
);
1706 VERIFY0(dsl_dataset_hold(dp
, ddsqra
->ddsqra_name
, FTAG
, &ds
));
1708 if (spa_version(dp
->dp_spa
) >= SPA_VERSION_RECVD_PROPS
) {
1709 dsl_prop_set_sync_impl(ds
, zfs_prop_to_name(ZFS_PROP_QUOTA
),
1710 ddsqra
->ddsqra_source
, sizeof (ddsqra
->ddsqra_value
), 1,
1711 &ddsqra
->ddsqra_value
, tx
);
1713 VERIFY0(dsl_prop_get_int_ds(ds
,
1714 zfs_prop_to_name(ZFS_PROP_QUOTA
), &newval
));
1716 newval
= ddsqra
->ddsqra_value
;
1717 spa_history_log_internal_ds(ds
, "set", tx
, "%s=%lld",
1718 zfs_prop_to_name(ZFS_PROP_QUOTA
), (longlong_t
)newval
);
1721 dmu_buf_will_dirty(ds
->ds_dir
->dd_dbuf
, tx
);
1722 mutex_enter(&ds
->ds_dir
->dd_lock
);
1723 dsl_dir_phys(ds
->ds_dir
)->dd_quota
= newval
;
1724 mutex_exit(&ds
->ds_dir
->dd_lock
);
1725 dsl_dataset_rele(ds
, FTAG
);
1729 dsl_dir_set_quota(const char *ddname
, zprop_source_t source
, uint64_t quota
)
1731 dsl_dir_set_qr_arg_t ddsqra
;
1733 ddsqra
.ddsqra_name
= ddname
;
1734 ddsqra
.ddsqra_source
= source
;
1735 ddsqra
.ddsqra_value
= quota
;
1737 return (dsl_sync_task(ddname
, dsl_dir_set_quota_check
,
1738 dsl_dir_set_quota_sync
, &ddsqra
, 0,
1739 ZFS_SPACE_CHECK_EXTRA_RESERVED
));
1743 dsl_dir_set_reservation_check(void *arg
, dmu_tx_t
*tx
)
1745 dsl_dir_set_qr_arg_t
*ddsqra
= arg
;
1746 dsl_pool_t
*dp
= dmu_tx_pool(tx
);
1749 uint64_t newval
, used
, avail
;
1752 error
= dsl_dataset_hold(dp
, ddsqra
->ddsqra_name
, FTAG
, &ds
);
1758 * If we are doing the preliminary check in open context, the
1759 * space estimates may be inaccurate.
1761 if (!dmu_tx_is_syncing(tx
)) {
1762 dsl_dataset_rele(ds
, FTAG
);
1766 error
= dsl_prop_predict(ds
->ds_dir
,
1767 zfs_prop_to_name(ZFS_PROP_RESERVATION
),
1768 ddsqra
->ddsqra_source
, ddsqra
->ddsqra_value
, &newval
);
1770 dsl_dataset_rele(ds
, FTAG
);
1774 mutex_enter(&dd
->dd_lock
);
1775 used
= dsl_dir_phys(dd
)->dd_used_bytes
;
1776 mutex_exit(&dd
->dd_lock
);
1778 if (dd
->dd_parent
) {
1779 avail
= dsl_dir_space_available(dd
->dd_parent
,
1782 avail
= dsl_pool_adjustedsize(dd
->dd_pool
,
1783 ZFS_SPACE_CHECK_NORMAL
) - used
;
1786 if (MAX(used
, newval
) > MAX(used
, dsl_dir_phys(dd
)->dd_reserved
)) {
1787 uint64_t delta
= MAX(used
, newval
) -
1788 MAX(used
, dsl_dir_phys(dd
)->dd_reserved
);
1790 if (delta
> avail
||
1791 (dsl_dir_phys(dd
)->dd_quota
> 0 &&
1792 newval
> dsl_dir_phys(dd
)->dd_quota
))
1793 error
= SET_ERROR(ENOSPC
);
1796 dsl_dataset_rele(ds
, FTAG
);
1801 dsl_dir_set_reservation_sync_impl(dsl_dir_t
*dd
, uint64_t value
, dmu_tx_t
*tx
)
1806 dmu_buf_will_dirty(dd
->dd_dbuf
, tx
);
1808 mutex_enter(&dd
->dd_lock
);
1809 used
= dsl_dir_phys(dd
)->dd_used_bytes
;
1810 delta
= MAX(used
, value
) - MAX(used
, dsl_dir_phys(dd
)->dd_reserved
);
1811 dsl_dir_phys(dd
)->dd_reserved
= value
;
1813 if (dd
->dd_parent
!= NULL
) {
1814 /* Roll up this additional usage into our ancestors */
1815 dsl_dir_diduse_space(dd
->dd_parent
, DD_USED_CHILD_RSRV
,
1818 mutex_exit(&dd
->dd_lock
);
1822 dsl_dir_set_reservation_sync(void *arg
, dmu_tx_t
*tx
)
1824 dsl_dir_set_qr_arg_t
*ddsqra
= arg
;
1825 dsl_pool_t
*dp
= dmu_tx_pool(tx
);
1829 VERIFY0(dsl_dataset_hold(dp
, ddsqra
->ddsqra_name
, FTAG
, &ds
));
1831 if (spa_version(dp
->dp_spa
) >= SPA_VERSION_RECVD_PROPS
) {
1832 dsl_prop_set_sync_impl(ds
,
1833 zfs_prop_to_name(ZFS_PROP_RESERVATION
),
1834 ddsqra
->ddsqra_source
, sizeof (ddsqra
->ddsqra_value
), 1,
1835 &ddsqra
->ddsqra_value
, tx
);
1837 VERIFY0(dsl_prop_get_int_ds(ds
,
1838 zfs_prop_to_name(ZFS_PROP_RESERVATION
), &newval
));
1840 newval
= ddsqra
->ddsqra_value
;
1841 spa_history_log_internal_ds(ds
, "set", tx
, "%s=%lld",
1842 zfs_prop_to_name(ZFS_PROP_RESERVATION
),
1843 (longlong_t
)newval
);
1846 dsl_dir_set_reservation_sync_impl(ds
->ds_dir
, newval
, tx
);
1847 dsl_dataset_rele(ds
, FTAG
);
1851 dsl_dir_set_reservation(const char *ddname
, zprop_source_t source
,
1852 uint64_t reservation
)
1854 dsl_dir_set_qr_arg_t ddsqra
;
1856 ddsqra
.ddsqra_name
= ddname
;
1857 ddsqra
.ddsqra_source
= source
;
1858 ddsqra
.ddsqra_value
= reservation
;
1860 return (dsl_sync_task(ddname
, dsl_dir_set_reservation_check
,
1861 dsl_dir_set_reservation_sync
, &ddsqra
, 0,
1862 ZFS_SPACE_CHECK_EXTRA_RESERVED
));
1866 closest_common_ancestor(dsl_dir_t
*ds1
, dsl_dir_t
*ds2
)
1868 for (; ds1
; ds1
= ds1
->dd_parent
) {
1870 for (dd
= ds2
; dd
; dd
= dd
->dd_parent
) {
1879 * If delta is applied to dd, how much of that delta would be applied to
1880 * ancestor? Syncing context only.
1883 would_change(dsl_dir_t
*dd
, int64_t delta
, dsl_dir_t
*ancestor
)
1888 mutex_enter(&dd
->dd_lock
);
1889 delta
= parent_delta(dd
, dsl_dir_phys(dd
)->dd_used_bytes
, delta
);
1890 mutex_exit(&dd
->dd_lock
);
1891 return (would_change(dd
->dd_parent
, delta
, ancestor
));
1894 typedef struct dsl_dir_rename_arg
{
1895 const char *ddra_oldname
;
1896 const char *ddra_newname
;
1899 } dsl_dir_rename_arg_t
;
1901 typedef struct dsl_valid_rename_arg
{
1904 } dsl_valid_rename_arg_t
;
1907 dsl_valid_rename(dsl_pool_t
*dp
, dsl_dataset_t
*ds
, void *arg
)
1910 dsl_valid_rename_arg_t
*dvra
= arg
;
1911 char namebuf
[ZFS_MAX_DATASET_NAME_LEN
];
1913 dsl_dataset_name(ds
, namebuf
);
1915 ASSERT3U(strnlen(namebuf
, ZFS_MAX_DATASET_NAME_LEN
),
1916 <, ZFS_MAX_DATASET_NAME_LEN
);
1917 int namelen
= strlen(namebuf
) + dvra
->char_delta
;
1918 int depth
= get_dataset_depth(namebuf
) + dvra
->nest_delta
;
1920 if (namelen
>= ZFS_MAX_DATASET_NAME_LEN
)
1921 return (SET_ERROR(ENAMETOOLONG
));
1922 if (dvra
->nest_delta
> 0 && depth
>= zfs_max_dataset_nesting
)
1923 return (SET_ERROR(ENAMETOOLONG
));
1928 dsl_dir_rename_check(void *arg
, dmu_tx_t
*tx
)
1930 dsl_dir_rename_arg_t
*ddra
= arg
;
1931 dsl_pool_t
*dp
= dmu_tx_pool(tx
);
1932 dsl_dir_t
*dd
, *newparent
;
1933 dsl_valid_rename_arg_t dvra
;
1934 dsl_dataset_t
*parentds
;
1936 const char *mynewname
;
1939 /* target dir should exist */
1940 error
= dsl_dir_hold(dp
, ddra
->ddra_oldname
, FTAG
, &dd
, NULL
);
1944 /* new parent should exist */
1945 error
= dsl_dir_hold(dp
, ddra
->ddra_newname
, FTAG
,
1946 &newparent
, &mynewname
);
1948 dsl_dir_rele(dd
, FTAG
);
1952 /* can't rename to different pool */
1953 if (dd
->dd_pool
!= newparent
->dd_pool
) {
1954 dsl_dir_rele(newparent
, FTAG
);
1955 dsl_dir_rele(dd
, FTAG
);
1956 return (SET_ERROR(EXDEV
));
1959 /* new name should not already exist */
1960 if (mynewname
== NULL
) {
1961 dsl_dir_rele(newparent
, FTAG
);
1962 dsl_dir_rele(dd
, FTAG
);
1963 return (SET_ERROR(EEXIST
));
1966 /* can't rename below anything but filesystems (eg. no ZVOLs) */
1967 error
= dsl_dataset_hold_obj(newparent
->dd_pool
,
1968 dsl_dir_phys(newparent
)->dd_head_dataset_obj
, FTAG
, &parentds
);
1970 dsl_dir_rele(newparent
, FTAG
);
1971 dsl_dir_rele(dd
, FTAG
);
1974 error
= dmu_objset_from_ds(parentds
, &parentos
);
1976 dsl_dataset_rele(parentds
, FTAG
);
1977 dsl_dir_rele(newparent
, FTAG
);
1978 dsl_dir_rele(dd
, FTAG
);
1981 if (dmu_objset_type(parentos
) != DMU_OST_ZFS
) {
1982 dsl_dataset_rele(parentds
, FTAG
);
1983 dsl_dir_rele(newparent
, FTAG
);
1984 dsl_dir_rele(dd
, FTAG
);
1985 return (SET_ERROR(ZFS_ERR_WRONG_PARENT
));
1987 dsl_dataset_rele(parentds
, FTAG
);
1989 ASSERT3U(strnlen(ddra
->ddra_newname
, ZFS_MAX_DATASET_NAME_LEN
),
1990 <, ZFS_MAX_DATASET_NAME_LEN
);
1991 ASSERT3U(strnlen(ddra
->ddra_oldname
, ZFS_MAX_DATASET_NAME_LEN
),
1992 <, ZFS_MAX_DATASET_NAME_LEN
);
1993 dvra
.char_delta
= strlen(ddra
->ddra_newname
)
1994 - strlen(ddra
->ddra_oldname
);
1995 dvra
.nest_delta
= get_dataset_depth(ddra
->ddra_newname
)
1996 - get_dataset_depth(ddra
->ddra_oldname
);
1998 /* if the name length is growing, validate child name lengths */
1999 if (dvra
.char_delta
> 0 || dvra
.nest_delta
> 0) {
2000 error
= dmu_objset_find_dp(dp
, dd
->dd_object
, dsl_valid_rename
,
2001 &dvra
, DS_FIND_CHILDREN
| DS_FIND_SNAPSHOTS
);
2003 dsl_dir_rele(newparent
, FTAG
);
2004 dsl_dir_rele(dd
, FTAG
);
2009 if (dmu_tx_is_syncing(tx
)) {
2010 if (spa_feature_is_active(dp
->dp_spa
,
2011 SPA_FEATURE_FS_SS_LIMIT
)) {
2013 * Although this is the check function and we don't
2014 * normally make on-disk changes in check functions,
2015 * we need to do that here.
2017 * Ensure this portion of the tree's counts have been
2018 * initialized in case the new parent has limits set.
2020 dsl_dir_init_fs_ss_count(dd
, tx
);
2024 if (newparent
!= dd
->dd_parent
) {
2025 /* is there enough space? */
2027 MAX(dsl_dir_phys(dd
)->dd_used_bytes
,
2028 dsl_dir_phys(dd
)->dd_reserved
);
2029 objset_t
*os
= dd
->dd_pool
->dp_meta_objset
;
2030 uint64_t fs_cnt
= 0;
2031 uint64_t ss_cnt
= 0;
2033 if (dsl_dir_is_zapified(dd
)) {
2036 err
= zap_lookup(os
, dd
->dd_object
,
2037 DD_FIELD_FILESYSTEM_COUNT
, sizeof (fs_cnt
), 1,
2039 if (err
!= ENOENT
&& err
!= 0) {
2040 dsl_dir_rele(newparent
, FTAG
);
2041 dsl_dir_rele(dd
, FTAG
);
2046 * have to add 1 for the filesystem itself that we're
2051 err
= zap_lookup(os
, dd
->dd_object
,
2052 DD_FIELD_SNAPSHOT_COUNT
, sizeof (ss_cnt
), 1,
2054 if (err
!= ENOENT
&& err
!= 0) {
2055 dsl_dir_rele(newparent
, FTAG
);
2056 dsl_dir_rele(dd
, FTAG
);
2061 /* check for encryption errors */
2062 error
= dsl_dir_rename_crypt_check(dd
, newparent
);
2064 dsl_dir_rele(newparent
, FTAG
);
2065 dsl_dir_rele(dd
, FTAG
);
2066 return (SET_ERROR(EACCES
));
2069 /* no rename into our descendant */
2070 if (closest_common_ancestor(dd
, newparent
) == dd
) {
2071 dsl_dir_rele(newparent
, FTAG
);
2072 dsl_dir_rele(dd
, FTAG
);
2073 return (SET_ERROR(EINVAL
));
2076 error
= dsl_dir_transfer_possible(dd
->dd_parent
,
2077 newparent
, fs_cnt
, ss_cnt
, myspace
,
2078 ddra
->ddra_cred
, ddra
->ddra_proc
);
2080 dsl_dir_rele(newparent
, FTAG
);
2081 dsl_dir_rele(dd
, FTAG
);
2086 dsl_dir_rele(newparent
, FTAG
);
2087 dsl_dir_rele(dd
, FTAG
);
2092 dsl_dir_rename_sync(void *arg
, dmu_tx_t
*tx
)
2094 dsl_dir_rename_arg_t
*ddra
= arg
;
2095 dsl_pool_t
*dp
= dmu_tx_pool(tx
);
2096 dsl_dir_t
*dd
, *newparent
;
2097 const char *mynewname
;
2098 objset_t
*mos
= dp
->dp_meta_objset
;
2100 VERIFY0(dsl_dir_hold(dp
, ddra
->ddra_oldname
, FTAG
, &dd
, NULL
));
2101 VERIFY0(dsl_dir_hold(dp
, ddra
->ddra_newname
, FTAG
, &newparent
,
2104 /* Log this before we change the name. */
2105 spa_history_log_internal_dd(dd
, "rename", tx
,
2106 "-> %s", ddra
->ddra_newname
);
2108 if (newparent
!= dd
->dd_parent
) {
2109 objset_t
*os
= dd
->dd_pool
->dp_meta_objset
;
2110 uint64_t fs_cnt
= 0;
2111 uint64_t ss_cnt
= 0;
2114 * We already made sure the dd counts were initialized in the
2117 if (spa_feature_is_active(dp
->dp_spa
,
2118 SPA_FEATURE_FS_SS_LIMIT
)) {
2119 VERIFY0(zap_lookup(os
, dd
->dd_object
,
2120 DD_FIELD_FILESYSTEM_COUNT
, sizeof (fs_cnt
), 1,
2122 /* add 1 for the filesystem itself that we're moving */
2125 VERIFY0(zap_lookup(os
, dd
->dd_object
,
2126 DD_FIELD_SNAPSHOT_COUNT
, sizeof (ss_cnt
), 1,
2130 dsl_fs_ss_count_adjust(dd
->dd_parent
, -fs_cnt
,
2131 DD_FIELD_FILESYSTEM_COUNT
, tx
);
2132 dsl_fs_ss_count_adjust(newparent
, fs_cnt
,
2133 DD_FIELD_FILESYSTEM_COUNT
, tx
);
2135 dsl_fs_ss_count_adjust(dd
->dd_parent
, -ss_cnt
,
2136 DD_FIELD_SNAPSHOT_COUNT
, tx
);
2137 dsl_fs_ss_count_adjust(newparent
, ss_cnt
,
2138 DD_FIELD_SNAPSHOT_COUNT
, tx
);
2140 dsl_dir_diduse_space(dd
->dd_parent
, DD_USED_CHILD
,
2141 -dsl_dir_phys(dd
)->dd_used_bytes
,
2142 -dsl_dir_phys(dd
)->dd_compressed_bytes
,
2143 -dsl_dir_phys(dd
)->dd_uncompressed_bytes
, tx
);
2144 dsl_dir_diduse_space(newparent
, DD_USED_CHILD
,
2145 dsl_dir_phys(dd
)->dd_used_bytes
,
2146 dsl_dir_phys(dd
)->dd_compressed_bytes
,
2147 dsl_dir_phys(dd
)->dd_uncompressed_bytes
, tx
);
2149 if (dsl_dir_phys(dd
)->dd_reserved
>
2150 dsl_dir_phys(dd
)->dd_used_bytes
) {
2151 uint64_t unused_rsrv
= dsl_dir_phys(dd
)->dd_reserved
-
2152 dsl_dir_phys(dd
)->dd_used_bytes
;
2154 dsl_dir_diduse_space(dd
->dd_parent
, DD_USED_CHILD_RSRV
,
2155 -unused_rsrv
, 0, 0, tx
);
2156 dsl_dir_diduse_space(newparent
, DD_USED_CHILD_RSRV
,
2157 unused_rsrv
, 0, 0, tx
);
2161 dmu_buf_will_dirty(dd
->dd_dbuf
, tx
);
2163 /* remove from old parent zapobj */
2164 VERIFY0(zap_remove(mos
,
2165 dsl_dir_phys(dd
->dd_parent
)->dd_child_dir_zapobj
,
2166 dd
->dd_myname
, tx
));
2168 (void) strlcpy(dd
->dd_myname
, mynewname
,
2169 sizeof (dd
->dd_myname
));
2170 dsl_dir_rele(dd
->dd_parent
, dd
);
2171 dsl_dir_phys(dd
)->dd_parent_obj
= newparent
->dd_object
;
2172 VERIFY0(dsl_dir_hold_obj(dp
,
2173 newparent
->dd_object
, NULL
, dd
, &dd
->dd_parent
));
2175 /* add to new parent zapobj */
2176 VERIFY0(zap_add(mos
, dsl_dir_phys(newparent
)->dd_child_dir_zapobj
,
2177 dd
->dd_myname
, 8, 1, &dd
->dd_object
, tx
));
2179 /* TODO: A rename callback to avoid these layering violations. */
2180 zfsvfs_update_fromname(ddra
->ddra_oldname
, ddra
->ddra_newname
);
2181 zvol_rename_minors(dp
->dp_spa
, ddra
->ddra_oldname
,
2182 ddra
->ddra_newname
, B_TRUE
);
2184 dsl_prop_notify_all(dd
);
2186 dsl_dir_rele(newparent
, FTAG
);
2187 dsl_dir_rele(dd
, FTAG
);
2191 dsl_dir_rename(const char *oldname
, const char *newname
)
2193 dsl_dir_rename_arg_t ddra
;
2195 ddra
.ddra_oldname
= oldname
;
2196 ddra
.ddra_newname
= newname
;
2197 ddra
.ddra_cred
= CRED();
2198 ddra
.ddra_proc
= curproc
;
2200 return (dsl_sync_task(oldname
,
2201 dsl_dir_rename_check
, dsl_dir_rename_sync
, &ddra
,
2202 3, ZFS_SPACE_CHECK_RESERVED
));
2206 dsl_dir_transfer_possible(dsl_dir_t
*sdd
, dsl_dir_t
*tdd
,
2207 uint64_t fs_cnt
, uint64_t ss_cnt
, uint64_t space
,
2208 cred_t
*cr
, proc_t
*proc
)
2210 dsl_dir_t
*ancestor
;
2215 ancestor
= closest_common_ancestor(sdd
, tdd
);
2216 adelta
= would_change(sdd
, -space
, ancestor
);
2217 avail
= dsl_dir_space_available(tdd
, ancestor
, adelta
, FALSE
);
2219 return (SET_ERROR(ENOSPC
));
2221 err
= dsl_fs_ss_limit_check(tdd
, fs_cnt
, ZFS_PROP_FILESYSTEM_LIMIT
,
2222 ancestor
, cr
, proc
);
2225 err
= dsl_fs_ss_limit_check(tdd
, ss_cnt
, ZFS_PROP_SNAPSHOT_LIMIT
,
2226 ancestor
, cr
, proc
);
2234 dsl_dir_snap_cmtime(dsl_dir_t
*dd
)
2238 mutex_enter(&dd
->dd_lock
);
2239 t
= dd
->dd_snap_cmtime
;
2240 mutex_exit(&dd
->dd_lock
);
2246 dsl_dir_snap_cmtime_update(dsl_dir_t
*dd
)
2251 mutex_enter(&dd
->dd_lock
);
2252 dd
->dd_snap_cmtime
= t
;
2253 mutex_exit(&dd
->dd_lock
);
2257 dsl_dir_zapify(dsl_dir_t
*dd
, dmu_tx_t
*tx
)
2259 objset_t
*mos
= dd
->dd_pool
->dp_meta_objset
;
2260 dmu_object_zapify(mos
, dd
->dd_object
, DMU_OT_DSL_DIR
, tx
);
2264 dsl_dir_is_zapified(dsl_dir_t
*dd
)
2266 dmu_object_info_t doi
;
2268 dmu_object_info_from_db(dd
->dd_dbuf
, &doi
);
2269 return (doi
.doi_type
== DMU_OTN_ZAP_METADATA
);
2273 dsl_dir_livelist_open(dsl_dir_t
*dd
, uint64_t obj
)
2275 objset_t
*mos
= dd
->dd_pool
->dp_meta_objset
;
2276 ASSERT(spa_feature_is_active(dd
->dd_pool
->dp_spa
,
2277 SPA_FEATURE_LIVELIST
));
2278 dsl_deadlist_open(&dd
->dd_livelist
, mos
, obj
);
2279 bplist_create(&dd
->dd_pending_allocs
);
2280 bplist_create(&dd
->dd_pending_frees
);
2284 dsl_dir_livelist_close(dsl_dir_t
*dd
)
2286 dsl_deadlist_close(&dd
->dd_livelist
);
2287 bplist_destroy(&dd
->dd_pending_allocs
);
2288 bplist_destroy(&dd
->dd_pending_frees
);
2292 dsl_dir_remove_livelist(dsl_dir_t
*dd
, dmu_tx_t
*tx
, boolean_t total
)
2295 dsl_pool_t
*dp
= dmu_tx_pool(tx
);
2296 spa_t
*spa
= dp
->dp_spa
;
2297 livelist_condense_entry_t to_condense
= spa
->spa_to_condense
;
2299 if (!dsl_deadlist_is_open(&dd
->dd_livelist
))
2303 * If the livelist being removed is set to be condensed, stop the
2304 * condense zthr and indicate the cancellation in the spa_to_condense
2305 * struct in case the condense no-wait synctask has already started
2307 zthr_t
*ll_condense_thread
= spa
->spa_livelist_condense_zthr
;
2308 if (ll_condense_thread
!= NULL
&&
2309 (to_condense
.ds
!= NULL
) && (to_condense
.ds
->ds_dir
== dd
)) {
2311 * We use zthr_wait_cycle_done instead of zthr_cancel
2312 * because we don't want to destroy the zthr, just have
2313 * it skip its current task.
2315 spa
->spa_to_condense
.cancelled
= B_TRUE
;
2316 zthr_wait_cycle_done(ll_condense_thread
);
2318 * If we've returned from zthr_wait_cycle_done without
2319 * clearing the to_condense data structure it's either
2320 * because the no-wait synctask has started (which is
2321 * indicated by 'syncing' field of to_condense) and we
2322 * can expect it to clear to_condense on its own.
2323 * Otherwise, we returned before the zthr ran. The
2324 * checkfunc will now fail as cancelled == B_TRUE so we
2325 * can safely NULL out ds, allowing a different dir's
2326 * livelist to be condensed.
2328 * We can be sure that the to_condense struct will not
2329 * be repopulated at this stage because both this
2330 * function and dsl_livelist_try_condense execute in
2333 if ((spa
->spa_to_condense
.ds
!= NULL
) &&
2334 !spa
->spa_to_condense
.syncing
) {
2335 dmu_buf_rele(spa
->spa_to_condense
.ds
->ds_dbuf
,
2337 spa
->spa_to_condense
.ds
= NULL
;
2341 dsl_dir_livelist_close(dd
);
2342 VERIFY0(zap_lookup(dp
->dp_meta_objset
, dd
->dd_object
,
2343 DD_FIELD_LIVELIST
, sizeof (uint64_t), 1, &obj
));
2344 VERIFY0(zap_remove(dp
->dp_meta_objset
, dd
->dd_object
,
2345 DD_FIELD_LIVELIST
, tx
));
2347 dsl_deadlist_free(dp
->dp_meta_objset
, obj
, tx
);
2348 spa_feature_decr(spa
, SPA_FEATURE_LIVELIST
, tx
);
2353 dsl_dir_activity_in_progress(dsl_dir_t
*dd
, dsl_dataset_t
*ds
,
2354 zfs_wait_activity_t activity
, boolean_t
*in_progress
)
2358 ASSERT(MUTEX_HELD(&dd
->dd_activity_lock
));
2361 case ZFS_WAIT_DELETEQ
: {
2364 error
= dmu_objset_from_ds(ds
, &os
);
2368 mutex_enter(&os
->os_user_ptr_lock
);
2369 void *user
= dmu_objset_get_user(os
);
2370 mutex_exit(&os
->os_user_ptr_lock
);
2371 if (dmu_objset_type(os
) != DMU_OST_ZFS
||
2372 user
== NULL
|| zfs_get_vfs_flag_unmounted(os
)) {
2373 *in_progress
= B_FALSE
;
2377 uint64_t readonly
= B_FALSE
;
2378 error
= zfs_get_temporary_prop(ds
, ZFS_PROP_READONLY
, &readonly
,
2384 if (readonly
|| !spa_writeable(dd
->dd_pool
->dp_spa
)) {
2385 *in_progress
= B_FALSE
;
2389 uint64_t count
, unlinked_obj
;
2390 error
= zap_lookup(os
, MASTER_NODE_OBJ
, ZFS_UNLINKED_SET
, 8, 1,
2393 dsl_dataset_rele(ds
, FTAG
);
2396 error
= zap_count(os
, unlinked_obj
, &count
);
2399 *in_progress
= (count
!= 0);
2403 * The delete queue is ZPL specific, and libzpool doesn't have
2404 * it. It doesn't make sense to wait for it.
2407 *in_progress
= B_FALSE
;
2412 panic("unrecognized value for activity %d", activity
);
2419 dsl_dir_wait(dsl_dir_t
*dd
, dsl_dataset_t
*ds
, zfs_wait_activity_t activity
,
2423 boolean_t in_progress
;
2424 dsl_pool_t
*dp
= dd
->dd_pool
;
2426 dsl_pool_config_enter(dp
, FTAG
);
2427 error
= dsl_dir_activity_in_progress(dd
, ds
, activity
,
2429 dsl_pool_config_exit(dp
, FTAG
);
2430 if (error
!= 0 || !in_progress
)
2435 if (cv_wait_sig(&dd
->dd_activity_cv
, &dd
->dd_activity_lock
) ==
2436 0 || dd
->dd_activity_cancelled
) {
2437 error
= SET_ERROR(EINTR
);
2445 dsl_dir_cancel_waiters(dsl_dir_t
*dd
)
2447 mutex_enter(&dd
->dd_activity_lock
);
2448 dd
->dd_activity_cancelled
= B_TRUE
;
2449 cv_broadcast(&dd
->dd_activity_cv
);
2450 while (dd
->dd_activity_waiters
> 0)
2451 cv_wait(&dd
->dd_activity_cv
, &dd
->dd_activity_lock
);
2452 mutex_exit(&dd
->dd_activity_lock
);
2455 #if defined(_KERNEL)
2456 EXPORT_SYMBOL(dsl_dir_set_quota
);
2457 EXPORT_SYMBOL(dsl_dir_set_reservation
);