4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
25 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
26 * Copyright 2017 Nexenta Systems, Inc.
27 * Copyright (c) 2024, Klara, Inc.
33 #include <sys/zfs_context.h>
35 #include <sys/zap_impl.h>
36 #include <sys/zap_leaf.h>
37 #include <sys/btree.h>
39 #include <sys/dmu_objset.h>
40 #include <sys/spa_impl.h>
43 #include <sys/sunddi.h>
47 * The maximum size (in bytes) of a microzap before it is converted to a
48 * fatzap. It will be rounded up to next multiple of 512 (SPA_MINBLOCKSIZE).
50 * By definition, a microzap must fit into a single block, so this has
51 * traditionally been SPA_OLD_MAXBLOCKSIZE, and is set to that by default.
52 * Setting this higher requires both the large_blocks feature (to even create
53 * blocks that large) and the large_microzap feature (to enable the stream
54 * machinery to understand not to try to split a microzap block).
56 * If large_microzap is enabled, this value will be clamped to
57 * spa_maxblocksize(). If not, it will be clamped to SPA_OLD_MAXBLOCKSIZE.
59 static int zap_micro_max_size
= SPA_OLD_MAXBLOCKSIZE
;
62 zap_get_micro_max_size(spa_t
*spa
)
64 uint64_t maxsz
= P2ROUNDUP(zap_micro_max_size
, SPA_MINBLOCKSIZE
);
65 if (maxsz
<= SPA_OLD_MAXBLOCKSIZE
)
67 if (spa_feature_is_enabled(spa
, SPA_FEATURE_LARGE_MICROZAP
))
68 return (MIN(maxsz
, spa_maxblocksize(spa
)));
69 return (SPA_OLD_MAXBLOCKSIZE
);
72 static int mzap_upgrade(zap_t
**zapp
,
73 const void *tag
, dmu_tx_t
*tx
, zap_flags_t flags
);
76 zap_getflags(zap_t
*zap
)
80 return (zap_f_phys(zap
)->zap_flags
);
84 zap_hashbits(zap_t
*zap
)
86 if (zap_getflags(zap
) & ZAP_FLAG_HASH64
)
95 if (zap_getflags(zap
) & ZAP_FLAG_HASH64
)
102 zap_hash(zap_name_t
*zn
)
104 zap_t
*zap
= zn
->zn_zap
;
107 if (zap_getflags(zap
) & ZAP_FLAG_PRE_HASHED_KEY
) {
108 ASSERT(zap_getflags(zap
) & ZAP_FLAG_UINT64_KEY
);
109 h
= *(uint64_t *)zn
->zn_key_orig
;
113 ASSERT(zfs_crc64_table
[128] == ZFS_CRC64_POLY
);
115 if (zap_getflags(zap
) & ZAP_FLAG_UINT64_KEY
) {
116 const uint64_t *wp
= zn
->zn_key_norm
;
118 ASSERT(zn
->zn_key_intlen
== 8);
119 for (int i
= 0; i
< zn
->zn_key_norm_numints
;
123 for (int j
= 0; j
< 8; j
++) {
125 zfs_crc64_table
[(h
^ word
) & 0xFF];
130 const uint8_t *cp
= zn
->zn_key_norm
;
133 * We previously stored the terminating null on
134 * disk, but didn't hash it, so we need to
135 * continue to not hash it. (The
136 * zn_key_*_numints includes the terminating
137 * null for non-binary keys.)
139 int len
= zn
->zn_key_norm_numints
- 1;
141 ASSERT(zn
->zn_key_intlen
== 1);
142 for (int i
= 0; i
< len
; cp
++, i
++) {
144 zfs_crc64_table
[(h
^ *cp
) & 0xFF];
149 * Don't use all 64 bits, since we need some in the cookie for
150 * the collision differentiator. We MUST use the high bits,
151 * since those are the ones that we first pay attention to when
152 * choosing the bucket.
154 h
&= ~((1ULL << (64 - zap_hashbits(zap
))) - 1);
160 zap_normalize(zap_t
*zap
, const char *name
, char *namenorm
, int normflags
,
163 ASSERT(!(zap_getflags(zap
) & ZAP_FLAG_UINT64_KEY
));
165 size_t inlen
= strlen(name
) + 1;
168 (void) u8_textprep_str((char *)name
, &inlen
, namenorm
, &outlen
,
169 normflags
| U8_TEXTPREP_IGNORE_NULL
| U8_TEXTPREP_IGNORE_INVALID
,
170 U8_UNICODE_LATEST
, &err
);
176 zap_match(zap_name_t
*zn
, const char *matchname
)
178 boolean_t res
= B_FALSE
;
179 ASSERT(!(zap_getflags(zn
->zn_zap
) & ZAP_FLAG_UINT64_KEY
));
181 if (zn
->zn_matchtype
& MT_NORMALIZE
) {
182 size_t namelen
= zn
->zn_normbuf_len
;
183 char normbuf
[ZAP_MAXNAMELEN
];
184 char *norm
= normbuf
;
187 * Cannot allocate this on-stack as it exceed the stack-limit of
190 if (namelen
> ZAP_MAXNAMELEN
)
191 norm
= kmem_alloc(namelen
, KM_SLEEP
);
193 if (zap_normalize(zn
->zn_zap
, matchname
, norm
,
194 zn
->zn_normflags
, namelen
) != 0) {
197 res
= (strcmp(zn
->zn_key_norm
, norm
) == 0);
200 kmem_free(norm
, namelen
);
202 res
= (strcmp(zn
->zn_key_orig
, matchname
) == 0);
207 static kmem_cache_t
*zap_name_cache
;
208 static kmem_cache_t
*zap_attr_cache
;
209 static kmem_cache_t
*zap_name_long_cache
;
210 static kmem_cache_t
*zap_attr_long_cache
;
215 zap_name_cache
= kmem_cache_create("zap_name",
216 sizeof (zap_name_t
) + ZAP_MAXNAMELEN
, 0, NULL
, NULL
,
217 NULL
, NULL
, NULL
, 0);
219 zap_attr_cache
= kmem_cache_create("zap_attr_cache",
220 sizeof (zap_attribute_t
) + ZAP_MAXNAMELEN
, 0, NULL
,
221 NULL
, NULL
, NULL
, NULL
, 0);
223 zap_name_long_cache
= kmem_cache_create("zap_name_long",
224 sizeof (zap_name_t
) + ZAP_MAXNAMELEN_NEW
, 0, NULL
, NULL
,
225 NULL
, NULL
, NULL
, 0);
227 zap_attr_long_cache
= kmem_cache_create("zap_attr_long_cache",
228 sizeof (zap_attribute_t
) + ZAP_MAXNAMELEN_NEW
, 0, NULL
,
229 NULL
, NULL
, NULL
, NULL
, 0);
235 kmem_cache_destroy(zap_name_cache
);
236 kmem_cache_destroy(zap_attr_cache
);
237 kmem_cache_destroy(zap_name_long_cache
);
238 kmem_cache_destroy(zap_attr_long_cache
);
242 zap_name_alloc(zap_t
*zap
, boolean_t longname
)
244 kmem_cache_t
*cache
= longname
? zap_name_long_cache
: zap_name_cache
;
245 zap_name_t
*zn
= kmem_cache_alloc(cache
, KM_SLEEP
);
248 zn
->zn_normbuf_len
= longname
? ZAP_MAXNAMELEN_NEW
: ZAP_MAXNAMELEN
;
253 zap_name_free(zap_name_t
*zn
)
255 if (zn
->zn_normbuf_len
== ZAP_MAXNAMELEN
) {
256 kmem_cache_free(zap_name_cache
, zn
);
258 ASSERT3U(zn
->zn_normbuf_len
, ==, ZAP_MAXNAMELEN_NEW
);
259 kmem_cache_free(zap_name_long_cache
, zn
);
264 zap_name_init_str(zap_name_t
*zn
, const char *key
, matchtype_t mt
)
266 zap_t
*zap
= zn
->zn_zap
;
267 size_t key_len
= strlen(key
) + 1;
269 /* Make sure zn is allocated for longname if key is long */
270 IMPLY(key_len
> ZAP_MAXNAMELEN
,
271 zn
->zn_normbuf_len
== ZAP_MAXNAMELEN_NEW
);
273 zn
->zn_key_intlen
= sizeof (*key
);
274 zn
->zn_key_orig
= key
;
275 zn
->zn_key_orig_numints
= key_len
;
276 zn
->zn_matchtype
= mt
;
277 zn
->zn_normflags
= zap
->zap_normflags
;
280 * If we're dealing with a case sensitive lookup on a mixed or
281 * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup
282 * will fold case to all caps overriding the lookup request.
284 if (mt
& MT_MATCH_CASE
)
285 zn
->zn_normflags
&= ~U8_TEXTPREP_TOUPPER
;
287 if (zap
->zap_normflags
) {
289 * We *must* use zap_normflags because this normalization is
290 * what the hash is computed from.
292 if (zap_normalize(zap
, key
, zn
->zn_normbuf
,
293 zap
->zap_normflags
, zn
->zn_normbuf_len
) != 0)
294 return (SET_ERROR(ENOTSUP
));
295 zn
->zn_key_norm
= zn
->zn_normbuf
;
296 zn
->zn_key_norm_numints
= strlen(zn
->zn_key_norm
) + 1;
299 return (SET_ERROR(ENOTSUP
));
300 zn
->zn_key_norm
= zn
->zn_key_orig
;
301 zn
->zn_key_norm_numints
= zn
->zn_key_orig_numints
;
304 zn
->zn_hash
= zap_hash(zn
);
306 if (zap
->zap_normflags
!= zn
->zn_normflags
) {
308 * We *must* use zn_normflags because this normalization is
309 * what the matching is based on. (Not the hash!)
311 if (zap_normalize(zap
, key
, zn
->zn_normbuf
,
312 zn
->zn_normflags
, zn
->zn_normbuf_len
) != 0)
313 return (SET_ERROR(ENOTSUP
));
314 zn
->zn_key_norm_numints
= strlen(zn
->zn_key_norm
) + 1;
321 zap_name_alloc_str(zap_t
*zap
, const char *key
, matchtype_t mt
)
323 size_t key_len
= strlen(key
) + 1;
324 zap_name_t
*zn
= zap_name_alloc(zap
, (key_len
> ZAP_MAXNAMELEN
));
325 if (zap_name_init_str(zn
, key
, mt
) != 0) {
333 zap_name_alloc_uint64(zap_t
*zap
, const uint64_t *key
, int numints
)
335 zap_name_t
*zn
= kmem_cache_alloc(zap_name_cache
, KM_SLEEP
);
337 ASSERT(zap
->zap_normflags
== 0);
339 zn
->zn_key_intlen
= sizeof (*key
);
340 zn
->zn_key_orig
= zn
->zn_key_norm
= key
;
341 zn
->zn_key_orig_numints
= zn
->zn_key_norm_numints
= numints
;
342 zn
->zn_matchtype
= 0;
343 zn
->zn_normbuf_len
= ZAP_MAXNAMELEN
;
345 zn
->zn_hash
= zap_hash(zn
);
350 mzap_byteswap(mzap_phys_t
*buf
, size_t size
)
352 buf
->mz_block_type
= BSWAP_64(buf
->mz_block_type
);
353 buf
->mz_salt
= BSWAP_64(buf
->mz_salt
);
354 buf
->mz_normflags
= BSWAP_64(buf
->mz_normflags
);
355 int max
= (size
/ MZAP_ENT_LEN
) - 1;
356 for (int i
= 0; i
< max
; i
++) {
357 buf
->mz_chunk
[i
].mze_value
=
358 BSWAP_64(buf
->mz_chunk
[i
].mze_value
);
359 buf
->mz_chunk
[i
].mze_cd
=
360 BSWAP_32(buf
->mz_chunk
[i
].mze_cd
);
365 zap_byteswap(void *buf
, size_t size
)
367 uint64_t block_type
= *(uint64_t *)buf
;
369 if (block_type
== ZBT_MICRO
|| block_type
== BSWAP_64(ZBT_MICRO
)) {
370 /* ASSERT(magic == ZAP_LEAF_MAGIC); */
371 mzap_byteswap(buf
, size
);
373 fzap_byteswap(buf
, size
);
377 __attribute__((always_inline
)) inline
379 mze_compare(const void *arg1
, const void *arg2
)
381 const mzap_ent_t
*mze1
= arg1
;
382 const mzap_ent_t
*mze2
= arg2
;
384 return (TREE_CMP((uint64_t)(mze1
->mze_hash
) << 32 | mze1
->mze_cd
,
385 (uint64_t)(mze2
->mze_hash
) << 32 | mze2
->mze_cd
));
388 ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf
, mzap_ent_t
,
392 mze_insert(zap_t
*zap
, uint16_t chunkid
, uint64_t hash
)
396 ASSERT(zap
->zap_ismicro
);
397 ASSERT(RW_WRITE_HELD(&zap
->zap_rwlock
));
399 mze
.mze_chunkid
= chunkid
;
400 ASSERT0(hash
& 0xffffffff);
401 mze
.mze_hash
= hash
>> 32;
402 ASSERT3U(MZE_PHYS(zap
, &mze
)->mze_cd
, <=, 0xffff);
403 mze
.mze_cd
= (uint16_t)MZE_PHYS(zap
, &mze
)->mze_cd
;
404 ASSERT(MZE_PHYS(zap
, &mze
)->mze_name
[0] != 0);
405 zfs_btree_add(&zap
->zap_m
.zap_tree
, &mze
);
409 mze_find(zap_name_t
*zn
, zfs_btree_index_t
*idx
)
411 mzap_ent_t mze_tofind
;
413 zfs_btree_t
*tree
= &zn
->zn_zap
->zap_m
.zap_tree
;
415 ASSERT(zn
->zn_zap
->zap_ismicro
);
416 ASSERT(RW_LOCK_HELD(&zn
->zn_zap
->zap_rwlock
));
418 ASSERT0(zn
->zn_hash
& 0xffffffff);
419 mze_tofind
.mze_hash
= zn
->zn_hash
>> 32;
420 mze_tofind
.mze_cd
= 0;
422 mze
= zfs_btree_find(tree
, &mze_tofind
, idx
);
424 mze
= zfs_btree_next(tree
, idx
, idx
);
425 for (; mze
&& mze
->mze_hash
== mze_tofind
.mze_hash
;
426 mze
= zfs_btree_next(tree
, idx
, idx
)) {
427 ASSERT3U(mze
->mze_cd
, ==, MZE_PHYS(zn
->zn_zap
, mze
)->mze_cd
);
428 if (zap_match(zn
, MZE_PHYS(zn
->zn_zap
, mze
)->mze_name
))
436 mze_find_unused_cd(zap_t
*zap
, uint64_t hash
)
438 mzap_ent_t mze_tofind
;
439 zfs_btree_index_t idx
;
440 zfs_btree_t
*tree
= &zap
->zap_m
.zap_tree
;
442 ASSERT(zap
->zap_ismicro
);
443 ASSERT(RW_LOCK_HELD(&zap
->zap_rwlock
));
445 ASSERT0(hash
& 0xffffffff);
447 mze_tofind
.mze_hash
= hash
;
448 mze_tofind
.mze_cd
= 0;
451 for (mzap_ent_t
*mze
= zfs_btree_find(tree
, &mze_tofind
, &idx
);
452 mze
&& mze
->mze_hash
== hash
;
453 mze
= zfs_btree_next(tree
, &idx
, &idx
)) {
454 if (mze
->mze_cd
!= cd
)
463 * Each mzap entry requires at max : 4 chunks
464 * 3 chunks for names + 1 chunk for value.
466 #define MZAP_ENT_CHUNKS (1 + ZAP_LEAF_ARRAY_NCHUNKS(MZAP_NAME_LEN) + \
467 ZAP_LEAF_ARRAY_NCHUNKS(sizeof (uint64_t)))
470 * Check if the current entry keeps the colliding entries under the fatzap leaf
474 mze_canfit_fzap_leaf(zap_name_t
*zn
, uint64_t hash
)
476 zap_t
*zap
= zn
->zn_zap
;
477 mzap_ent_t mze_tofind
;
478 zfs_btree_index_t idx
;
479 zfs_btree_t
*tree
= &zap
->zap_m
.zap_tree
;
480 uint32_t mzap_ents
= 0;
482 ASSERT0(hash
& 0xffffffff);
484 mze_tofind
.mze_hash
= hash
;
485 mze_tofind
.mze_cd
= 0;
487 for (mzap_ent_t
*mze
= zfs_btree_find(tree
, &mze_tofind
, &idx
);
488 mze
&& mze
->mze_hash
== hash
;
489 mze
= zfs_btree_next(tree
, &idx
, &idx
)) {
493 /* Include the new entry being added */
496 return (ZAP_LEAF_NUMCHUNKS_DEF
> (mzap_ents
* MZAP_ENT_CHUNKS
));
500 mze_destroy(zap_t
*zap
)
502 zfs_btree_clear(&zap
->zap_m
.zap_tree
);
503 zfs_btree_destroy(&zap
->zap_m
.zap_tree
);
507 mzap_open(dmu_buf_t
*db
)
510 uint64_t *zap_hdr
= (uint64_t *)db
->db_data
;
511 uint64_t zap_block_type
= zap_hdr
[0];
512 uint64_t zap_magic
= zap_hdr
[1];
514 ASSERT3U(MZAP_ENT_LEN
, ==, sizeof (mzap_ent_phys_t
));
516 zap_t
*zap
= kmem_zalloc(sizeof (zap_t
), KM_SLEEP
);
517 rw_init(&zap
->zap_rwlock
, NULL
, RW_DEFAULT
, NULL
);
518 rw_enter(&zap
->zap_rwlock
, RW_WRITER
);
519 zap
->zap_objset
= dmu_buf_get_objset(db
);
520 zap
->zap_object
= db
->db_object
;
523 if (zap_block_type
!= ZBT_MICRO
) {
524 mutex_init(&zap
->zap_f
.zap_num_entries_mtx
, 0, MUTEX_DEFAULT
,
526 zap
->zap_f
.zap_block_shift
= highbit64(db
->db_size
) - 1;
527 if (zap_block_type
!= ZBT_HEADER
|| zap_magic
!= ZAP_MAGIC
) {
528 winner
= NULL
; /* No actual winner here... */
532 zap
->zap_ismicro
= TRUE
;
536 * Make sure that zap_ismicro is set before we let others see
537 * it, because zap_lockdir() checks zap_ismicro without the lock
540 dmu_buf_init_user(&zap
->zap_dbu
, zap_evict_sync
, NULL
, &zap
->zap_dbuf
);
541 winner
= dmu_buf_set_user(db
, &zap
->zap_dbu
);
546 if (zap
->zap_ismicro
) {
547 zap
->zap_salt
= zap_m_phys(zap
)->mz_salt
;
548 zap
->zap_normflags
= zap_m_phys(zap
)->mz_normflags
;
549 zap
->zap_m
.zap_num_chunks
= db
->db_size
/ MZAP_ENT_LEN
- 1;
552 * Reduce B-tree leaf from 4KB to 512 bytes to reduce memmove()
553 * overhead on massive inserts below. It still allows to store
554 * 62 entries before we have to add 2KB B-tree core node.
556 zfs_btree_create_custom(&zap
->zap_m
.zap_tree
, mze_compare
,
557 mze_find_in_buf
, sizeof (mzap_ent_t
), 512);
559 zap_name_t
*zn
= zap_name_alloc(zap
, B_FALSE
);
560 for (uint16_t i
= 0; i
< zap
->zap_m
.zap_num_chunks
; i
++) {
561 mzap_ent_phys_t
*mze
=
562 &zap_m_phys(zap
)->mz_chunk
[i
];
563 if (mze
->mze_name
[0]) {
564 zap
->zap_m
.zap_num_entries
++;
565 zap_name_init_str(zn
, mze
->mze_name
, 0);
566 mze_insert(zap
, i
, zn
->zn_hash
);
571 zap
->zap_salt
= zap_f_phys(zap
)->zap_salt
;
572 zap
->zap_normflags
= zap_f_phys(zap
)->zap_normflags
;
574 ASSERT3U(sizeof (struct zap_leaf_header
), ==,
575 2*ZAP_LEAF_CHUNKSIZE
);
578 * The embedded pointer table should not overlap the
581 ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap
, 0), >,
582 &zap_f_phys(zap
)->zap_salt
);
585 * The embedded pointer table should end at the end of
588 ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap
,
589 1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap
)) -
590 (uintptr_t)zap_f_phys(zap
), ==,
591 zap
->zap_dbuf
->db_size
);
593 rw_exit(&zap
->zap_rwlock
);
597 rw_exit(&zap
->zap_rwlock
);
598 rw_destroy(&zap
->zap_rwlock
);
599 if (!zap
->zap_ismicro
)
600 mutex_destroy(&zap
->zap_f
.zap_num_entries_mtx
);
601 kmem_free(zap
, sizeof (zap_t
));
606 * This routine "consumes" the caller's hold on the dbuf, which must
607 * have the specified tag.
610 zap_lockdir_impl(dnode_t
*dn
, dmu_buf_t
*db
, const void *tag
, dmu_tx_t
*tx
,
611 krw_t lti
, boolean_t fatreader
, boolean_t adding
, zap_t
**zapp
)
613 ASSERT0(db
->db_offset
);
614 objset_t
*os
= dmu_buf_get_objset(db
);
615 uint64_t obj
= db
->db_object
;
616 dmu_object_info_t doi
;
620 dmu_object_info_from_dnode(dn
, &doi
);
621 if (DMU_OT_BYTESWAP(doi
.doi_type
) != DMU_BSWAP_ZAP
)
622 return (SET_ERROR(EINVAL
));
624 zap_t
*zap
= dmu_buf_get_user(db
);
629 * mzap_open() didn't like what it saw on-disk.
630 * Check for corruption!
632 return (SET_ERROR(EIO
));
637 * We're checking zap_ismicro without the lock held, in order to
638 * tell what type of lock we want. Once we have some sort of
639 * lock, see if it really is the right type. In practice this
640 * can only be different if it was upgraded from micro to fat,
641 * and micro wanted WRITER but fat only needs READER.
643 krw_t lt
= (!zap
->zap_ismicro
&& fatreader
) ? RW_READER
: lti
;
644 rw_enter(&zap
->zap_rwlock
, lt
);
645 if (lt
!= ((!zap
->zap_ismicro
&& fatreader
) ? RW_READER
: lti
)) {
646 /* it was upgraded, now we only need reader */
647 ASSERT(lt
== RW_WRITER
);
649 ((!zap
->zap_ismicro
&& fatreader
) ? RW_READER
: lti
));
650 rw_downgrade(&zap
->zap_rwlock
);
654 zap
->zap_objset
= os
;
658 dmu_buf_will_dirty(db
, tx
);
660 ASSERT3P(zap
->zap_dbuf
, ==, db
);
662 ASSERT(!zap
->zap_ismicro
||
663 zap
->zap_m
.zap_num_entries
<= zap
->zap_m
.zap_num_chunks
);
664 if (zap
->zap_ismicro
&& tx
&& adding
&&
665 zap
->zap_m
.zap_num_entries
== zap
->zap_m
.zap_num_chunks
) {
666 uint64_t newsz
= db
->db_size
+ SPA_MINBLOCKSIZE
;
667 if (newsz
> zap_get_micro_max_size(dmu_objset_spa(os
))) {
668 dprintf("upgrading obj %llu: num_entries=%u\n",
669 (u_longlong_t
)obj
, zap
->zap_m
.zap_num_entries
);
671 int err
= mzap_upgrade(zapp
, tag
, tx
, 0);
673 rw_exit(&zap
->zap_rwlock
);
676 VERIFY0(dmu_object_set_blocksize(os
, obj
, newsz
, 0, tx
));
677 zap
->zap_m
.zap_num_chunks
=
678 db
->db_size
/ MZAP_ENT_LEN
- 1;
680 if (newsz
> SPA_OLD_MAXBLOCKSIZE
) {
681 dsl_dataset_t
*ds
= dmu_objset_ds(os
);
682 if (!dsl_dataset_feature_is_active(ds
,
683 SPA_FEATURE_LARGE_MICROZAP
)) {
685 * A microzap just grew beyond the old limit
686 * for the first time, so we have to ensure the
687 * feature flag is activated.
688 * zap_get_micro_max_size() won't let us get
689 * here if the feature is not enabled, so we
690 * don't need any other checks beforehand.
692 * Since we're in open context, we can't
693 * activate the feature directly, so we instead
694 * flag it on the dataset for next sync.
696 dsl_dataset_dirty(ds
, tx
);
697 mutex_enter(&ds
->ds_lock
);
698 ds
->ds_feature_activation
699 [SPA_FEATURE_LARGE_MICROZAP
] =
701 mutex_exit(&ds
->ds_lock
);
711 zap_lockdir_by_dnode(dnode_t
*dn
, dmu_tx_t
*tx
,
712 krw_t lti
, boolean_t fatreader
, boolean_t adding
, const void *tag
,
718 err
= dmu_buf_hold_by_dnode(dn
, 0, tag
, &db
, DMU_READ_NO_PREFETCH
);
721 err
= zap_lockdir_impl(dn
, db
, tag
, tx
, lti
, fatreader
, adding
, zapp
);
723 dmu_buf_rele(db
, tag
);
725 VERIFY(dnode_add_ref(dn
, tag
));
730 zap_lockdir(objset_t
*os
, uint64_t obj
, dmu_tx_t
*tx
,
731 krw_t lti
, boolean_t fatreader
, boolean_t adding
, const void *tag
,
738 err
= dnode_hold(os
, obj
, tag
, &dn
);
741 err
= dmu_buf_hold_by_dnode(dn
, 0, tag
, &db
, DMU_READ_NO_PREFETCH
);
746 err
= zap_lockdir_impl(dn
, db
, tag
, tx
, lti
, fatreader
, adding
, zapp
);
748 dmu_buf_rele(db
, tag
);
755 zap_unlockdir(zap_t
*zap
, const void *tag
)
757 rw_exit(&zap
->zap_rwlock
);
758 dnode_rele(zap
->zap_dnode
, tag
);
759 dmu_buf_rele(zap
->zap_dbuf
, tag
);
763 mzap_upgrade(zap_t
**zapp
, const void *tag
, dmu_tx_t
*tx
, zap_flags_t flags
)
768 ASSERT(RW_WRITE_HELD(&zap
->zap_rwlock
));
770 int sz
= zap
->zap_dbuf
->db_size
;
771 mzap_phys_t
*mzp
= vmem_alloc(sz
, KM_SLEEP
);
772 memcpy(mzp
, zap
->zap_dbuf
->db_data
, sz
);
773 int nchunks
= zap
->zap_m
.zap_num_chunks
;
776 err
= dmu_object_set_blocksize(zap
->zap_objset
, zap
->zap_object
,
777 1ULL << fzap_default_block_shift
, 0, tx
);
784 dprintf("upgrading obj=%llu with %u chunks\n",
785 (u_longlong_t
)zap
->zap_object
, nchunks
);
786 /* XXX destroy the tree later, so we can use the stored hash value */
789 fzap_upgrade(zap
, tx
, flags
);
791 zap_name_t
*zn
= zap_name_alloc(zap
, B_FALSE
);
792 for (int i
= 0; i
< nchunks
; i
++) {
793 mzap_ent_phys_t
*mze
= &mzp
->mz_chunk
[i
];
794 if (mze
->mze_name
[0] == 0)
796 dprintf("adding %s=%llu\n",
797 mze
->mze_name
, (u_longlong_t
)mze
->mze_value
);
798 zap_name_init_str(zn
, mze
->mze_name
, 0);
799 /* If we fail here, we would end up losing entries */
800 VERIFY0(fzap_add_cd(zn
, 8, 1, &mze
->mze_value
, mze
->mze_cd
,
802 zap
= zn
->zn_zap
; /* fzap_add_cd() may change zap */
811 * The "normflags" determine the behavior of the matchtype_t which is
812 * passed to zap_lookup_norm(). Names which have the same normalized
813 * version will be stored with the same hash value, and therefore we can
814 * perform normalization-insensitive lookups. We can be Unicode form-
815 * insensitive and/or case-insensitive. The following flags are valid for
822 * U8_TEXTPREP_TOUPPER
824 * The *_NF* (Normalization Form) flags are mutually exclusive; at most one
825 * of them may be supplied.
828 mzap_create_impl(dnode_t
*dn
, int normflags
, zap_flags_t flags
, dmu_tx_t
*tx
)
832 VERIFY0(dmu_buf_hold_by_dnode(dn
, 0, FTAG
, &db
, DMU_READ_NO_PREFETCH
));
834 dmu_buf_will_dirty(db
, tx
);
835 mzap_phys_t
*zp
= db
->db_data
;
836 zp
->mz_block_type
= ZBT_MICRO
;
838 ((uintptr_t)db
^ (uintptr_t)tx
^ (dn
->dn_object
<< 1)) | 1ULL;
839 zp
->mz_normflags
= normflags
;
843 /* Only fat zap supports flags; upgrade immediately. */
844 VERIFY(dnode_add_ref(dn
, FTAG
));
845 VERIFY0(zap_lockdir_impl(dn
, db
, FTAG
, tx
, RW_WRITER
,
846 B_FALSE
, B_FALSE
, &zap
));
847 VERIFY0(mzap_upgrade(&zap
, FTAG
, tx
, flags
));
848 zap_unlockdir(zap
, FTAG
);
850 dmu_buf_rele(db
, FTAG
);
855 zap_create_impl(objset_t
*os
, int normflags
, zap_flags_t flags
,
856 dmu_object_type_t ot
, int leaf_blockshift
, int indirect_blockshift
,
857 dmu_object_type_t bonustype
, int bonuslen
, int dnodesize
,
858 dnode_t
**allocated_dnode
, const void *tag
, dmu_tx_t
*tx
)
862 ASSERT3U(DMU_OT_BYTESWAP(ot
), ==, DMU_BSWAP_ZAP
);
864 if (allocated_dnode
== NULL
) {
866 obj
= dmu_object_alloc_hold(os
, ot
, 1ULL << leaf_blockshift
,
867 indirect_blockshift
, bonustype
, bonuslen
, dnodesize
,
869 mzap_create_impl(dn
, normflags
, flags
, tx
);
870 dnode_rele(dn
, FTAG
);
872 obj
= dmu_object_alloc_hold(os
, ot
, 1ULL << leaf_blockshift
,
873 indirect_blockshift
, bonustype
, bonuslen
, dnodesize
,
874 allocated_dnode
, tag
, tx
);
875 mzap_create_impl(*allocated_dnode
, normflags
, flags
, tx
);
882 zap_create_claim(objset_t
*os
, uint64_t obj
, dmu_object_type_t ot
,
883 dmu_object_type_t bonustype
, int bonuslen
, dmu_tx_t
*tx
)
885 return (zap_create_claim_dnsize(os
, obj
, ot
, bonustype
, bonuslen
,
890 zap_create_claim_dnsize(objset_t
*os
, uint64_t obj
, dmu_object_type_t ot
,
891 dmu_object_type_t bonustype
, int bonuslen
, int dnodesize
, dmu_tx_t
*tx
)
893 return (zap_create_claim_norm_dnsize(os
, obj
,
894 0, ot
, bonustype
, bonuslen
, dnodesize
, tx
));
898 zap_create_claim_norm(objset_t
*os
, uint64_t obj
, int normflags
,
899 dmu_object_type_t ot
,
900 dmu_object_type_t bonustype
, int bonuslen
, dmu_tx_t
*tx
)
902 return (zap_create_claim_norm_dnsize(os
, obj
, normflags
, ot
, bonustype
,
907 zap_create_claim_norm_dnsize(objset_t
*os
, uint64_t obj
, int normflags
,
908 dmu_object_type_t ot
, dmu_object_type_t bonustype
, int bonuslen
,
909 int dnodesize
, dmu_tx_t
*tx
)
914 ASSERT3U(DMU_OT_BYTESWAP(ot
), ==, DMU_BSWAP_ZAP
);
915 error
= dmu_object_claim_dnsize(os
, obj
, ot
, 0, bonustype
, bonuslen
,
920 error
= dnode_hold(os
, obj
, FTAG
, &dn
);
924 mzap_create_impl(dn
, normflags
, 0, tx
);
926 dnode_rele(dn
, FTAG
);
932 zap_create(objset_t
*os
, dmu_object_type_t ot
,
933 dmu_object_type_t bonustype
, int bonuslen
, dmu_tx_t
*tx
)
935 return (zap_create_norm(os
, 0, ot
, bonustype
, bonuslen
, tx
));
939 zap_create_dnsize(objset_t
*os
, dmu_object_type_t ot
,
940 dmu_object_type_t bonustype
, int bonuslen
, int dnodesize
, dmu_tx_t
*tx
)
942 return (zap_create_norm_dnsize(os
, 0, ot
, bonustype
, bonuslen
,
947 zap_create_norm(objset_t
*os
, int normflags
, dmu_object_type_t ot
,
948 dmu_object_type_t bonustype
, int bonuslen
, dmu_tx_t
*tx
)
950 return (zap_create_norm_dnsize(os
, normflags
, ot
, bonustype
, bonuslen
,
955 zap_create_norm_dnsize(objset_t
*os
, int normflags
, dmu_object_type_t ot
,
956 dmu_object_type_t bonustype
, int bonuslen
, int dnodesize
, dmu_tx_t
*tx
)
958 return (zap_create_impl(os
, normflags
, 0, ot
, 0, 0,
959 bonustype
, bonuslen
, dnodesize
, NULL
, NULL
, tx
));
963 zap_create_flags(objset_t
*os
, int normflags
, zap_flags_t flags
,
964 dmu_object_type_t ot
, int leaf_blockshift
, int indirect_blockshift
,
965 dmu_object_type_t bonustype
, int bonuslen
, dmu_tx_t
*tx
)
967 return (zap_create_flags_dnsize(os
, normflags
, flags
, ot
,
968 leaf_blockshift
, indirect_blockshift
, bonustype
, bonuslen
, 0, tx
));
972 zap_create_flags_dnsize(objset_t
*os
, int normflags
, zap_flags_t flags
,
973 dmu_object_type_t ot
, int leaf_blockshift
, int indirect_blockshift
,
974 dmu_object_type_t bonustype
, int bonuslen
, int dnodesize
, dmu_tx_t
*tx
)
976 return (zap_create_impl(os
, normflags
, flags
, ot
, leaf_blockshift
,
977 indirect_blockshift
, bonustype
, bonuslen
, dnodesize
, NULL
, NULL
,
982 * Create a zap object and return a pointer to the newly allocated dnode via
983 * the allocated_dnode argument. The returned dnode will be held and the
984 * caller is responsible for releasing the hold by calling dnode_rele().
987 zap_create_hold(objset_t
*os
, int normflags
, zap_flags_t flags
,
988 dmu_object_type_t ot
, int leaf_blockshift
, int indirect_blockshift
,
989 dmu_object_type_t bonustype
, int bonuslen
, int dnodesize
,
990 dnode_t
**allocated_dnode
, const void *tag
, dmu_tx_t
*tx
)
992 return (zap_create_impl(os
, normflags
, flags
, ot
, leaf_blockshift
,
993 indirect_blockshift
, bonustype
, bonuslen
, dnodesize
,
994 allocated_dnode
, tag
, tx
));
998 zap_destroy(objset_t
*os
, uint64_t zapobj
, dmu_tx_t
*tx
)
1001 * dmu_object_free will free the object number and free the
1002 * data. Freeing the data will cause our pageout function to be
1003 * called, which will destroy our data (zap_leaf_t's and zap_t).
1006 return (dmu_object_free(os
, zapobj
, tx
));
1010 zap_evict_sync(void *dbu
)
1014 rw_destroy(&zap
->zap_rwlock
);
1016 if (zap
->zap_ismicro
)
1019 mutex_destroy(&zap
->zap_f
.zap_num_entries_mtx
);
1021 kmem_free(zap
, sizeof (zap_t
));
1025 zap_count(objset_t
*os
, uint64_t zapobj
, uint64_t *count
)
1030 zap_lockdir(os
, zapobj
, NULL
, RW_READER
, TRUE
, FALSE
, FTAG
, &zap
);
1033 if (!zap
->zap_ismicro
) {
1034 err
= fzap_count(zap
, count
);
1036 *count
= zap
->zap_m
.zap_num_entries
;
1038 zap_unlockdir(zap
, FTAG
);
1043 * zn may be NULL; if not specified, it will be computed if needed.
1044 * See also the comment above zap_entry_normalization_conflict().
1047 mzap_normalization_conflict(zap_t
*zap
, zap_name_t
*zn
, mzap_ent_t
*mze
,
1048 zfs_btree_index_t
*idx
)
1050 boolean_t allocdzn
= B_FALSE
;
1052 zfs_btree_index_t oidx
;
1054 if (zap
->zap_normflags
== 0)
1057 for (other
= zfs_btree_prev(&zap
->zap_m
.zap_tree
, idx
, &oidx
);
1058 other
&& other
->mze_hash
== mze
->mze_hash
;
1059 other
= zfs_btree_prev(&zap
->zap_m
.zap_tree
, &oidx
, &oidx
)) {
1062 zn
= zap_name_alloc_str(zap
,
1063 MZE_PHYS(zap
, mze
)->mze_name
, MT_NORMALIZE
);
1066 if (zap_match(zn
, MZE_PHYS(zap
, other
)->mze_name
)) {
1073 for (other
= zfs_btree_next(&zap
->zap_m
.zap_tree
, idx
, &oidx
);
1074 other
&& other
->mze_hash
== mze
->mze_hash
;
1075 other
= zfs_btree_next(&zap
->zap_m
.zap_tree
, &oidx
, &oidx
)) {
1078 zn
= zap_name_alloc_str(zap
,
1079 MZE_PHYS(zap
, mze
)->mze_name
, MT_NORMALIZE
);
1082 if (zap_match(zn
, MZE_PHYS(zap
, other
)->mze_name
)) {
1095 * Routines for manipulating attributes.
1099 zap_lookup(objset_t
*os
, uint64_t zapobj
, const char *name
,
1100 uint64_t integer_size
, uint64_t num_integers
, void *buf
)
1102 return (zap_lookup_norm(os
, zapobj
, name
, integer_size
,
1103 num_integers
, buf
, 0, NULL
, 0, NULL
));
1107 zap_lookup_impl(zap_t
*zap
, const char *name
,
1108 uint64_t integer_size
, uint64_t num_integers
, void *buf
,
1109 matchtype_t mt
, char *realname
, int rn_len
,
1114 zap_name_t
*zn
= zap_name_alloc_str(zap
, name
, mt
);
1116 return (SET_ERROR(ENOTSUP
));
1118 if (!zap
->zap_ismicro
) {
1119 err
= fzap_lookup(zn
, integer_size
, num_integers
, buf
,
1120 realname
, rn_len
, ncp
);
1122 zfs_btree_index_t idx
;
1123 mzap_ent_t
*mze
= mze_find(zn
, &idx
);
1125 err
= SET_ERROR(ENOENT
);
1127 if (num_integers
< 1) {
1128 err
= SET_ERROR(EOVERFLOW
);
1129 } else if (integer_size
!= 8) {
1130 err
= SET_ERROR(EINVAL
);
1133 MZE_PHYS(zap
, mze
)->mze_value
;
1134 if (realname
!= NULL
)
1135 (void) strlcpy(realname
,
1136 MZE_PHYS(zap
, mze
)->mze_name
,
1139 *ncp
= mzap_normalization_conflict(zap
,
1150 zap_lookup_norm(objset_t
*os
, uint64_t zapobj
, const char *name
,
1151 uint64_t integer_size
, uint64_t num_integers
, void *buf
,
1152 matchtype_t mt
, char *realname
, int rn_len
,
1158 zap_lockdir(os
, zapobj
, NULL
, RW_READER
, TRUE
, FALSE
, FTAG
, &zap
);
1161 err
= zap_lookup_impl(zap
, name
, integer_size
,
1162 num_integers
, buf
, mt
, realname
, rn_len
, ncp
);
1163 zap_unlockdir(zap
, FTAG
);
1168 zap_prefetch(objset_t
*os
, uint64_t zapobj
, const char *name
)
1174 err
= zap_lockdir(os
, zapobj
, NULL
, RW_READER
, TRUE
, FALSE
, FTAG
, &zap
);
1177 zn
= zap_name_alloc_str(zap
, name
, 0);
1179 zap_unlockdir(zap
, FTAG
);
1180 return (SET_ERROR(ENOTSUP
));
1185 zap_unlockdir(zap
, FTAG
);
1190 zap_prefetch_object(objset_t
*os
, uint64_t zapobj
)
1193 dmu_object_info_t doi
;
1195 error
= dmu_object_info(os
, zapobj
, &doi
);
1196 if (error
== 0 && DMU_OT_BYTESWAP(doi
.doi_type
) != DMU_BSWAP_ZAP
)
1197 error
= SET_ERROR(EINVAL
);
1199 dmu_prefetch_wait(os
, zapobj
, 0, doi
.doi_max_offset
);
1205 zap_lookup_by_dnode(dnode_t
*dn
, const char *name
,
1206 uint64_t integer_size
, uint64_t num_integers
, void *buf
)
1208 return (zap_lookup_norm_by_dnode(dn
, name
, integer_size
,
1209 num_integers
, buf
, 0, NULL
, 0, NULL
));
1213 zap_lookup_norm_by_dnode(dnode_t
*dn
, const char *name
,
1214 uint64_t integer_size
, uint64_t num_integers
, void *buf
,
1215 matchtype_t mt
, char *realname
, int rn_len
,
1220 int err
= zap_lockdir_by_dnode(dn
, NULL
, RW_READER
, TRUE
, FALSE
,
1224 err
= zap_lookup_impl(zap
, name
, integer_size
,
1225 num_integers
, buf
, mt
, realname
, rn_len
, ncp
);
1226 zap_unlockdir(zap
, FTAG
);
1231 zap_prefetch_uint64_impl(zap_t
*zap
, const uint64_t *key
, int key_numints
)
1233 zap_name_t
*zn
= zap_name_alloc_uint64(zap
, key
, key_numints
);
1235 zap_unlockdir(zap
, FTAG
);
1236 return (SET_ERROR(ENOTSUP
));
1241 zap_unlockdir(zap
, FTAG
);
1246 zap_prefetch_uint64(objset_t
*os
, uint64_t zapobj
, const uint64_t *key
,
1252 zap_lockdir(os
, zapobj
, NULL
, RW_READER
, TRUE
, FALSE
, FTAG
, &zap
);
1255 err
= zap_prefetch_uint64_impl(zap
, key
, key_numints
);
1256 /* zap_prefetch_uint64_impl() calls zap_unlockdir() */
1261 zap_prefetch_uint64_by_dnode(dnode_t
*dn
, const uint64_t *key
, int key_numints
)
1266 zap_lockdir_by_dnode(dn
, NULL
, RW_READER
, TRUE
, FALSE
, FTAG
, &zap
);
1269 err
= zap_prefetch_uint64_impl(zap
, key
, key_numints
);
1270 /* zap_prefetch_uint64_impl() calls zap_unlockdir() */
1275 zap_lookup_uint64_impl(zap_t
*zap
, const uint64_t *key
,
1276 int key_numints
, uint64_t integer_size
, uint64_t num_integers
, void *buf
)
1278 zap_name_t
*zn
= zap_name_alloc_uint64(zap
, key
, key_numints
);
1280 zap_unlockdir(zap
, FTAG
);
1281 return (SET_ERROR(ENOTSUP
));
1284 int err
= fzap_lookup(zn
, integer_size
, num_integers
, buf
,
1287 zap_unlockdir(zap
, FTAG
);
1292 zap_lookup_uint64(objset_t
*os
, uint64_t zapobj
, const uint64_t *key
,
1293 int key_numints
, uint64_t integer_size
, uint64_t num_integers
, void *buf
)
1298 zap_lockdir(os
, zapobj
, NULL
, RW_READER
, TRUE
, FALSE
, FTAG
, &zap
);
1301 err
= zap_lookup_uint64_impl(zap
, key
, key_numints
, integer_size
,
1303 /* zap_lookup_uint64_impl() calls zap_unlockdir() */
1308 zap_lookup_uint64_by_dnode(dnode_t
*dn
, const uint64_t *key
,
1309 int key_numints
, uint64_t integer_size
, uint64_t num_integers
, void *buf
)
1314 zap_lockdir_by_dnode(dn
, NULL
, RW_READER
, TRUE
, FALSE
, FTAG
, &zap
);
1317 err
= zap_lookup_uint64_impl(zap
, key
, key_numints
, integer_size
,
1319 /* zap_lookup_uint64_impl() calls zap_unlockdir() */
1324 zap_contains(objset_t
*os
, uint64_t zapobj
, const char *name
)
1326 int err
= zap_lookup_norm(os
, zapobj
, name
, 0,
1327 0, NULL
, 0, NULL
, 0, NULL
);
1328 if (err
== EOVERFLOW
|| err
== EINVAL
)
1329 err
= 0; /* found, but skipped reading the value */
1334 zap_length(objset_t
*os
, uint64_t zapobj
, const char *name
,
1335 uint64_t *integer_size
, uint64_t *num_integers
)
1340 zap_lockdir(os
, zapobj
, NULL
, RW_READER
, TRUE
, FALSE
, FTAG
, &zap
);
1343 zap_name_t
*zn
= zap_name_alloc_str(zap
, name
, 0);
1345 zap_unlockdir(zap
, FTAG
);
1346 return (SET_ERROR(ENOTSUP
));
1348 if (!zap
->zap_ismicro
) {
1349 err
= fzap_length(zn
, integer_size
, num_integers
);
1351 zfs_btree_index_t idx
;
1352 mzap_ent_t
*mze
= mze_find(zn
, &idx
);
1354 err
= SET_ERROR(ENOENT
);
1363 zap_unlockdir(zap
, FTAG
);
1368 zap_length_uint64(objset_t
*os
, uint64_t zapobj
, const uint64_t *key
,
1369 int key_numints
, uint64_t *integer_size
, uint64_t *num_integers
)
1374 zap_lockdir(os
, zapobj
, NULL
, RW_READER
, TRUE
, FALSE
, FTAG
, &zap
);
1377 zap_name_t
*zn
= zap_name_alloc_uint64(zap
, key
, key_numints
);
1379 zap_unlockdir(zap
, FTAG
);
1380 return (SET_ERROR(ENOTSUP
));
1382 err
= fzap_length(zn
, integer_size
, num_integers
);
1384 zap_unlockdir(zap
, FTAG
);
1389 mzap_addent(zap_name_t
*zn
, uint64_t value
)
1391 zap_t
*zap
= zn
->zn_zap
;
1392 uint16_t start
= zap
->zap_m
.zap_alloc_next
;
1394 ASSERT(RW_WRITE_HELD(&zap
->zap_rwlock
));
1397 for (int i
= 0; i
< zap
->zap_m
.zap_num_chunks
; i
++) {
1398 mzap_ent_phys_t
*mze
= &zap_m_phys(zap
)->mz_chunk
[i
];
1399 ASSERT(strcmp(zn
->zn_key_orig
, mze
->mze_name
) != 0);
1403 uint32_t cd
= mze_find_unused_cd(zap
, zn
->zn_hash
);
1404 /* given the limited size of the microzap, this can't happen */
1405 ASSERT(cd
< zap_maxcd(zap
));
1408 for (uint16_t i
= start
; i
< zap
->zap_m
.zap_num_chunks
; i
++) {
1409 mzap_ent_phys_t
*mze
= &zap_m_phys(zap
)->mz_chunk
[i
];
1410 if (mze
->mze_name
[0] == 0) {
1411 mze
->mze_value
= value
;
1413 (void) strlcpy(mze
->mze_name
, zn
->zn_key_orig
,
1414 sizeof (mze
->mze_name
));
1415 zap
->zap_m
.zap_num_entries
++;
1416 zap
->zap_m
.zap_alloc_next
= i
+1;
1417 if (zap
->zap_m
.zap_alloc_next
==
1418 zap
->zap_m
.zap_num_chunks
)
1419 zap
->zap_m
.zap_alloc_next
= 0;
1420 mze_insert(zap
, i
, zn
->zn_hash
);
1428 cmn_err(CE_PANIC
, "out of entries!");
1432 zap_add_impl(zap_t
*zap
, const char *key
,
1433 int integer_size
, uint64_t num_integers
,
1434 const void *val
, dmu_tx_t
*tx
, const void *tag
)
1436 const uint64_t *intval
= val
;
1439 zap_name_t
*zn
= zap_name_alloc_str(zap
, key
, 0);
1441 zap_unlockdir(zap
, tag
);
1442 return (SET_ERROR(ENOTSUP
));
1444 if (!zap
->zap_ismicro
) {
1445 err
= fzap_add(zn
, integer_size
, num_integers
, val
, tag
, tx
);
1446 zap
= zn
->zn_zap
; /* fzap_add() may change zap */
1447 } else if (integer_size
!= 8 || num_integers
!= 1 ||
1448 strlen(key
) >= MZAP_NAME_LEN
||
1449 !mze_canfit_fzap_leaf(zn
, zn
->zn_hash
)) {
1450 err
= mzap_upgrade(&zn
->zn_zap
, tag
, tx
, 0);
1452 err
= fzap_add(zn
, integer_size
, num_integers
, val
,
1455 zap
= zn
->zn_zap
; /* fzap_add() may change zap */
1457 zfs_btree_index_t idx
;
1458 if (mze_find(zn
, &idx
) != NULL
) {
1459 err
= SET_ERROR(EEXIST
);
1461 mzap_addent(zn
, *intval
);
1464 ASSERT(zap
== zn
->zn_zap
);
1466 if (zap
!= NULL
) /* may be NULL if fzap_add() failed */
1467 zap_unlockdir(zap
, tag
);
1472 zap_add(objset_t
*os
, uint64_t zapobj
, const char *key
,
1473 int integer_size
, uint64_t num_integers
,
1474 const void *val
, dmu_tx_t
*tx
)
1479 err
= zap_lockdir(os
, zapobj
, tx
, RW_WRITER
, TRUE
, TRUE
, FTAG
, &zap
);
1482 err
= zap_add_impl(zap
, key
, integer_size
, num_integers
, val
, tx
, FTAG
);
1483 /* zap_add_impl() calls zap_unlockdir() */
1488 zap_add_by_dnode(dnode_t
*dn
, const char *key
,
1489 int integer_size
, uint64_t num_integers
,
1490 const void *val
, dmu_tx_t
*tx
)
1495 err
= zap_lockdir_by_dnode(dn
, tx
, RW_WRITER
, TRUE
, TRUE
, FTAG
, &zap
);
1498 err
= zap_add_impl(zap
, key
, integer_size
, num_integers
, val
, tx
, FTAG
);
1499 /* zap_add_impl() calls zap_unlockdir() */
1504 zap_add_uint64_impl(zap_t
*zap
, const uint64_t *key
,
1505 int key_numints
, int integer_size
, uint64_t num_integers
,
1506 const void *val
, dmu_tx_t
*tx
, const void *tag
)
1510 zap_name_t
*zn
= zap_name_alloc_uint64(zap
, key
, key_numints
);
1512 zap_unlockdir(zap
, tag
);
1513 return (SET_ERROR(ENOTSUP
));
1515 err
= fzap_add(zn
, integer_size
, num_integers
, val
, tag
, tx
);
1516 zap
= zn
->zn_zap
; /* fzap_add() may change zap */
1518 if (zap
!= NULL
) /* may be NULL if fzap_add() failed */
1519 zap_unlockdir(zap
, tag
);
1524 zap_add_uint64(objset_t
*os
, uint64_t zapobj
, const uint64_t *key
,
1525 int key_numints
, int integer_size
, uint64_t num_integers
,
1526 const void *val
, dmu_tx_t
*tx
)
1531 zap_lockdir(os
, zapobj
, tx
, RW_WRITER
, TRUE
, TRUE
, FTAG
, &zap
);
1534 err
= zap_add_uint64_impl(zap
, key
, key_numints
,
1535 integer_size
, num_integers
, val
, tx
, FTAG
);
1536 /* zap_add_uint64_impl() calls zap_unlockdir() */
1541 zap_add_uint64_by_dnode(dnode_t
*dn
, const uint64_t *key
,
1542 int key_numints
, int integer_size
, uint64_t num_integers
,
1543 const void *val
, dmu_tx_t
*tx
)
1548 zap_lockdir_by_dnode(dn
, tx
, RW_WRITER
, TRUE
, TRUE
, FTAG
, &zap
);
1551 err
= zap_add_uint64_impl(zap
, key
, key_numints
,
1552 integer_size
, num_integers
, val
, tx
, FTAG
);
1553 /* zap_add_uint64_impl() calls zap_unlockdir() */
1558 zap_update(objset_t
*os
, uint64_t zapobj
, const char *name
,
1559 int integer_size
, uint64_t num_integers
, const void *val
, dmu_tx_t
*tx
)
1562 const uint64_t *intval
= val
;
1565 zap_lockdir(os
, zapobj
, tx
, RW_WRITER
, TRUE
, TRUE
, FTAG
, &zap
);
1568 zap_name_t
*zn
= zap_name_alloc_str(zap
, name
, 0);
1570 zap_unlockdir(zap
, FTAG
);
1571 return (SET_ERROR(ENOTSUP
));
1573 if (!zap
->zap_ismicro
) {
1574 err
= fzap_update(zn
, integer_size
, num_integers
, val
,
1576 zap
= zn
->zn_zap
; /* fzap_update() may change zap */
1577 } else if (integer_size
!= 8 || num_integers
!= 1 ||
1578 strlen(name
) >= MZAP_NAME_LEN
) {
1579 dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
1580 (u_longlong_t
)zapobj
, integer_size
,
1581 (u_longlong_t
)num_integers
, name
);
1582 err
= mzap_upgrade(&zn
->zn_zap
, FTAG
, tx
, 0);
1584 err
= fzap_update(zn
, integer_size
, num_integers
,
1587 zap
= zn
->zn_zap
; /* fzap_update() may change zap */
1589 zfs_btree_index_t idx
;
1590 mzap_ent_t
*mze
= mze_find(zn
, &idx
);
1592 MZE_PHYS(zap
, mze
)->mze_value
= *intval
;
1594 mzap_addent(zn
, *intval
);
1597 ASSERT(zap
== zn
->zn_zap
);
1599 if (zap
!= NULL
) /* may be NULL if fzap_upgrade() failed */
1600 zap_unlockdir(zap
, FTAG
);
1605 zap_update_uint64_impl(zap_t
*zap
, const uint64_t *key
, int key_numints
,
1606 int integer_size
, uint64_t num_integers
, const void *val
, dmu_tx_t
*tx
,
1611 zap_name_t
*zn
= zap_name_alloc_uint64(zap
, key
, key_numints
);
1613 zap_unlockdir(zap
, tag
);
1614 return (SET_ERROR(ENOTSUP
));
1616 err
= fzap_update(zn
, integer_size
, num_integers
, val
, tag
, tx
);
1617 zap
= zn
->zn_zap
; /* fzap_update() may change zap */
1619 if (zap
!= NULL
) /* may be NULL if fzap_upgrade() failed */
1620 zap_unlockdir(zap
, tag
);
1625 zap_update_uint64(objset_t
*os
, uint64_t zapobj
, const uint64_t *key
,
1626 int key_numints
, int integer_size
, uint64_t num_integers
, const void *val
,
1632 zap_lockdir(os
, zapobj
, tx
, RW_WRITER
, TRUE
, TRUE
, FTAG
, &zap
);
1635 err
= zap_update_uint64_impl(zap
, key
, key_numints
,
1636 integer_size
, num_integers
, val
, tx
, FTAG
);
1637 /* zap_update_uint64_impl() calls zap_unlockdir() */
1642 zap_update_uint64_by_dnode(dnode_t
*dn
, const uint64_t *key
, int key_numints
,
1643 int integer_size
, uint64_t num_integers
, const void *val
, dmu_tx_t
*tx
)
1648 zap_lockdir_by_dnode(dn
, tx
, RW_WRITER
, TRUE
, TRUE
, FTAG
, &zap
);
1651 err
= zap_update_uint64_impl(zap
, key
, key_numints
,
1652 integer_size
, num_integers
, val
, tx
, FTAG
);
1653 /* zap_update_uint64_impl() calls zap_unlockdir() */
1658 zap_remove(objset_t
*os
, uint64_t zapobj
, const char *name
, dmu_tx_t
*tx
)
1660 return (zap_remove_norm(os
, zapobj
, name
, 0, tx
));
1664 zap_remove_impl(zap_t
*zap
, const char *name
,
1665 matchtype_t mt
, dmu_tx_t
*tx
)
1669 zap_name_t
*zn
= zap_name_alloc_str(zap
, name
, mt
);
1671 return (SET_ERROR(ENOTSUP
));
1672 if (!zap
->zap_ismicro
) {
1673 err
= fzap_remove(zn
, tx
);
1675 zfs_btree_index_t idx
;
1676 mzap_ent_t
*mze
= mze_find(zn
, &idx
);
1678 err
= SET_ERROR(ENOENT
);
1680 zap
->zap_m
.zap_num_entries
--;
1681 memset(MZE_PHYS(zap
, mze
), 0, sizeof (mzap_ent_phys_t
));
1682 zfs_btree_remove_idx(&zap
->zap_m
.zap_tree
, &idx
);
1690 zap_remove_norm(objset_t
*os
, uint64_t zapobj
, const char *name
,
1691 matchtype_t mt
, dmu_tx_t
*tx
)
1696 err
= zap_lockdir(os
, zapobj
, tx
, RW_WRITER
, TRUE
, FALSE
, FTAG
, &zap
);
1699 err
= zap_remove_impl(zap
, name
, mt
, tx
);
1700 zap_unlockdir(zap
, FTAG
);
1705 zap_remove_by_dnode(dnode_t
*dn
, const char *name
, dmu_tx_t
*tx
)
1710 err
= zap_lockdir_by_dnode(dn
, tx
, RW_WRITER
, TRUE
, FALSE
, FTAG
, &zap
);
1713 err
= zap_remove_impl(zap
, name
, 0, tx
);
1714 zap_unlockdir(zap
, FTAG
);
1719 zap_remove_uint64_impl(zap_t
*zap
, const uint64_t *key
, int key_numints
,
1720 dmu_tx_t
*tx
, const void *tag
)
1724 zap_name_t
*zn
= zap_name_alloc_uint64(zap
, key
, key_numints
);
1726 zap_unlockdir(zap
, tag
);
1727 return (SET_ERROR(ENOTSUP
));
1729 err
= fzap_remove(zn
, tx
);
1731 zap_unlockdir(zap
, tag
);
1736 zap_remove_uint64(objset_t
*os
, uint64_t zapobj
, const uint64_t *key
,
1737 int key_numints
, dmu_tx_t
*tx
)
1742 zap_lockdir(os
, zapobj
, tx
, RW_WRITER
, TRUE
, FALSE
, FTAG
, &zap
);
1745 err
= zap_remove_uint64_impl(zap
, key
, key_numints
, tx
, FTAG
);
1746 /* zap_remove_uint64_impl() calls zap_unlockdir() */
1751 zap_remove_uint64_by_dnode(dnode_t
*dn
, const uint64_t *key
, int key_numints
,
1757 zap_lockdir_by_dnode(dn
, tx
, RW_WRITER
, TRUE
, FALSE
, FTAG
, &zap
);
1760 err
= zap_remove_uint64_impl(zap
, key
, key_numints
, tx
, FTAG
);
1761 /* zap_remove_uint64_impl() calls zap_unlockdir() */
1766 static zap_attribute_t
*
1767 zap_attribute_alloc_impl(boolean_t longname
)
1769 zap_attribute_t
*za
;
1771 za
= kmem_cache_alloc((longname
)? zap_attr_long_cache
: zap_attr_cache
,
1773 za
->za_name_len
= (longname
)? ZAP_MAXNAMELEN_NEW
: ZAP_MAXNAMELEN
;
1778 zap_attribute_alloc(void)
1780 return (zap_attribute_alloc_impl(B_FALSE
));
1784 zap_attribute_long_alloc(void)
1786 return (zap_attribute_alloc_impl(B_TRUE
));
1790 zap_attribute_free(zap_attribute_t
*za
)
1792 if (za
->za_name_len
== ZAP_MAXNAMELEN
) {
1793 kmem_cache_free(zap_attr_cache
, za
);
1795 ASSERT3U(za
->za_name_len
, ==, ZAP_MAXNAMELEN_NEW
);
1796 kmem_cache_free(zap_attr_long_cache
, za
);
1801 * Routines for iterating over the attributes.
1805 zap_cursor_init_impl(zap_cursor_t
*zc
, objset_t
*os
, uint64_t zapobj
,
1806 uint64_t serialized
, boolean_t prefetch
)
1811 zc
->zc_zapobj
= zapobj
;
1812 zc
->zc_serialized
= serialized
;
1815 zc
->zc_prefetch
= prefetch
;
1818 zap_cursor_init_serialized(zap_cursor_t
*zc
, objset_t
*os
, uint64_t zapobj
,
1819 uint64_t serialized
)
1821 zap_cursor_init_impl(zc
, os
, zapobj
, serialized
, B_TRUE
);
1825 * Initialize a cursor at the beginning of the ZAP object. The entire
1826 * ZAP object will be prefetched.
1829 zap_cursor_init(zap_cursor_t
*zc
, objset_t
*os
, uint64_t zapobj
)
1831 zap_cursor_init_impl(zc
, os
, zapobj
, 0, B_TRUE
);
1835 * Initialize a cursor at the beginning, but request that we not prefetch
1836 * the entire ZAP object.
1839 zap_cursor_init_noprefetch(zap_cursor_t
*zc
, objset_t
*os
, uint64_t zapobj
)
1841 zap_cursor_init_impl(zc
, os
, zapobj
, 0, B_FALSE
);
1845 zap_cursor_fini(zap_cursor_t
*zc
)
1848 rw_enter(&zc
->zc_zap
->zap_rwlock
, RW_READER
);
1849 zap_unlockdir(zc
->zc_zap
, NULL
);
1853 rw_enter(&zc
->zc_leaf
->l_rwlock
, RW_READER
);
1854 zap_put_leaf(zc
->zc_leaf
);
1857 zc
->zc_objset
= NULL
;
1861 zap_cursor_serialize(zap_cursor_t
*zc
)
1863 if (zc
->zc_hash
== -1ULL)
1865 if (zc
->zc_zap
== NULL
)
1866 return (zc
->zc_serialized
);
1867 ASSERT((zc
->zc_hash
& zap_maxcd(zc
->zc_zap
)) == 0);
1868 ASSERT(zc
->zc_cd
< zap_maxcd(zc
->zc_zap
));
1871 * We want to keep the high 32 bits of the cursor zero if we can, so
1872 * that 32-bit programs can access this. So usually use a small
1873 * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits
1876 * [ collision differentiator | zap_hashbits()-bit hash value ]
1878 return ((zc
->zc_hash
>> (64 - zap_hashbits(zc
->zc_zap
))) |
1879 ((uint64_t)zc
->zc_cd
<< zap_hashbits(zc
->zc_zap
)));
1883 zap_cursor_retrieve(zap_cursor_t
*zc
, zap_attribute_t
*za
)
1887 if (zc
->zc_hash
== -1ULL)
1888 return (SET_ERROR(ENOENT
));
1890 if (zc
->zc_zap
== NULL
) {
1892 err
= zap_lockdir(zc
->zc_objset
, zc
->zc_zapobj
, NULL
,
1893 RW_READER
, TRUE
, FALSE
, NULL
, &zc
->zc_zap
);
1898 * To support zap_cursor_init_serialized, advance, retrieve,
1899 * we must add to the existing zc_cd, which may already
1900 * be 1 due to the zap_cursor_advance.
1902 ASSERT(zc
->zc_hash
== 0);
1903 hb
= zap_hashbits(zc
->zc_zap
);
1904 zc
->zc_hash
= zc
->zc_serialized
<< (64 - hb
);
1905 zc
->zc_cd
+= zc
->zc_serialized
>> hb
;
1906 if (zc
->zc_cd
>= zap_maxcd(zc
->zc_zap
)) /* corrupt serialized */
1909 rw_enter(&zc
->zc_zap
->zap_rwlock
, RW_READER
);
1911 if (!zc
->zc_zap
->zap_ismicro
) {
1912 err
= fzap_cursor_retrieve(zc
->zc_zap
, zc
, za
);
1914 zfs_btree_index_t idx
;
1915 mzap_ent_t mze_tofind
;
1917 mze_tofind
.mze_hash
= zc
->zc_hash
>> 32;
1918 mze_tofind
.mze_cd
= zc
->zc_cd
;
1920 mzap_ent_t
*mze
= zfs_btree_find(&zc
->zc_zap
->zap_m
.zap_tree
,
1923 mze
= zfs_btree_next(&zc
->zc_zap
->zap_m
.zap_tree
,
1927 mzap_ent_phys_t
*mzep
= MZE_PHYS(zc
->zc_zap
, mze
);
1928 ASSERT3U(mze
->mze_cd
, ==, mzep
->mze_cd
);
1929 za
->za_normalization_conflict
=
1930 mzap_normalization_conflict(zc
->zc_zap
, NULL
,
1932 za
->za_integer_length
= 8;
1933 za
->za_num_integers
= 1;
1934 za
->za_first_integer
= mzep
->mze_value
;
1935 (void) strlcpy(za
->za_name
, mzep
->mze_name
,
1937 zc
->zc_hash
= (uint64_t)mze
->mze_hash
<< 32;
1938 zc
->zc_cd
= mze
->mze_cd
;
1941 zc
->zc_hash
= -1ULL;
1942 err
= SET_ERROR(ENOENT
);
1945 rw_exit(&zc
->zc_zap
->zap_rwlock
);
1950 zap_cursor_advance(zap_cursor_t
*zc
)
1952 if (zc
->zc_hash
== -1ULL)
1958 zap_get_stats(objset_t
*os
, uint64_t zapobj
, zap_stats_t
*zs
)
1963 zap_lockdir(os
, zapobj
, NULL
, RW_READER
, TRUE
, FALSE
, FTAG
, &zap
);
1967 memset(zs
, 0, sizeof (zap_stats_t
));
1969 if (zap
->zap_ismicro
) {
1970 zs
->zs_blocksize
= zap
->zap_dbuf
->db_size
;
1971 zs
->zs_num_entries
= zap
->zap_m
.zap_num_entries
;
1972 zs
->zs_num_blocks
= 1;
1974 fzap_get_stats(zap
, zs
);
1976 zap_unlockdir(zap
, FTAG
);
1980 #if defined(_KERNEL)
1981 EXPORT_SYMBOL(zap_create
);
1982 EXPORT_SYMBOL(zap_create_dnsize
);
1983 EXPORT_SYMBOL(zap_create_norm
);
1984 EXPORT_SYMBOL(zap_create_norm_dnsize
);
1985 EXPORT_SYMBOL(zap_create_flags
);
1986 EXPORT_SYMBOL(zap_create_flags_dnsize
);
1987 EXPORT_SYMBOL(zap_create_claim
);
1988 EXPORT_SYMBOL(zap_create_claim_norm
);
1989 EXPORT_SYMBOL(zap_create_claim_norm_dnsize
);
1990 EXPORT_SYMBOL(zap_create_hold
);
1991 EXPORT_SYMBOL(zap_destroy
);
1992 EXPORT_SYMBOL(zap_lookup
);
1993 EXPORT_SYMBOL(zap_lookup_by_dnode
);
1994 EXPORT_SYMBOL(zap_lookup_norm
);
1995 EXPORT_SYMBOL(zap_lookup_uint64
);
1996 EXPORT_SYMBOL(zap_contains
);
1997 EXPORT_SYMBOL(zap_prefetch
);
1998 EXPORT_SYMBOL(zap_prefetch_uint64
);
1999 EXPORT_SYMBOL(zap_prefetch_object
);
2000 EXPORT_SYMBOL(zap_add
);
2001 EXPORT_SYMBOL(zap_add_by_dnode
);
2002 EXPORT_SYMBOL(zap_add_uint64
);
2003 EXPORT_SYMBOL(zap_add_uint64_by_dnode
);
2004 EXPORT_SYMBOL(zap_update
);
2005 EXPORT_SYMBOL(zap_update_uint64
);
2006 EXPORT_SYMBOL(zap_update_uint64_by_dnode
);
2007 EXPORT_SYMBOL(zap_length
);
2008 EXPORT_SYMBOL(zap_length_uint64
);
2009 EXPORT_SYMBOL(zap_remove
);
2010 EXPORT_SYMBOL(zap_remove_by_dnode
);
2011 EXPORT_SYMBOL(zap_remove_norm
);
2012 EXPORT_SYMBOL(zap_remove_uint64
);
2013 EXPORT_SYMBOL(zap_remove_uint64_by_dnode
);
2014 EXPORT_SYMBOL(zap_count
);
2015 EXPORT_SYMBOL(zap_value_search
);
2016 EXPORT_SYMBOL(zap_join
);
2017 EXPORT_SYMBOL(zap_join_increment
);
2018 EXPORT_SYMBOL(zap_add_int
);
2019 EXPORT_SYMBOL(zap_remove_int
);
2020 EXPORT_SYMBOL(zap_lookup_int
);
2021 EXPORT_SYMBOL(zap_increment_int
);
2022 EXPORT_SYMBOL(zap_add_int_key
);
2023 EXPORT_SYMBOL(zap_lookup_int_key
);
2024 EXPORT_SYMBOL(zap_increment
);
2025 EXPORT_SYMBOL(zap_cursor_init
);
2026 EXPORT_SYMBOL(zap_cursor_fini
);
2027 EXPORT_SYMBOL(zap_cursor_retrieve
);
2028 EXPORT_SYMBOL(zap_cursor_advance
);
2029 EXPORT_SYMBOL(zap_cursor_serialize
);
2030 EXPORT_SYMBOL(zap_cursor_init_serialized
);
2031 EXPORT_SYMBOL(zap_get_stats
);
2034 ZFS_MODULE_PARAM(zfs
, , zap_micro_max_size
, INT
, ZMOD_RW
,
2035 "Maximum micro ZAP size, before converting to a fat ZAP, in bytes");