zfs(4): remove "experimental" from zfs_bclone_enabled
[zfs.git] / module / zfs / dmu_object.c
blob56986ea4344629c8f85768e2d002028644f3f418
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
24 * Copyright 2014 HybridCluster. All rights reserved.
27 #include <sys/dbuf.h>
28 #include <sys/dmu.h>
29 #include <sys/dmu_impl.h>
30 #include <sys/dmu_objset.h>
31 #include <sys/dmu_tx.h>
32 #include <sys/dnode.h>
33 #include <sys/zap.h>
34 #include <sys/zfeature.h>
35 #include <sys/dsl_dataset.h>
38 * Each of the concurrent object allocators will grab
39 * 2^dmu_object_alloc_chunk_shift dnode slots at a time. The default is to
40 * grab 128 slots, which is 4 blocks worth. This was experimentally
41 * determined to be the lowest value that eliminates the measurable effect
42 * of lock contention from this code path.
44 uint_t dmu_object_alloc_chunk_shift = 7;
46 static uint64_t
47 dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
48 int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
49 int dnodesize, dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
51 uint64_t object;
52 uint64_t L1_dnode_count = DNODES_PER_BLOCK <<
53 (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
54 dnode_t *dn = NULL;
55 int dn_slots = dnodesize >> DNODE_SHIFT;
56 boolean_t restarted = B_FALSE;
57 uint64_t *cpuobj = NULL;
58 uint_t dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
59 int error;
61 cpuobj = &os->os_obj_next_percpu[CPU_SEQID_UNSTABLE %
62 os->os_obj_next_percpu_len];
64 if (dn_slots == 0) {
65 dn_slots = DNODE_MIN_SLOTS;
66 } else {
67 ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
68 ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
72 * The "chunk" of dnodes that is assigned to a CPU-specific
73 * allocator needs to be at least one block's worth, to avoid
74 * lock contention on the dbuf. It can be at most one L1 block's
75 * worth, so that the "rescan after polishing off a L1's worth"
76 * logic below will be sure to kick in.
78 if (dnodes_per_chunk < DNODES_PER_BLOCK)
79 dnodes_per_chunk = DNODES_PER_BLOCK;
80 if (dnodes_per_chunk > L1_dnode_count)
81 dnodes_per_chunk = L1_dnode_count;
84 * The caller requested the dnode be returned as a performance
85 * optimization in order to avoid releasing the hold only to
86 * immediately reacquire it. Since they caller is responsible
87 * for releasing the hold they must provide the tag.
89 if (allocated_dnode != NULL) {
90 ASSERT3P(tag, !=, NULL);
91 } else {
92 ASSERT3P(tag, ==, NULL);
93 tag = FTAG;
96 object = *cpuobj;
97 for (;;) {
99 * If we finished a chunk of dnodes, get a new one from
100 * the global allocator.
102 if ((P2PHASE(object, dnodes_per_chunk) == 0) ||
103 (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) <
104 dn_slots)) {
105 DNODE_STAT_BUMP(dnode_alloc_next_chunk);
106 mutex_enter(&os->os_obj_lock);
107 ASSERT0(P2PHASE(os->os_obj_next_chunk,
108 dnodes_per_chunk));
109 object = os->os_obj_next_chunk;
112 * Each time we polish off a L1 bp worth of dnodes
113 * (2^12 objects), move to another L1 bp that's
114 * still reasonably sparse (at most 1/4 full). Look
115 * from the beginning at most once per txg. If we
116 * still can't allocate from that L1 block, search
117 * for an empty L0 block, which will quickly skip
118 * to the end of the metadnode if no nearby L0
119 * blocks are empty. This fallback avoids a
120 * pathology where full dnode blocks containing
121 * large dnodes appear sparse because they have a
122 * low blk_fill, leading to many failed allocation
123 * attempts. In the long term a better mechanism to
124 * search for sparse metadnode regions, such as
125 * spacemaps, could be implemented.
127 * os_scan_dnodes is set during txg sync if enough
128 * objects have been freed since the previous
129 * rescan to justify backfilling again.
131 * Note that dmu_traverse depends on the behavior
132 * that we use multiple blocks of the dnode object
133 * before going back to reuse objects. Any change
134 * to this algorithm should preserve that property
135 * or find another solution to the issues described
136 * in traverse_visitbp.
138 if (P2PHASE(object, L1_dnode_count) == 0) {
139 uint64_t offset;
140 uint64_t blkfill;
141 int minlvl;
142 if (os->os_rescan_dnodes) {
143 offset = 0;
144 os->os_rescan_dnodes = B_FALSE;
145 } else {
146 offset = object << DNODE_SHIFT;
148 blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
149 minlvl = restarted ? 1 : 2;
150 restarted = B_TRUE;
151 error = dnode_next_offset(DMU_META_DNODE(os),
152 DNODE_FIND_HOLE, &offset, minlvl,
153 blkfill, 0);
154 if (error == 0) {
155 object = offset >> DNODE_SHIFT;
159 * Note: if "restarted", we may find a L0 that
160 * is not suitably aligned.
162 os->os_obj_next_chunk =
163 P2ALIGN_TYPED(object, dnodes_per_chunk, uint64_t) +
164 dnodes_per_chunk;
165 (void) atomic_swap_64(cpuobj, object);
166 mutex_exit(&os->os_obj_lock);
170 * The value of (*cpuobj) before adding dn_slots is the object
171 * ID assigned to us. The value afterwards is the object ID
172 * assigned to whoever wants to do an allocation next.
174 object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots;
177 * XXX We should check for an i/o error here and return
178 * up to our caller. Actually we should pre-read it in
179 * dmu_tx_assign(), but there is currently no mechanism
180 * to do so.
182 error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
183 dn_slots, tag, &dn);
184 if (error == 0) {
185 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
187 * Another thread could have allocated it; check
188 * again now that we have the struct lock.
190 if (dn->dn_type == DMU_OT_NONE) {
191 dnode_allocate(dn, ot, blocksize,
192 indirect_blockshift, bonustype,
193 bonuslen, dn_slots, tx);
194 rw_exit(&dn->dn_struct_rwlock);
195 dmu_tx_add_new_object(tx, dn);
198 * Caller requested the allocated dnode be
199 * returned and is responsible for the hold.
201 if (allocated_dnode != NULL)
202 *allocated_dnode = dn;
203 else
204 dnode_rele(dn, tag);
206 return (object);
208 rw_exit(&dn->dn_struct_rwlock);
209 dnode_rele(dn, tag);
210 DNODE_STAT_BUMP(dnode_alloc_race);
214 * Skip to next known valid starting point on error. This
215 * is the start of the next block of dnodes.
217 if (dmu_object_next(os, &object, B_TRUE, 0) != 0) {
218 object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK);
219 DNODE_STAT_BUMP(dnode_alloc_next_block);
221 (void) atomic_swap_64(cpuobj, object);
225 uint64_t
226 dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
227 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
229 return dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
230 bonuslen, 0, NULL, NULL, tx);
233 uint64_t
234 dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
235 int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
236 dmu_tx_t *tx)
238 return dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
239 bonustype, bonuslen, 0, NULL, NULL, tx);
242 uint64_t
243 dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
244 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
246 return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
247 bonuslen, dnodesize, NULL, NULL, tx));
251 * Allocate a new object and return a pointer to the newly allocated dnode
252 * via the allocated_dnode argument. The returned dnode will be held and
253 * the caller is responsible for releasing the hold by calling dnode_rele().
255 uint64_t
256 dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot, int blocksize,
257 int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
258 int dnodesize, dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
260 return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
261 bonustype, bonuslen, dnodesize, allocated_dnode, tag, tx));
265 dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
266 int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
268 return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype,
269 bonuslen, 0, tx));
273 dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
274 int blocksize, dmu_object_type_t bonustype, int bonuslen,
275 int dnodesize, dmu_tx_t *tx)
277 dnode_t *dn;
278 int dn_slots = dnodesize >> DNODE_SHIFT;
279 int err;
281 if (dn_slots == 0)
282 dn_slots = DNODE_MIN_SLOTS;
283 ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
284 ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
286 if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
287 return (SET_ERROR(EBADF));
289 err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,
290 FTAG, &dn);
291 if (err)
292 return (err);
294 dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx);
295 dmu_tx_add_new_object(tx, dn);
297 dnode_rele(dn, FTAG);
299 return (0);
303 dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
304 int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
306 return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype,
307 bonuslen, DNODE_MIN_SIZE, B_FALSE, tx));
311 dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
312 int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize,
313 boolean_t keep_spill, dmu_tx_t *tx)
315 dnode_t *dn;
316 int dn_slots = dnodesize >> DNODE_SHIFT;
317 int err;
319 if (dn_slots == 0)
320 dn_slots = DNODE_MIN_SLOTS;
322 if (object == DMU_META_DNODE_OBJECT)
323 return (SET_ERROR(EBADF));
325 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
326 FTAG, &dn);
327 if (err)
328 return (err);
330 dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots,
331 keep_spill, tx);
333 dnode_rele(dn, FTAG);
334 return (err);
338 dmu_object_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
340 dnode_t *dn;
341 int err;
343 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
344 FTAG, &dn);
345 if (err)
346 return (err);
348 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
349 if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
350 dbuf_rm_spill(dn, tx);
351 dnode_rm_spill(dn, tx);
353 rw_exit(&dn->dn_struct_rwlock);
355 dnode_rele(dn, FTAG);
356 return (err);
360 dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
362 dnode_t *dn;
363 int err;
365 ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
367 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
368 FTAG, &dn);
369 if (err)
370 return (err);
372 ASSERT(dn->dn_type != DMU_OT_NONE);
374 * If we don't create this free range, we'll leak indirect blocks when
375 * we get to freeing the dnode in syncing context.
377 dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
378 dnode_free(dn, tx);
379 dnode_rele(dn, FTAG);
381 return (0);
385 * Return (in *objectp) the next object which is allocated (or a hole)
386 * after *object, taking into account only objects that may have been modified
387 * after the specified txg.
390 dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
392 uint64_t offset;
393 uint64_t start_obj;
394 struct dsl_dataset *ds = os->os_dsl_dataset;
395 int error;
397 if (*objectp == 0) {
398 start_obj = 1;
399 } else if (ds && dsl_dataset_feature_is_active(ds,
400 SPA_FEATURE_LARGE_DNODE)) {
401 uint64_t i = *objectp + 1;
402 uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1);
403 dmu_object_info_t doi;
406 * Scan through the remaining meta dnode block. The contents
407 * of each slot in the block are known so it can be quickly
408 * checked. If the block is exhausted without a match then
409 * hand off to dnode_next_offset() for further scanning.
411 while (i <= last_obj) {
412 if (i == 0)
413 return (SET_ERROR(ESRCH));
414 error = dmu_object_info(os, i, &doi);
415 if (error == ENOENT) {
416 if (hole) {
417 *objectp = i;
418 return (0);
419 } else {
420 i++;
422 } else if (error == EEXIST) {
423 i++;
424 } else if (error == 0) {
425 if (hole) {
426 i += doi.doi_dnodesize >> DNODE_SHIFT;
427 } else {
428 *objectp = i;
429 return (0);
431 } else {
432 return (error);
436 start_obj = i;
437 } else {
438 start_obj = *objectp + 1;
441 offset = start_obj << DNODE_SHIFT;
443 error = dnode_next_offset(DMU_META_DNODE(os),
444 (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
446 *objectp = offset >> DNODE_SHIFT;
448 return (error);
452 * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the
453 * refcount on SPA_FEATURE_EXTENSIBLE_DATASET.
455 * Only for use from syncing context, on MOS objects.
457 void
458 dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type,
459 dmu_tx_t *tx)
461 dnode_t *dn;
463 ASSERT(dmu_tx_is_syncing(tx));
465 VERIFY0(dnode_hold(mos, object, FTAG, &dn));
466 if (dn->dn_type == DMU_OTN_ZAP_METADATA) {
467 dnode_rele(dn, FTAG);
468 return;
470 ASSERT3U(dn->dn_type, ==, old_type);
471 ASSERT0(dn->dn_maxblkid);
474 * We must initialize the ZAP data before changing the type,
475 * so that concurrent calls to *_is_zapified() can determine if
476 * the object has been completely zapified by checking the type.
478 mzap_create_impl(dn, 0, 0, tx);
480 dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type =
481 DMU_OTN_ZAP_METADATA;
482 dnode_setdirty(dn, tx);
483 dnode_rele(dn, FTAG);
485 spa_feature_incr(dmu_objset_spa(mos),
486 SPA_FEATURE_EXTENSIBLE_DATASET, tx);
489 void
490 dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx)
492 dnode_t *dn;
493 dmu_object_type_t t;
495 ASSERT(dmu_tx_is_syncing(tx));
497 VERIFY0(dnode_hold(mos, object, FTAG, &dn));
498 t = dn->dn_type;
499 dnode_rele(dn, FTAG);
501 if (t == DMU_OTN_ZAP_METADATA) {
502 spa_feature_decr(dmu_objset_spa(mos),
503 SPA_FEATURE_EXTENSIBLE_DATASET, tx);
505 VERIFY0(dmu_object_free(mos, object, tx));
508 EXPORT_SYMBOL(dmu_object_alloc);
509 EXPORT_SYMBOL(dmu_object_alloc_ibs);
510 EXPORT_SYMBOL(dmu_object_alloc_dnsize);
511 EXPORT_SYMBOL(dmu_object_alloc_hold);
512 EXPORT_SYMBOL(dmu_object_claim);
513 EXPORT_SYMBOL(dmu_object_claim_dnsize);
514 EXPORT_SYMBOL(dmu_object_reclaim);
515 EXPORT_SYMBOL(dmu_object_reclaim_dnsize);
516 EXPORT_SYMBOL(dmu_object_rm_spill);
517 EXPORT_SYMBOL(dmu_object_free);
518 EXPORT_SYMBOL(dmu_object_next);
519 EXPORT_SYMBOL(dmu_object_zapify);
520 EXPORT_SYMBOL(dmu_object_free_zapified);
522 /* BEGIN CSTYLED */
523 ZFS_MODULE_PARAM(zfs, , dmu_object_alloc_chunk_shift, UINT, ZMOD_RW,
524 "CPU-specific allocator grabs 2^N objects at once");
525 /* END CSTYLED */