4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24 * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
25 * Copyright (c) 2014, Joyent, Inc. All rights reserved.
26 * Copyright 2014 HybridCluster. All rights reserved.
27 * Copyright 2016 RackTop Systems.
28 * Copyright (c) 2016 Actifio, Inc. All rights reserved.
29 * Copyright (c) 2019, 2024, Klara, Inc.
30 * Copyright (c) 2019, Allan Jude
34 #include <sys/dmu_impl.h>
35 #include <sys/dmu_tx.h>
37 #include <sys/dnode.h>
38 #include <sys/zfs_context.h>
39 #include <sys/dmu_objset.h>
40 #include <sys/dmu_traverse.h>
41 #include <sys/dsl_dataset.h>
42 #include <sys/dsl_dir.h>
43 #include <sys/dsl_prop.h>
44 #include <sys/dsl_pool.h>
45 #include <sys/dsl_synctask.h>
46 #include <sys/spa_impl.h>
47 #include <sys/zfs_ioctl.h>
49 #include <sys/zio_checksum.h>
50 #include <sys/zfs_znode.h>
51 #include <zfs_fletcher.h>
54 #include <sys/zfs_onexit.h>
55 #include <sys/dmu_send.h>
56 #include <sys/dmu_recv.h>
57 #include <sys/dsl_destroy.h>
58 #include <sys/blkptr.h>
59 #include <sys/dsl_bookmark.h>
60 #include <sys/zfeature.h>
61 #include <sys/bqueue.h>
63 #include <sys/policy.h>
64 #include <sys/objlist.h>
66 #include <sys/zfs_vfsops.h>
69 /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
70 static int zfs_send_corrupt_data
= B_FALSE
;
72 * This tunable controls the amount of data (measured in bytes) that will be
73 * prefetched by zfs send. If the main thread is blocking on reads that haven't
74 * completed, this variable might need to be increased. If instead the main
75 * thread is issuing new reads because the prefetches have fallen out of the
76 * cache, this may need to be decreased.
78 static uint_t zfs_send_queue_length
= SPA_MAXBLOCKSIZE
;
80 * This tunable controls the length of the queues that zfs send worker threads
81 * use to communicate. If the send_main_thread is blocking on these queues,
82 * this variable may need to be increased. If there is a significant slowdown
83 * at the start of a send as these threads consume all the available IO
84 * resources, this variable may need to be decreased.
86 static uint_t zfs_send_no_prefetch_queue_length
= 1024 * 1024;
88 * These tunables control the fill fraction of the queues by zfs send. The fill
89 * fraction controls the frequency with which threads have to be cv_signaled.
90 * If a lot of cpu time is being spent on cv_signal, then these should be tuned
91 * down. If the queues empty before the signalled thread can catch up, then
92 * these should be tuned up.
94 static uint_t zfs_send_queue_ff
= 20;
95 static uint_t zfs_send_no_prefetch_queue_ff
= 20;
98 * Use this to override the recordsize calculation for fast zfs send estimates.
100 static uint_t zfs_override_estimate_recordsize
= 0;
102 /* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */
103 static const boolean_t zfs_send_set_freerecords_bit
= B_TRUE
;
105 /* Set this tunable to FALSE is disable sending unmodified spill blocks. */
106 static int zfs_send_unmodified_spill_blocks
= B_TRUE
;
108 static inline boolean_t
109 overflow_multiply(uint64_t a
, uint64_t b
, uint64_t *c
)
111 uint64_t temp
= a
* b
;
112 if (b
!= 0 && temp
/ b
!= a
)
118 struct send_thread_arg
{
120 objset_t
*os
; /* Objset to traverse */
121 uint64_t fromtxg
; /* Traverse from this txg */
122 int flags
; /* flags to pass to traverse_dataset */
125 zbookmark_phys_t resume
;
126 uint64_t *num_blocks_visited
;
129 struct redact_list_thread_arg
{
132 zbookmark_phys_t resume
;
133 redaction_list_t
*rl
;
134 boolean_t mark_redact
;
136 uint64_t *num_blocks_visited
;
139 struct send_merge_thread_arg
{
142 struct redact_list_thread_arg
*from_arg
;
143 struct send_thread_arg
*to_arg
;
144 struct redact_list_thread_arg
*redact_arg
;
150 boolean_t eos_marker
; /* Marks the end of the stream */
152 uint64_t start_blkid
;
155 enum type
{DATA
, HOLE
, OBJECT
, OBJECT_RANGE
, REDACT
,
156 PREVIOUSLY_REDACTED
} type
;
159 dmu_object_type_t obj_type
;
160 uint32_t datablksz
; // logical size
161 uint32_t datasz
; // payload size
167 boolean_t io_outstanding
;
168 boolean_t io_compressed
;
176 * This is a pointer because embedding it in the
177 * struct causes these structures to be massively larger
178 * for all range types; this makes the code much less
194 * The list of data whose inclusion in a send stream can be pending from
195 * one call to backup_cb to another. Multiple calls to dump_free(),
196 * dump_freeobjects(), and dump_redact() can be aggregated into a single
197 * DRR_FREE, DRR_FREEOBJECTS, or DRR_REDACT replay record.
206 typedef struct dmu_send_cookie
{
207 dmu_replay_record_t
*dsc_drr
;
208 dmu_send_outparams_t
*dsc_dso
;
213 uint64_t dsc_fromtxg
;
215 dmu_pendop_t dsc_pending_op
;
216 uint64_t dsc_featureflags
;
217 uint64_t dsc_last_data_object
;
218 uint64_t dsc_last_data_offset
;
219 uint64_t dsc_resume_object
;
220 uint64_t dsc_resume_offset
;
221 boolean_t dsc_sent_begin
;
222 boolean_t dsc_sent_end
;
225 static int do_dump(dmu_send_cookie_t
*dscp
, struct send_range
*range
);
228 range_free(struct send_range
*range
)
230 if (range
->type
== OBJECT
) {
231 size_t size
= sizeof (dnode_phys_t
) *
232 (range
->sru
.object
.dnp
->dn_extra_slots
+ 1);
233 kmem_free(range
->sru
.object
.dnp
, size
);
234 } else if (range
->type
== DATA
) {
235 mutex_enter(&range
->sru
.data
.lock
);
236 while (range
->sru
.data
.io_outstanding
)
237 cv_wait(&range
->sru
.data
.cv
, &range
->sru
.data
.lock
);
238 if (range
->sru
.data
.abd
!= NULL
)
239 abd_free(range
->sru
.data
.abd
);
240 if (range
->sru
.data
.abuf
!= NULL
) {
241 arc_buf_destroy(range
->sru
.data
.abuf
,
242 &range
->sru
.data
.abuf
);
244 mutex_exit(&range
->sru
.data
.lock
);
246 cv_destroy(&range
->sru
.data
.cv
);
247 mutex_destroy(&range
->sru
.data
.lock
);
249 kmem_free(range
, sizeof (*range
));
253 * For all record types except BEGIN, fill in the checksum (overlaid in
254 * drr_u.drr_checksum.drr_checksum). The checksum verifies everything
255 * up to the start of the checksum itself.
258 dump_record(dmu_send_cookie_t
*dscp
, void *payload
, int payload_len
)
260 dmu_send_outparams_t
*dso
= dscp
->dsc_dso
;
261 ASSERT3U(offsetof(dmu_replay_record_t
, drr_u
.drr_checksum
.drr_checksum
),
262 ==, sizeof (dmu_replay_record_t
) - sizeof (zio_cksum_t
));
263 (void) fletcher_4_incremental_native(dscp
->dsc_drr
,
264 offsetof(dmu_replay_record_t
, drr_u
.drr_checksum
.drr_checksum
),
266 if (dscp
->dsc_drr
->drr_type
== DRR_BEGIN
) {
267 dscp
->dsc_sent_begin
= B_TRUE
;
269 ASSERT(ZIO_CHECKSUM_IS_ZERO(&dscp
->dsc_drr
->drr_u
.
270 drr_checksum
.drr_checksum
));
271 dscp
->dsc_drr
->drr_u
.drr_checksum
.drr_checksum
= dscp
->dsc_zc
;
273 if (dscp
->dsc_drr
->drr_type
== DRR_END
) {
274 dscp
->dsc_sent_end
= B_TRUE
;
276 (void) fletcher_4_incremental_native(&dscp
->dsc_drr
->
277 drr_u
.drr_checksum
.drr_checksum
,
278 sizeof (zio_cksum_t
), &dscp
->dsc_zc
);
279 *dscp
->dsc_off
+= sizeof (dmu_replay_record_t
);
280 dscp
->dsc_err
= dso
->dso_outfunc(dscp
->dsc_os
, dscp
->dsc_drr
,
281 sizeof (dmu_replay_record_t
), dso
->dso_arg
);
282 if (dscp
->dsc_err
!= 0)
283 return (SET_ERROR(EINTR
));
284 if (payload_len
!= 0) {
285 *dscp
->dsc_off
+= payload_len
;
287 * payload is null when dso_dryrun == B_TRUE (i.e. when we're
288 * doing a send size calculation)
290 if (payload
!= NULL
) {
291 (void) fletcher_4_incremental_native(
292 payload
, payload_len
, &dscp
->dsc_zc
);
296 * The code does not rely on this (len being a multiple of 8).
297 * We keep this assertion because of the corresponding assertion
298 * in receive_read(). Keeping this assertion ensures that we do
299 * not inadvertently break backwards compatibility (causing the
300 * assertion in receive_read() to trigger on old software).
302 * Raw sends cannot be received on old software, and so can
303 * bypass this assertion.
306 ASSERT((payload_len
% 8 == 0) ||
307 (dscp
->dsc_featureflags
& DMU_BACKUP_FEATURE_RAW
));
309 dscp
->dsc_err
= dso
->dso_outfunc(dscp
->dsc_os
, payload
,
310 payload_len
, dso
->dso_arg
);
311 if (dscp
->dsc_err
!= 0)
312 return (SET_ERROR(EINTR
));
318 * Fill in the drr_free struct, or perform aggregation if the previous record is
319 * also a free record, and the two are adjacent.
321 * Note that we send free records even for a full send, because we want to be
322 * able to receive a full send as a clone, which requires a list of all the free
323 * and freeobject records that were generated on the source.
326 dump_free(dmu_send_cookie_t
*dscp
, uint64_t object
, uint64_t offset
,
329 struct drr_free
*drrf
= &(dscp
->dsc_drr
->drr_u
.drr_free
);
332 * When we receive a free record, dbuf_free_range() assumes
333 * that the receiving system doesn't have any dbufs in the range
334 * being freed. This is always true because there is a one-record
335 * constraint: we only send one WRITE record for any given
336 * object,offset. We know that the one-record constraint is
337 * true because we always send data in increasing order by
340 * If the increasing-order constraint ever changes, we should find
341 * another way to assert that the one-record constraint is still
344 ASSERT(object
> dscp
->dsc_last_data_object
||
345 (object
== dscp
->dsc_last_data_object
&&
346 offset
> dscp
->dsc_last_data_offset
));
349 * If there is a pending op, but it's not PENDING_FREE, push it out,
350 * since free block aggregation can only be done for blocks of the
351 * same type (i.e., DRR_FREE records can only be aggregated with
352 * other DRR_FREE records. DRR_FREEOBJECTS records can only be
353 * aggregated with other DRR_FREEOBJECTS records).
355 if (dscp
->dsc_pending_op
!= PENDING_NONE
&&
356 dscp
->dsc_pending_op
!= PENDING_FREE
) {
357 if (dump_record(dscp
, NULL
, 0) != 0)
358 return (SET_ERROR(EINTR
));
359 dscp
->dsc_pending_op
= PENDING_NONE
;
362 if (dscp
->dsc_pending_op
== PENDING_FREE
) {
364 * Check to see whether this free block can be aggregated
367 if (drrf
->drr_object
== object
&& drrf
->drr_offset
+
368 drrf
->drr_length
== offset
) {
369 if (offset
+ length
< offset
|| length
== UINT64_MAX
)
370 drrf
->drr_length
= UINT64_MAX
;
372 drrf
->drr_length
+= length
;
375 /* not a continuation. Push out pending record */
376 if (dump_record(dscp
, NULL
, 0) != 0)
377 return (SET_ERROR(EINTR
));
378 dscp
->dsc_pending_op
= PENDING_NONE
;
381 /* create a FREE record and make it pending */
382 memset(dscp
->dsc_drr
, 0, sizeof (dmu_replay_record_t
));
383 dscp
->dsc_drr
->drr_type
= DRR_FREE
;
384 drrf
->drr_object
= object
;
385 drrf
->drr_offset
= offset
;
386 if (offset
+ length
< offset
)
387 drrf
->drr_length
= DMU_OBJECT_END
;
389 drrf
->drr_length
= length
;
390 drrf
->drr_toguid
= dscp
->dsc_toguid
;
391 if (length
== DMU_OBJECT_END
) {
392 if (dump_record(dscp
, NULL
, 0) != 0)
393 return (SET_ERROR(EINTR
));
395 dscp
->dsc_pending_op
= PENDING_FREE
;
402 * Fill in the drr_redact struct, or perform aggregation if the previous record
403 * is also a redaction record, and the two are adjacent.
406 dump_redact(dmu_send_cookie_t
*dscp
, uint64_t object
, uint64_t offset
,
409 struct drr_redact
*drrr
= &dscp
->dsc_drr
->drr_u
.drr_redact
;
412 * If there is a pending op, but it's not PENDING_REDACT, push it out,
413 * since free block aggregation can only be done for blocks of the
414 * same type (i.e., DRR_REDACT records can only be aggregated with
415 * other DRR_REDACT records).
417 if (dscp
->dsc_pending_op
!= PENDING_NONE
&&
418 dscp
->dsc_pending_op
!= PENDING_REDACT
) {
419 if (dump_record(dscp
, NULL
, 0) != 0)
420 return (SET_ERROR(EINTR
));
421 dscp
->dsc_pending_op
= PENDING_NONE
;
424 if (dscp
->dsc_pending_op
== PENDING_REDACT
) {
426 * Check to see whether this redacted block can be aggregated
429 if (drrr
->drr_object
== object
&& drrr
->drr_offset
+
430 drrr
->drr_length
== offset
) {
431 drrr
->drr_length
+= length
;
434 /* not a continuation. Push out pending record */
435 if (dump_record(dscp
, NULL
, 0) != 0)
436 return (SET_ERROR(EINTR
));
437 dscp
->dsc_pending_op
= PENDING_NONE
;
440 /* create a REDACT record and make it pending */
441 memset(dscp
->dsc_drr
, 0, sizeof (dmu_replay_record_t
));
442 dscp
->dsc_drr
->drr_type
= DRR_REDACT
;
443 drrr
->drr_object
= object
;
444 drrr
->drr_offset
= offset
;
445 drrr
->drr_length
= length
;
446 drrr
->drr_toguid
= dscp
->dsc_toguid
;
447 dscp
->dsc_pending_op
= PENDING_REDACT
;
453 dmu_dump_write(dmu_send_cookie_t
*dscp
, dmu_object_type_t type
, uint64_t object
,
454 uint64_t offset
, int lsize
, int psize
, const blkptr_t
*bp
,
455 boolean_t io_compressed
, void *data
)
457 uint64_t payload_size
;
458 boolean_t raw
= (dscp
->dsc_featureflags
& DMU_BACKUP_FEATURE_RAW
);
459 struct drr_write
*drrw
= &(dscp
->dsc_drr
->drr_u
.drr_write
);
462 * We send data in increasing object, offset order.
463 * See comment in dump_free() for details.
465 ASSERT(object
> dscp
->dsc_last_data_object
||
466 (object
== dscp
->dsc_last_data_object
&&
467 offset
> dscp
->dsc_last_data_offset
));
468 dscp
->dsc_last_data_object
= object
;
469 dscp
->dsc_last_data_offset
= offset
+ lsize
- 1;
472 * If there is any kind of pending aggregation (currently either
473 * a grouping of free objects or free blocks), push it out to
474 * the stream, since aggregation can't be done across operations
475 * of different types.
477 if (dscp
->dsc_pending_op
!= PENDING_NONE
) {
478 if (dump_record(dscp
, NULL
, 0) != 0)
479 return (SET_ERROR(EINTR
));
480 dscp
->dsc_pending_op
= PENDING_NONE
;
482 /* write a WRITE record */
483 memset(dscp
->dsc_drr
, 0, sizeof (dmu_replay_record_t
));
484 dscp
->dsc_drr
->drr_type
= DRR_WRITE
;
485 drrw
->drr_object
= object
;
486 drrw
->drr_type
= type
;
487 drrw
->drr_offset
= offset
;
488 drrw
->drr_toguid
= dscp
->dsc_toguid
;
489 drrw
->drr_logical_size
= lsize
;
491 /* only set the compression fields if the buf is compressed or raw */
492 boolean_t compressed
=
493 (bp
!= NULL
? BP_GET_COMPRESS(bp
) != ZIO_COMPRESS_OFF
&&
494 io_compressed
: lsize
!= psize
);
495 if (raw
|| compressed
) {
497 ASSERT(raw
|| dscp
->dsc_featureflags
&
498 DMU_BACKUP_FEATURE_COMPRESSED
);
499 ASSERT(!BP_IS_EMBEDDED(bp
));
500 ASSERT3S(psize
, >, 0);
503 ASSERT(BP_IS_PROTECTED(bp
));
506 * This is a raw protected block so we need to pass
507 * along everything the receiving side will need to
508 * interpret this block, including the byteswap, salt,
511 if (BP_SHOULD_BYTESWAP(bp
))
512 drrw
->drr_flags
|= DRR_RAW_BYTESWAP
;
513 zio_crypt_decode_params_bp(bp
, drrw
->drr_salt
,
515 zio_crypt_decode_mac_bp(bp
, drrw
->drr_mac
);
517 /* this is a compressed block */
518 ASSERT(dscp
->dsc_featureflags
&
519 DMU_BACKUP_FEATURE_COMPRESSED
);
520 ASSERT(!BP_SHOULD_BYTESWAP(bp
));
521 ASSERT(!DMU_OT_IS_METADATA(BP_GET_TYPE(bp
)));
522 ASSERT3U(BP_GET_COMPRESS(bp
), !=, ZIO_COMPRESS_OFF
);
523 ASSERT3S(lsize
, >=, psize
);
526 /* set fields common to compressed and raw sends */
527 drrw
->drr_compressiontype
= BP_GET_COMPRESS(bp
);
528 drrw
->drr_compressed_size
= psize
;
529 payload_size
= drrw
->drr_compressed_size
;
531 payload_size
= drrw
->drr_logical_size
;
534 if (bp
== NULL
|| BP_IS_EMBEDDED(bp
) || (BP_IS_PROTECTED(bp
) && !raw
)) {
536 * There's no pre-computed checksum for partial-block writes,
537 * embedded BP's, or encrypted BP's that are being sent as
538 * plaintext, so (like fletcher4-checksummed blocks) userland
539 * will have to compute a dedup-capable checksum itself.
541 drrw
->drr_checksumtype
= ZIO_CHECKSUM_OFF
;
543 drrw
->drr_checksumtype
= BP_GET_CHECKSUM(bp
);
544 if (zio_checksum_table
[drrw
->drr_checksumtype
].ci_flags
&
545 ZCHECKSUM_FLAG_DEDUP
)
546 drrw
->drr_flags
|= DRR_CHECKSUM_DEDUP
;
547 DDK_SET_LSIZE(&drrw
->drr_key
, BP_GET_LSIZE(bp
));
548 DDK_SET_PSIZE(&drrw
->drr_key
, BP_GET_PSIZE(bp
));
549 DDK_SET_COMPRESS(&drrw
->drr_key
, BP_GET_COMPRESS(bp
));
550 DDK_SET_CRYPT(&drrw
->drr_key
, BP_IS_PROTECTED(bp
));
551 drrw
->drr_key
.ddk_cksum
= bp
->blk_cksum
;
554 if (dump_record(dscp
, data
, payload_size
) != 0)
555 return (SET_ERROR(EINTR
));
560 dump_write_embedded(dmu_send_cookie_t
*dscp
, uint64_t object
, uint64_t offset
,
561 int blksz
, const blkptr_t
*bp
)
563 char buf
[BPE_PAYLOAD_SIZE
];
564 struct drr_write_embedded
*drrw
=
565 &(dscp
->dsc_drr
->drr_u
.drr_write_embedded
);
567 if (dscp
->dsc_pending_op
!= PENDING_NONE
) {
568 if (dump_record(dscp
, NULL
, 0) != 0)
569 return (SET_ERROR(EINTR
));
570 dscp
->dsc_pending_op
= PENDING_NONE
;
573 ASSERT(BP_IS_EMBEDDED(bp
));
575 memset(dscp
->dsc_drr
, 0, sizeof (dmu_replay_record_t
));
576 dscp
->dsc_drr
->drr_type
= DRR_WRITE_EMBEDDED
;
577 drrw
->drr_object
= object
;
578 drrw
->drr_offset
= offset
;
579 drrw
->drr_length
= blksz
;
580 drrw
->drr_toguid
= dscp
->dsc_toguid
;
581 drrw
->drr_compression
= BP_GET_COMPRESS(bp
);
582 drrw
->drr_etype
= BPE_GET_ETYPE(bp
);
583 drrw
->drr_lsize
= BPE_GET_LSIZE(bp
);
584 drrw
->drr_psize
= BPE_GET_PSIZE(bp
);
586 decode_embedded_bp_compressed(bp
, buf
);
588 uint32_t psize
= drrw
->drr_psize
;
589 uint32_t rsize
= P2ROUNDUP(psize
, 8);
592 memset(buf
+ psize
, 0, rsize
- psize
);
594 if (dump_record(dscp
, buf
, rsize
) != 0)
595 return (SET_ERROR(EINTR
));
600 dump_spill(dmu_send_cookie_t
*dscp
, const blkptr_t
*bp
, uint64_t object
,
603 struct drr_spill
*drrs
= &(dscp
->dsc_drr
->drr_u
.drr_spill
);
604 uint64_t blksz
= BP_GET_LSIZE(bp
);
605 uint64_t payload_size
= blksz
;
607 if (dscp
->dsc_pending_op
!= PENDING_NONE
) {
608 if (dump_record(dscp
, NULL
, 0) != 0)
609 return (SET_ERROR(EINTR
));
610 dscp
->dsc_pending_op
= PENDING_NONE
;
613 /* write a SPILL record */
614 memset(dscp
->dsc_drr
, 0, sizeof (dmu_replay_record_t
));
615 dscp
->dsc_drr
->drr_type
= DRR_SPILL
;
616 drrs
->drr_object
= object
;
617 drrs
->drr_length
= blksz
;
618 drrs
->drr_toguid
= dscp
->dsc_toguid
;
620 /* See comment in dump_dnode() for full details */
621 if (zfs_send_unmodified_spill_blocks
&&
622 (BP_GET_LOGICAL_BIRTH(bp
) <= dscp
->dsc_fromtxg
)) {
623 drrs
->drr_flags
|= DRR_SPILL_UNMODIFIED
;
626 /* handle raw send fields */
627 if (dscp
->dsc_featureflags
& DMU_BACKUP_FEATURE_RAW
) {
628 ASSERT(BP_IS_PROTECTED(bp
));
630 if (BP_SHOULD_BYTESWAP(bp
))
631 drrs
->drr_flags
|= DRR_RAW_BYTESWAP
;
632 drrs
->drr_compressiontype
= BP_GET_COMPRESS(bp
);
633 drrs
->drr_compressed_size
= BP_GET_PSIZE(bp
);
634 zio_crypt_decode_params_bp(bp
, drrs
->drr_salt
, drrs
->drr_iv
);
635 zio_crypt_decode_mac_bp(bp
, drrs
->drr_mac
);
636 payload_size
= drrs
->drr_compressed_size
;
639 if (dump_record(dscp
, data
, payload_size
) != 0)
640 return (SET_ERROR(EINTR
));
645 dump_freeobjects(dmu_send_cookie_t
*dscp
, uint64_t firstobj
, uint64_t numobjs
)
647 struct drr_freeobjects
*drrfo
= &(dscp
->dsc_drr
->drr_u
.drr_freeobjects
);
648 uint64_t maxobj
= DNODES_PER_BLOCK
*
649 (DMU_META_DNODE(dscp
->dsc_os
)->dn_maxblkid
+ 1);
652 * ZoL < 0.7 does not handle large FREEOBJECTS records correctly,
653 * leading to zfs recv never completing. to avoid this issue, don't
654 * send FREEOBJECTS records for object IDs which cannot exist on the
658 if (maxobj
<= firstobj
)
661 if (maxobj
< firstobj
+ numobjs
)
662 numobjs
= maxobj
- firstobj
;
666 * If there is a pending op, but it's not PENDING_FREEOBJECTS,
667 * push it out, since free block aggregation can only be done for
668 * blocks of the same type (i.e., DRR_FREE records can only be
669 * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records
670 * can only be aggregated with other DRR_FREEOBJECTS records).
672 if (dscp
->dsc_pending_op
!= PENDING_NONE
&&
673 dscp
->dsc_pending_op
!= PENDING_FREEOBJECTS
) {
674 if (dump_record(dscp
, NULL
, 0) != 0)
675 return (SET_ERROR(EINTR
));
676 dscp
->dsc_pending_op
= PENDING_NONE
;
679 if (dscp
->dsc_pending_op
== PENDING_FREEOBJECTS
) {
681 * See whether this free object array can be aggregated
684 if (drrfo
->drr_firstobj
+ drrfo
->drr_numobjs
== firstobj
) {
685 drrfo
->drr_numobjs
+= numobjs
;
688 /* can't be aggregated. Push out pending record */
689 if (dump_record(dscp
, NULL
, 0) != 0)
690 return (SET_ERROR(EINTR
));
691 dscp
->dsc_pending_op
= PENDING_NONE
;
695 /* write a FREEOBJECTS record */
696 memset(dscp
->dsc_drr
, 0, sizeof (dmu_replay_record_t
));
697 dscp
->dsc_drr
->drr_type
= DRR_FREEOBJECTS
;
698 drrfo
->drr_firstobj
= firstobj
;
699 drrfo
->drr_numobjs
= numobjs
;
700 drrfo
->drr_toguid
= dscp
->dsc_toguid
;
702 dscp
->dsc_pending_op
= PENDING_FREEOBJECTS
;
708 dump_dnode(dmu_send_cookie_t
*dscp
, const blkptr_t
*bp
, uint64_t object
,
711 struct drr_object
*drro
= &(dscp
->dsc_drr
->drr_u
.drr_object
);
714 if (object
< dscp
->dsc_resume_object
) {
716 * Note: when resuming, we will visit all the dnodes in
717 * the block of dnodes that we are resuming from. In
718 * this case it's unnecessary to send the dnodes prior to
719 * the one we are resuming from. We should be at most one
720 * block's worth of dnodes behind the resume point.
722 ASSERT3U(dscp
->dsc_resume_object
- object
, <,
723 1 << (DNODE_BLOCK_SHIFT
- DNODE_SHIFT
));
727 if (dnp
== NULL
|| dnp
->dn_type
== DMU_OT_NONE
)
728 return (dump_freeobjects(dscp
, object
, 1));
730 if (dscp
->dsc_pending_op
!= PENDING_NONE
) {
731 if (dump_record(dscp
, NULL
, 0) != 0)
732 return (SET_ERROR(EINTR
));
733 dscp
->dsc_pending_op
= PENDING_NONE
;
736 /* write an OBJECT record */
737 memset(dscp
->dsc_drr
, 0, sizeof (dmu_replay_record_t
));
738 dscp
->dsc_drr
->drr_type
= DRR_OBJECT
;
739 drro
->drr_object
= object
;
740 drro
->drr_type
= dnp
->dn_type
;
741 drro
->drr_bonustype
= dnp
->dn_bonustype
;
742 drro
->drr_blksz
= dnp
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
;
743 drro
->drr_bonuslen
= dnp
->dn_bonuslen
;
744 drro
->drr_dn_slots
= dnp
->dn_extra_slots
+ 1;
745 drro
->drr_checksumtype
= dnp
->dn_checksum
;
746 drro
->drr_compress
= dnp
->dn_compress
;
747 drro
->drr_toguid
= dscp
->dsc_toguid
;
749 if (!(dscp
->dsc_featureflags
& DMU_BACKUP_FEATURE_LARGE_BLOCKS
) &&
750 drro
->drr_blksz
> SPA_OLD_MAXBLOCKSIZE
)
751 drro
->drr_blksz
= SPA_OLD_MAXBLOCKSIZE
;
753 bonuslen
= P2ROUNDUP(dnp
->dn_bonuslen
, 8);
755 if ((dscp
->dsc_featureflags
& DMU_BACKUP_FEATURE_RAW
)) {
756 ASSERT(BP_IS_ENCRYPTED(bp
));
758 if (BP_SHOULD_BYTESWAP(bp
))
759 drro
->drr_flags
|= DRR_RAW_BYTESWAP
;
761 /* needed for reconstructing dnp on recv side */
762 drro
->drr_maxblkid
= dnp
->dn_maxblkid
;
763 drro
->drr_indblkshift
= dnp
->dn_indblkshift
;
764 drro
->drr_nlevels
= dnp
->dn_nlevels
;
765 drro
->drr_nblkptr
= dnp
->dn_nblkptr
;
768 * Since we encrypt the entire bonus area, the (raw) part
769 * beyond the bonuslen is actually nonzero, so we need
773 if (drro
->drr_bonuslen
> DN_MAX_BONUS_LEN(dnp
))
774 return (SET_ERROR(EINVAL
));
775 drro
->drr_raw_bonuslen
= DN_MAX_BONUS_LEN(dnp
);
776 bonuslen
= drro
->drr_raw_bonuslen
;
781 * DRR_OBJECT_SPILL is set for every dnode which references a
782 * spill block. This allows the receiving pool to definitively
783 * determine when a spill block should be kept or freed.
785 if (dnp
->dn_flags
& DNODE_FLAG_SPILL_BLKPTR
)
786 drro
->drr_flags
|= DRR_OBJECT_SPILL
;
788 if (dump_record(dscp
, DN_BONUS(dnp
), bonuslen
) != 0)
789 return (SET_ERROR(EINTR
));
791 /* Free anything past the end of the file. */
792 if (dump_free(dscp
, object
, (dnp
->dn_maxblkid
+ 1) *
793 (dnp
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
), DMU_OBJECT_END
) != 0)
794 return (SET_ERROR(EINTR
));
797 * Send DRR_SPILL records for unmodified spill blocks. This is useful
798 * because changing certain attributes of the object (e.g. blocksize)
799 * can cause old versions of ZFS to incorrectly remove a spill block.
800 * Including these records in the stream forces an up to date version
801 * to always be written ensuring they're never lost. Current versions
802 * of the code which understand the DRR_FLAG_SPILL_BLOCK feature can
803 * ignore these unmodified spill blocks.
805 if (zfs_send_unmodified_spill_blocks
&&
806 (dnp
->dn_flags
& DNODE_FLAG_SPILL_BLKPTR
) &&
807 (BP_GET_LOGICAL_BIRTH(DN_SPILL_BLKPTR(dnp
)) <= dscp
->dsc_fromtxg
)) {
808 struct send_range record
;
809 blkptr_t
*bp
= DN_SPILL_BLKPTR(dnp
);
811 memset(&record
, 0, sizeof (struct send_range
));
813 record
.object
= object
;
814 record
.eos_marker
= B_FALSE
;
815 record
.start_blkid
= DMU_SPILL_BLKID
;
816 record
.end_blkid
= record
.start_blkid
+ 1;
817 record
.sru
.data
.bp
= *bp
;
818 record
.sru
.data
.obj_type
= dnp
->dn_type
;
819 record
.sru
.data
.datablksz
= BP_GET_LSIZE(bp
);
821 if (do_dump(dscp
, &record
) != 0)
822 return (SET_ERROR(EINTR
));
825 if (dscp
->dsc_err
!= 0)
826 return (SET_ERROR(EINTR
));
832 dump_object_range(dmu_send_cookie_t
*dscp
, const blkptr_t
*bp
,
833 uint64_t firstobj
, uint64_t numslots
)
835 struct drr_object_range
*drror
=
836 &(dscp
->dsc_drr
->drr_u
.drr_object_range
);
838 /* we only use this record type for raw sends */
839 ASSERT(BP_IS_PROTECTED(bp
));
840 ASSERT(dscp
->dsc_featureflags
& DMU_BACKUP_FEATURE_RAW
);
841 ASSERT3U(BP_GET_COMPRESS(bp
), ==, ZIO_COMPRESS_OFF
);
842 ASSERT3U(BP_GET_TYPE(bp
), ==, DMU_OT_DNODE
);
843 ASSERT0(BP_GET_LEVEL(bp
));
845 if (dscp
->dsc_pending_op
!= PENDING_NONE
) {
846 if (dump_record(dscp
, NULL
, 0) != 0)
847 return (SET_ERROR(EINTR
));
848 dscp
->dsc_pending_op
= PENDING_NONE
;
851 memset(dscp
->dsc_drr
, 0, sizeof (dmu_replay_record_t
));
852 dscp
->dsc_drr
->drr_type
= DRR_OBJECT_RANGE
;
853 drror
->drr_firstobj
= firstobj
;
854 drror
->drr_numslots
= numslots
;
855 drror
->drr_toguid
= dscp
->dsc_toguid
;
856 if (BP_SHOULD_BYTESWAP(bp
))
857 drror
->drr_flags
|= DRR_RAW_BYTESWAP
;
858 zio_crypt_decode_params_bp(bp
, drror
->drr_salt
, drror
->drr_iv
);
859 zio_crypt_decode_mac_bp(bp
, drror
->drr_mac
);
861 if (dump_record(dscp
, NULL
, 0) != 0)
862 return (SET_ERROR(EINTR
));
867 send_do_embed(const blkptr_t
*bp
, uint64_t featureflags
)
869 if (!BP_IS_EMBEDDED(bp
))
873 * Compression function must be legacy, or explicitly enabled.
875 if ((BP_GET_COMPRESS(bp
) >= ZIO_COMPRESS_LEGACY_FUNCTIONS
&&
876 !(featureflags
& DMU_BACKUP_FEATURE_LZ4
)))
880 * If we have not set the ZSTD feature flag, we can't send ZSTD
881 * compressed embedded blocks, as the receiver may not support them.
883 if ((BP_GET_COMPRESS(bp
) == ZIO_COMPRESS_ZSTD
&&
884 !(featureflags
& DMU_BACKUP_FEATURE_ZSTD
)))
888 * Embed type must be explicitly enabled.
890 switch (BPE_GET_ETYPE(bp
)) {
891 case BP_EMBEDDED_TYPE_DATA
:
892 if (featureflags
& DMU_BACKUP_FEATURE_EMBED_DATA
)
902 * This function actually handles figuring out what kind of record needs to be
903 * dumped, and calling the appropriate helper function. In most cases,
904 * the data has already been read by send_reader_thread().
907 do_dump(dmu_send_cookie_t
*dscp
, struct send_range
*range
)
910 switch (range
->type
) {
912 err
= dump_dnode(dscp
, &range
->sru
.object
.bp
, range
->object
,
913 range
->sru
.object
.dnp
);
916 ASSERT3U(range
->start_blkid
+ 1, ==, range
->end_blkid
);
917 if (!(dscp
->dsc_featureflags
& DMU_BACKUP_FEATURE_RAW
)) {
920 uint64_t epb
= BP_GET_LSIZE(&range
->sru
.object_range
.bp
) >>
922 uint64_t firstobj
= range
->start_blkid
* epb
;
923 err
= dump_object_range(dscp
, &range
->sru
.object_range
.bp
,
928 struct srr
*srrp
= &range
->sru
.redact
;
929 err
= dump_redact(dscp
, range
->object
, range
->start_blkid
*
930 srrp
->datablksz
, (range
->end_blkid
- range
->start_blkid
) *
935 struct srd
*srdp
= &range
->sru
.data
;
936 blkptr_t
*bp
= &srdp
->bp
;
938 dmu_objset_spa(dscp
->dsc_os
);
940 ASSERT3U(srdp
->datablksz
, ==, BP_GET_LSIZE(bp
));
941 ASSERT3U(range
->start_blkid
+ 1, ==, range
->end_blkid
);
942 if (BP_GET_TYPE(bp
) == DMU_OT_SA
) {
943 arc_flags_t aflags
= ARC_FLAG_WAIT
;
944 zio_flag_t zioflags
= ZIO_FLAG_CANFAIL
;
946 if (dscp
->dsc_featureflags
& DMU_BACKUP_FEATURE_RAW
) {
947 ASSERT(BP_IS_PROTECTED(bp
));
948 zioflags
|= ZIO_FLAG_RAW
;
952 ASSERT3U(range
->start_blkid
, ==, DMU_SPILL_BLKID
);
953 zb
.zb_objset
= dmu_objset_id(dscp
->dsc_os
);
954 zb
.zb_object
= range
->object
;
956 zb
.zb_blkid
= range
->start_blkid
;
958 arc_buf_t
*abuf
= NULL
;
959 if (!dscp
->dsc_dso
->dso_dryrun
&& arc_read(NULL
, spa
,
960 bp
, arc_getbuf_func
, &abuf
, ZIO_PRIORITY_ASYNC_READ
,
961 zioflags
, &aflags
, &zb
) != 0)
962 return (SET_ERROR(EIO
));
964 err
= dump_spill(dscp
, bp
, zb
.zb_object
,
965 (abuf
== NULL
? NULL
: abuf
->b_data
));
967 arc_buf_destroy(abuf
, &abuf
);
970 if (send_do_embed(bp
, dscp
->dsc_featureflags
)) {
971 err
= dump_write_embedded(dscp
, range
->object
,
972 range
->start_blkid
* srdp
->datablksz
,
973 srdp
->datablksz
, bp
);
976 ASSERT(range
->object
> dscp
->dsc_resume_object
||
977 (range
->object
== dscp
->dsc_resume_object
&&
978 range
->start_blkid
* srdp
->datablksz
>=
979 dscp
->dsc_resume_offset
));
980 /* it's a level-0 block of a regular object */
982 mutex_enter(&srdp
->lock
);
983 while (srdp
->io_outstanding
)
984 cv_wait(&srdp
->cv
, &srdp
->lock
);
986 mutex_exit(&srdp
->lock
);
989 if (zfs_send_corrupt_data
&&
990 !dscp
->dsc_dso
->dso_dryrun
) {
992 * Send a block filled with 0x"zfs badd bloc"
994 srdp
->abuf
= arc_alloc_buf(spa
, &srdp
->abuf
,
995 ARC_BUFC_DATA
, srdp
->datablksz
);
997 for (ptr
= srdp
->abuf
->b_data
;
998 (char *)ptr
< (char *)srdp
->abuf
->b_data
+
999 srdp
->datablksz
; ptr
++)
1000 *ptr
= 0x2f5baddb10cULL
;
1002 return (SET_ERROR(EIO
));
1006 ASSERT(dscp
->dsc_dso
->dso_dryrun
||
1007 srdp
->abuf
!= NULL
|| srdp
->abd
!= NULL
);
1009 uint64_t offset
= range
->start_blkid
* srdp
->datablksz
;
1012 if (srdp
->abd
!= NULL
) {
1013 data
= abd_to_buf(srdp
->abd
);
1014 ASSERT3P(srdp
->abuf
, ==, NULL
);
1015 } else if (srdp
->abuf
!= NULL
) {
1016 data
= srdp
->abuf
->b_data
;
1020 * If we have large blocks stored on disk but the send flags
1021 * don't allow us to send large blocks, we split the data from
1022 * the arc buf into chunks.
1024 if (srdp
->datablksz
> SPA_OLD_MAXBLOCKSIZE
&&
1025 !(dscp
->dsc_featureflags
&
1026 DMU_BACKUP_FEATURE_LARGE_BLOCKS
)) {
1027 while (srdp
->datablksz
> 0 && err
== 0) {
1028 int n
= MIN(srdp
->datablksz
,
1029 SPA_OLD_MAXBLOCKSIZE
);
1030 err
= dmu_dump_write(dscp
, srdp
->obj_type
,
1031 range
->object
, offset
, n
, n
, NULL
, B_FALSE
,
1035 * When doing dry run, data==NULL is used as a
1037 * dmu_dump_write()->dump_record().
1041 srdp
->datablksz
-= n
;
1044 err
= dmu_dump_write(dscp
, srdp
->obj_type
,
1045 range
->object
, offset
,
1046 srdp
->datablksz
, srdp
->datasz
, bp
,
1047 srdp
->io_compressed
, data
);
1052 struct srh
*srhp
= &range
->sru
.hole
;
1053 if (range
->object
== DMU_META_DNODE_OBJECT
) {
1054 uint32_t span
= srhp
->datablksz
>> DNODE_SHIFT
;
1055 uint64_t first_obj
= range
->start_blkid
* span
;
1056 uint64_t numobj
= range
->end_blkid
* span
- first_obj
;
1057 return (dump_freeobjects(dscp
, first_obj
, numobj
));
1059 uint64_t offset
= 0;
1062 * If this multiply overflows, we don't need to send this block.
1063 * Even if it has a birth time, it can never not be a hole, so
1064 * we don't need to send records for it.
1066 if (!overflow_multiply(range
->start_blkid
, srhp
->datablksz
,
1072 if (!overflow_multiply(range
->end_blkid
, srhp
->datablksz
, &len
))
1075 return (dump_free(dscp
, range
->object
, offset
, len
));
1078 panic("Invalid range type in do_dump: %d", range
->type
);
1083 static struct send_range
*
1084 range_alloc(enum type type
, uint64_t object
, uint64_t start_blkid
,
1085 uint64_t end_blkid
, boolean_t eos
)
1087 struct send_range
*range
= kmem_alloc(sizeof (*range
), KM_SLEEP
);
1089 range
->object
= object
;
1090 range
->start_blkid
= start_blkid
;
1091 range
->end_blkid
= end_blkid
;
1092 range
->eos_marker
= eos
;
1094 range
->sru
.data
.abd
= NULL
;
1095 range
->sru
.data
.abuf
= NULL
;
1096 mutex_init(&range
->sru
.data
.lock
, NULL
, MUTEX_DEFAULT
, NULL
);
1097 cv_init(&range
->sru
.data
.cv
, NULL
, CV_DEFAULT
, NULL
);
1098 range
->sru
.data
.io_outstanding
= 0;
1099 range
->sru
.data
.io_err
= 0;
1100 range
->sru
.data
.io_compressed
= B_FALSE
;
1106 * This is the callback function to traverse_dataset that acts as a worker
1107 * thread for dmu_send_impl.
1110 send_cb(spa_t
*spa
, zilog_t
*zilog
, const blkptr_t
*bp
,
1111 const zbookmark_phys_t
*zb
, const struct dnode_phys
*dnp
, void *arg
)
1114 struct send_thread_arg
*sta
= arg
;
1115 struct send_range
*record
;
1117 ASSERT(zb
->zb_object
== DMU_META_DNODE_OBJECT
||
1118 zb
->zb_object
>= sta
->resume
.zb_object
);
1121 * All bps of an encrypted os should have the encryption bit set.
1122 * If this is not true it indicates tampering and we report an error.
1124 if (sta
->os
->os_encrypted
&&
1125 !BP_IS_HOLE(bp
) && !BP_USES_CRYPT(bp
)) {
1126 spa_log_error(spa
, zb
, BP_GET_LOGICAL_BIRTH(bp
));
1127 return (SET_ERROR(EIO
));
1131 return (SET_ERROR(EINTR
));
1132 if (zb
->zb_object
!= DMU_META_DNODE_OBJECT
&&
1133 DMU_OBJECT_IS_SPECIAL(zb
->zb_object
))
1135 atomic_inc_64(sta
->num_blocks_visited
);
1137 if (zb
->zb_level
== ZB_DNODE_LEVEL
) {
1138 if (zb
->zb_object
== DMU_META_DNODE_OBJECT
)
1140 record
= range_alloc(OBJECT
, zb
->zb_object
, 0, 0, B_FALSE
);
1141 record
->sru
.object
.bp
= *bp
;
1142 size_t size
= sizeof (*dnp
) * (dnp
->dn_extra_slots
+ 1);
1143 record
->sru
.object
.dnp
= kmem_alloc(size
, KM_SLEEP
);
1144 memcpy(record
->sru
.object
.dnp
, dnp
, size
);
1145 bqueue_enqueue(&sta
->q
, record
, sizeof (*record
));
1148 if (zb
->zb_level
== 0 && zb
->zb_object
== DMU_META_DNODE_OBJECT
&&
1150 record
= range_alloc(OBJECT_RANGE
, 0, zb
->zb_blkid
,
1151 zb
->zb_blkid
+ 1, B_FALSE
);
1152 record
->sru
.object_range
.bp
= *bp
;
1153 bqueue_enqueue(&sta
->q
, record
, sizeof (*record
));
1156 if (zb
->zb_level
< 0 || (zb
->zb_level
> 0 && !BP_IS_HOLE(bp
)))
1158 if (zb
->zb_object
== DMU_META_DNODE_OBJECT
&& !BP_IS_HOLE(bp
))
1161 uint64_t span
= bp_span_in_blocks(dnp
->dn_indblkshift
, zb
->zb_level
);
1165 * If this multiply overflows, we don't need to send this block.
1166 * Even if it has a birth time, it can never not be a hole, so
1167 * we don't need to send records for it.
1169 if (!overflow_multiply(span
, zb
->zb_blkid
, &start
) || (!(zb
->zb_blkid
==
1170 DMU_SPILL_BLKID
|| DMU_OT_IS_METADATA(dnp
->dn_type
)) &&
1171 span
* zb
->zb_blkid
> dnp
->dn_maxblkid
)) {
1172 ASSERT(BP_IS_HOLE(bp
));
1176 if (zb
->zb_blkid
== DMU_SPILL_BLKID
)
1177 ASSERT3U(BP_GET_TYPE(bp
), ==, DMU_OT_SA
);
1179 enum type record_type
= DATA
;
1182 else if (BP_IS_REDACTED(bp
))
1183 record_type
= REDACT
;
1187 record
= range_alloc(record_type
, zb
->zb_object
, start
,
1188 (start
+ span
< start
? 0 : start
+ span
), B_FALSE
);
1190 uint64_t datablksz
= (zb
->zb_blkid
== DMU_SPILL_BLKID
?
1191 BP_GET_LSIZE(bp
) : dnp
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
);
1193 if (BP_IS_HOLE(bp
)) {
1194 record
->sru
.hole
.datablksz
= datablksz
;
1195 } else if (BP_IS_REDACTED(bp
)) {
1196 record
->sru
.redact
.datablksz
= datablksz
;
1198 record
->sru
.data
.datablksz
= datablksz
;
1199 record
->sru
.data
.obj_type
= dnp
->dn_type
;
1200 record
->sru
.data
.bp
= *bp
;
1203 bqueue_enqueue(&sta
->q
, record
, sizeof (*record
));
1207 struct redact_list_cb_arg
{
1208 uint64_t *num_blocks_visited
;
1211 boolean_t mark_redact
;
1215 redact_list_cb(redact_block_phys_t
*rb
, void *arg
)
1217 struct redact_list_cb_arg
*rlcap
= arg
;
1219 atomic_inc_64(rlcap
->num_blocks_visited
);
1223 struct send_range
*data
= range_alloc(REDACT
, rb
->rbp_object
,
1224 rb
->rbp_blkid
, rb
->rbp_blkid
+ redact_block_get_count(rb
), B_FALSE
);
1225 ASSERT3U(data
->end_blkid
, >, rb
->rbp_blkid
);
1226 if (rlcap
->mark_redact
) {
1227 data
->type
= REDACT
;
1228 data
->sru
.redact
.datablksz
= redact_block_get_size(rb
);
1230 data
->type
= PREVIOUSLY_REDACTED
;
1232 bqueue_enqueue(rlcap
->q
, data
, sizeof (*data
));
1238 * This function kicks off the traverse_dataset. It also handles setting the
1239 * error code of the thread in case something goes wrong, and pushes the End of
1240 * Stream record when the traverse_dataset call has finished.
1242 static __attribute__((noreturn
)) void
1243 send_traverse_thread(void *arg
)
1245 struct send_thread_arg
*st_arg
= arg
;
1247 struct send_range
*data
;
1248 fstrans_cookie_t cookie
= spl_fstrans_mark();
1250 err
= traverse_dataset_resume(st_arg
->os
->os_dsl_dataset
,
1251 st_arg
->fromtxg
, &st_arg
->resume
,
1252 st_arg
->flags
, send_cb
, st_arg
);
1255 st_arg
->error_code
= err
;
1256 data
= range_alloc(DATA
, 0, 0, 0, B_TRUE
);
1257 bqueue_enqueue_flush(&st_arg
->q
, data
, sizeof (*data
));
1258 spl_fstrans_unmark(cookie
);
1263 * Utility function that causes End of Stream records to compare after of all
1264 * others, so that other threads' comparison logic can stay simple.
1266 static int __attribute__((unused
))
1267 send_range_after(const struct send_range
*from
, const struct send_range
*to
)
1269 if (from
->eos_marker
== B_TRUE
)
1271 if (to
->eos_marker
== B_TRUE
)
1274 uint64_t from_obj
= from
->object
;
1275 uint64_t from_end_obj
= from
->object
+ 1;
1276 uint64_t to_obj
= to
->object
;
1277 uint64_t to_end_obj
= to
->object
+ 1;
1278 if (from_obj
== 0) {
1279 ASSERT(from
->type
== HOLE
|| from
->type
== OBJECT_RANGE
);
1280 from_obj
= from
->start_blkid
<< DNODES_PER_BLOCK_SHIFT
;
1281 from_end_obj
= from
->end_blkid
<< DNODES_PER_BLOCK_SHIFT
;
1284 ASSERT(to
->type
== HOLE
|| to
->type
== OBJECT_RANGE
);
1285 to_obj
= to
->start_blkid
<< DNODES_PER_BLOCK_SHIFT
;
1286 to_end_obj
= to
->end_blkid
<< DNODES_PER_BLOCK_SHIFT
;
1289 if (from_end_obj
<= to_obj
)
1291 if (from_obj
>= to_end_obj
)
1293 int64_t cmp
= TREE_CMP(to
->type
== OBJECT_RANGE
, from
->type
==
1297 cmp
= TREE_CMP(to
->type
== OBJECT
, from
->type
== OBJECT
);
1300 if (from
->end_blkid
<= to
->start_blkid
)
1302 if (from
->start_blkid
>= to
->end_blkid
)
1308 * Pop the new data off the queue, check that the records we receive are in
1309 * the right order, but do not free the old data. This is used so that the
1310 * records can be sent on to the main thread without copying the data.
1312 static struct send_range
*
1313 get_next_range_nofree(bqueue_t
*bq
, struct send_range
*prev
)
1315 struct send_range
*next
= bqueue_dequeue(bq
);
1316 ASSERT3S(send_range_after(prev
, next
), ==, -1);
1321 * Pop the new data off the queue, check that the records we receive are in
1322 * the right order, and free the old data.
1324 static struct send_range
*
1325 get_next_range(bqueue_t
*bq
, struct send_range
*prev
)
1327 struct send_range
*next
= get_next_range_nofree(bq
, prev
);
1332 static __attribute__((noreturn
)) void
1333 redact_list_thread(void *arg
)
1335 struct redact_list_thread_arg
*rlt_arg
= arg
;
1336 struct send_range
*record
;
1337 fstrans_cookie_t cookie
= spl_fstrans_mark();
1338 if (rlt_arg
->rl
!= NULL
) {
1339 struct redact_list_cb_arg rlcba
= {0};
1340 rlcba
.cancel
= &rlt_arg
->cancel
;
1341 rlcba
.q
= &rlt_arg
->q
;
1342 rlcba
.num_blocks_visited
= rlt_arg
->num_blocks_visited
;
1343 rlcba
.mark_redact
= rlt_arg
->mark_redact
;
1344 int err
= dsl_redaction_list_traverse(rlt_arg
->rl
,
1345 &rlt_arg
->resume
, redact_list_cb
, &rlcba
);
1347 rlt_arg
->error_code
= err
;
1349 record
= range_alloc(DATA
, 0, 0, 0, B_TRUE
);
1350 bqueue_enqueue_flush(&rlt_arg
->q
, record
, sizeof (*record
));
1351 spl_fstrans_unmark(cookie
);
1357 * Compare the start point of the two provided ranges. End of stream ranges
1358 * compare last, objects compare before any data or hole inside that object and
1359 * multi-object holes that start at the same object.
1362 send_range_start_compare(struct send_range
*r1
, struct send_range
*r2
)
1364 uint64_t r1_objequiv
= r1
->object
;
1365 uint64_t r1_l0equiv
= r1
->start_blkid
;
1366 uint64_t r2_objequiv
= r2
->object
;
1367 uint64_t r2_l0equiv
= r2
->start_blkid
;
1368 int64_t cmp
= TREE_CMP(r1
->eos_marker
, r2
->eos_marker
);
1371 if (r1
->object
== 0) {
1372 r1_objequiv
= r1
->start_blkid
* DNODES_PER_BLOCK
;
1375 if (r2
->object
== 0) {
1376 r2_objequiv
= r2
->start_blkid
* DNODES_PER_BLOCK
;
1380 cmp
= TREE_CMP(r1_objequiv
, r2_objequiv
);
1383 cmp
= TREE_CMP(r2
->type
== OBJECT_RANGE
, r1
->type
== OBJECT_RANGE
);
1386 cmp
= TREE_CMP(r2
->type
== OBJECT
, r1
->type
== OBJECT
);
1390 return (TREE_CMP(r1_l0equiv
, r2_l0equiv
));
1401 * This function returns the next range the send_merge_thread should operate on.
1402 * The inputs are two arrays; the first one stores the range at the front of the
1403 * queues stored in the second one. The ranges are sorted in descending
1404 * priority order; the metadata from earlier ranges overrules metadata from
1405 * later ranges. out_mask is used to return which threads the ranges came from;
1406 * bit i is set if ranges[i] started at the same place as the returned range.
1408 * This code is not hardcoded to compare a specific number of threads; it could
1409 * be used with any number, just by changing the q_idx enum.
1411 * The "next range" is the one with the earliest start; if two starts are equal,
1412 * the highest-priority range is the next to operate on. If a higher-priority
1413 * range starts in the middle of the first range, then the first range will be
1414 * truncated to end where the higher-priority range starts, and we will operate
1415 * on that one next time. In this way, we make sure that each block covered by
1416 * some range gets covered by a returned range, and each block covered is
1417 * returned using the metadata of the highest-priority range it appears in.
1419 * For example, if the three ranges at the front of the queues were [2,4),
1420 * [3,5), and [1,3), then the ranges returned would be [1,2) with the metadata
1421 * from the third range, [2,4) with the metadata from the first range, and then
1422 * [4,5) with the metadata from the second.
1424 static struct send_range
*
1425 find_next_range(struct send_range
**ranges
, bqueue_t
**qs
, uint64_t *out_mask
)
1427 int idx
= 0; // index of the range with the earliest start
1430 for (i
= 1; i
< NUM_THREADS
; i
++) {
1431 if (send_range_start_compare(ranges
[i
], ranges
[idx
]) < 0)
1434 if (ranges
[idx
]->eos_marker
) {
1435 struct send_range
*ret
= range_alloc(DATA
, 0, 0, 0, B_TRUE
);
1440 * Find all the ranges that start at that same point.
1442 for (i
= 0; i
< NUM_THREADS
; i
++) {
1443 if (send_range_start_compare(ranges
[i
], ranges
[idx
]) == 0)
1448 * OBJECT_RANGE records only come from the TO thread, and should always
1449 * be treated as overlapping with nothing and sent on immediately. They
1450 * are only used in raw sends, and are never redacted.
1452 if (ranges
[idx
]->type
== OBJECT_RANGE
) {
1453 ASSERT3U(idx
, ==, TO_IDX
);
1454 ASSERT3U(*out_mask
, ==, 1 << TO_IDX
);
1455 struct send_range
*ret
= ranges
[idx
];
1456 ranges
[idx
] = get_next_range_nofree(qs
[idx
], ranges
[idx
]);
1460 * Find the first start or end point after the start of the first range.
1462 uint64_t first_change
= ranges
[idx
]->end_blkid
;
1463 for (i
= 0; i
< NUM_THREADS
; i
++) {
1464 if (i
== idx
|| ranges
[i
]->eos_marker
||
1465 ranges
[i
]->object
> ranges
[idx
]->object
||
1466 ranges
[i
]->object
== DMU_META_DNODE_OBJECT
)
1468 ASSERT3U(ranges
[i
]->object
, ==, ranges
[idx
]->object
);
1469 if (first_change
> ranges
[i
]->start_blkid
&&
1470 (bmask
& (1 << i
)) == 0)
1471 first_change
= ranges
[i
]->start_blkid
;
1472 else if (first_change
> ranges
[i
]->end_blkid
)
1473 first_change
= ranges
[i
]->end_blkid
;
1476 * Update all ranges to no longer overlap with the range we're
1477 * returning. All such ranges must start at the same place as the range
1478 * being returned, and end at or after first_change. Thus we update
1479 * their start to first_change. If that makes them size 0, then free
1480 * them and pull a new range from that thread.
1482 for (i
= 0; i
< NUM_THREADS
; i
++) {
1483 if (i
== idx
|| (bmask
& (1 << i
)) == 0)
1485 ASSERT3U(first_change
, >, ranges
[i
]->start_blkid
);
1486 ranges
[i
]->start_blkid
= first_change
;
1487 ASSERT3U(ranges
[i
]->start_blkid
, <=, ranges
[i
]->end_blkid
);
1488 if (ranges
[i
]->start_blkid
== ranges
[i
]->end_blkid
)
1489 ranges
[i
] = get_next_range(qs
[i
], ranges
[i
]);
1492 * Short-circuit the simple case; if the range doesn't overlap with
1493 * anything else, or it only overlaps with things that start at the same
1494 * place and are longer, send it on.
1496 if (first_change
== ranges
[idx
]->end_blkid
) {
1497 struct send_range
*ret
= ranges
[idx
];
1498 ranges
[idx
] = get_next_range_nofree(qs
[idx
], ranges
[idx
]);
1503 * Otherwise, return a truncated copy of ranges[idx] and move the start
1504 * of ranges[idx] back to first_change.
1506 struct send_range
*ret
= kmem_alloc(sizeof (*ret
), KM_SLEEP
);
1507 *ret
= *ranges
[idx
];
1508 ret
->end_blkid
= first_change
;
1509 ranges
[idx
]->start_blkid
= first_change
;
1513 #define FROM_AND_REDACT_BITS ((1 << REDACT_IDX) | (1 << FROM_IDX))
1516 * Merge the results from the from thread and the to thread, and then hand the
1517 * records off to send_prefetch_thread to prefetch them. If this is not a
1518 * send from a redaction bookmark, the from thread will push an end of stream
1519 * record and stop, and we'll just send everything that was changed in the
1520 * to_ds since the ancestor's creation txg. If it is, then since
1521 * traverse_dataset has a canonical order, we can compare each change as
1522 * they're pulled off the queues. That will give us a stream that is
1523 * appropriately sorted, and covers all records. In addition, we pull the
1524 * data from the redact_list_thread and use that to determine which blocks
1525 * should be redacted.
1527 static __attribute__((noreturn
)) void
1528 send_merge_thread(void *arg
)
1530 struct send_merge_thread_arg
*smt_arg
= arg
;
1531 struct send_range
*front_ranges
[NUM_THREADS
];
1532 bqueue_t
*queues
[NUM_THREADS
];
1534 fstrans_cookie_t cookie
= spl_fstrans_mark();
1536 if (smt_arg
->redact_arg
== NULL
) {
1537 front_ranges
[REDACT_IDX
] =
1538 kmem_zalloc(sizeof (struct send_range
), KM_SLEEP
);
1539 front_ranges
[REDACT_IDX
]->eos_marker
= B_TRUE
;
1540 front_ranges
[REDACT_IDX
]->type
= REDACT
;
1541 queues
[REDACT_IDX
] = NULL
;
1543 front_ranges
[REDACT_IDX
] =
1544 bqueue_dequeue(&smt_arg
->redact_arg
->q
);
1545 queues
[REDACT_IDX
] = &smt_arg
->redact_arg
->q
;
1547 front_ranges
[TO_IDX
] = bqueue_dequeue(&smt_arg
->to_arg
->q
);
1548 queues
[TO_IDX
] = &smt_arg
->to_arg
->q
;
1549 front_ranges
[FROM_IDX
] = bqueue_dequeue(&smt_arg
->from_arg
->q
);
1550 queues
[FROM_IDX
] = &smt_arg
->from_arg
->q
;
1552 struct send_range
*range
;
1553 for (range
= find_next_range(front_ranges
, queues
, &mask
);
1554 !range
->eos_marker
&& err
== 0 && !smt_arg
->cancel
;
1555 range
= find_next_range(front_ranges
, queues
, &mask
)) {
1557 * If the range in question was in both the from redact bookmark
1558 * and the bookmark we're using to redact, then don't send it.
1559 * It's already redacted on the receiving system, so a redaction
1560 * record would be redundant.
1562 if ((mask
& FROM_AND_REDACT_BITS
) == FROM_AND_REDACT_BITS
) {
1563 ASSERT3U(range
->type
, ==, REDACT
);
1567 bqueue_enqueue(&smt_arg
->q
, range
, sizeof (*range
));
1569 if (smt_arg
->to_arg
->error_code
!= 0) {
1570 err
= smt_arg
->to_arg
->error_code
;
1571 } else if (smt_arg
->from_arg
->error_code
!= 0) {
1572 err
= smt_arg
->from_arg
->error_code
;
1573 } else if (smt_arg
->redact_arg
!= NULL
&&
1574 smt_arg
->redact_arg
->error_code
!= 0) {
1575 err
= smt_arg
->redact_arg
->error_code
;
1578 if (smt_arg
->cancel
&& err
== 0)
1579 err
= SET_ERROR(EINTR
);
1580 smt_arg
->error
= err
;
1581 if (smt_arg
->error
!= 0) {
1582 smt_arg
->to_arg
->cancel
= B_TRUE
;
1583 smt_arg
->from_arg
->cancel
= B_TRUE
;
1584 if (smt_arg
->redact_arg
!= NULL
)
1585 smt_arg
->redact_arg
->cancel
= B_TRUE
;
1587 for (int i
= 0; i
< NUM_THREADS
; i
++) {
1588 while (!front_ranges
[i
]->eos_marker
) {
1589 front_ranges
[i
] = get_next_range(queues
[i
],
1592 range_free(front_ranges
[i
]);
1594 range
->eos_marker
= B_TRUE
;
1595 bqueue_enqueue_flush(&smt_arg
->q
, range
, 1);
1596 spl_fstrans_unmark(cookie
);
1600 struct send_reader_thread_arg
{
1601 struct send_merge_thread_arg
*smta
;
1604 boolean_t issue_reads
;
1605 uint64_t featureflags
;
1610 dmu_send_read_done(zio_t
*zio
)
1612 struct send_range
*range
= zio
->io_private
;
1614 mutex_enter(&range
->sru
.data
.lock
);
1615 if (zio
->io_error
!= 0) {
1616 abd_free(range
->sru
.data
.abd
);
1617 range
->sru
.data
.abd
= NULL
;
1618 range
->sru
.data
.io_err
= zio
->io_error
;
1621 ASSERT(range
->sru
.data
.io_outstanding
);
1622 range
->sru
.data
.io_outstanding
= B_FALSE
;
1623 cv_broadcast(&range
->sru
.data
.cv
);
1624 mutex_exit(&range
->sru
.data
.lock
);
1628 issue_data_read(struct send_reader_thread_arg
*srta
, struct send_range
*range
)
1630 struct srd
*srdp
= &range
->sru
.data
;
1631 blkptr_t
*bp
= &srdp
->bp
;
1632 objset_t
*os
= srta
->smta
->os
;
1634 ASSERT3U(range
->type
, ==, DATA
);
1635 ASSERT3U(range
->start_blkid
+ 1, ==, range
->end_blkid
);
1637 * If we have large blocks stored on disk but
1638 * the send flags don't allow us to send large
1639 * blocks, we split the data from the arc buf
1642 boolean_t split_large_blocks
=
1643 srdp
->datablksz
> SPA_OLD_MAXBLOCKSIZE
&&
1644 !(srta
->featureflags
& DMU_BACKUP_FEATURE_LARGE_BLOCKS
);
1646 * We should only request compressed data from the ARC if all
1647 * the following are true:
1648 * - stream compression was requested
1649 * - we aren't splitting large blocks into smaller chunks
1650 * - the data won't need to be byteswapped before sending
1651 * - this isn't an embedded block
1652 * - this isn't metadata (if receiving on a different endian
1653 * system it can be byteswapped more easily)
1655 boolean_t request_compressed
=
1656 (srta
->featureflags
& DMU_BACKUP_FEATURE_COMPRESSED
) &&
1657 !split_large_blocks
&& !BP_SHOULD_BYTESWAP(bp
) &&
1658 !BP_IS_EMBEDDED(bp
) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp
));
1660 zio_flag_t zioflags
= ZIO_FLAG_CANFAIL
;
1662 if (srta
->featureflags
& DMU_BACKUP_FEATURE_RAW
) {
1663 zioflags
|= ZIO_FLAG_RAW
;
1664 srdp
->io_compressed
= B_TRUE
;
1665 } else if (request_compressed
) {
1666 zioflags
|= ZIO_FLAG_RAW_COMPRESS
;
1667 srdp
->io_compressed
= B_TRUE
;
1670 srdp
->datasz
= (zioflags
& ZIO_FLAG_RAW_COMPRESS
) ?
1671 BP_GET_PSIZE(bp
) : BP_GET_LSIZE(bp
);
1673 if (!srta
->issue_reads
)
1675 if (BP_IS_REDACTED(bp
))
1677 if (send_do_embed(bp
, srta
->featureflags
))
1680 zbookmark_phys_t zb
= {
1681 .zb_objset
= dmu_objset_id(os
),
1682 .zb_object
= range
->object
,
1684 .zb_blkid
= range
->start_blkid
,
1687 arc_flags_t aflags
= ARC_FLAG_CACHED_ONLY
;
1689 int arc_err
= arc_read(NULL
, os
->os_spa
, bp
,
1690 arc_getbuf_func
, &srdp
->abuf
, ZIO_PRIORITY_ASYNC_READ
,
1691 zioflags
, &aflags
, &zb
);
1693 * If the data is not already cached in the ARC, we read directly
1694 * from zio. This avoids the performance overhead of adding a new
1695 * entry to the ARC, and we also avoid polluting the ARC cache with
1696 * data that is not likely to be used in the future.
1699 srdp
->abd
= abd_alloc_linear(srdp
->datasz
, B_FALSE
);
1700 srdp
->io_outstanding
= B_TRUE
;
1701 zio_nowait(zio_read(NULL
, os
->os_spa
, bp
, srdp
->abd
,
1702 srdp
->datasz
, dmu_send_read_done
, range
,
1703 ZIO_PRIORITY_ASYNC_READ
, zioflags
, &zb
));
1708 * Create a new record with the given values.
1711 enqueue_range(struct send_reader_thread_arg
*srta
, bqueue_t
*q
, dnode_t
*dn
,
1712 uint64_t blkid
, uint64_t count
, const blkptr_t
*bp
, uint32_t datablksz
)
1714 enum type range_type
= (bp
== NULL
|| BP_IS_HOLE(bp
) ? HOLE
:
1715 (BP_IS_REDACTED(bp
) ? REDACT
: DATA
));
1717 struct send_range
*range
= range_alloc(range_type
, dn
->dn_object
,
1718 blkid
, blkid
+ count
, B_FALSE
);
1720 if (blkid
== DMU_SPILL_BLKID
) {
1721 ASSERT3P(bp
, !=, NULL
);
1722 ASSERT3U(BP_GET_TYPE(bp
), ==, DMU_OT_SA
);
1725 switch (range_type
) {
1727 range
->sru
.hole
.datablksz
= datablksz
;
1730 ASSERT3U(count
, ==, 1);
1731 range
->sru
.data
.datablksz
= datablksz
;
1732 range
->sru
.data
.obj_type
= dn
->dn_type
;
1733 range
->sru
.data
.bp
= *bp
;
1734 issue_data_read(srta
, range
);
1737 range
->sru
.redact
.datablksz
= datablksz
;
1742 bqueue_enqueue(q
, range
, datablksz
);
1746 * This thread is responsible for two things: First, it retrieves the correct
1747 * blkptr in the to ds if we need to send the data because of something from
1748 * the from thread. As a result of this, we're the first ones to discover that
1749 * some indirect blocks can be discarded because they're not holes. Second,
1750 * it issues prefetches for the data we need to send.
1752 static __attribute__((noreturn
)) void
1753 send_reader_thread(void *arg
)
1755 struct send_reader_thread_arg
*srta
= arg
;
1756 struct send_merge_thread_arg
*smta
= srta
->smta
;
1757 bqueue_t
*inq
= &smta
->q
;
1758 bqueue_t
*outq
= &srta
->q
;
1759 objset_t
*os
= smta
->os
;
1760 fstrans_cookie_t cookie
= spl_fstrans_mark();
1761 struct send_range
*range
= bqueue_dequeue(inq
);
1765 * If the record we're analyzing is from a redaction bookmark from the
1766 * fromds, then we need to know whether or not it exists in the tods so
1767 * we know whether to create records for it or not. If it does, we need
1768 * the datablksz so we can generate an appropriate record for it.
1769 * Finally, if it isn't redacted, we need the blkptr so that we can send
1770 * a WRITE record containing the actual data.
1772 uint64_t last_obj
= UINT64_MAX
;
1773 uint64_t last_obj_exists
= B_TRUE
;
1774 while (!range
->eos_marker
&& !srta
->cancel
&& smta
->error
== 0 &&
1776 switch (range
->type
) {
1778 issue_data_read(srta
, range
);
1779 bqueue_enqueue(outq
, range
, range
->sru
.data
.datablksz
);
1780 range
= get_next_range_nofree(inq
, range
);
1785 case REDACT
: // Redacted blocks must exist
1786 bqueue_enqueue(outq
, range
, sizeof (*range
));
1787 range
= get_next_range_nofree(inq
, range
);
1789 case PREVIOUSLY_REDACTED
: {
1791 * This entry came from the "from bookmark" when
1792 * sending from a bookmark that has a redaction
1793 * list. We need to check if this object/blkid
1794 * exists in the target ("to") dataset, and if
1795 * not then we drop this entry. We also need
1796 * to fill in the block pointer so that we know
1799 * To accomplish the above, we first cache whether or
1800 * not the last object we examined exists. If it
1801 * doesn't, we can drop this record. If it does, we hold
1802 * the dnode and use it to call dbuf_dnode_findbp. We do
1803 * this instead of dbuf_bookmark_findbp because we will
1804 * often operate on large ranges, and holding the dnode
1805 * once is more efficient.
1807 boolean_t object_exists
= B_TRUE
;
1809 * If the data is redacted, we only care if it exists,
1810 * so that we don't send records for objects that have
1814 if (range
->object
== last_obj
&& !last_obj_exists
) {
1816 * If we're still examining the same object as
1817 * previously, and it doesn't exist, we don't
1818 * need to call dbuf_bookmark_findbp.
1820 object_exists
= B_FALSE
;
1822 err
= dnode_hold(os
, range
->object
, FTAG
, &dn
);
1823 if (err
== ENOENT
) {
1824 object_exists
= B_FALSE
;
1827 last_obj
= range
->object
;
1828 last_obj_exists
= object_exists
;
1833 } else if (!object_exists
) {
1835 * The block was modified, but doesn't
1836 * exist in the to dataset; if it was
1837 * deleted in the to dataset, then we'll
1838 * visit the hole bp for it at some point.
1840 range
= get_next_range(inq
, range
);
1844 MIN(dn
->dn_maxblkid
, range
->end_blkid
);
1846 * The object exists, so we need to try to find the
1847 * blkptr for each block in the range we're processing.
1849 rw_enter(&dn
->dn_struct_rwlock
, RW_READER
);
1850 for (uint64_t blkid
= range
->start_blkid
;
1851 blkid
< file_max
; blkid
++) {
1853 uint32_t datablksz
=
1854 dn
->dn_phys
->dn_datablkszsec
<<
1856 uint64_t offset
= blkid
* datablksz
;
1858 * This call finds the next non-hole block in
1859 * the object. This is to prevent a
1860 * performance problem where we're unredacting
1861 * a large hole. Using dnode_next_offset to
1862 * skip over the large hole avoids iterating
1863 * over every block in it.
1865 err
= dnode_next_offset(dn
, DNODE_FIND_HAVELOCK
,
1868 offset
= UINT64_MAX
;
1870 } else if (err
!= 0) {
1873 if (offset
!= blkid
* datablksz
) {
1875 * if there is a hole from here
1878 offset
= MIN(offset
, file_max
*
1880 uint64_t nblks
= (offset
/ datablksz
) -
1882 enqueue_range(srta
, outq
, dn
, blkid
,
1883 nblks
, NULL
, datablksz
);
1886 if (blkid
>= file_max
)
1888 err
= dbuf_dnode_findbp(dn
, 0, blkid
, &bp
,
1892 ASSERT(!BP_IS_HOLE(&bp
));
1893 enqueue_range(srta
, outq
, dn
, blkid
, 1, &bp
,
1896 rw_exit(&dn
->dn_struct_rwlock
);
1897 dnode_rele(dn
, FTAG
);
1898 range
= get_next_range(inq
, range
);
1902 if (srta
->cancel
|| err
!= 0) {
1903 smta
->cancel
= B_TRUE
;
1905 } else if (smta
->error
!= 0) {
1906 srta
->error
= smta
->error
;
1908 while (!range
->eos_marker
)
1909 range
= get_next_range(inq
, range
);
1911 bqueue_enqueue_flush(outq
, range
, 1);
1912 spl_fstrans_unmark(cookie
);
1916 #define NUM_SNAPS_NOT_REDACTED UINT64_MAX
1918 struct dmu_send_params
{
1920 const void *tag
; // Tag dp was held with, will be used to release dp.
1922 /* To snapshot args */
1924 dsl_dataset_t
*to_ds
;
1925 /* From snapshot args */
1926 zfs_bookmark_phys_t ancestor_zb
;
1927 uint64_t *fromredactsnaps
;
1928 /* NUM_SNAPS_NOT_REDACTED if not sending from redaction bookmark */
1929 uint64_t numfromredactsnaps
;
1933 boolean_t large_block_ok
;
1934 boolean_t compressok
;
1939 uint64_t saved_guid
;
1940 zfs_bookmark_phys_t
*redactbook
;
1941 /* Stream output params */
1942 dmu_send_outparams_t
*dso
;
1944 /* Stream progress params */
1947 char saved_toname
[MAXNAMELEN
];
1951 setup_featureflags(struct dmu_send_params
*dspp
, objset_t
*os
,
1952 uint64_t *featureflags
)
1954 dsl_dataset_t
*to_ds
= dspp
->to_ds
;
1955 dsl_pool_t
*dp
= dspp
->dp
;
1957 if (dmu_objset_type(os
) == DMU_OST_ZFS
) {
1959 if (zfs_get_zplprop(os
, ZFS_PROP_VERSION
, &version
) != 0)
1960 return (SET_ERROR(EINVAL
));
1962 if (version
>= ZPL_VERSION_SA
)
1963 *featureflags
|= DMU_BACKUP_FEATURE_SA_SPILL
;
1966 /* raw sends imply large_block_ok */
1967 if ((dspp
->rawok
|| dspp
->large_block_ok
) &&
1968 dsl_dataset_feature_is_active(to_ds
, SPA_FEATURE_LARGE_BLOCKS
)) {
1969 *featureflags
|= DMU_BACKUP_FEATURE_LARGE_BLOCKS
;
1972 /* encrypted datasets will not have embedded blocks */
1973 if ((dspp
->embedok
|| dspp
->rawok
) && !os
->os_encrypted
&&
1974 spa_feature_is_active(dp
->dp_spa
, SPA_FEATURE_EMBEDDED_DATA
)) {
1975 *featureflags
|= DMU_BACKUP_FEATURE_EMBED_DATA
;
1978 /* raw send implies compressok */
1979 if (dspp
->compressok
|| dspp
->rawok
)
1980 *featureflags
|= DMU_BACKUP_FEATURE_COMPRESSED
;
1982 if (dspp
->rawok
&& os
->os_encrypted
)
1983 *featureflags
|= DMU_BACKUP_FEATURE_RAW
;
1985 if ((*featureflags
&
1986 (DMU_BACKUP_FEATURE_EMBED_DATA
| DMU_BACKUP_FEATURE_COMPRESSED
|
1987 DMU_BACKUP_FEATURE_RAW
)) != 0 &&
1988 spa_feature_is_active(dp
->dp_spa
, SPA_FEATURE_LZ4_COMPRESS
)) {
1989 *featureflags
|= DMU_BACKUP_FEATURE_LZ4
;
1993 * We specifically do not include DMU_BACKUP_FEATURE_EMBED_DATA here to
1994 * allow sending ZSTD compressed datasets to a receiver that does not
1997 if ((*featureflags
&
1998 (DMU_BACKUP_FEATURE_COMPRESSED
| DMU_BACKUP_FEATURE_RAW
)) != 0 &&
1999 dsl_dataset_feature_is_active(to_ds
, SPA_FEATURE_ZSTD_COMPRESS
)) {
2000 *featureflags
|= DMU_BACKUP_FEATURE_ZSTD
;
2003 if (dspp
->resumeobj
!= 0 || dspp
->resumeoff
!= 0) {
2004 *featureflags
|= DMU_BACKUP_FEATURE_RESUMING
;
2007 if (dspp
->redactbook
!= NULL
) {
2008 *featureflags
|= DMU_BACKUP_FEATURE_REDACTED
;
2011 if (dsl_dataset_feature_is_active(to_ds
, SPA_FEATURE_LARGE_DNODE
)) {
2012 *featureflags
|= DMU_BACKUP_FEATURE_LARGE_DNODE
;
2015 if (dsl_dataset_feature_is_active(to_ds
, SPA_FEATURE_LONGNAME
)) {
2016 *featureflags
|= DMU_BACKUP_FEATURE_LONGNAME
;
2019 if (dsl_dataset_feature_is_active(to_ds
, SPA_FEATURE_LARGE_MICROZAP
)) {
2021 * We must never split a large microzap block, so we can only
2022 * send large microzaps if LARGE_BLOCKS is already enabled.
2024 if (!(*featureflags
& DMU_BACKUP_FEATURE_LARGE_BLOCKS
))
2025 return (SET_ERROR(ZFS_ERR_STREAM_LARGE_MICROZAP
));
2026 *featureflags
|= DMU_BACKUP_FEATURE_LARGE_MICROZAP
;
2032 static dmu_replay_record_t
*
2033 create_begin_record(struct dmu_send_params
*dspp
, objset_t
*os
,
2034 uint64_t featureflags
)
2036 dmu_replay_record_t
*drr
= kmem_zalloc(sizeof (dmu_replay_record_t
),
2038 drr
->drr_type
= DRR_BEGIN
;
2040 struct drr_begin
*drrb
= &drr
->drr_u
.drr_begin
;
2041 dsl_dataset_t
*to_ds
= dspp
->to_ds
;
2043 drrb
->drr_magic
= DMU_BACKUP_MAGIC
;
2044 drrb
->drr_creation_time
= dsl_dataset_phys(to_ds
)->ds_creation_time
;
2045 drrb
->drr_type
= dmu_objset_type(os
);
2046 drrb
->drr_toguid
= dsl_dataset_phys(to_ds
)->ds_guid
;
2047 drrb
->drr_fromguid
= dspp
->ancestor_zb
.zbm_guid
;
2049 DMU_SET_STREAM_HDRTYPE(drrb
->drr_versioninfo
, DMU_SUBSTREAM
);
2050 DMU_SET_FEATUREFLAGS(drrb
->drr_versioninfo
, featureflags
);
2053 drrb
->drr_flags
|= DRR_FLAG_CLONE
;
2054 if (dsl_dataset_phys(dspp
->to_ds
)->ds_flags
& DS_FLAG_CI_DATASET
)
2055 drrb
->drr_flags
|= DRR_FLAG_CI_DATA
;
2056 if (zfs_send_set_freerecords_bit
)
2057 drrb
->drr_flags
|= DRR_FLAG_FREERECORDS
;
2058 drr
->drr_u
.drr_begin
.drr_flags
|= DRR_FLAG_SPILL_BLOCK
;
2060 if (dspp
->savedok
) {
2061 drrb
->drr_toguid
= dspp
->saved_guid
;
2062 strlcpy(drrb
->drr_toname
, dspp
->saved_toname
,
2063 sizeof (drrb
->drr_toname
));
2065 dsl_dataset_name(to_ds
, drrb
->drr_toname
);
2066 if (!to_ds
->ds_is_snapshot
) {
2067 (void) strlcat(drrb
->drr_toname
, "@--head--",
2068 sizeof (drrb
->drr_toname
));
2075 setup_to_thread(struct send_thread_arg
*to_arg
, objset_t
*to_os
,
2076 dmu_sendstatus_t
*dssp
, uint64_t fromtxg
, boolean_t rawok
)
2078 VERIFY0(bqueue_init(&to_arg
->q
, zfs_send_no_prefetch_queue_ff
,
2079 MAX(zfs_send_no_prefetch_queue_length
, 2 * zfs_max_recordsize
),
2080 offsetof(struct send_range
, ln
)));
2081 to_arg
->error_code
= 0;
2082 to_arg
->cancel
= B_FALSE
;
2084 to_arg
->fromtxg
= fromtxg
;
2085 to_arg
->flags
= TRAVERSE_PRE
| TRAVERSE_PREFETCH_METADATA
;
2087 to_arg
->flags
|= TRAVERSE_NO_DECRYPT
;
2088 if (zfs_send_corrupt_data
)
2089 to_arg
->flags
|= TRAVERSE_HARD
;
2090 to_arg
->num_blocks_visited
= &dssp
->dss_blocks
;
2091 (void) thread_create(NULL
, 0, send_traverse_thread
, to_arg
, 0,
2092 curproc
, TS_RUN
, minclsyspri
);
2096 setup_from_thread(struct redact_list_thread_arg
*from_arg
,
2097 redaction_list_t
*from_rl
, dmu_sendstatus_t
*dssp
)
2099 VERIFY0(bqueue_init(&from_arg
->q
, zfs_send_no_prefetch_queue_ff
,
2100 MAX(zfs_send_no_prefetch_queue_length
, 2 * zfs_max_recordsize
),
2101 offsetof(struct send_range
, ln
)));
2102 from_arg
->error_code
= 0;
2103 from_arg
->cancel
= B_FALSE
;
2104 from_arg
->rl
= from_rl
;
2105 from_arg
->mark_redact
= B_FALSE
;
2106 from_arg
->num_blocks_visited
= &dssp
->dss_blocks
;
2108 * If from_ds is null, send_traverse_thread just returns success and
2109 * enqueues an eos marker.
2111 (void) thread_create(NULL
, 0, redact_list_thread
, from_arg
, 0,
2112 curproc
, TS_RUN
, minclsyspri
);
2116 setup_redact_list_thread(struct redact_list_thread_arg
*rlt_arg
,
2117 struct dmu_send_params
*dspp
, redaction_list_t
*rl
, dmu_sendstatus_t
*dssp
)
2119 if (dspp
->redactbook
== NULL
)
2122 rlt_arg
->cancel
= B_FALSE
;
2123 VERIFY0(bqueue_init(&rlt_arg
->q
, zfs_send_no_prefetch_queue_ff
,
2124 MAX(zfs_send_no_prefetch_queue_length
, 2 * zfs_max_recordsize
),
2125 offsetof(struct send_range
, ln
)));
2126 rlt_arg
->error_code
= 0;
2127 rlt_arg
->mark_redact
= B_TRUE
;
2129 rlt_arg
->num_blocks_visited
= &dssp
->dss_blocks
;
2131 (void) thread_create(NULL
, 0, redact_list_thread
, rlt_arg
, 0,
2132 curproc
, TS_RUN
, minclsyspri
);
2136 setup_merge_thread(struct send_merge_thread_arg
*smt_arg
,
2137 struct dmu_send_params
*dspp
, struct redact_list_thread_arg
*from_arg
,
2138 struct send_thread_arg
*to_arg
, struct redact_list_thread_arg
*rlt_arg
,
2141 VERIFY0(bqueue_init(&smt_arg
->q
, zfs_send_no_prefetch_queue_ff
,
2142 MAX(zfs_send_no_prefetch_queue_length
, 2 * zfs_max_recordsize
),
2143 offsetof(struct send_range
, ln
)));
2144 smt_arg
->cancel
= B_FALSE
;
2146 smt_arg
->from_arg
= from_arg
;
2147 smt_arg
->to_arg
= to_arg
;
2148 if (dspp
->redactbook
!= NULL
)
2149 smt_arg
->redact_arg
= rlt_arg
;
2152 (void) thread_create(NULL
, 0, send_merge_thread
, smt_arg
, 0, curproc
,
2153 TS_RUN
, minclsyspri
);
2157 setup_reader_thread(struct send_reader_thread_arg
*srt_arg
,
2158 struct dmu_send_params
*dspp
, struct send_merge_thread_arg
*smt_arg
,
2159 uint64_t featureflags
)
2161 VERIFY0(bqueue_init(&srt_arg
->q
, zfs_send_queue_ff
,
2162 MAX(zfs_send_queue_length
, 2 * zfs_max_recordsize
),
2163 offsetof(struct send_range
, ln
)));
2164 srt_arg
->smta
= smt_arg
;
2165 srt_arg
->issue_reads
= !dspp
->dso
->dso_dryrun
;
2166 srt_arg
->featureflags
= featureflags
;
2167 (void) thread_create(NULL
, 0, send_reader_thread
, srt_arg
, 0,
2168 curproc
, TS_RUN
, minclsyspri
);
2172 setup_resume_points(struct dmu_send_params
*dspp
,
2173 struct send_thread_arg
*to_arg
, struct redact_list_thread_arg
*from_arg
,
2174 struct redact_list_thread_arg
*rlt_arg
,
2175 struct send_merge_thread_arg
*smt_arg
, boolean_t resuming
, objset_t
*os
,
2176 redaction_list_t
*redact_rl
, nvlist_t
*nvl
)
2179 dsl_dataset_t
*to_ds
= dspp
->to_ds
;
2185 obj
= dspp
->resumeobj
;
2186 dmu_object_info_t to_doi
;
2187 err
= dmu_object_info(os
, obj
, &to_doi
);
2191 blkid
= dspp
->resumeoff
/ to_doi
.doi_data_block_size
;
2194 * If we're resuming a redacted send, we can skip to the appropriate
2195 * point in the redaction bookmark by binary searching through it.
2197 if (redact_rl
!= NULL
) {
2198 SET_BOOKMARK(&rlt_arg
->resume
, to_ds
->ds_object
, obj
, 0, blkid
);
2201 SET_BOOKMARK(&to_arg
->resume
, to_ds
->ds_object
, obj
, 0, blkid
);
2202 if (nvlist_exists(nvl
, BEGINNV_REDACT_FROM_SNAPS
)) {
2203 uint64_t objset
= dspp
->ancestor_zb
.zbm_redaction_obj
;
2205 * Note: If the resume point is in an object whose
2206 * blocksize is different in the from vs to snapshots,
2207 * we will have divided by the "wrong" blocksize.
2208 * However, in this case fromsnap's send_cb() will
2209 * detect that the blocksize has changed and therefore
2210 * ignore this object.
2212 * If we're resuming a send from a redaction bookmark,
2213 * we still cannot accidentally suggest blocks behind
2214 * the to_ds. In addition, we know that any blocks in
2215 * the object in the to_ds will have to be sent, since
2216 * the size changed. Therefore, we can't cause any harm
2219 SET_BOOKMARK(&from_arg
->resume
, objset
, obj
, 0, blkid
);
2222 fnvlist_add_uint64(nvl
, BEGINNV_RESUME_OBJECT
, dspp
->resumeobj
);
2223 fnvlist_add_uint64(nvl
, BEGINNV_RESUME_OFFSET
, dspp
->resumeoff
);
2228 static dmu_sendstatus_t
*
2229 setup_send_progress(struct dmu_send_params
*dspp
)
2231 dmu_sendstatus_t
*dssp
= kmem_zalloc(sizeof (*dssp
), KM_SLEEP
);
2232 dssp
->dss_outfd
= dspp
->outfd
;
2233 dssp
->dss_off
= dspp
->off
;
2234 dssp
->dss_proc
= curproc
;
2235 mutex_enter(&dspp
->to_ds
->ds_sendstream_lock
);
2236 list_insert_head(&dspp
->to_ds
->ds_sendstreams
, dssp
);
2237 mutex_exit(&dspp
->to_ds
->ds_sendstream_lock
);
2242 * Actually do the bulk of the work in a zfs send.
2244 * The idea is that we want to do a send from ancestor_zb to to_ds. We also
2245 * want to not send any data that has been modified by all the datasets in
2246 * redactsnaparr, and store the list of blocks that are redacted in this way in
2247 * a bookmark named redactbook, created on the to_ds. We do this by creating
2248 * several worker threads, whose function is described below.
2250 * There are three cases.
2251 * The first case is a redacted zfs send. In this case there are 5 threads.
2252 * The first thread is the to_ds traversal thread: it calls dataset_traverse on
2253 * the to_ds and finds all the blocks that have changed since ancestor_zb (if
2254 * it's a full send, that's all blocks in the dataset). It then sends those
2255 * blocks on to the send merge thread. The redact list thread takes the data
2256 * from the redaction bookmark and sends those blocks on to the send merge
2257 * thread. The send merge thread takes the data from the to_ds traversal
2258 * thread, and combines it with the redaction records from the redact list
2259 * thread. If a block appears in both the to_ds's data and the redaction data,
2260 * the send merge thread will mark it as redacted and send it on to the prefetch
2261 * thread. Otherwise, the send merge thread will send the block on to the
2262 * prefetch thread unchanged. The prefetch thread will issue prefetch reads for
2263 * any data that isn't redacted, and then send the data on to the main thread.
2264 * The main thread behaves the same as in a normal send case, issuing demand
2265 * reads for data blocks and sending out records over the network
2267 * The graphic below diagrams the flow of data in the case of a redacted zfs
2268 * send. Each box represents a thread, and each line represents the flow of
2271 * Records from the |
2272 * redaction bookmark |
2273 * +--------------------+ | +---------------------------+
2274 * | | v | Send Merge Thread |
2275 * | Redact List Thread +----------> Apply redaction marks to |
2276 * | | | records as specified by |
2277 * +--------------------+ | redaction ranges |
2278 * +----^---------------+------+
2281 * | +------------v--------+
2282 * | | Prefetch Thread |
2283 * +--------------------+ | | Issues prefetch |
2284 * | to_ds Traversal | | | reads of data blocks|
2285 * | Thread (finds +---------------+ +------------+--------+
2286 * | candidate blocks) | Blocks modified | Prefetched data
2287 * +--------------------+ by to_ds since |
2288 * ancestor_zb +------------v----+
2289 * | Main Thread | File Descriptor
2290 * | Sends data over +->(to zfs receive)
2292 * +-----------------+
2294 * The second case is an incremental send from a redaction bookmark. The to_ds
2295 * traversal thread and the main thread behave the same as in the redacted
2296 * send case. The new thread is the from bookmark traversal thread. It
2297 * iterates over the redaction list in the redaction bookmark, and enqueues
2298 * records for each block that was redacted in the original send. The send
2299 * merge thread now has to merge the data from the two threads. For details
2300 * about that process, see the header comment of send_merge_thread(). Any data
2301 * it decides to send on will be prefetched by the prefetch thread. Note that
2302 * you can perform a redacted send from a redaction bookmark; in that case,
2303 * the data flow behaves very similarly to the flow in the redacted send case,
2304 * except with the addition of the bookmark traversal thread iterating over the
2305 * redaction bookmark. The send_merge_thread also has to take on the
2306 * responsibility of merging the redact list thread's records, the bookmark
2307 * traversal thread's records, and the to_ds records.
2309 * +---------------------+
2311 * | Redact List Thread +--------------+
2313 * +---------------------+ |
2314 * Blocks in redaction list | Ranges modified by every secure snap
2315 * of from bookmark | (or EOS if not readcted)
2317 * +---------------------+ | +----v----------------------+
2318 * | bookmark Traversal | v | Send Merge Thread |
2319 * | Thread (finds +---------> Merges bookmark, rlt, and |
2320 * | candidate blocks) | | to_ds send records |
2321 * +---------------------+ +----^---------------+------+
2323 * | +------------v--------+
2324 * | | Prefetch Thread |
2325 * +--------------------+ | | Issues prefetch |
2326 * | to_ds Traversal | | | reads of data blocks|
2327 * | Thread (finds +---------------+ +------------+--------+
2328 * | candidate blocks) | Blocks modified | Prefetched data
2329 * +--------------------+ by to_ds since +------------v----+
2330 * ancestor_zb | Main Thread | File Descriptor
2331 * | Sends data over +->(to zfs receive)
2333 * +-----------------+
2335 * The final case is a simple zfs full or incremental send. The to_ds traversal
2336 * thread behaves the same as always. The redact list thread is never started.
2337 * The send merge thread takes all the blocks that the to_ds traversal thread
2338 * sends it, prefetches the data, and sends the blocks on to the main thread.
2339 * The main thread sends the data over the wire.
2341 * To keep performance acceptable, we want to prefetch the data in the worker
2342 * threads. While the to_ds thread could simply use the TRAVERSE_PREFETCH
2343 * feature built into traverse_dataset, the combining and deletion of records
2344 * due to redaction and sends from redaction bookmarks mean that we could
2345 * issue many unnecessary prefetches. As a result, we only prefetch data
2346 * after we've determined that the record is not going to be redacted. To
2347 * prevent the prefetching from getting too far ahead of the main thread, the
2348 * blocking queues that are used for communication are capped not by the
2349 * number of entries in the queue, but by the sum of the size of the
2350 * prefetches associated with them. The limit on the amount of data that the
2351 * thread can prefetch beyond what the main thread has reached is controlled
2352 * by the global variable zfs_send_queue_length. In addition, to prevent poor
2353 * performance in the beginning of a send, we also limit the distance ahead
2354 * that the traversal threads can be. That distance is controlled by the
2355 * zfs_send_no_prefetch_queue_length tunable.
2357 * Note: Releases dp using the specified tag.
2360 dmu_send_impl(struct dmu_send_params
*dspp
)
2363 dmu_replay_record_t
*drr
;
2364 dmu_sendstatus_t
*dssp
;
2365 dmu_send_cookie_t dsc
= {0};
2367 uint64_t fromtxg
= dspp
->ancestor_zb
.zbm_creation_txg
;
2368 uint64_t featureflags
= 0;
2369 struct redact_list_thread_arg
*from_arg
;
2370 struct send_thread_arg
*to_arg
;
2371 struct redact_list_thread_arg
*rlt_arg
;
2372 struct send_merge_thread_arg
*smt_arg
;
2373 struct send_reader_thread_arg
*srt_arg
;
2374 struct send_range
*range
;
2375 redaction_list_t
*from_rl
= NULL
;
2376 redaction_list_t
*redact_rl
= NULL
;
2377 boolean_t resuming
= (dspp
->resumeobj
!= 0 || dspp
->resumeoff
!= 0);
2378 boolean_t book_resuming
= resuming
;
2380 dsl_dataset_t
*to_ds
= dspp
->to_ds
;
2381 zfs_bookmark_phys_t
*ancestor_zb
= &dspp
->ancestor_zb
;
2382 dsl_pool_t
*dp
= dspp
->dp
;
2383 const void *tag
= dspp
->tag
;
2385 err
= dmu_objset_from_ds(to_ds
, &os
);
2387 dsl_pool_rele(dp
, tag
);
2392 * If this is a non-raw send of an encrypted ds, we can ensure that
2393 * the objset_phys_t is authenticated. This is safe because this is
2394 * either a snapshot or we have owned the dataset, ensuring that
2395 * it can't be modified.
2397 if (!dspp
->rawok
&& os
->os_encrypted
&&
2398 arc_is_unauthenticated(os
->os_phys_buf
)) {
2399 zbookmark_phys_t zb
;
2401 SET_BOOKMARK(&zb
, to_ds
->ds_object
, ZB_ROOT_OBJECT
,
2402 ZB_ROOT_LEVEL
, ZB_ROOT_BLKID
);
2403 err
= arc_untransform(os
->os_phys_buf
, os
->os_spa
,
2406 dsl_pool_rele(dp
, tag
);
2410 ASSERT0(arc_is_unauthenticated(os
->os_phys_buf
));
2413 if ((err
= setup_featureflags(dspp
, os
, &featureflags
)) != 0) {
2414 dsl_pool_rele(dp
, tag
);
2419 * If we're doing a redacted send, hold the bookmark's redaction list.
2421 if (dspp
->redactbook
!= NULL
) {
2422 err
= dsl_redaction_list_hold_obj(dp
,
2423 dspp
->redactbook
->zbm_redaction_obj
, FTAG
,
2426 dsl_pool_rele(dp
, tag
);
2427 return (SET_ERROR(EINVAL
));
2429 dsl_redaction_list_long_hold(dp
, redact_rl
, FTAG
);
2433 * If we're sending from a redaction bookmark, hold the redaction list
2434 * so that we can consider sending the redacted blocks.
2436 if (ancestor_zb
->zbm_redaction_obj
!= 0) {
2437 err
= dsl_redaction_list_hold_obj(dp
,
2438 ancestor_zb
->zbm_redaction_obj
, FTAG
, &from_rl
);
2440 if (redact_rl
!= NULL
) {
2441 dsl_redaction_list_long_rele(redact_rl
, FTAG
);
2442 dsl_redaction_list_rele(redact_rl
, FTAG
);
2444 dsl_pool_rele(dp
, tag
);
2445 return (SET_ERROR(EINVAL
));
2447 dsl_redaction_list_long_hold(dp
, from_rl
, FTAG
);
2450 dsl_dataset_long_hold(to_ds
, FTAG
);
2452 from_arg
= kmem_zalloc(sizeof (*from_arg
), KM_SLEEP
);
2453 to_arg
= kmem_zalloc(sizeof (*to_arg
), KM_SLEEP
);
2454 rlt_arg
= kmem_zalloc(sizeof (*rlt_arg
), KM_SLEEP
);
2455 smt_arg
= kmem_zalloc(sizeof (*smt_arg
), KM_SLEEP
);
2456 srt_arg
= kmem_zalloc(sizeof (*srt_arg
), KM_SLEEP
);
2458 drr
= create_begin_record(dspp
, os
, featureflags
);
2459 dssp
= setup_send_progress(dspp
);
2462 dsc
.dsc_dso
= dspp
->dso
;
2464 dsc
.dsc_off
= dspp
->off
;
2465 dsc
.dsc_toguid
= dsl_dataset_phys(to_ds
)->ds_guid
;
2466 dsc
.dsc_fromtxg
= fromtxg
;
2467 dsc
.dsc_pending_op
= PENDING_NONE
;
2468 dsc
.dsc_featureflags
= featureflags
;
2469 dsc
.dsc_resume_object
= dspp
->resumeobj
;
2470 dsc
.dsc_resume_offset
= dspp
->resumeoff
;
2472 dsl_pool_rele(dp
, tag
);
2474 void *payload
= NULL
;
2475 size_t payload_len
= 0;
2476 nvlist_t
*nvl
= fnvlist_alloc();
2479 * If we're doing a redacted send, we include the snapshots we're
2480 * redacted with respect to so that the target system knows what send
2481 * streams can be correctly received on top of this dataset. If we're
2482 * instead sending a redacted dataset, we include the snapshots that the
2483 * dataset was created with respect to.
2485 if (dspp
->redactbook
!= NULL
) {
2486 fnvlist_add_uint64_array(nvl
, BEGINNV_REDACT_SNAPS
,
2487 redact_rl
->rl_phys
->rlp_snaps
,
2488 redact_rl
->rl_phys
->rlp_num_snaps
);
2489 } else if (dsl_dataset_feature_is_active(to_ds
,
2490 SPA_FEATURE_REDACTED_DATASETS
)) {
2491 uint64_t *tods_guids
;
2493 VERIFY(dsl_dataset_get_uint64_array_feature(to_ds
,
2494 SPA_FEATURE_REDACTED_DATASETS
, &length
, &tods_guids
));
2495 fnvlist_add_uint64_array(nvl
, BEGINNV_REDACT_SNAPS
, tods_guids
,
2500 * If we're sending from a redaction bookmark, then we should retrieve
2501 * the guids of that bookmark so we can send them over the wire.
2503 if (from_rl
!= NULL
) {
2504 fnvlist_add_uint64_array(nvl
, BEGINNV_REDACT_FROM_SNAPS
,
2505 from_rl
->rl_phys
->rlp_snaps
,
2506 from_rl
->rl_phys
->rlp_num_snaps
);
2510 * If the snapshot we're sending from is redacted, include the redaction
2511 * list in the stream.
2513 if (dspp
->numfromredactsnaps
!= NUM_SNAPS_NOT_REDACTED
) {
2514 ASSERT3P(from_rl
, ==, NULL
);
2515 fnvlist_add_uint64_array(nvl
, BEGINNV_REDACT_FROM_SNAPS
,
2516 dspp
->fromredactsnaps
, (uint_t
)dspp
->numfromredactsnaps
);
2517 if (dspp
->numfromredactsnaps
> 0) {
2518 kmem_free(dspp
->fromredactsnaps
,
2519 dspp
->numfromredactsnaps
* sizeof (uint64_t));
2520 dspp
->fromredactsnaps
= NULL
;
2524 if (resuming
|| book_resuming
) {
2525 err
= setup_resume_points(dspp
, to_arg
, from_arg
,
2526 rlt_arg
, smt_arg
, resuming
, os
, redact_rl
, nvl
);
2531 if (featureflags
& DMU_BACKUP_FEATURE_RAW
) {
2532 uint64_t ivset_guid
= ancestor_zb
->zbm_ivset_guid
;
2533 nvlist_t
*keynvl
= NULL
;
2534 ASSERT(os
->os_encrypted
);
2536 err
= dsl_crypto_populate_key_nvlist(os
, ivset_guid
,
2543 fnvlist_add_nvlist(nvl
, "crypt_keydata", keynvl
);
2544 fnvlist_free(keynvl
);
2547 if (!nvlist_empty(nvl
)) {
2548 payload
= fnvlist_pack(nvl
, &payload_len
);
2549 drr
->drr_payloadlen
= payload_len
;
2553 err
= dump_record(&dsc
, payload
, payload_len
);
2554 fnvlist_pack_free(payload
, payload_len
);
2560 setup_to_thread(to_arg
, os
, dssp
, fromtxg
, dspp
->rawok
);
2561 setup_from_thread(from_arg
, from_rl
, dssp
);
2562 setup_redact_list_thread(rlt_arg
, dspp
, redact_rl
, dssp
);
2563 setup_merge_thread(smt_arg
, dspp
, from_arg
, to_arg
, rlt_arg
, os
);
2564 setup_reader_thread(srt_arg
, dspp
, smt_arg
, featureflags
);
2566 range
= bqueue_dequeue(&srt_arg
->q
);
2567 while (err
== 0 && !range
->eos_marker
) {
2568 err
= do_dump(&dsc
, range
);
2569 range
= get_next_range(&srt_arg
->q
, range
);
2571 err
= SET_ERROR(EINTR
);
2575 * If we hit an error or are interrupted, cancel our worker threads and
2576 * clear the queue of any pending records. The threads will pass the
2577 * cancel up the tree of worker threads, and each one will clean up any
2578 * pending records before exiting.
2581 srt_arg
->cancel
= B_TRUE
;
2582 while (!range
->eos_marker
) {
2583 range
= get_next_range(&srt_arg
->q
, range
);
2588 bqueue_destroy(&srt_arg
->q
);
2589 bqueue_destroy(&smt_arg
->q
);
2590 if (dspp
->redactbook
!= NULL
)
2591 bqueue_destroy(&rlt_arg
->q
);
2592 bqueue_destroy(&to_arg
->q
);
2593 bqueue_destroy(&from_arg
->q
);
2595 if (err
== 0 && srt_arg
->error
!= 0)
2596 err
= srt_arg
->error
;
2601 if (dsc
.dsc_pending_op
!= PENDING_NONE
)
2602 if (dump_record(&dsc
, NULL
, 0) != 0)
2603 err
= SET_ERROR(EINTR
);
2606 if (err
== EINTR
&& dsc
.dsc_err
!= 0)
2612 * Send the DRR_END record if this is not a saved stream.
2613 * Otherwise, the omitted DRR_END record will signal to
2614 * the receive side that the stream is incomplete.
2616 if (!dspp
->savedok
) {
2617 memset(drr
, 0, sizeof (dmu_replay_record_t
));
2618 drr
->drr_type
= DRR_END
;
2619 drr
->drr_u
.drr_end
.drr_checksum
= dsc
.dsc_zc
;
2620 drr
->drr_u
.drr_end
.drr_toguid
= dsc
.dsc_toguid
;
2622 if (dump_record(&dsc
, NULL
, 0) != 0)
2626 mutex_enter(&to_ds
->ds_sendstream_lock
);
2627 list_remove(&to_ds
->ds_sendstreams
, dssp
);
2628 mutex_exit(&to_ds
->ds_sendstream_lock
);
2630 VERIFY(err
!= 0 || (dsc
.dsc_sent_begin
&&
2631 (dsc
.dsc_sent_end
|| dspp
->savedok
)));
2633 kmem_free(drr
, sizeof (dmu_replay_record_t
));
2634 kmem_free(dssp
, sizeof (dmu_sendstatus_t
));
2635 kmem_free(from_arg
, sizeof (*from_arg
));
2636 kmem_free(to_arg
, sizeof (*to_arg
));
2637 kmem_free(rlt_arg
, sizeof (*rlt_arg
));
2638 kmem_free(smt_arg
, sizeof (*smt_arg
));
2639 kmem_free(srt_arg
, sizeof (*srt_arg
));
2641 dsl_dataset_long_rele(to_ds
, FTAG
);
2642 if (from_rl
!= NULL
) {
2643 dsl_redaction_list_long_rele(from_rl
, FTAG
);
2644 dsl_redaction_list_rele(from_rl
, FTAG
);
2646 if (redact_rl
!= NULL
) {
2647 dsl_redaction_list_long_rele(redact_rl
, FTAG
);
2648 dsl_redaction_list_rele(redact_rl
, FTAG
);
2655 dmu_send_obj(const char *pool
, uint64_t tosnap
, uint64_t fromsnap
,
2656 boolean_t embedok
, boolean_t large_block_ok
, boolean_t compressok
,
2657 boolean_t rawok
, boolean_t savedok
, int outfd
, offset_t
*off
,
2658 dmu_send_outparams_t
*dsop
)
2661 dsl_dataset_t
*fromds
;
2662 ds_hold_flags_t dsflags
;
2663 struct dmu_send_params dspp
= {0};
2664 dspp
.embedok
= embedok
;
2665 dspp
.large_block_ok
= large_block_ok
;
2666 dspp
.compressok
= compressok
;
2672 dspp
.savedok
= savedok
;
2674 dsflags
= (rawok
) ? DS_HOLD_FLAG_NONE
: DS_HOLD_FLAG_DECRYPT
;
2675 err
= dsl_pool_hold(pool
, FTAG
, &dspp
.dp
);
2679 err
= dsl_dataset_hold_obj_flags(dspp
.dp
, tosnap
, dsflags
, FTAG
,
2682 dsl_pool_rele(dspp
.dp
, FTAG
);
2686 if (fromsnap
!= 0) {
2687 err
= dsl_dataset_hold_obj_flags(dspp
.dp
, fromsnap
, dsflags
,
2690 dsl_dataset_rele_flags(dspp
.to_ds
, dsflags
, FTAG
);
2691 dsl_pool_rele(dspp
.dp
, FTAG
);
2694 dspp
.ancestor_zb
.zbm_guid
= dsl_dataset_phys(fromds
)->ds_guid
;
2695 dspp
.ancestor_zb
.zbm_creation_txg
=
2696 dsl_dataset_phys(fromds
)->ds_creation_txg
;
2697 dspp
.ancestor_zb
.zbm_creation_time
=
2698 dsl_dataset_phys(fromds
)->ds_creation_time
;
2700 if (dsl_dataset_is_zapified(fromds
)) {
2701 (void) zap_lookup(dspp
.dp
->dp_meta_objset
,
2702 fromds
->ds_object
, DS_FIELD_IVSET_GUID
, 8, 1,
2703 &dspp
.ancestor_zb
.zbm_ivset_guid
);
2706 /* See dmu_send for the reasons behind this. */
2707 uint64_t *fromredact
;
2709 if (!dsl_dataset_get_uint64_array_feature(fromds
,
2710 SPA_FEATURE_REDACTED_DATASETS
,
2711 &dspp
.numfromredactsnaps
,
2713 dspp
.numfromredactsnaps
= NUM_SNAPS_NOT_REDACTED
;
2714 } else if (dspp
.numfromredactsnaps
> 0) {
2715 uint64_t size
= dspp
.numfromredactsnaps
*
2717 dspp
.fromredactsnaps
= kmem_zalloc(size
, KM_SLEEP
);
2718 memcpy(dspp
.fromredactsnaps
, fromredact
, size
);
2721 boolean_t is_before
=
2722 dsl_dataset_is_before(dspp
.to_ds
, fromds
, 0);
2723 dspp
.is_clone
= (dspp
.to_ds
->ds_dir
!=
2725 dsl_dataset_rele(fromds
, FTAG
);
2727 dsl_pool_rele(dspp
.dp
, FTAG
);
2728 err
= SET_ERROR(EXDEV
);
2730 err
= dmu_send_impl(&dspp
);
2733 dspp
.numfromredactsnaps
= NUM_SNAPS_NOT_REDACTED
;
2734 err
= dmu_send_impl(&dspp
);
2736 if (dspp
.fromredactsnaps
)
2737 kmem_free(dspp
.fromredactsnaps
,
2738 dspp
.numfromredactsnaps
* sizeof (uint64_t));
2740 dsl_dataset_rele(dspp
.to_ds
, FTAG
);
2745 dmu_send(const char *tosnap
, const char *fromsnap
, boolean_t embedok
,
2746 boolean_t large_block_ok
, boolean_t compressok
, boolean_t rawok
,
2747 boolean_t savedok
, uint64_t resumeobj
, uint64_t resumeoff
,
2748 const char *redactbook
, int outfd
, offset_t
*off
,
2749 dmu_send_outparams_t
*dsop
)
2752 ds_hold_flags_t dsflags
;
2753 boolean_t owned
= B_FALSE
;
2754 dsl_dataset_t
*fromds
= NULL
;
2755 zfs_bookmark_phys_t book
= {0};
2756 struct dmu_send_params dspp
= {0};
2758 dsflags
= (rawok
) ? DS_HOLD_FLAG_NONE
: DS_HOLD_FLAG_DECRYPT
;
2759 dspp
.tosnap
= tosnap
;
2760 dspp
.embedok
= embedok
;
2761 dspp
.large_block_ok
= large_block_ok
;
2762 dspp
.compressok
= compressok
;
2767 dspp
.resumeobj
= resumeobj
;
2768 dspp
.resumeoff
= resumeoff
;
2770 dspp
.savedok
= savedok
;
2772 if (fromsnap
!= NULL
&& strpbrk(fromsnap
, "@#") == NULL
)
2773 return (SET_ERROR(EINVAL
));
2775 err
= dsl_pool_hold(tosnap
, FTAG
, &dspp
.dp
);
2779 if (strchr(tosnap
, '@') == NULL
&& spa_writeable(dspp
.dp
->dp_spa
)) {
2781 * We are sending a filesystem or volume. Ensure
2782 * that it doesn't change by owning the dataset.
2787 * We are looking for the dataset that represents the
2788 * partially received send stream. If this stream was
2789 * received as a new snapshot of an existing dataset,
2790 * this will be saved in a hidden clone named
2791 * "<pool>/<dataset>/%recv". Otherwise, the stream
2792 * will be saved in the live dataset itself. In
2793 * either case we need to use dsl_dataset_own_force()
2794 * because the stream is marked as inconsistent,
2795 * which would normally make it unavailable to be
2798 char *name
= kmem_asprintf("%s/%s", tosnap
,
2800 err
= dsl_dataset_own_force(dspp
.dp
, name
, dsflags
,
2802 if (err
== ENOENT
) {
2803 err
= dsl_dataset_own_force(dspp
.dp
, tosnap
,
2804 dsflags
, FTAG
, &dspp
.to_ds
);
2809 err
= zap_lookup(dspp
.dp
->dp_meta_objset
,
2810 dspp
.to_ds
->ds_object
,
2811 DS_FIELD_RESUME_TOGUID
, 8, 1,
2816 err
= zap_lookup(dspp
.dp
->dp_meta_objset
,
2817 dspp
.to_ds
->ds_object
,
2818 DS_FIELD_RESUME_TONAME
, 1,
2819 sizeof (dspp
.saved_toname
),
2822 /* Only disown if there was an error in the lookups */
2823 if (owned
&& (err
!= 0))
2824 dsl_dataset_disown(dspp
.to_ds
, dsflags
, FTAG
);
2828 err
= dsl_dataset_own(dspp
.dp
, tosnap
, dsflags
,
2834 err
= dsl_dataset_hold_flags(dspp
.dp
, tosnap
, dsflags
, FTAG
,
2839 /* Note: dsl dataset is not owned at this point */
2840 dsl_pool_rele(dspp
.dp
, FTAG
);
2844 if (redactbook
!= NULL
) {
2845 char path
[ZFS_MAX_DATASET_NAME_LEN
];
2846 (void) strlcpy(path
, tosnap
, sizeof (path
));
2847 char *at
= strchr(path
, '@');
2851 (void) snprintf(at
, sizeof (path
) - (at
- path
), "#%s",
2853 err
= dsl_bookmark_lookup(dspp
.dp
, path
,
2855 dspp
.redactbook
= &book
;
2860 dsl_pool_rele(dspp
.dp
, FTAG
);
2862 dsl_dataset_disown(dspp
.to_ds
, dsflags
, FTAG
);
2864 dsl_dataset_rele_flags(dspp
.to_ds
, dsflags
, FTAG
);
2868 if (fromsnap
!= NULL
) {
2869 zfs_bookmark_phys_t
*zb
= &dspp
.ancestor_zb
;
2871 if (strpbrk(tosnap
, "@#") != NULL
)
2872 fsnamelen
= strpbrk(tosnap
, "@#") - tosnap
;
2874 fsnamelen
= strlen(tosnap
);
2877 * If the fromsnap is in a different filesystem, then
2878 * mark the send stream as a clone.
2880 if (strncmp(tosnap
, fromsnap
, fsnamelen
) != 0 ||
2881 (fromsnap
[fsnamelen
] != '@' &&
2882 fromsnap
[fsnamelen
] != '#')) {
2883 dspp
.is_clone
= B_TRUE
;
2886 if (strchr(fromsnap
, '@') != NULL
) {
2887 err
= dsl_dataset_hold(dspp
.dp
, fromsnap
, FTAG
,
2891 ASSERT3P(fromds
, ==, NULL
);
2894 * We need to make a deep copy of the redact
2895 * snapshots of the from snapshot, because the
2896 * array will be freed when we evict from_ds.
2898 uint64_t *fromredact
;
2899 if (!dsl_dataset_get_uint64_array_feature(
2900 fromds
, SPA_FEATURE_REDACTED_DATASETS
,
2901 &dspp
.numfromredactsnaps
,
2903 dspp
.numfromredactsnaps
=
2904 NUM_SNAPS_NOT_REDACTED
;
2905 } else if (dspp
.numfromredactsnaps
> 0) {
2907 dspp
.numfromredactsnaps
*
2909 dspp
.fromredactsnaps
= kmem_zalloc(size
,
2911 memcpy(dspp
.fromredactsnaps
, fromredact
,
2914 if (!dsl_dataset_is_before(dspp
.to_ds
, fromds
,
2916 err
= SET_ERROR(EXDEV
);
2918 zb
->zbm_creation_txg
=
2919 dsl_dataset_phys(fromds
)->
2921 zb
->zbm_creation_time
=
2922 dsl_dataset_phys(fromds
)->
2925 dsl_dataset_phys(fromds
)->ds_guid
;
2926 zb
->zbm_redaction_obj
= 0;
2928 if (dsl_dataset_is_zapified(fromds
)) {
2930 dspp
.dp
->dp_meta_objset
,
2932 DS_FIELD_IVSET_GUID
, 8, 1,
2933 &zb
->zbm_ivset_guid
);
2936 dsl_dataset_rele(fromds
, FTAG
);
2939 dspp
.numfromredactsnaps
= NUM_SNAPS_NOT_REDACTED
;
2940 err
= dsl_bookmark_lookup(dspp
.dp
, fromsnap
, dspp
.to_ds
,
2942 if (err
== EXDEV
&& zb
->zbm_redaction_obj
!= 0 &&
2944 dsl_dataset_phys(dspp
.to_ds
)->ds_guid
)
2949 /* dmu_send_impl will call dsl_pool_rele for us. */
2950 err
= dmu_send_impl(&dspp
);
2952 if (dspp
.fromredactsnaps
)
2953 kmem_free(dspp
.fromredactsnaps
,
2954 dspp
.numfromredactsnaps
*
2956 dsl_pool_rele(dspp
.dp
, FTAG
);
2959 dspp
.numfromredactsnaps
= NUM_SNAPS_NOT_REDACTED
;
2960 err
= dmu_send_impl(&dspp
);
2963 dsl_dataset_disown(dspp
.to_ds
, dsflags
, FTAG
);
2965 dsl_dataset_rele_flags(dspp
.to_ds
, dsflags
, FTAG
);
2970 dmu_adjust_send_estimate_for_indirects(dsl_dataset_t
*ds
, uint64_t uncompressed
,
2971 uint64_t compressed
, boolean_t stream_compressed
, uint64_t *sizep
)
2976 * Assume that space (both on-disk and in-stream) is dominated by
2977 * data. We will adjust for indirect blocks and the copies property,
2978 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
2981 uint64_t recordsize
;
2982 uint64_t record_count
;
2984 VERIFY0(dmu_objset_from_ds(ds
, &os
));
2986 /* Assume all (uncompressed) blocks are recordsize. */
2987 if (zfs_override_estimate_recordsize
!= 0) {
2988 recordsize
= zfs_override_estimate_recordsize
;
2989 } else if (os
->os_phys
->os_type
== DMU_OST_ZVOL
) {
2990 err
= dsl_prop_get_int_ds(ds
,
2991 zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE
), &recordsize
);
2993 err
= dsl_prop_get_int_ds(ds
,
2994 zfs_prop_to_name(ZFS_PROP_RECORDSIZE
), &recordsize
);
2998 record_count
= uncompressed
/ recordsize
;
3001 * If we're estimating a send size for a compressed stream, use the
3002 * compressed data size to estimate the stream size. Otherwise, use the
3003 * uncompressed data size.
3005 size
= stream_compressed
? compressed
: uncompressed
;
3008 * Subtract out approximate space used by indirect blocks.
3009 * Assume most space is used by data blocks (non-indirect, non-dnode).
3010 * Assume no ditto blocks or internal fragmentation.
3012 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per
3015 size
-= record_count
* sizeof (blkptr_t
);
3017 /* Add in the space for the record associated with each block. */
3018 size
+= record_count
* sizeof (dmu_replay_record_t
);
3026 dmu_send_estimate_fast(dsl_dataset_t
*origds
, dsl_dataset_t
*fromds
,
3027 zfs_bookmark_phys_t
*frombook
, boolean_t stream_compressed
,
3028 boolean_t saved
, uint64_t *sizep
)
3031 dsl_dataset_t
*ds
= origds
;
3032 uint64_t uncomp
, comp
;
3034 ASSERT(dsl_pool_config_held(origds
->ds_dir
->dd_pool
));
3035 ASSERT(fromds
== NULL
|| frombook
== NULL
);
3038 * If this is a saved send we may actually be sending
3039 * from the %recv clone used for resuming.
3042 objset_t
*mos
= origds
->ds_dir
->dd_pool
->dp_meta_objset
;
3044 char dsname
[ZFS_MAX_DATASET_NAME_LEN
+ 6];
3046 dsl_dataset_name(origds
, dsname
);
3047 (void) strcat(dsname
, "/");
3048 (void) strlcat(dsname
, recv_clone_name
, sizeof (dsname
));
3050 err
= dsl_dataset_hold(origds
->ds_dir
->dd_pool
,
3052 if (err
!= ENOENT
&& err
!= 0) {
3054 } else if (err
== ENOENT
) {
3058 /* check that this dataset has partially received data */
3059 err
= zap_lookup(mos
, ds
->ds_object
,
3060 DS_FIELD_RESUME_TOGUID
, 8, 1, &guid
);
3062 err
= SET_ERROR(err
== ENOENT
? EINVAL
: err
);
3066 err
= zap_lookup(mos
, ds
->ds_object
,
3067 DS_FIELD_RESUME_TONAME
, 1, sizeof (dsname
), dsname
);
3069 err
= SET_ERROR(err
== ENOENT
? EINVAL
: err
);
3074 /* tosnap must be a snapshot or the target of a saved send */
3075 if (!ds
->ds_is_snapshot
&& ds
== origds
)
3076 return (SET_ERROR(EINVAL
));
3078 if (fromds
!= NULL
) {
3080 if (!fromds
->ds_is_snapshot
) {
3081 err
= SET_ERROR(EINVAL
);
3085 if (!dsl_dataset_is_before(ds
, fromds
, 0)) {
3086 err
= SET_ERROR(EXDEV
);
3090 err
= dsl_dataset_space_written(fromds
, ds
, &used
, &comp
,
3094 } else if (frombook
!= NULL
) {
3096 err
= dsl_dataset_space_written_bookmark(frombook
, ds
, &used
,
3101 uncomp
= dsl_dataset_phys(ds
)->ds_uncompressed_bytes
;
3102 comp
= dsl_dataset_phys(ds
)->ds_compressed_bytes
;
3105 err
= dmu_adjust_send_estimate_for_indirects(ds
, uncomp
, comp
,
3106 stream_compressed
, sizep
);
3108 * Add the size of the BEGIN and END records to the estimate.
3110 *sizep
+= 2 * sizeof (dmu_replay_record_t
);
3114 dsl_dataset_rele(ds
, FTAG
);
3118 ZFS_MODULE_PARAM(zfs_send
, zfs_send_
, corrupt_data
, INT
, ZMOD_RW
,
3119 "Allow sending corrupt data");
3121 ZFS_MODULE_PARAM(zfs_send
, zfs_send_
, queue_length
, UINT
, ZMOD_RW
,
3122 "Maximum send queue length");
3124 ZFS_MODULE_PARAM(zfs_send
, zfs_send_
, unmodified_spill_blocks
, INT
, ZMOD_RW
,
3125 "Send unmodified spill blocks");
3127 ZFS_MODULE_PARAM(zfs_send
, zfs_send_
, no_prefetch_queue_length
, UINT
, ZMOD_RW
,
3128 "Maximum send queue length for non-prefetch queues");
3130 ZFS_MODULE_PARAM(zfs_send
, zfs_send_
, queue_ff
, UINT
, ZMOD_RW
,
3131 "Send queue fill fraction");
3133 ZFS_MODULE_PARAM(zfs_send
, zfs_send_
, no_prefetch_queue_ff
, UINT
, ZMOD_RW
,
3134 "Send queue fill fraction for non-prefetch queues");
3136 ZFS_MODULE_PARAM(zfs_send
, zfs_
, override_estimate_recordsize
, UINT
, ZMOD_RW
,
3137 "Override block size estimate with fixed size");