4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/zfs_context.h>
32 #include <sys/resource.h>
34 #include <sys/zil_impl.h>
35 #include <sys/dsl_dataset.h>
37 #include <sys/dmu_tx.h>
40 * The zfs intent log (ZIL) saves transaction records of system calls
41 * that change the file system in memory with enough information
42 * to be able to replay them. These are stored in memory until
43 * either the DMU transaction group (txg) commits them to the stable pool
44 * and they can be discarded, or they are flushed to the stable log
45 * (also in the pool) due to a fsync, O_DSYNC or other synchronous
46 * requirement. In the event of a panic or power fail then those log
47 * records (transactions) are replayed.
49 * There is one ZIL per file system. Its on-disk (pool) format consists
56 * A log record holds a system call transaction. Log blocks can
57 * hold many log records and the blocks are chained together.
58 * Each ZIL block contains a block pointer (blkptr_t) to the next
59 * ZIL block in the chain. The ZIL header points to the first
60 * block in the chain. Note there is not a fixed place in the pool
61 * to hold blocks. They are dynamically allocated and freed as
62 * needed from the blocks available. Figure X shows the ZIL structure:
66 * This global ZIL switch affects all pools
68 int zil_disable
= 0; /* disable intent logging */
71 * Tunable parameter for debugging or performance analysis. Setting
72 * zfs_nocacheflush will cause corruption on power loss if a volatile
73 * out-of-order write cache is enabled.
75 boolean_t zfs_nocacheflush
= B_FALSE
;
77 static kmem_cache_t
*zil_lwb_cache
;
80 zil_dva_compare(const void *x1
, const void *x2
)
82 const dva_t
*dva1
= x1
;
83 const dva_t
*dva2
= x2
;
85 if (DVA_GET_VDEV(dva1
) < DVA_GET_VDEV(dva2
))
87 if (DVA_GET_VDEV(dva1
) > DVA_GET_VDEV(dva2
))
90 if (DVA_GET_OFFSET(dva1
) < DVA_GET_OFFSET(dva2
))
92 if (DVA_GET_OFFSET(dva1
) > DVA_GET_OFFSET(dva2
))
99 zil_dva_tree_init(avl_tree_t
*t
)
101 avl_create(t
, zil_dva_compare
, sizeof (zil_dva_node_t
),
102 offsetof(zil_dva_node_t
, zn_node
));
106 zil_dva_tree_fini(avl_tree_t
*t
)
111 while ((zn
= avl_destroy_nodes(t
, &cookie
)) != NULL
)
112 kmem_free(zn
, sizeof (zil_dva_node_t
));
118 zil_dva_tree_add(avl_tree_t
*t
, dva_t
*dva
)
123 if (avl_find(t
, dva
, &where
) != NULL
)
126 zn
= kmem_alloc(sizeof (zil_dva_node_t
), KM_SLEEP
);
128 avl_insert(t
, zn
, where
);
133 static zil_header_t
*
134 zil_header_in_syncing_context(zilog_t
*zilog
)
136 return ((zil_header_t
*)zilog
->zl_header
);
140 zil_init_log_chain(zilog_t
*zilog
, blkptr_t
*bp
)
142 zio_cksum_t
*zc
= &bp
->blk_cksum
;
144 zc
->zc_word
[ZIL_ZC_GUID_0
] = spa_get_random(-1ULL);
145 zc
->zc_word
[ZIL_ZC_GUID_1
] = spa_get_random(-1ULL);
146 zc
->zc_word
[ZIL_ZC_OBJSET
] = dmu_objset_id(zilog
->zl_os
);
147 zc
->zc_word
[ZIL_ZC_SEQ
] = 1ULL;
151 * Read a log block, make sure it's valid, and byteswap it if necessary.
154 zil_read_log_block(zilog_t
*zilog
, const blkptr_t
*bp
, arc_buf_t
**abufpp
)
158 uint32_t aflags
= ARC_WAIT
;
161 zb
.zb_objset
= bp
->blk_cksum
.zc_word
[ZIL_ZC_OBJSET
];
164 zb
.zb_blkid
= bp
->blk_cksum
.zc_word
[ZIL_ZC_SEQ
];
169 * We shouldn't be doing any scrubbing while we're doing log
170 * replay, it's OK to not lock.
172 error
= arc_read_nolock(NULL
, zilog
->zl_spa
, &blk
,
173 arc_getbuf_func
, abufpp
, ZIO_PRIORITY_SYNC_READ
, ZIO_FLAG_CANFAIL
|
174 ZIO_FLAG_SPECULATIVE
| ZIO_FLAG_SCRUB
, &aflags
, &zb
);
177 char *data
= (*abufpp
)->b_data
;
178 uint64_t blksz
= BP_GET_LSIZE(bp
);
179 zil_trailer_t
*ztp
= (zil_trailer_t
*)(data
+ blksz
) - 1;
180 zio_cksum_t cksum
= bp
->blk_cksum
;
183 * Validate the checksummed log block.
185 * Sequence numbers should be... sequential. The checksum
186 * verifier for the next block should be bp's checksum plus 1.
188 * Also check the log chain linkage and size used.
190 cksum
.zc_word
[ZIL_ZC_SEQ
]++;
192 if (bcmp(&cksum
, &ztp
->zit_next_blk
.blk_cksum
,
193 sizeof (cksum
)) || BP_IS_HOLE(&ztp
->zit_next_blk
) ||
194 (ztp
->zit_nused
> (blksz
- sizeof (zil_trailer_t
)))) {
199 VERIFY(arc_buf_remove_ref(*abufpp
, abufpp
) == 1);
204 dprintf("error %d on %llu:%llu\n", error
, zb
.zb_objset
, zb
.zb_blkid
);
210 * Parse the intent log, and call parse_func for each valid record within.
211 * Return the highest sequence number.
214 zil_parse(zilog_t
*zilog
, zil_parse_blk_func_t
*parse_blk_func
,
215 zil_parse_lr_func_t
*parse_lr_func
, void *arg
, uint64_t txg
)
217 const zil_header_t
*zh
= zilog
->zl_header
;
218 uint64_t claim_seq
= zh
->zh_claim_seq
;
220 uint64_t max_seq
= 0;
221 blkptr_t blk
= zh
->zh_log
;
227 if (BP_IS_HOLE(&blk
))
231 * Starting at the block pointed to by zh_log we read the log chain.
232 * For each block in the chain we strongly check that block to
233 * ensure its validity. We stop when an invalid block is found.
234 * For each block pointer in the chain we call parse_blk_func().
235 * For each record in each valid block we call parse_lr_func().
236 * If the log has been claimed, stop if we encounter a sequence
237 * number greater than the highest claimed sequence number.
239 zil_dva_tree_init(&zilog
->zl_dva_tree
);
241 seq
= blk
.blk_cksum
.zc_word
[ZIL_ZC_SEQ
];
243 if (claim_seq
!= 0 && seq
> claim_seq
)
246 ASSERT(max_seq
< seq
);
249 error
= zil_read_log_block(zilog
, &blk
, &abuf
);
251 if (parse_blk_func
!= NULL
)
252 parse_blk_func(zilog
, &blk
, arg
, txg
);
257 lrbuf
= abuf
->b_data
;
258 ztp
= (zil_trailer_t
*)(lrbuf
+ BP_GET_LSIZE(&blk
)) - 1;
259 blk
= ztp
->zit_next_blk
;
261 if (parse_lr_func
== NULL
) {
262 VERIFY(arc_buf_remove_ref(abuf
, &abuf
) == 1);
266 for (lrp
= lrbuf
; lrp
< lrbuf
+ ztp
->zit_nused
; lrp
+= reclen
) {
267 lr_t
*lr
= (lr_t
*)lrp
;
268 reclen
= lr
->lrc_reclen
;
269 ASSERT3U(reclen
, >=, sizeof (lr_t
));
270 parse_lr_func(zilog
, lr
, arg
, txg
);
272 VERIFY(arc_buf_remove_ref(abuf
, &abuf
) == 1);
274 zil_dva_tree_fini(&zilog
->zl_dva_tree
);
281 zil_claim_log_block(zilog_t
*zilog
, blkptr_t
*bp
, void *tx
, uint64_t first_txg
)
283 spa_t
*spa
= zilog
->zl_spa
;
287 * Claim log block if not already committed and not already claimed.
289 if (bp
->blk_birth
>= first_txg
&&
290 zil_dva_tree_add(&zilog
->zl_dva_tree
, BP_IDENTITY(bp
)) == 0) {
291 err
= zio_wait(zio_claim(NULL
, spa
, first_txg
, bp
, NULL
, NULL
,
292 ZIO_FLAG_MUSTSUCCEED
));
298 zil_claim_log_record(zilog_t
*zilog
, lr_t
*lrc
, void *tx
, uint64_t first_txg
)
300 if (lrc
->lrc_txtype
== TX_WRITE
) {
301 lr_write_t
*lr
= (lr_write_t
*)lrc
;
302 zil_claim_log_block(zilog
, &lr
->lr_blkptr
, tx
, first_txg
);
308 zil_free_log_block(zilog_t
*zilog
, blkptr_t
*bp
, void *tx
, uint64_t claim_txg
)
310 zio_free_blk(zilog
->zl_spa
, bp
, dmu_tx_get_txg(tx
));
314 zil_free_log_record(zilog_t
*zilog
, lr_t
*lrc
, void *tx
, uint64_t claim_txg
)
317 * If we previously claimed it, we need to free it.
319 if (claim_txg
!= 0 && lrc
->lrc_txtype
== TX_WRITE
) {
320 lr_write_t
*lr
= (lr_write_t
*)lrc
;
321 blkptr_t
*bp
= &lr
->lr_blkptr
;
322 if (bp
->blk_birth
>= claim_txg
&&
323 !zil_dva_tree_add(&zilog
->zl_dva_tree
, BP_IDENTITY(bp
))) {
324 (void) arc_free(NULL
, zilog
->zl_spa
,
325 dmu_tx_get_txg(tx
), bp
, NULL
, NULL
, ARC_WAIT
);
331 * Create an on-disk intent log.
334 zil_create(zilog_t
*zilog
)
336 const zil_header_t
*zh
= zilog
->zl_header
;
344 * Wait for any previous destroy to complete.
346 txg_wait_synced(zilog
->zl_dmu_pool
, zilog
->zl_destroy_txg
);
348 ASSERT(zh
->zh_claim_txg
== 0);
349 ASSERT(zh
->zh_replay_seq
== 0);
354 * If we don't already have an initial log block or we have one
355 * but it's the wrong endianness then allocate one.
357 if (BP_IS_HOLE(&blk
) || BP_SHOULD_BYTESWAP(&blk
)) {
358 tx
= dmu_tx_create(zilog
->zl_os
);
359 (void) dmu_tx_assign(tx
, TXG_WAIT
);
360 dsl_dataset_dirty(dmu_objset_ds(zilog
->zl_os
), tx
);
361 txg
= dmu_tx_get_txg(tx
);
363 if (!BP_IS_HOLE(&blk
)) {
364 zio_free_blk(zilog
->zl_spa
, &blk
, txg
);
368 error
= zio_alloc_blk(zilog
->zl_spa
, ZIL_MIN_BLKSZ
, &blk
,
372 zil_init_log_chain(zilog
, &blk
);
376 * Allocate a log write buffer (lwb) for the first log block.
379 lwb
= kmem_cache_alloc(zil_lwb_cache
, KM_SLEEP
);
380 lwb
->lwb_zilog
= zilog
;
383 lwb
->lwb_sz
= BP_GET_LSIZE(&lwb
->lwb_blk
);
384 lwb
->lwb_buf
= zio_buf_alloc(lwb
->lwb_sz
);
385 lwb
->lwb_max_txg
= txg
;
388 mutex_enter(&zilog
->zl_lock
);
389 list_insert_tail(&zilog
->zl_lwb_list
, lwb
);
390 mutex_exit(&zilog
->zl_lock
);
394 * If we just allocated the first log block, commit our transaction
395 * and wait for zil_sync() to stuff the block poiner into zh_log.
396 * (zh is part of the MOS, so we cannot modify it in open context.)
400 txg_wait_synced(zilog
->zl_dmu_pool
, txg
);
403 ASSERT(bcmp(&blk
, &zh
->zh_log
, sizeof (blk
)) == 0);
407 * In one tx, free all log blocks and clear the log header.
408 * If keep_first is set, then we're replaying a log with no content.
409 * We want to keep the first block, however, so that the first
410 * synchronous transaction doesn't require a txg_wait_synced()
411 * in zil_create(). We don't need to txg_wait_synced() here either
412 * when keep_first is set, because both zil_create() and zil_destroy()
413 * will wait for any in-progress destroys to complete.
416 zil_destroy(zilog_t
*zilog
, boolean_t keep_first
)
418 const zil_header_t
*zh
= zilog
->zl_header
;
424 * Wait for any previous destroy to complete.
426 txg_wait_synced(zilog
->zl_dmu_pool
, zilog
->zl_destroy_txg
);
428 if (BP_IS_HOLE(&zh
->zh_log
))
431 tx
= dmu_tx_create(zilog
->zl_os
);
432 (void) dmu_tx_assign(tx
, TXG_WAIT
);
433 dsl_dataset_dirty(dmu_objset_ds(zilog
->zl_os
), tx
);
434 txg
= dmu_tx_get_txg(tx
);
436 mutex_enter(&zilog
->zl_lock
);
439 * It is possible for the ZIL to get the previously mounted zilog
440 * structure of the same dataset if quickly remounted and the dbuf
441 * eviction has not completed. In this case we can see a non
442 * empty lwb list and keep_first will be set. We fix this by
443 * clearing the keep_first. This will be slower but it's very rare.
445 if (!list_is_empty(&zilog
->zl_lwb_list
) && keep_first
)
446 keep_first
= B_FALSE
;
448 ASSERT3U(zilog
->zl_destroy_txg
, <, txg
);
449 zilog
->zl_destroy_txg
= txg
;
450 zilog
->zl_keep_first
= keep_first
;
452 if (!list_is_empty(&zilog
->zl_lwb_list
)) {
453 ASSERT(zh
->zh_claim_txg
== 0);
455 while ((lwb
= list_head(&zilog
->zl_lwb_list
)) != NULL
) {
456 list_remove(&zilog
->zl_lwb_list
, lwb
);
457 if (lwb
->lwb_buf
!= NULL
)
458 zio_buf_free(lwb
->lwb_buf
, lwb
->lwb_sz
);
459 zio_free_blk(zilog
->zl_spa
, &lwb
->lwb_blk
, txg
);
460 kmem_cache_free(zil_lwb_cache
, lwb
);
464 (void) zil_parse(zilog
, zil_free_log_block
,
465 zil_free_log_record
, tx
, zh
->zh_claim_txg
);
468 mutex_exit(&zilog
->zl_lock
);
474 * zil_rollback_destroy() is only called by the rollback code.
475 * We already have a syncing tx. Rollback has exclusive access to the
476 * dataset, so we don't have to worry about concurrent zil access.
477 * The actual freeing of any log blocks occurs in zil_sync() later in
478 * this txg syncing phase.
481 zil_rollback_destroy(zilog_t
*zilog
, dmu_tx_t
*tx
)
483 const zil_header_t
*zh
= zilog
->zl_header
;
486 if (BP_IS_HOLE(&zh
->zh_log
))
489 txg
= dmu_tx_get_txg(tx
);
490 ASSERT3U(zilog
->zl_destroy_txg
, <, txg
);
491 zilog
->zl_destroy_txg
= txg
;
492 zilog
->zl_keep_first
= B_FALSE
;
495 * Ensure there's no outstanding ZIL IO. No lwbs or just the
496 * unused one that allocated in advance is ok.
498 ASSERT(zilog
->zl_lwb_list
.list_head
.list_next
==
499 zilog
->zl_lwb_list
.list_head
.list_prev
);
500 (void) zil_parse(zilog
, zil_free_log_block
, zil_free_log_record
,
501 tx
, zh
->zh_claim_txg
);
505 zil_claim(char *osname
, void *txarg
)
507 dmu_tx_t
*tx
= txarg
;
508 uint64_t first_txg
= dmu_tx_get_txg(tx
);
514 error
= dmu_objset_open(osname
, DMU_OST_ANY
, DS_MODE_USER
, &os
);
516 cmn_err(CE_WARN
, "can't open objset for %s", osname
);
520 zilog
= dmu_objset_zil(os
);
521 zh
= zil_header_in_syncing_context(zilog
);
524 * Claim all log blocks if we haven't already done so, and remember
525 * the highest claimed sequence number. This ensures that if we can
526 * read only part of the log now (e.g. due to a missing device),
527 * but we can read the entire log later, we will not try to replay
528 * or destroy beyond the last block we successfully claimed.
530 ASSERT3U(zh
->zh_claim_txg
, <=, first_txg
);
531 if (zh
->zh_claim_txg
== 0 && !BP_IS_HOLE(&zh
->zh_log
)) {
532 zh
->zh_claim_txg
= first_txg
;
533 zh
->zh_claim_seq
= zil_parse(zilog
, zil_claim_log_block
,
534 zil_claim_log_record
, tx
, first_txg
);
535 dsl_dataset_dirty(dmu_objset_ds(os
), tx
);
538 ASSERT3U(first_txg
, ==, (spa_last_synced_txg(zilog
->zl_spa
) + 1));
539 dmu_objset_close(os
);
544 * Check the log by walking the log chain.
545 * Checksum errors are ok as they indicate the end of the chain.
546 * Any other error (no device or read failure) returns an error.
550 zil_check_log_chain(char *osname
, void *txarg
)
561 error
= dmu_objset_open(osname
, DMU_OST_ANY
, DS_MODE_USER
, &os
);
563 cmn_err(CE_WARN
, "can't open objset for %s", osname
);
567 zilog
= dmu_objset_zil(os
);
568 zh
= zil_header_in_syncing_context(zilog
);
570 if (BP_IS_HOLE(&blk
)) {
571 dmu_objset_close(os
);
572 return (0); /* no chain */
576 error
= zil_read_log_block(zilog
, &blk
, &abuf
);
579 lrbuf
= abuf
->b_data
;
580 ztp
= (zil_trailer_t
*)(lrbuf
+ BP_GET_LSIZE(&blk
)) - 1;
581 blk
= ztp
->zit_next_blk
;
582 VERIFY(arc_buf_remove_ref(abuf
, &abuf
) == 1);
584 dmu_objset_close(os
);
586 return (0); /* normal end of chain */
595 zil_clear_log_chain(char *osname
, void *txarg
)
603 error
= dmu_objset_open(osname
, DMU_OST_ANY
, DS_MODE_USER
, &os
);
605 cmn_err(CE_WARN
, "can't open objset for %s", osname
);
609 zilog
= dmu_objset_zil(os
);
610 tx
= dmu_tx_create(zilog
->zl_os
);
611 (void) dmu_tx_assign(tx
, TXG_WAIT
);
612 zh
= zil_header_in_syncing_context(zilog
);
613 BP_ZERO(&zh
->zh_log
);
614 dsl_dataset_dirty(dmu_objset_ds(os
), tx
);
616 dmu_objset_close(os
);
621 zil_vdev_compare(const void *x1
, const void *x2
)
623 uint64_t v1
= ((zil_vdev_node_t
*)x1
)->zv_vdev
;
624 uint64_t v2
= ((zil_vdev_node_t
*)x2
)->zv_vdev
;
635 zil_add_block(zilog_t
*zilog
, blkptr_t
*bp
)
637 avl_tree_t
*t
= &zilog
->zl_vdev_tree
;
639 zil_vdev_node_t
*zv
, zvsearch
;
640 int ndvas
= BP_GET_NDVAS(bp
);
643 if (zfs_nocacheflush
)
646 ASSERT(zilog
->zl_writer
);
649 * Even though we're zl_writer, we still need a lock because the
650 * zl_get_data() callbacks may have dmu_sync() done callbacks
651 * that will run concurrently.
653 mutex_enter(&zilog
->zl_vdev_lock
);
654 for (i
= 0; i
< ndvas
; i
++) {
655 zvsearch
.zv_vdev
= DVA_GET_VDEV(&bp
->blk_dva
[i
]);
656 if (avl_find(t
, &zvsearch
, &where
) == NULL
) {
657 zv
= kmem_alloc(sizeof (*zv
), KM_SLEEP
);
658 zv
->zv_vdev
= zvsearch
.zv_vdev
;
659 avl_insert(t
, zv
, where
);
662 mutex_exit(&zilog
->zl_vdev_lock
);
666 zil_flush_vdevs(zilog_t
*zilog
)
668 spa_t
*spa
= zilog
->zl_spa
;
669 avl_tree_t
*t
= &zilog
->zl_vdev_tree
;
674 ASSERT(zilog
->zl_writer
);
677 * We don't need zl_vdev_lock here because we're the zl_writer,
678 * and all zl_get_data() callbacks are done.
680 if (avl_numnodes(t
) == 0)
683 spa_config_enter(spa
, SCL_STATE
, FTAG
, RW_READER
);
685 zio
= zio_root(spa
, NULL
, NULL
, ZIO_FLAG_CANFAIL
);
687 while ((zv
= avl_destroy_nodes(t
, &cookie
)) != NULL
) {
688 vdev_t
*vd
= vdev_lookup_top(spa
, zv
->zv_vdev
);
691 kmem_free(zv
, sizeof (*zv
));
695 * Wait for all the flushes to complete. Not all devices actually
696 * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails.
698 (void) zio_wait(zio
);
700 spa_config_exit(spa
, SCL_STATE
, FTAG
);
704 * Function called when a log block write completes
707 zil_lwb_write_done(zio_t
*zio
)
709 lwb_t
*lwb
= zio
->io_private
;
710 zilog_t
*zilog
= lwb
->lwb_zilog
;
712 ASSERT(BP_GET_COMPRESS(zio
->io_bp
) == ZIO_COMPRESS_OFF
);
713 ASSERT(BP_GET_CHECKSUM(zio
->io_bp
) == ZIO_CHECKSUM_ZILOG
);
714 ASSERT(BP_GET_TYPE(zio
->io_bp
) == DMU_OT_INTENT_LOG
);
715 ASSERT(BP_GET_LEVEL(zio
->io_bp
) == 0);
716 ASSERT(BP_GET_BYTEORDER(zio
->io_bp
) == ZFS_HOST_BYTEORDER
);
717 ASSERT(!BP_IS_GANG(zio
->io_bp
));
718 ASSERT(!BP_IS_HOLE(zio
->io_bp
));
719 ASSERT(zio
->io_bp
->blk_fill
== 0);
722 * Now that we've written this log block, we have a stable pointer
723 * to the next block in the chain, so it's OK to let the txg in
724 * which we allocated the next block sync.
726 txg_rele_to_sync(&lwb
->lwb_txgh
);
728 zio_buf_free(lwb
->lwb_buf
, lwb
->lwb_sz
);
729 mutex_enter(&zilog
->zl_lock
);
732 zilog
->zl_log_error
= B_TRUE
;
733 mutex_exit(&zilog
->zl_lock
);
737 * Initialize the io for a log block.
740 zil_lwb_write_init(zilog_t
*zilog
, lwb_t
*lwb
)
744 zb
.zb_objset
= lwb
->lwb_blk
.blk_cksum
.zc_word
[ZIL_ZC_OBJSET
];
747 zb
.zb_blkid
= lwb
->lwb_blk
.blk_cksum
.zc_word
[ZIL_ZC_SEQ
];
749 if (zilog
->zl_root_zio
== NULL
) {
750 zilog
->zl_root_zio
= zio_root(zilog
->zl_spa
, NULL
, NULL
,
753 if (lwb
->lwb_zio
== NULL
) {
754 lwb
->lwb_zio
= zio_rewrite(zilog
->zl_root_zio
, zilog
->zl_spa
,
755 0, &lwb
->lwb_blk
, lwb
->lwb_buf
,
756 lwb
->lwb_sz
, zil_lwb_write_done
, lwb
,
757 ZIO_PRIORITY_LOG_WRITE
, ZIO_FLAG_CANFAIL
, &zb
);
762 * Start a log block write and advance to the next log block.
763 * Calls are serialized.
766 zil_lwb_write_start(zilog_t
*zilog
, lwb_t
*lwb
)
769 zil_trailer_t
*ztp
= (zil_trailer_t
*)(lwb
->lwb_buf
+ lwb
->lwb_sz
) - 1;
770 spa_t
*spa
= zilog
->zl_spa
;
771 blkptr_t
*bp
= &ztp
->zit_next_blk
;
776 ASSERT(lwb
->lwb_nused
<= ZIL_BLK_DATA_SZ(lwb
));
779 * Allocate the next block and save its address in this block
780 * before writing it in order to establish the log chain.
781 * Note that if the allocation of nlwb synced before we wrote
782 * the block that points at it (lwb), we'd leak it if we crashed.
783 * Therefore, we don't do txg_rele_to_sync() until zil_lwb_write_done().
785 txg
= txg_hold_open(zilog
->zl_dmu_pool
, &lwb
->lwb_txgh
);
786 txg_rele_to_quiesce(&lwb
->lwb_txgh
);
789 * Pick a ZIL blocksize. We request a size that is the
790 * maximum of the previous used size, the current used size and
791 * the amount waiting in the queue.
793 zil_blksz
= MAX(zilog
->zl_prev_used
,
794 zilog
->zl_cur_used
+ sizeof (*ztp
));
795 zil_blksz
= MAX(zil_blksz
, zilog
->zl_itx_list_sz
+ sizeof (*ztp
));
796 zil_blksz
= P2ROUNDUP_TYPED(zil_blksz
, ZIL_MIN_BLKSZ
, uint64_t);
797 if (zil_blksz
> ZIL_MAX_BLKSZ
)
798 zil_blksz
= ZIL_MAX_BLKSZ
;
801 /* pass the old blkptr in order to spread log blocks across devs */
802 error
= zio_alloc_blk(spa
, zil_blksz
, bp
, &lwb
->lwb_blk
, txg
);
804 dmu_tx_t
*tx
= dmu_tx_create_assigned(zilog
->zl_dmu_pool
, txg
);
807 * We dirty the dataset to ensure that zil_sync() will
808 * be called to remove this lwb from our zl_lwb_list.
809 * Failing to do so, may leave an lwb with a NULL lwb_buf
810 * hanging around on the zl_lwb_list.
812 dsl_dataset_dirty(dmu_objset_ds(zilog
->zl_os
), tx
);
816 * Since we've just experienced an allocation failure so we
817 * terminate the current lwb and send it on its way.
820 ztp
->zit_nused
= lwb
->lwb_nused
;
821 ztp
->zit_bt
.zbt_cksum
= lwb
->lwb_blk
.blk_cksum
;
822 zio_nowait(lwb
->lwb_zio
);
825 * By returning NULL the caller will call tx_wait_synced()
830 ASSERT3U(bp
->blk_birth
, ==, txg
);
832 ztp
->zit_nused
= lwb
->lwb_nused
;
833 ztp
->zit_bt
.zbt_cksum
= lwb
->lwb_blk
.blk_cksum
;
834 bp
->blk_cksum
= lwb
->lwb_blk
.blk_cksum
;
835 bp
->blk_cksum
.zc_word
[ZIL_ZC_SEQ
]++;
838 * Allocate a new log write buffer (lwb).
840 nlwb
= kmem_cache_alloc(zil_lwb_cache
, KM_SLEEP
);
842 nlwb
->lwb_zilog
= zilog
;
845 nlwb
->lwb_sz
= BP_GET_LSIZE(&nlwb
->lwb_blk
);
846 nlwb
->lwb_buf
= zio_buf_alloc(nlwb
->lwb_sz
);
847 nlwb
->lwb_max_txg
= txg
;
848 nlwb
->lwb_zio
= NULL
;
851 * Put new lwb at the end of the log chain
853 mutex_enter(&zilog
->zl_lock
);
854 list_insert_tail(&zilog
->zl_lwb_list
, nlwb
);
855 mutex_exit(&zilog
->zl_lock
);
857 /* Record the block for later vdev flushing */
858 zil_add_block(zilog
, &lwb
->lwb_blk
);
861 * kick off the write for the old log block
863 dprintf_bp(&lwb
->lwb_blk
, "lwb %p txg %llu: ", lwb
, txg
);
864 ASSERT(lwb
->lwb_zio
);
865 zio_nowait(lwb
->lwb_zio
);
871 zil_lwb_commit(zilog_t
*zilog
, itx_t
*itx
, lwb_t
*lwb
)
873 lr_t
*lrc
= &itx
->itx_lr
; /* common log record */
874 lr_write_t
*lr
= (lr_write_t
*)lrc
;
875 uint64_t txg
= lrc
->lrc_txg
;
876 uint64_t reclen
= lrc
->lrc_reclen
;
881 ASSERT(lwb
->lwb_buf
!= NULL
);
883 if (lrc
->lrc_txtype
== TX_WRITE
&& itx
->itx_wr_state
== WR_NEED_COPY
)
884 dlen
= P2ROUNDUP_TYPED(
885 lr
->lr_length
, sizeof (uint64_t), uint64_t);
889 zilog
->zl_cur_used
+= (reclen
+ dlen
);
891 zil_lwb_write_init(zilog
, lwb
);
894 * If this record won't fit in the current log block, start a new one.
896 if (lwb
->lwb_nused
+ reclen
+ dlen
> ZIL_BLK_DATA_SZ(lwb
)) {
897 lwb
= zil_lwb_write_start(zilog
, lwb
);
900 zil_lwb_write_init(zilog
, lwb
);
901 ASSERT(lwb
->lwb_nused
== 0);
902 if (reclen
+ dlen
> ZIL_BLK_DATA_SZ(lwb
)) {
903 txg_wait_synced(zilog
->zl_dmu_pool
, txg
);
909 * Update the lrc_seq, to be log record sequence number. See zil.h
910 * Then copy the record to the log buffer.
912 lrc
->lrc_seq
= ++zilog
->zl_lr_seq
; /* we are single threaded */
913 bcopy(lrc
, lwb
->lwb_buf
+ lwb
->lwb_nused
, reclen
);
916 * If it's a write, fetch the data or get its blkptr as appropriate.
918 if (lrc
->lrc_txtype
== TX_WRITE
) {
919 if (txg
> spa_freeze_txg(zilog
->zl_spa
))
920 txg_wait_synced(zilog
->zl_dmu_pool
, txg
);
921 if (itx
->itx_wr_state
!= WR_COPIED
) {
925 /* alignment is guaranteed */
926 lr
= (lr_write_t
*)(lwb
->lwb_buf
+ lwb
->lwb_nused
);
928 ASSERT(itx
->itx_wr_state
== WR_NEED_COPY
);
929 dbuf
= lwb
->lwb_buf
+ lwb
->lwb_nused
+ reclen
;
930 lr
->lr_common
.lrc_reclen
+= dlen
;
932 ASSERT(itx
->itx_wr_state
== WR_INDIRECT
);
935 error
= zilog
->zl_get_data(
936 itx
->itx_private
, lr
, dbuf
, lwb
->lwb_zio
);
938 ASSERT(error
== ENOENT
|| error
== EEXIST
||
945 lwb
->lwb_nused
+= reclen
+ dlen
;
946 lwb
->lwb_max_txg
= MAX(lwb
->lwb_max_txg
, txg
);
947 ASSERT3U(lwb
->lwb_nused
, <=, ZIL_BLK_DATA_SZ(lwb
));
948 ASSERT3U(P2PHASE(lwb
->lwb_nused
, sizeof (uint64_t)), ==, 0);
954 zil_itx_create(uint64_t txtype
, size_t lrsize
)
958 lrsize
= P2ROUNDUP_TYPED(lrsize
, sizeof (uint64_t), size_t);
960 itx
= kmem_alloc(offsetof(itx_t
, itx_lr
) + lrsize
, KM_SLEEP
);
961 itx
->itx_lr
.lrc_txtype
= txtype
;
962 itx
->itx_lr
.lrc_reclen
= lrsize
;
963 itx
->itx_sod
= lrsize
; /* if write & WR_NEED_COPY will be increased */
964 itx
->itx_lr
.lrc_seq
= 0; /* defensive */
970 zil_itx_assign(zilog_t
*zilog
, itx_t
*itx
, dmu_tx_t
*tx
)
974 ASSERT(itx
->itx_lr
.lrc_seq
== 0);
976 mutex_enter(&zilog
->zl_lock
);
977 list_insert_tail(&zilog
->zl_itx_list
, itx
);
978 zilog
->zl_itx_list_sz
+= itx
->itx_sod
;
979 itx
->itx_lr
.lrc_txg
= dmu_tx_get_txg(tx
);
980 itx
->itx_lr
.lrc_seq
= seq
= ++zilog
->zl_itx_seq
;
981 mutex_exit(&zilog
->zl_lock
);
987 * Free up all in-memory intent log transactions that have now been synced.
990 zil_itx_clean(zilog_t
*zilog
)
992 uint64_t synced_txg
= spa_last_synced_txg(zilog
->zl_spa
);
993 uint64_t freeze_txg
= spa_freeze_txg(zilog
->zl_spa
);
997 list_create(&clean_list
, sizeof (itx_t
), offsetof(itx_t
, itx_node
));
999 mutex_enter(&zilog
->zl_lock
);
1000 /* wait for a log writer to finish walking list */
1001 while (zilog
->zl_writer
) {
1002 cv_wait(&zilog
->zl_cv_writer
, &zilog
->zl_lock
);
1006 * Move the sync'd log transactions to a separate list so we can call
1007 * kmem_free without holding the zl_lock.
1009 * There is no need to set zl_writer as we don't drop zl_lock here
1011 while ((itx
= list_head(&zilog
->zl_itx_list
)) != NULL
&&
1012 itx
->itx_lr
.lrc_txg
<= MIN(synced_txg
, freeze_txg
)) {
1013 list_remove(&zilog
->zl_itx_list
, itx
);
1014 zilog
->zl_itx_list_sz
-= itx
->itx_sod
;
1015 list_insert_tail(&clean_list
, itx
);
1017 cv_broadcast(&zilog
->zl_cv_writer
);
1018 mutex_exit(&zilog
->zl_lock
);
1020 /* destroy sync'd log transactions */
1021 while ((itx
= list_head(&clean_list
)) != NULL
) {
1022 list_remove(&clean_list
, itx
);
1023 kmem_free(itx
, offsetof(itx_t
, itx_lr
)
1024 + itx
->itx_lr
.lrc_reclen
);
1026 list_destroy(&clean_list
);
1030 * If there are any in-memory intent log transactions which have now been
1031 * synced then start up a taskq to free them.
1034 zil_clean(zilog_t
*zilog
)
1038 mutex_enter(&zilog
->zl_lock
);
1039 itx
= list_head(&zilog
->zl_itx_list
);
1040 if ((itx
!= NULL
) &&
1041 (itx
->itx_lr
.lrc_txg
<= spa_last_synced_txg(zilog
->zl_spa
))) {
1042 (void) taskq_dispatch(zilog
->zl_clean_taskq
,
1043 (void (*)(void *))zil_itx_clean
, zilog
, TQ_NOSLEEP
);
1045 mutex_exit(&zilog
->zl_lock
);
1049 zil_commit_writer(zilog_t
*zilog
, uint64_t seq
, uint64_t foid
)
1052 uint64_t commit_seq
= 0;
1053 itx_t
*itx
, *itx_next
= (itx_t
*)-1;
1057 zilog
->zl_writer
= B_TRUE
;
1058 ASSERT(zilog
->zl_root_zio
== NULL
);
1059 spa
= zilog
->zl_spa
;
1061 if (zilog
->zl_suspend
) {
1064 lwb
= list_tail(&zilog
->zl_lwb_list
);
1067 * Return if there's nothing to flush before we
1068 * dirty the fs by calling zil_create()
1070 if (list_is_empty(&zilog
->zl_itx_list
)) {
1071 zilog
->zl_writer
= B_FALSE
;
1074 mutex_exit(&zilog
->zl_lock
);
1076 mutex_enter(&zilog
->zl_lock
);
1077 lwb
= list_tail(&zilog
->zl_lwb_list
);
1081 /* Loop through in-memory log transactions filling log blocks. */
1082 DTRACE_PROBE1(zil__cw1
, zilog_t
*, zilog
);
1085 * Find the next itx to push:
1086 * Push all transactions related to specified foid and all
1087 * other transactions except TX_WRITE, TX_TRUNCATE,
1088 * TX_SETATTR and TX_ACL for all other files.
1090 if (itx_next
!= (itx_t
*)-1)
1093 itx
= list_head(&zilog
->zl_itx_list
);
1094 for (; itx
!= NULL
; itx
= list_next(&zilog
->zl_itx_list
, itx
)) {
1095 if (foid
== 0) /* push all foids? */
1097 if (itx
->itx_sync
) /* push all O_[D]SYNC */
1099 switch (itx
->itx_lr
.lrc_txtype
) {
1104 /* lr_foid is same offset for these records */
1105 if (((lr_write_t
*)&itx
->itx_lr
)->lr_foid
1107 continue; /* skip this record */
1115 if ((itx
->itx_lr
.lrc_seq
> seq
) &&
1116 ((lwb
== NULL
) || (lwb
->lwb_nused
== 0) ||
1117 (lwb
->lwb_nused
+ itx
->itx_sod
> ZIL_BLK_DATA_SZ(lwb
)))) {
1122 * Save the next pointer. Even though we soon drop
1123 * zl_lock all threads that may change the list
1124 * (another writer or zil_itx_clean) can't do so until
1125 * they have zl_writer.
1127 itx_next
= list_next(&zilog
->zl_itx_list
, itx
);
1128 list_remove(&zilog
->zl_itx_list
, itx
);
1129 zilog
->zl_itx_list_sz
-= itx
->itx_sod
;
1130 mutex_exit(&zilog
->zl_lock
);
1131 txg
= itx
->itx_lr
.lrc_txg
;
1134 if (txg
> spa_last_synced_txg(spa
) ||
1135 txg
> spa_freeze_txg(spa
))
1136 lwb
= zil_lwb_commit(zilog
, itx
, lwb
);
1137 kmem_free(itx
, offsetof(itx_t
, itx_lr
)
1138 + itx
->itx_lr
.lrc_reclen
);
1139 mutex_enter(&zilog
->zl_lock
);
1141 DTRACE_PROBE1(zil__cw2
, zilog_t
*, zilog
);
1142 /* determine commit sequence number */
1143 itx
= list_head(&zilog
->zl_itx_list
);
1145 commit_seq
= itx
->itx_lr
.lrc_seq
;
1147 commit_seq
= zilog
->zl_itx_seq
;
1148 mutex_exit(&zilog
->zl_lock
);
1150 /* write the last block out */
1151 if (lwb
!= NULL
&& lwb
->lwb_zio
!= NULL
)
1152 lwb
= zil_lwb_write_start(zilog
, lwb
);
1154 zilog
->zl_prev_used
= zilog
->zl_cur_used
;
1155 zilog
->zl_cur_used
= 0;
1158 * Wait if necessary for the log blocks to be on stable storage.
1160 if (zilog
->zl_root_zio
) {
1161 DTRACE_PROBE1(zil__cw3
, zilog_t
*, zilog
);
1162 (void) zio_wait(zilog
->zl_root_zio
);
1163 zilog
->zl_root_zio
= NULL
;
1164 DTRACE_PROBE1(zil__cw4
, zilog_t
*, zilog
);
1165 zil_flush_vdevs(zilog
);
1168 if (zilog
->zl_log_error
|| lwb
== NULL
) {
1169 zilog
->zl_log_error
= 0;
1170 txg_wait_synced(zilog
->zl_dmu_pool
, 0);
1173 mutex_enter(&zilog
->zl_lock
);
1174 zilog
->zl_writer
= B_FALSE
;
1176 ASSERT3U(commit_seq
, >=, zilog
->zl_commit_seq
);
1177 zilog
->zl_commit_seq
= commit_seq
;
1181 * Push zfs transactions to stable storage up to the supplied sequence number.
1182 * If foid is 0 push out all transactions, otherwise push only those
1183 * for that file or might have been used to create that file.
1186 zil_commit(zilog_t
*zilog
, uint64_t seq
, uint64_t foid
)
1188 if (zilog
== NULL
|| seq
== 0)
1191 mutex_enter(&zilog
->zl_lock
);
1193 seq
= MIN(seq
, zilog
->zl_itx_seq
); /* cap seq at largest itx seq */
1195 while (zilog
->zl_writer
) {
1196 cv_wait(&zilog
->zl_cv_writer
, &zilog
->zl_lock
);
1197 if (seq
< zilog
->zl_commit_seq
) {
1198 mutex_exit(&zilog
->zl_lock
);
1202 zil_commit_writer(zilog
, seq
, foid
); /* drops zl_lock */
1203 /* wake up others waiting on the commit */
1204 cv_broadcast(&zilog
->zl_cv_writer
);
1205 mutex_exit(&zilog
->zl_lock
);
1209 * Called in syncing context to free committed log blocks and update log header.
1212 zil_sync(zilog_t
*zilog
, dmu_tx_t
*tx
)
1214 zil_header_t
*zh
= zil_header_in_syncing_context(zilog
);
1215 uint64_t txg
= dmu_tx_get_txg(tx
);
1216 spa_t
*spa
= zilog
->zl_spa
;
1219 mutex_enter(&zilog
->zl_lock
);
1221 ASSERT(zilog
->zl_stop_sync
== 0);
1223 zh
->zh_replay_seq
= zilog
->zl_replay_seq
[txg
& TXG_MASK
];
1225 if (zilog
->zl_destroy_txg
== txg
) {
1226 blkptr_t blk
= zh
->zh_log
;
1228 ASSERT(list_head(&zilog
->zl_lwb_list
) == NULL
);
1229 ASSERT(spa_sync_pass(spa
) == 1);
1231 bzero(zh
, sizeof (zil_header_t
));
1232 bzero(zilog
->zl_replay_seq
, sizeof (zilog
->zl_replay_seq
));
1234 if (zilog
->zl_keep_first
) {
1236 * If this block was part of log chain that couldn't
1237 * be claimed because a device was missing during
1238 * zil_claim(), but that device later returns,
1239 * then this block could erroneously appear valid.
1240 * To guard against this, assign a new GUID to the new
1241 * log chain so it doesn't matter what blk points to.
1243 zil_init_log_chain(zilog
, &blk
);
1249 lwb
= list_head(&zilog
->zl_lwb_list
);
1251 mutex_exit(&zilog
->zl_lock
);
1254 zh
->zh_log
= lwb
->lwb_blk
;
1255 if (lwb
->lwb_buf
!= NULL
|| lwb
->lwb_max_txg
> txg
)
1257 list_remove(&zilog
->zl_lwb_list
, lwb
);
1258 zio_free_blk(spa
, &lwb
->lwb_blk
, txg
);
1259 kmem_cache_free(zil_lwb_cache
, lwb
);
1262 * If we don't have anything left in the lwb list then
1263 * we've had an allocation failure and we need to zero
1264 * out the zil_header blkptr so that we don't end
1265 * up freeing the same block twice.
1267 if (list_head(&zilog
->zl_lwb_list
) == NULL
)
1268 BP_ZERO(&zh
->zh_log
);
1270 mutex_exit(&zilog
->zl_lock
);
1276 zil_lwb_cache
= kmem_cache_create("zil_lwb_cache",
1277 sizeof (struct lwb
), 0, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
1283 kmem_cache_destroy(zil_lwb_cache
);
1287 zil_alloc(objset_t
*os
, zil_header_t
*zh_phys
)
1291 zilog
= kmem_zalloc(sizeof (zilog_t
), KM_SLEEP
);
1293 zilog
->zl_header
= zh_phys
;
1295 zilog
->zl_spa
= dmu_objset_spa(os
);
1296 zilog
->zl_dmu_pool
= dmu_objset_pool(os
);
1297 zilog
->zl_destroy_txg
= TXG_INITIAL
- 1;
1299 mutex_init(&zilog
->zl_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
1301 list_create(&zilog
->zl_itx_list
, sizeof (itx_t
),
1302 offsetof(itx_t
, itx_node
));
1304 list_create(&zilog
->zl_lwb_list
, sizeof (lwb_t
),
1305 offsetof(lwb_t
, lwb_node
));
1307 mutex_init(&zilog
->zl_vdev_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
1309 avl_create(&zilog
->zl_vdev_tree
, zil_vdev_compare
,
1310 sizeof (zil_vdev_node_t
), offsetof(zil_vdev_node_t
, zv_node
));
1312 cv_init(&zilog
->zl_cv_writer
, NULL
, CV_DEFAULT
, NULL
);
1313 cv_init(&zilog
->zl_cv_suspend
, NULL
, CV_DEFAULT
, NULL
);
1319 zil_free(zilog_t
*zilog
)
1323 zilog
->zl_stop_sync
= 1;
1325 while ((lwb
= list_head(&zilog
->zl_lwb_list
)) != NULL
) {
1326 list_remove(&zilog
->zl_lwb_list
, lwb
);
1327 if (lwb
->lwb_buf
!= NULL
)
1328 zio_buf_free(lwb
->lwb_buf
, lwb
->lwb_sz
);
1329 kmem_cache_free(zil_lwb_cache
, lwb
);
1331 list_destroy(&zilog
->zl_lwb_list
);
1333 avl_destroy(&zilog
->zl_vdev_tree
);
1334 mutex_destroy(&zilog
->zl_vdev_lock
);
1336 ASSERT(list_head(&zilog
->zl_itx_list
) == NULL
);
1337 list_destroy(&zilog
->zl_itx_list
);
1338 mutex_destroy(&zilog
->zl_lock
);
1340 cv_destroy(&zilog
->zl_cv_writer
);
1341 cv_destroy(&zilog
->zl_cv_suspend
);
1343 kmem_free(zilog
, sizeof (zilog_t
));
1347 * return true if the initial log block is not valid
1350 zil_empty(zilog_t
*zilog
)
1352 const zil_header_t
*zh
= zilog
->zl_header
;
1353 arc_buf_t
*abuf
= NULL
;
1355 if (BP_IS_HOLE(&zh
->zh_log
))
1358 if (zil_read_log_block(zilog
, &zh
->zh_log
, &abuf
) != 0)
1361 VERIFY(arc_buf_remove_ref(abuf
, &abuf
) == 1);
1366 * Open an intent log.
1369 zil_open(objset_t
*os
, zil_get_data_t
*get_data
)
1371 zilog_t
*zilog
= dmu_objset_zil(os
);
1373 zilog
->zl_get_data
= get_data
;
1374 zilog
->zl_clean_taskq
= taskq_create("zil_clean", 1, minclsyspri
,
1375 2, 2, TASKQ_PREPOPULATE
);
1381 * Close an intent log.
1384 zil_close(zilog_t
*zilog
)
1387 * If the log isn't already committed, mark the objset dirty
1388 * (so zil_sync() will be called) and wait for that txg to sync.
1390 if (!zil_is_committed(zilog
)) {
1392 dmu_tx_t
*tx
= dmu_tx_create(zilog
->zl_os
);
1393 (void) dmu_tx_assign(tx
, TXG_WAIT
);
1394 dsl_dataset_dirty(dmu_objset_ds(zilog
->zl_os
), tx
);
1395 txg
= dmu_tx_get_txg(tx
);
1397 txg_wait_synced(zilog
->zl_dmu_pool
, txg
);
1400 taskq_destroy(zilog
->zl_clean_taskq
);
1401 zilog
->zl_clean_taskq
= NULL
;
1402 zilog
->zl_get_data
= NULL
;
1404 zil_itx_clean(zilog
);
1405 ASSERT(list_head(&zilog
->zl_itx_list
) == NULL
);
1409 * Suspend an intent log. While in suspended mode, we still honor
1410 * synchronous semantics, but we rely on txg_wait_synced() to do it.
1411 * We suspend the log briefly when taking a snapshot so that the snapshot
1412 * contains all the data it's supposed to, and has an empty intent log.
1415 zil_suspend(zilog_t
*zilog
)
1417 const zil_header_t
*zh
= zilog
->zl_header
;
1419 mutex_enter(&zilog
->zl_lock
);
1420 if (zh
->zh_claim_txg
!= 0) { /* unplayed log */
1421 mutex_exit(&zilog
->zl_lock
);
1424 if (zilog
->zl_suspend
++ != 0) {
1426 * Someone else already began a suspend.
1427 * Just wait for them to finish.
1429 while (zilog
->zl_suspending
)
1430 cv_wait(&zilog
->zl_cv_suspend
, &zilog
->zl_lock
);
1431 mutex_exit(&zilog
->zl_lock
);
1434 zilog
->zl_suspending
= B_TRUE
;
1435 mutex_exit(&zilog
->zl_lock
);
1437 zil_commit(zilog
, UINT64_MAX
, 0);
1440 * Wait for any in-flight log writes to complete.
1442 mutex_enter(&zilog
->zl_lock
);
1443 while (zilog
->zl_writer
)
1444 cv_wait(&zilog
->zl_cv_writer
, &zilog
->zl_lock
);
1445 mutex_exit(&zilog
->zl_lock
);
1447 zil_destroy(zilog
, B_FALSE
);
1449 mutex_enter(&zilog
->zl_lock
);
1450 zilog
->zl_suspending
= B_FALSE
;
1451 cv_broadcast(&zilog
->zl_cv_suspend
);
1452 mutex_exit(&zilog
->zl_lock
);
1458 zil_resume(zilog_t
*zilog
)
1460 mutex_enter(&zilog
->zl_lock
);
1461 ASSERT(zilog
->zl_suspend
!= 0);
1462 zilog
->zl_suspend
--;
1463 mutex_exit(&zilog
->zl_lock
);
1466 typedef struct zil_replay_arg
{
1468 zil_replay_func_t
**zr_replay
;
1469 zil_replay_cleaner_t
*zr_replay_cleaner
;
1472 boolean_t zr_byteswap
;
1477 zil_replay_log_record(zilog_t
*zilog
, lr_t
*lr
, void *zra
, uint64_t claim_txg
)
1479 zil_replay_arg_t
*zr
= zra
;
1480 const zil_header_t
*zh
= zilog
->zl_header
;
1481 uint64_t reclen
= lr
->lrc_reclen
;
1482 uint64_t txtype
= lr
->lrc_txtype
;
1484 int pass
, error
, sunk
;
1486 if (zilog
->zl_stop_replay
)
1489 if (lr
->lrc_txg
< claim_txg
) /* already committed */
1492 if (lr
->lrc_seq
<= zh
->zh_replay_seq
) /* already replayed */
1495 /* Strip case-insensitive bit, still present in log record */
1499 * Make a copy of the data so we can revise and extend it.
1501 bcopy(lr
, zr
->zr_lrbuf
, reclen
);
1504 * The log block containing this lr may have been byteswapped
1505 * so that we can easily examine common fields like lrc_txtype.
1506 * However, the log is a mix of different data types, and only the
1507 * replay vectors know how to byteswap their records. Therefore, if
1508 * the lr was byteswapped, undo it before invoking the replay vector.
1510 if (zr
->zr_byteswap
)
1511 byteswap_uint64_array(zr
->zr_lrbuf
, reclen
);
1514 * If this is a TX_WRITE with a blkptr, suck in the data.
1516 if (txtype
== TX_WRITE
&& reclen
== sizeof (lr_write_t
)) {
1517 lr_write_t
*lrw
= (lr_write_t
*)lr
;
1518 blkptr_t
*wbp
= &lrw
->lr_blkptr
;
1519 uint64_t wlen
= lrw
->lr_length
;
1520 char *wbuf
= zr
->zr_lrbuf
+ reclen
;
1522 if (BP_IS_HOLE(wbp
)) { /* compressed to a hole */
1526 * A subsequent write may have overwritten this block,
1527 * in which case wbp may have been been freed and
1528 * reallocated, and our read of wbp may fail with a
1529 * checksum error. We can safely ignore this because
1530 * the later write will provide the correct data.
1534 zb
.zb_objset
= dmu_objset_id(zilog
->zl_os
);
1535 zb
.zb_object
= lrw
->lr_foid
;
1537 zb
.zb_blkid
= lrw
->lr_offset
/ BP_GET_LSIZE(wbp
);
1539 (void) zio_wait(zio_read(NULL
, zilog
->zl_spa
,
1540 wbp
, wbuf
, BP_GET_LSIZE(wbp
), NULL
, NULL
,
1541 ZIO_PRIORITY_SYNC_READ
,
1542 ZIO_FLAG_CANFAIL
| ZIO_FLAG_SPECULATIVE
, &zb
));
1543 (void) memmove(wbuf
, wbuf
+ lrw
->lr_blkoff
, wlen
);
1548 * Replay of large truncates can end up needing additional txs
1549 * and a different txg. If they are nested within the replay tx
1550 * as below then a hang is possible. So we do the truncate here
1551 * and redo the truncate later (a no-op) and update the sequence
1552 * number whilst in the replay tx. Fortunately, it's safe to repeat
1553 * a truncate if we crash and the truncate commits. A create over
1554 * an existing file will also come in as a TX_TRUNCATE record.
1556 * Note, remove of large files and renames over large files is
1557 * handled by putting the deleted object on a stable list
1558 * and if necessary force deleting the object outside of the replay
1559 * transaction using the zr_replay_cleaner.
1561 if (txtype
== TX_TRUNCATE
) {
1562 *zr
->zr_txgp
= TXG_NOWAIT
;
1563 error
= zr
->zr_replay
[TX_TRUNCATE
](zr
->zr_arg
, zr
->zr_lrbuf
,
1567 zr
->zr_byteswap
= 0; /* only byteswap once */
1571 * We must now do two things atomically: replay this log record,
1572 * and update the log header to reflect the fact that we did so.
1573 * We use the DMU's ability to assign into a specific txg to do this.
1575 for (pass
= 1, sunk
= B_FALSE
; /* CONSTANTCONDITION */; pass
++) {
1576 uint64_t replay_txg
;
1577 dmu_tx_t
*replay_tx
;
1579 replay_tx
= dmu_tx_create(zr
->zr_os
);
1580 error
= dmu_tx_assign(replay_tx
, TXG_WAIT
);
1582 dmu_tx_abort(replay_tx
);
1586 replay_txg
= dmu_tx_get_txg(replay_tx
);
1588 if (txtype
== 0 || txtype
>= TX_MAX_TYPE
) {
1592 * On the first pass, arrange for the replay vector
1593 * to fail its dmu_tx_assign(). That's the only way
1594 * to ensure that those code paths remain well tested.
1596 * Only byteswap (if needed) on the 1st pass.
1598 *zr
->zr_txgp
= replay_txg
- (pass
== 1);
1599 error
= zr
->zr_replay
[txtype
](zr
->zr_arg
, zr
->zr_lrbuf
,
1600 zr
->zr_byteswap
&& pass
== 1);
1601 *zr
->zr_txgp
= TXG_NOWAIT
;
1605 dsl_dataset_dirty(dmu_objset_ds(zr
->zr_os
), replay_tx
);
1606 zilog
->zl_replay_seq
[replay_txg
& TXG_MASK
] =
1610 dmu_tx_commit(replay_tx
);
1616 * The DMU's dnode layer doesn't see removes until the txg
1617 * commits, so a subsequent claim can spuriously fail with
1618 * EEXIST. So if we receive any error other than ERESTART
1619 * we try syncing out any removes then retrying the
1622 if (error
!= ERESTART
&& !sunk
) {
1623 if (zr
->zr_replay_cleaner
)
1624 zr
->zr_replay_cleaner(zr
->zr_arg
);
1625 txg_wait_synced(spa_get_dsl(zilog
->zl_spa
), 0);
1627 continue; /* retry */
1630 if (error
!= ERESTART
)
1634 txg_wait_open(spa_get_dsl(zilog
->zl_spa
),
1637 dprintf("pass %d, retrying\n", pass
);
1641 ASSERT(error
&& error
!= ERESTART
);
1642 name
= kmem_alloc(MAXNAMELEN
, KM_SLEEP
);
1643 dmu_objset_name(zr
->zr_os
, name
);
1644 cmn_err(CE_WARN
, "ZFS replay transaction error %d, "
1645 "dataset %s, seq 0x%llx, txtype %llu %s\n",
1646 error
, name
, (u_longlong_t
)lr
->lrc_seq
, (u_longlong_t
)txtype
,
1647 (lr
->lrc_txtype
& TX_CI
) ? "CI" : "");
1648 zilog
->zl_stop_replay
= 1;
1649 kmem_free(name
, MAXNAMELEN
);
1654 zil_incr_blks(zilog_t
*zilog
, blkptr_t
*bp
, void *arg
, uint64_t claim_txg
)
1656 zilog
->zl_replay_blks
++;
1660 * If this dataset has a non-empty intent log, replay it and destroy it.
1663 zil_replay(objset_t
*os
, void *arg
, uint64_t *txgp
,
1664 zil_replay_func_t
*replay_func
[TX_MAX_TYPE
],
1665 zil_replay_cleaner_t
*replay_cleaner
)
1667 zilog_t
*zilog
= dmu_objset_zil(os
);
1668 const zil_header_t
*zh
= zilog
->zl_header
;
1669 zil_replay_arg_t zr
;
1671 if (zil_empty(zilog
)) {
1672 zil_destroy(zilog
, B_TRUE
);
1677 zr
.zr_replay
= replay_func
;
1678 zr
.zr_replay_cleaner
= replay_cleaner
;
1681 zr
.zr_byteswap
= BP_SHOULD_BYTESWAP(&zh
->zh_log
);
1682 zr
.zr_lrbuf
= kmem_alloc(2 * SPA_MAXBLOCKSIZE
, KM_SLEEP
);
1685 * Wait for in-progress removes to sync before starting replay.
1687 txg_wait_synced(zilog
->zl_dmu_pool
, 0);
1689 zilog
->zl_stop_replay
= 0;
1690 zilog
->zl_replay_time
= lbolt
;
1691 ASSERT(zilog
->zl_replay_blks
== 0);
1692 (void) zil_parse(zilog
, zil_incr_blks
, zil_replay_log_record
, &zr
,
1694 kmem_free(zr
.zr_lrbuf
, 2 * SPA_MAXBLOCKSIZE
);
1696 zil_destroy(zilog
, B_FALSE
);
1697 txg_wait_synced(zilog
->zl_dmu_pool
, zilog
->zl_destroy_txg
);
1701 * Report whether all transactions are committed
1704 zil_is_committed(zilog_t
*zilog
)
1709 mutex_enter(&zilog
->zl_lock
);
1710 while (zilog
->zl_writer
)
1711 cv_wait(&zilog
->zl_cv_writer
, &zilog
->zl_lock
);
1713 /* recent unpushed intent log transactions? */
1714 if (!list_is_empty(&zilog
->zl_itx_list
)) {
1719 /* intent log never used? */
1720 lwb
= list_head(&zilog
->zl_lwb_list
);
1727 * more than 1 log buffer means zil_sync() hasn't yet freed
1728 * entries after a txg has committed
1730 if (list_next(&zilog
->zl_lwb_list
, lwb
)) {
1735 ASSERT(zil_empty(zilog
));
1738 cv_broadcast(&zilog
->zl_cv_writer
);
1739 mutex_exit(&zilog
->zl_lock
);