4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/zfs_context.h>
27 #include <sys/fm/fs/zfs.h>
30 #include <sys/spa_impl.h>
31 #include <sys/vdev_impl.h>
32 #include <sys/zio_impl.h>
33 #include <sys/zio_compress.h>
34 #include <sys/zio_checksum.h>
37 * ==========================================================================
39 * ==========================================================================
41 uint8_t zio_priority_table
[ZIO_PRIORITY_TABLE_SIZE
] = {
42 0, /* ZIO_PRIORITY_NOW */
43 0, /* ZIO_PRIORITY_SYNC_READ */
44 0, /* ZIO_PRIORITY_SYNC_WRITE */
45 6, /* ZIO_PRIORITY_ASYNC_READ */
46 4, /* ZIO_PRIORITY_ASYNC_WRITE */
47 4, /* ZIO_PRIORITY_FREE */
48 0, /* ZIO_PRIORITY_CACHE_FILL */
49 0, /* ZIO_PRIORITY_LOG_WRITE */
50 10, /* ZIO_PRIORITY_RESILVER */
51 20, /* ZIO_PRIORITY_SCRUB */
55 * ==========================================================================
56 * I/O type descriptions
57 * ==========================================================================
59 char *zio_type_name
[ZIO_TYPES
] = {
60 "null", "read", "write", "free", "claim", "ioctl" };
62 #define SYNC_PASS_DEFERRED_FREE 1 /* defer frees after this pass */
63 #define SYNC_PASS_DONT_COMPRESS 4 /* don't compress after this pass */
64 #define SYNC_PASS_REWRITE 1 /* rewrite new bps after this pass */
67 * ==========================================================================
69 * ==========================================================================
71 kmem_cache_t
*zio_cache
;
72 kmem_cache_t
*zio_buf_cache
[SPA_MAXBLOCKSIZE
>> SPA_MINBLOCKSHIFT
];
73 kmem_cache_t
*zio_data_buf_cache
[SPA_MAXBLOCKSIZE
>> SPA_MINBLOCKSHIFT
];
75 #if defined(_KERNEL) && !defined(__NetBSD__)
76 extern vmem_t
*zio_alloc_arena
;
80 * An allocating zio is one that either currently has the DVA allocate
81 * stage set or will have it later in its lifetime.
83 #define IO_IS_ALLOCATING(zio) \
84 ((zio)->io_orig_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE))
90 vmem_t
*data_alloc_arena
= NULL
;
92 #if defined(_KERNEL) && !defined(__NetBSD__)
93 data_alloc_arena
= zio_alloc_arena
;
95 zio_cache
= kmem_cache_create("zio_cache", sizeof (zio_t
), 0,
96 NULL
, NULL
, NULL
, NULL
, NULL
, 0);
100 * For small buffers, we want a cache for each multiple of
101 * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache
102 * for each quarter-power of 2. For large buffers, we want
103 * a cache for each multiple of PAGESIZE.
105 for (c
= 0; c
< SPA_MAXBLOCKSIZE
>> SPA_MINBLOCKSHIFT
; c
++) {
106 size_t size
= (c
+ 1) << SPA_MINBLOCKSHIFT
;
110 while (p2
& (p2
- 1))
113 if (size
<= 4 * SPA_MINBLOCKSIZE
) {
114 align
= SPA_MINBLOCKSIZE
;
115 } else if (P2PHASE(size
, PAGESIZE
) == 0) {
117 } else if (P2PHASE(size
, p2
>> 2) == 0) {
123 (void) sprintf(name
, "zio_buf_%lu", (ulong_t
)size
);
124 zio_buf_cache
[c
] = kmem_cache_create(name
, size
,
125 align
, NULL
, NULL
, NULL
, NULL
, NULL
, KMC_NODEBUG
);
127 (void) sprintf(name
, "zio_data_buf_%lu", (ulong_t
)size
);
128 zio_data_buf_cache
[c
] = kmem_cache_create(name
, size
,
129 align
, NULL
, NULL
, NULL
, NULL
, data_alloc_arena
,
135 ASSERT(zio_buf_cache
[c
] != NULL
);
136 if (zio_buf_cache
[c
- 1] == NULL
)
137 zio_buf_cache
[c
- 1] = zio_buf_cache
[c
];
139 ASSERT(zio_data_buf_cache
[c
] != NULL
);
140 if (zio_data_buf_cache
[c
- 1] == NULL
)
141 zio_data_buf_cache
[c
- 1] = zio_data_buf_cache
[c
];
143 #endif /* __NetBSD__ */
151 kmem_cache_t
*last_cache
= NULL
;
152 kmem_cache_t
*last_data_cache
= NULL
;
155 for (c
= 0; c
< SPA_MAXBLOCKSIZE
>> SPA_MINBLOCKSHIFT
; c
++) {
156 if (zio_buf_cache
[c
] != last_cache
) {
157 last_cache
= zio_buf_cache
[c
];
158 kmem_cache_destroy(zio_buf_cache
[c
]);
160 zio_buf_cache
[c
] = NULL
;
162 if (zio_data_buf_cache
[c
] != last_data_cache
) {
163 last_data_cache
= zio_data_buf_cache
[c
];
164 kmem_cache_destroy(zio_data_buf_cache
[c
]);
166 zio_data_buf_cache
[c
] = NULL
;
168 #endif /* __NetBSD__ */
170 kmem_cache_destroy(zio_cache
);
176 * ==========================================================================
177 * Allocate and free I/O buffers
178 * ==========================================================================
182 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a
183 * crashdump if the kernel panics, so use it judiciously. Obviously, it's
184 * useful to inspect ZFS metadata, but if possible, we should avoid keeping
185 * excess / transient data in-core during a crashdump.
188 zio_buf_alloc(size_t size
)
190 size_t c
= (size
- 1) >> SPA_MINBLOCKSHIFT
;
191 ASSERT(c
< SPA_MAXBLOCKSIZE
>> SPA_MINBLOCKSHIFT
);
193 return (kmem_alloc(size
, KM_SLEEP
));
195 return (kmem_cache_alloc(zio_buf_cache
[c
], KM_PUSHPAGE
));
200 * Use zio_data_buf_alloc to allocate data. The data will not appear in a
201 * crashdump if the kernel panics. This exists so that we will limit the amount
202 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount
203 * of kernel heap dumped to disk when the kernel panics)
206 zio_data_buf_alloc(size_t size
)
208 size_t c
= (size
- 1) >> SPA_MINBLOCKSHIFT
;
210 ASSERT(c
< SPA_MAXBLOCKSIZE
>> SPA_MINBLOCKSHIFT
);
212 return (kmem_alloc(size
, KM_SLEEP
));
214 return (kmem_cache_alloc(zio_data_buf_cache
[c
], KM_PUSHPAGE
));
219 zio_buf_free(void *buf
, size_t size
)
221 size_t c
= (size
- 1) >> SPA_MINBLOCKSHIFT
;
223 ASSERT(c
< SPA_MAXBLOCKSIZE
>> SPA_MINBLOCKSHIFT
);
226 kmem_free(buf
, size
);
228 kmem_cache_free(zio_buf_cache
[c
], buf
);
233 zio_data_buf_free(void *buf
, size_t size
)
235 size_t c
= (size
- 1) >> SPA_MINBLOCKSHIFT
;
237 ASSERT(c
< SPA_MAXBLOCKSIZE
>> SPA_MINBLOCKSHIFT
);
240 kmem_free(buf
, size
);
242 kmem_cache_free(zio_data_buf_cache
[c
], buf
);
247 * ==========================================================================
248 * Push and pop I/O transform buffers
249 * ==========================================================================
252 zio_push_transform(zio_t
*zio
, void *data
, uint64_t size
, uint64_t bufsize
,
253 zio_transform_func_t
*transform
)
255 zio_transform_t
*zt
= kmem_alloc(sizeof (zio_transform_t
), KM_SLEEP
);
257 zt
->zt_orig_data
= zio
->io_data
;
258 zt
->zt_orig_size
= zio
->io_size
;
259 zt
->zt_bufsize
= bufsize
;
260 zt
->zt_transform
= transform
;
262 zt
->zt_next
= zio
->io_transform_stack
;
263 zio
->io_transform_stack
= zt
;
270 zio_pop_transforms(zio_t
*zio
)
274 while ((zt
= zio
->io_transform_stack
) != NULL
) {
275 if (zt
->zt_transform
!= NULL
)
276 zt
->zt_transform(zio
,
277 zt
->zt_orig_data
, zt
->zt_orig_size
);
279 zio_buf_free(zio
->io_data
, zt
->zt_bufsize
);
281 zio
->io_data
= zt
->zt_orig_data
;
282 zio
->io_size
= zt
->zt_orig_size
;
283 zio
->io_transform_stack
= zt
->zt_next
;
285 kmem_free(zt
, sizeof (zio_transform_t
));
290 * ==========================================================================
291 * I/O transform callbacks for subblocks and decompression
292 * ==========================================================================
295 zio_subblock(zio_t
*zio
, void *data
, uint64_t size
)
297 ASSERT(zio
->io_size
> size
);
299 if (zio
->io_type
== ZIO_TYPE_READ
)
300 bcopy(zio
->io_data
, data
, size
);
304 zio_decompress(zio_t
*zio
, void *data
, uint64_t size
)
306 if (zio
->io_error
== 0 &&
307 zio_decompress_data(BP_GET_COMPRESS(zio
->io_bp
),
308 zio
->io_data
, zio
->io_size
, data
, size
) != 0)
313 * ==========================================================================
314 * I/O parent/child relationships and pipeline interlocks
315 * ==========================================================================
319 zio_add_child(zio_t
*pio
, zio_t
*zio
)
321 mutex_enter(&pio
->io_lock
);
322 if (zio
->io_stage
< ZIO_STAGE_READY
)
323 pio
->io_children
[zio
->io_child_type
][ZIO_WAIT_READY
]++;
324 if (zio
->io_stage
< ZIO_STAGE_DONE
)
325 pio
->io_children
[zio
->io_child_type
][ZIO_WAIT_DONE
]++;
326 zio
->io_sibling_prev
= NULL
;
327 zio
->io_sibling_next
= pio
->io_child
;
328 if (pio
->io_child
!= NULL
)
329 pio
->io_child
->io_sibling_prev
= zio
;
331 zio
->io_parent
= pio
;
332 mutex_exit(&pio
->io_lock
);
336 zio_remove_child(zio_t
*pio
, zio_t
*zio
)
340 ASSERT(zio
->io_parent
== pio
);
342 mutex_enter(&pio
->io_lock
);
343 next
= zio
->io_sibling_next
;
344 prev
= zio
->io_sibling_prev
;
346 next
->io_sibling_prev
= prev
;
348 prev
->io_sibling_next
= next
;
349 if (pio
->io_child
== zio
)
350 pio
->io_child
= next
;
351 mutex_exit(&pio
->io_lock
);
355 zio_wait_for_children(zio_t
*zio
, enum zio_child child
, enum zio_wait_type wait
)
357 uint64_t *countp
= &zio
->io_children
[child
][wait
];
358 boolean_t waiting
= B_FALSE
;
360 mutex_enter(&zio
->io_lock
);
361 ASSERT(zio
->io_stall
== NULL
);
364 zio
->io_stall
= countp
;
367 mutex_exit(&zio
->io_lock
);
373 zio_notify_parent(zio_t
*pio
, zio_t
*zio
, enum zio_wait_type wait
)
375 uint64_t *countp
= &pio
->io_children
[zio
->io_child_type
][wait
];
376 int *errorp
= &pio
->io_child_error
[zio
->io_child_type
];
378 mutex_enter(&pio
->io_lock
);
379 if (zio
->io_error
&& !(zio
->io_flags
& ZIO_FLAG_DONT_PROPAGATE
))
380 *errorp
= zio_worst_error(*errorp
, zio
->io_error
);
381 pio
->io_reexecute
|= zio
->io_reexecute
;
382 ASSERT3U(*countp
, >, 0);
383 if (--*countp
== 0 && pio
->io_stall
== countp
) {
384 pio
->io_stall
= NULL
;
385 mutex_exit(&pio
->io_lock
);
388 mutex_exit(&pio
->io_lock
);
393 zio_inherit_child_errors(zio_t
*zio
, enum zio_child c
)
395 if (zio
->io_child_error
[c
] != 0 && zio
->io_error
== 0)
396 zio
->io_error
= zio
->io_child_error
[c
];
400 * ==========================================================================
401 * Create the various types of I/O (read, write, free, etc)
402 * ==========================================================================
405 zio_create(zio_t
*pio
, spa_t
*spa
, uint64_t txg
, blkptr_t
*bp
,
406 void *data
, uint64_t size
, zio_done_func_t
*done
, void *private,
407 zio_type_t type
, int priority
, int flags
, vdev_t
*vd
, uint64_t offset
,
408 const zbookmark_t
*zb
, uint8_t stage
, uint32_t pipeline
)
412 ASSERT3U(size
, <=, SPA_MAXBLOCKSIZE
);
413 ASSERT(P2PHASE(size
, SPA_MINBLOCKSIZE
) == 0);
414 ASSERT(P2PHASE(offset
, SPA_MINBLOCKSIZE
) == 0);
416 ASSERT(!vd
|| spa_config_held(spa
, SCL_STATE_ALL
, RW_READER
));
417 ASSERT(!bp
|| !(flags
& ZIO_FLAG_CONFIG_WRITER
));
418 ASSERT(vd
|| stage
== ZIO_STAGE_OPEN
);
420 zio
= kmem_cache_alloc(zio_cache
, KM_SLEEP
);
421 bzero(zio
, sizeof (zio_t
));
423 mutex_init(&zio
->io_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
424 cv_init(&zio
->io_cv
, NULL
, CV_DEFAULT
, NULL
);
427 zio
->io_child_type
= ZIO_CHILD_VDEV
;
428 else if (flags
& ZIO_FLAG_GANG_CHILD
)
429 zio
->io_child_type
= ZIO_CHILD_GANG
;
431 zio
->io_child_type
= ZIO_CHILD_LOGICAL
;
435 zio
->io_bp_copy
= *bp
;
436 zio
->io_bp_orig
= *bp
;
437 if (type
!= ZIO_TYPE_WRITE
)
438 zio
->io_bp
= &zio
->io_bp_copy
; /* so caller can free */
439 if (zio
->io_child_type
== ZIO_CHILD_LOGICAL
) {
441 pipeline
|= ZIO_GANG_STAGES
;
442 zio
->io_logical
= zio
;
451 zio
->io_private
= private;
453 zio
->io_priority
= priority
;
455 zio
->io_offset
= offset
;
456 zio
->io_orig_flags
= zio
->io_flags
= flags
;
457 zio
->io_orig_stage
= zio
->io_stage
= stage
;
458 zio
->io_orig_pipeline
= zio
->io_pipeline
= pipeline
;
461 zio
->io_bookmark
= *zb
;
465 * Logical I/Os can have logical, gang, or vdev children.
466 * Gang I/Os can have gang or vdev children.
467 * Vdev I/Os can only have vdev children.
468 * The following ASSERT captures all of these constraints.
470 ASSERT(zio
->io_child_type
<= pio
->io_child_type
);
471 if (zio
->io_logical
== NULL
)
472 zio
->io_logical
= pio
->io_logical
;
473 zio_add_child(pio
, zio
);
480 zio_destroy(zio_t
*zio
)
482 spa_t
*spa
= zio
->io_spa
;
483 uint8_t async_root
= zio
->io_async_root
;
485 mutex_destroy(&zio
->io_lock
);
486 cv_destroy(&zio
->io_cv
);
487 kmem_cache_free(zio_cache
, zio
);
490 mutex_enter(&spa
->spa_async_root_lock
);
491 if (--spa
->spa_async_root_count
== 0)
492 cv_broadcast(&spa
->spa_async_root_cv
);
493 mutex_exit(&spa
->spa_async_root_lock
);
498 zio_null(zio_t
*pio
, spa_t
*spa
, zio_done_func_t
*done
, void *private,
503 zio
= zio_create(pio
, spa
, 0, NULL
, NULL
, 0, done
, private,
504 ZIO_TYPE_NULL
, ZIO_PRIORITY_NOW
, flags
, NULL
, 0, NULL
,
505 ZIO_STAGE_OPEN
, ZIO_INTERLOCK_PIPELINE
);
511 zio_root(spa_t
*spa
, zio_done_func_t
*done
, void *private, int flags
)
513 return (zio_null(NULL
, spa
, done
, private, flags
));
517 zio_read(zio_t
*pio
, spa_t
*spa
, const blkptr_t
*bp
,
518 void *data
, uint64_t size
, zio_done_func_t
*done
, void *private,
519 int priority
, int flags
, const zbookmark_t
*zb
)
523 zio
= zio_create(pio
, spa
, bp
->blk_birth
, (blkptr_t
*)bp
,
524 data
, size
, done
, private,
525 ZIO_TYPE_READ
, priority
, flags
, NULL
, 0, zb
,
526 ZIO_STAGE_OPEN
, ZIO_READ_PIPELINE
);
532 zio_skip_write(zio_t
*zio
)
534 ASSERT(zio
->io_type
== ZIO_TYPE_WRITE
);
535 ASSERT(zio
->io_stage
== ZIO_STAGE_READY
);
536 ASSERT(!BP_IS_GANG(zio
->io_bp
));
538 zio
->io_pipeline
&= ~ZIO_VDEV_IO_STAGES
;
542 zio_write(zio_t
*pio
, spa_t
*spa
, uint64_t txg
, blkptr_t
*bp
,
543 void *data
, uint64_t size
, zio_prop_t
*zp
,
544 zio_done_func_t
*ready
, zio_done_func_t
*done
, void *private,
545 int priority
, int flags
, const zbookmark_t
*zb
)
549 ASSERT(zp
->zp_checksum
>= ZIO_CHECKSUM_OFF
&&
550 zp
->zp_checksum
< ZIO_CHECKSUM_FUNCTIONS
&&
551 zp
->zp_compress
>= ZIO_COMPRESS_OFF
&&
552 zp
->zp_compress
< ZIO_COMPRESS_FUNCTIONS
&&
553 zp
->zp_type
< DMU_OT_NUMTYPES
&&
556 zp
->zp_ndvas
<= spa_max_replication(spa
));
557 ASSERT(ready
!= NULL
);
559 zio
= zio_create(pio
, spa
, txg
, bp
, data
, size
, done
, private,
560 ZIO_TYPE_WRITE
, priority
, flags
, NULL
, 0, zb
,
561 ZIO_STAGE_OPEN
, ZIO_WRITE_PIPELINE
);
563 zio
->io_ready
= ready
;
570 zio_rewrite(zio_t
*pio
, spa_t
*spa
, uint64_t txg
, blkptr_t
*bp
, void *data
,
571 uint64_t size
, zio_done_func_t
*done
, void *private, int priority
,
572 int flags
, zbookmark_t
*zb
)
576 zio
= zio_create(pio
, spa
, txg
, bp
, data
, size
, done
, private,
577 ZIO_TYPE_WRITE
, priority
, flags
, NULL
, 0, zb
,
578 ZIO_STAGE_OPEN
, ZIO_REWRITE_PIPELINE
);
584 zio_free(zio_t
*pio
, spa_t
*spa
, uint64_t txg
, blkptr_t
*bp
,
585 zio_done_func_t
*done
, void *private, int flags
)
589 ASSERT(!BP_IS_HOLE(bp
));
591 if (bp
->blk_fill
== BLK_FILL_ALREADY_FREED
)
592 return (zio_null(pio
, spa
, NULL
, NULL
, flags
));
594 if (txg
== spa
->spa_syncing_txg
&&
595 spa_sync_pass(spa
) > SYNC_PASS_DEFERRED_FREE
) {
596 bplist_enqueue_deferred(&spa
->spa_sync_bplist
, bp
);
597 return (zio_null(pio
, spa
, NULL
, NULL
, flags
));
600 zio
= zio_create(pio
, spa
, txg
, bp
, NULL
, BP_GET_PSIZE(bp
),
601 done
, private, ZIO_TYPE_FREE
, ZIO_PRIORITY_FREE
, flags
,
602 NULL
, 0, NULL
, ZIO_STAGE_OPEN
, ZIO_FREE_PIPELINE
);
608 zio_claim(zio_t
*pio
, spa_t
*spa
, uint64_t txg
, blkptr_t
*bp
,
609 zio_done_func_t
*done
, void *private, int flags
)
614 * A claim is an allocation of a specific block. Claims are needed
615 * to support immediate writes in the intent log. The issue is that
616 * immediate writes contain committed data, but in a txg that was
617 * *not* committed. Upon opening the pool after an unclean shutdown,
618 * the intent log claims all blocks that contain immediate write data
619 * so that the SPA knows they're in use.
621 * All claims *must* be resolved in the first txg -- before the SPA
622 * starts allocating blocks -- so that nothing is allocated twice.
624 ASSERT3U(spa
->spa_uberblock
.ub_rootbp
.blk_birth
, <, spa_first_txg(spa
));
625 ASSERT3U(spa_first_txg(spa
), <=, txg
);
627 zio
= zio_create(pio
, spa
, txg
, bp
, NULL
, BP_GET_PSIZE(bp
),
628 done
, private, ZIO_TYPE_CLAIM
, ZIO_PRIORITY_NOW
, flags
,
629 NULL
, 0, NULL
, ZIO_STAGE_OPEN
, ZIO_CLAIM_PIPELINE
);
635 zio_ioctl(zio_t
*pio
, spa_t
*spa
, vdev_t
*vd
, int cmd
,
636 zio_done_func_t
*done
, void *private, int priority
, int flags
)
641 if (vd
->vdev_children
== 0) {
642 zio
= zio_create(pio
, spa
, 0, NULL
, NULL
, 0, done
, private,
643 ZIO_TYPE_IOCTL
, priority
, flags
, vd
, 0, NULL
,
644 ZIO_STAGE_OPEN
, ZIO_IOCTL_PIPELINE
);
648 zio
= zio_null(pio
, spa
, NULL
, NULL
, flags
);
650 for (c
= 0; c
< vd
->vdev_children
; c
++)
651 zio_nowait(zio_ioctl(zio
, spa
, vd
->vdev_child
[c
], cmd
,
652 done
, private, priority
, flags
));
659 zio_read_phys(zio_t
*pio
, vdev_t
*vd
, uint64_t offset
, uint64_t size
,
660 void *data
, int checksum
, zio_done_func_t
*done
, void *private,
661 int priority
, int flags
, boolean_t labels
)
665 ASSERT(vd
->vdev_children
== 0);
666 ASSERT(!labels
|| offset
+ size
<= VDEV_LABEL_START_SIZE
||
667 offset
>= vd
->vdev_psize
- VDEV_LABEL_END_SIZE
);
668 ASSERT3U(offset
+ size
, <=, vd
->vdev_psize
);
670 zio
= zio_create(pio
, vd
->vdev_spa
, 0, NULL
, data
, size
, done
, private,
671 ZIO_TYPE_READ
, priority
, flags
, vd
, offset
, NULL
,
672 ZIO_STAGE_OPEN
, ZIO_READ_PHYS_PIPELINE
);
674 zio
->io_prop
.zp_checksum
= checksum
;
680 zio_write_phys(zio_t
*pio
, vdev_t
*vd
, uint64_t offset
, uint64_t size
,
681 void *data
, int checksum
, zio_done_func_t
*done
, void *private,
682 int priority
, int flags
, boolean_t labels
)
686 ASSERT(vd
->vdev_children
== 0);
687 ASSERT(!labels
|| offset
+ size
<= VDEV_LABEL_START_SIZE
||
688 offset
>= vd
->vdev_psize
- VDEV_LABEL_END_SIZE
);
689 ASSERT3U(offset
+ size
, <=, vd
->vdev_psize
);
691 zio
= zio_create(pio
, vd
->vdev_spa
, 0, NULL
, data
, size
, done
, private,
692 ZIO_TYPE_WRITE
, priority
, flags
, vd
, offset
, NULL
,
693 ZIO_STAGE_OPEN
, ZIO_WRITE_PHYS_PIPELINE
);
695 zio
->io_prop
.zp_checksum
= checksum
;
697 if (zio_checksum_table
[checksum
].ci_zbt
) {
699 * zbt checksums are necessarily destructive -- they modify
700 * the end of the write buffer to hold the verifier/checksum.
701 * Therefore, we must make a local copy in case the data is
702 * being written to multiple places in parallel.
704 void *wbuf
= zio_buf_alloc(size
);
705 bcopy(data
, wbuf
, size
);
706 zio_push_transform(zio
, wbuf
, size
, size
, NULL
);
713 * Create a child I/O to do some work for us.
716 zio_vdev_child_io(zio_t
*pio
, blkptr_t
*bp
, vdev_t
*vd
, uint64_t offset
,
717 void *data
, uint64_t size
, int type
, int priority
, int flags
,
718 zio_done_func_t
*done
, void *private)
720 uint32_t pipeline
= ZIO_VDEV_CHILD_PIPELINE
;
723 ASSERT(vd
->vdev_parent
==
724 (pio
->io_vd
? pio
->io_vd
: pio
->io_spa
->spa_root_vdev
));
726 if (type
== ZIO_TYPE_READ
&& bp
!= NULL
) {
728 * If we have the bp, then the child should perform the
729 * checksum and the parent need not. This pushes error
730 * detection as close to the leaves as possible and
731 * eliminates redundant checksums in the interior nodes.
733 pipeline
|= 1U << ZIO_STAGE_CHECKSUM_VERIFY
;
734 pio
->io_pipeline
&= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY
);
737 if (vd
->vdev_children
== 0)
738 offset
+= VDEV_LABEL_START_SIZE
;
740 zio
= zio_create(pio
, pio
->io_spa
, pio
->io_txg
, bp
, data
, size
,
741 done
, private, type
, priority
,
742 (pio
->io_flags
& ZIO_FLAG_VDEV_INHERIT
) |
743 ZIO_FLAG_CANFAIL
| ZIO_FLAG_DONT_PROPAGATE
| flags
,
744 vd
, offset
, &pio
->io_bookmark
,
745 ZIO_STAGE_VDEV_IO_START
- 1, pipeline
);
751 zio_vdev_delegated_io(vdev_t
*vd
, uint64_t offset
, void *data
, uint64_t size
,
752 int type
, int priority
, int flags
, zio_done_func_t
*done
, void *private)
756 ASSERT(vd
->vdev_ops
->vdev_op_leaf
);
758 zio
= zio_create(NULL
, vd
->vdev_spa
, 0, NULL
,
759 data
, size
, done
, private, type
, priority
,
760 flags
| ZIO_FLAG_CANFAIL
| ZIO_FLAG_DONT_RETRY
,
762 ZIO_STAGE_VDEV_IO_START
- 1, ZIO_VDEV_CHILD_PIPELINE
);
768 zio_flush(zio_t
*zio
, vdev_t
*vd
)
770 zio_nowait(zio_ioctl(zio
, zio
->io_spa
, vd
, DKIOCFLUSHWRITECACHE
,
771 NULL
, NULL
, ZIO_PRIORITY_NOW
,
772 ZIO_FLAG_CANFAIL
| ZIO_FLAG_DONT_PROPAGATE
| ZIO_FLAG_DONT_RETRY
));
776 * ==========================================================================
777 * Prepare to read and write logical blocks
778 * ==========================================================================
782 zio_read_bp_init(zio_t
*zio
)
784 blkptr_t
*bp
= zio
->io_bp
;
786 if (BP_GET_COMPRESS(bp
) != ZIO_COMPRESS_OFF
&& zio
->io_logical
== zio
) {
787 uint64_t csize
= BP_GET_PSIZE(bp
);
788 void *cbuf
= zio_buf_alloc(csize
);
790 zio_push_transform(zio
, cbuf
, csize
, csize
, zio_decompress
);
793 if (!dmu_ot
[BP_GET_TYPE(bp
)].ot_metadata
&& BP_GET_LEVEL(bp
) == 0)
794 zio
->io_flags
|= ZIO_FLAG_DONT_CACHE
;
796 return (ZIO_PIPELINE_CONTINUE
);
800 zio_write_bp_init(zio_t
*zio
)
802 zio_prop_t
*zp
= &zio
->io_prop
;
803 int compress
= zp
->zp_compress
;
804 blkptr_t
*bp
= zio
->io_bp
;
806 uint64_t lsize
= zio
->io_size
;
807 uint64_t csize
= lsize
;
808 uint64_t cbufsize
= 0;
812 * If our children haven't all reached the ready stage,
813 * wait for them and then repeat this pipeline stage.
815 if (zio_wait_for_children(zio
, ZIO_CHILD_GANG
, ZIO_WAIT_READY
) ||
816 zio_wait_for_children(zio
, ZIO_CHILD_LOGICAL
, ZIO_WAIT_READY
))
817 return (ZIO_PIPELINE_STOP
);
819 if (!IO_IS_ALLOCATING(zio
))
820 return (ZIO_PIPELINE_CONTINUE
);
822 ASSERT(compress
!= ZIO_COMPRESS_INHERIT
);
824 if (bp
->blk_birth
== zio
->io_txg
) {
826 * We're rewriting an existing block, which means we're
827 * working on behalf of spa_sync(). For spa_sync() to
828 * converge, it must eventually be the case that we don't
829 * have to allocate new blocks. But compression changes
830 * the blocksize, which forces a reallocate, and makes
831 * convergence take longer. Therefore, after the first
832 * few passes, stop compressing to ensure convergence.
834 pass
= spa_sync_pass(zio
->io_spa
);
837 if (pass
> SYNC_PASS_DONT_COMPRESS
)
838 compress
= ZIO_COMPRESS_OFF
;
841 * Only MOS (objset 0) data should need to be rewritten.
843 ASSERT(zio
->io_logical
->io_bookmark
.zb_objset
== 0);
845 /* Make sure someone doesn't change their mind on overwrites */
846 ASSERT(MIN(zp
->zp_ndvas
+ BP_IS_GANG(bp
),
847 spa_max_replication(zio
->io_spa
)) == BP_GET_NDVAS(bp
));
850 if (compress
!= ZIO_COMPRESS_OFF
) {
851 if (!zio_compress_data(compress
, zio
->io_data
, zio
->io_size
,
852 &cbuf
, &csize
, &cbufsize
)) {
853 compress
= ZIO_COMPRESS_OFF
;
854 } else if (csize
!= 0) {
855 zio_push_transform(zio
, cbuf
, csize
, cbufsize
, NULL
);
860 * The final pass of spa_sync() must be all rewrites, but the first
861 * few passes offer a trade-off: allocating blocks defers convergence,
862 * but newly allocated blocks are sequential, so they can be written
863 * to disk faster. Therefore, we allow the first few passes of
864 * spa_sync() to allocate new blocks, but force rewrites after that.
865 * There should only be a handful of blocks after pass 1 in any case.
867 if (bp
->blk_birth
== zio
->io_txg
&& BP_GET_PSIZE(bp
) == csize
&&
868 pass
> SYNC_PASS_REWRITE
) {
870 uint32_t gang_stages
= zio
->io_pipeline
& ZIO_GANG_STAGES
;
871 zio
->io_pipeline
= ZIO_REWRITE_PIPELINE
| gang_stages
;
872 zio
->io_flags
|= ZIO_FLAG_IO_REWRITE
;
875 zio
->io_pipeline
= ZIO_WRITE_PIPELINE
;
879 zio
->io_pipeline
= ZIO_INTERLOCK_PIPELINE
;
881 ASSERT(zp
->zp_checksum
!= ZIO_CHECKSUM_GANG_HEADER
);
882 BP_SET_LSIZE(bp
, lsize
);
883 BP_SET_PSIZE(bp
, csize
);
884 BP_SET_COMPRESS(bp
, compress
);
885 BP_SET_CHECKSUM(bp
, zp
->zp_checksum
);
886 BP_SET_TYPE(bp
, zp
->zp_type
);
887 BP_SET_LEVEL(bp
, zp
->zp_level
);
888 BP_SET_BYTEORDER(bp
, ZFS_HOST_BYTEORDER
);
891 return (ZIO_PIPELINE_CONTINUE
);
895 * ==========================================================================
896 * Execute the I/O pipeline
897 * ==========================================================================
901 zio_taskq_dispatch(zio_t
*zio
, enum zio_taskq_type q
)
903 zio_type_t t
= zio
->io_type
;
906 * If we're a config writer, the normal issue and interrupt threads
907 * may all be blocked waiting for the config lock. In this case,
908 * select the otherwise-unused taskq for ZIO_TYPE_NULL.
910 if (zio
->io_flags
& ZIO_FLAG_CONFIG_WRITER
)
914 * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
916 if (t
== ZIO_TYPE_WRITE
&& zio
->io_vd
&& zio
->io_vd
->vdev_aux
)
919 (void) taskq_dispatch(zio
->io_spa
->spa_zio_taskq
[t
][q
],
920 (task_func_t
*)zio_execute
, zio
, TQ_SLEEP
);
924 zio_taskq_member(zio_t
*zio
, enum zio_taskq_type q
)
926 kthread_t
*executor
= zio
->io_executor
;
927 spa_t
*spa
= zio
->io_spa
;
929 for (zio_type_t t
= 0; t
< ZIO_TYPES
; t
++)
930 if (taskq_member(spa
->spa_zio_taskq
[t
][q
], executor
))
937 zio_issue_async(zio_t
*zio
)
939 zio_taskq_dispatch(zio
, ZIO_TASKQ_ISSUE
);
941 return (ZIO_PIPELINE_STOP
);
945 zio_interrupt(zio_t
*zio
)
947 zio_taskq_dispatch(zio
, ZIO_TASKQ_INTERRUPT
);
951 * Execute the I/O pipeline until one of the following occurs:
952 * (1) the I/O completes; (2) the pipeline stalls waiting for
953 * dependent child I/Os; (3) the I/O issues, so we're waiting
954 * for an I/O completion interrupt; (4) the I/O is delegated by
955 * vdev-level caching or aggregation; (5) the I/O is deferred
956 * due to vdev-level queueing; (6) the I/O is handed off to
957 * another thread. In all cases, the pipeline stops whenever
958 * there's no CPU work; it never burns a thread in cv_wait().
960 * There's no locking on io_stage because there's no legitimate way
961 * for multiple threads to be attempting to process the same I/O.
963 static zio_pipe_stage_t
*zio_pipeline
[ZIO_STAGES
];
966 zio_execute(zio_t
*zio
)
968 zio
->io_executor
= curthread
;
970 while (zio
->io_stage
< ZIO_STAGE_DONE
) {
971 uint32_t pipeline
= zio
->io_pipeline
;
972 zio_stage_t stage
= zio
->io_stage
;
975 ASSERT(!MUTEX_HELD(&zio
->io_lock
));
977 while (((1U << ++stage
) & pipeline
) == 0)
980 ASSERT(stage
<= ZIO_STAGE_DONE
);
981 ASSERT(zio
->io_stall
== NULL
);
984 * If we are in interrupt context and this pipeline stage
985 * will grab a config lock that is held across I/O,
986 * issue async to avoid deadlock.
988 if (((1U << stage
) & ZIO_CONFIG_LOCK_BLOCKING_STAGES
) &&
989 zio
->io_vd
== NULL
&&
990 zio_taskq_member(zio
, ZIO_TASKQ_INTERRUPT
)) {
991 zio_taskq_dispatch(zio
, ZIO_TASKQ_ISSUE
);
995 zio
->io_stage
= stage
;
996 rv
= zio_pipeline
[stage
](zio
);
998 if (rv
== ZIO_PIPELINE_STOP
)
1001 ASSERT(rv
== ZIO_PIPELINE_CONTINUE
);
1006 * ==========================================================================
1007 * Initiate I/O, either sync or async
1008 * ==========================================================================
1011 zio_wait(zio_t
*zio
)
1015 ASSERT(zio
->io_stage
== ZIO_STAGE_OPEN
);
1016 ASSERT(zio
->io_executor
== NULL
);
1018 zio
->io_waiter
= curthread
;
1022 mutex_enter(&zio
->io_lock
);
1023 while (zio
->io_executor
!= NULL
)
1024 cv_wait(&zio
->io_cv
, &zio
->io_lock
);
1025 mutex_exit(&zio
->io_lock
);
1027 error
= zio
->io_error
;
1034 zio_nowait(zio_t
*zio
)
1036 ASSERT(zio
->io_executor
== NULL
);
1038 if (zio
->io_parent
== NULL
&& zio
->io_child_type
== ZIO_CHILD_LOGICAL
) {
1040 * This is a logical async I/O with no parent to wait for it.
1041 * Attach it to the pool's global async root zio so that
1042 * spa_unload() has a way of waiting for async I/O to finish.
1044 spa_t
*spa
= zio
->io_spa
;
1045 zio
->io_async_root
= B_TRUE
;
1046 mutex_enter(&spa
->spa_async_root_lock
);
1047 spa
->spa_async_root_count
++;
1048 mutex_exit(&spa
->spa_async_root_lock
);
1055 * ==========================================================================
1056 * Reexecute or suspend/resume failed I/O
1057 * ==========================================================================
1061 zio_reexecute(zio_t
*pio
)
1063 zio_t
*zio
, *zio_next
;
1065 pio
->io_flags
= pio
->io_orig_flags
;
1066 pio
->io_stage
= pio
->io_orig_stage
;
1067 pio
->io_pipeline
= pio
->io_orig_pipeline
;
1068 pio
->io_reexecute
= 0;
1070 for (int c
= 0; c
< ZIO_CHILD_TYPES
; c
++)
1071 pio
->io_child_error
[c
] = 0;
1073 if (IO_IS_ALLOCATING(pio
)) {
1075 * Remember the failed bp so that the io_ready() callback
1076 * can update its accounting upon reexecution. The block
1077 * was already freed in zio_done(); we indicate this with
1078 * a fill count of -1 so that zio_free() knows to skip it.
1080 blkptr_t
*bp
= pio
->io_bp
;
1081 ASSERT(bp
->blk_birth
== 0 || bp
->blk_birth
== pio
->io_txg
);
1082 bp
->blk_fill
= BLK_FILL_ALREADY_FREED
;
1083 pio
->io_bp_orig
= *bp
;
1088 * As we reexecute pio's children, new children could be created.
1089 * New children go to the head of the io_child list, however,
1090 * so we will (correctly) not reexecute them. The key is that
1091 * the remainder of the io_child list, from 'zio_next' onward,
1092 * cannot be affected by any side effects of reexecuting 'zio'.
1094 for (zio
= pio
->io_child
; zio
!= NULL
; zio
= zio_next
) {
1095 zio_next
= zio
->io_sibling_next
;
1096 mutex_enter(&pio
->io_lock
);
1097 pio
->io_children
[zio
->io_child_type
][ZIO_WAIT_READY
]++;
1098 pio
->io_children
[zio
->io_child_type
][ZIO_WAIT_DONE
]++;
1099 mutex_exit(&pio
->io_lock
);
1104 * Now that all children have been reexecuted, execute the parent.
1110 zio_suspend(spa_t
*spa
, zio_t
*zio
)
1112 if (spa_get_failmode(spa
) == ZIO_FAILURE_MODE_PANIC
)
1113 fm_panic("Pool '%s' has encountered an uncorrectable I/O "
1114 "failure and the failure mode property for this pool "
1115 "is set to panic.", spa_name(spa
));
1117 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE
, spa
, NULL
, NULL
, 0, 0);
1119 mutex_enter(&spa
->spa_suspend_lock
);
1121 if (spa
->spa_suspend_zio_root
== NULL
)
1122 spa
->spa_suspend_zio_root
= zio_root(spa
, NULL
, NULL
, 0);
1124 spa
->spa_suspended
= B_TRUE
;
1127 ASSERT(zio
!= spa
->spa_suspend_zio_root
);
1128 ASSERT(zio
->io_child_type
== ZIO_CHILD_LOGICAL
);
1129 ASSERT(zio
->io_parent
== NULL
);
1130 ASSERT(zio
->io_stage
== ZIO_STAGE_DONE
);
1131 zio_add_child(spa
->spa_suspend_zio_root
, zio
);
1134 mutex_exit(&spa
->spa_suspend_lock
);
1138 zio_resume(spa_t
*spa
)
1143 * Reexecute all previously suspended i/o.
1145 mutex_enter(&spa
->spa_suspend_lock
);
1146 spa
->spa_suspended
= B_FALSE
;
1147 cv_broadcast(&spa
->spa_suspend_cv
);
1148 pio
= spa
->spa_suspend_zio_root
;
1149 spa
->spa_suspend_zio_root
= NULL
;
1150 mutex_exit(&spa
->spa_suspend_lock
);
1155 while ((zio
= pio
->io_child
) != NULL
) {
1156 zio_remove_child(pio
, zio
);
1157 zio
->io_parent
= NULL
;
1161 ASSERT(pio
->io_children
[ZIO_CHILD_LOGICAL
][ZIO_WAIT_DONE
] == 0);
1163 (void) zio_wait(pio
);
1167 zio_resume_wait(spa_t
*spa
)
1169 mutex_enter(&spa
->spa_suspend_lock
);
1170 while (spa_suspended(spa
))
1171 cv_wait(&spa
->spa_suspend_cv
, &spa
->spa_suspend_lock
);
1172 mutex_exit(&spa
->spa_suspend_lock
);
1176 * ==========================================================================
1179 * A gang block is a collection of small blocks that looks to the DMU
1180 * like one large block. When zio_dva_allocate() cannot find a block
1181 * of the requested size, due to either severe fragmentation or the pool
1182 * being nearly full, it calls zio_write_gang_block() to construct the
1183 * block from smaller fragments.
1185 * A gang block consists of a gang header (zio_gbh_phys_t) and up to
1186 * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like
1187 * an indirect block: it's an array of block pointers. It consumes
1188 * only one sector and hence is allocatable regardless of fragmentation.
1189 * The gang header's bps point to its gang members, which hold the data.
1191 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
1192 * as the verifier to ensure uniqueness of the SHA256 checksum.
1193 * Critically, the gang block bp's blk_cksum is the checksum of the data,
1194 * not the gang header. This ensures that data block signatures (needed for
1195 * deduplication) are independent of how the block is physically stored.
1197 * Gang blocks can be nested: a gang member may itself be a gang block.
1198 * Thus every gang block is a tree in which root and all interior nodes are
1199 * gang headers, and the leaves are normal blocks that contain user data.
1200 * The root of the gang tree is called the gang leader.
1202 * To perform any operation (read, rewrite, free, claim) on a gang block,
1203 * zio_gang_assemble() first assembles the gang tree (minus data leaves)
1204 * in the io_gang_tree field of the original logical i/o by recursively
1205 * reading the gang leader and all gang headers below it. This yields
1206 * an in-core tree containing the contents of every gang header and the
1207 * bps for every constituent of the gang block.
1209 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
1210 * and invokes a callback on each bp. To free a gang block, zio_gang_issue()
1211 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
1212 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
1213 * zio_read_gang() is a wrapper around zio_read() that omits reading gang
1214 * headers, since we already have those in io_gang_tree. zio_rewrite_gang()
1215 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
1216 * of the gang header plus zio_checksum_compute() of the data to update the
1217 * gang header's blk_cksum as described above.
1219 * The two-phase assemble/issue model solves the problem of partial failure --
1220 * what if you'd freed part of a gang block but then couldn't read the
1221 * gang header for another part? Assembling the entire gang tree first
1222 * ensures that all the necessary gang header I/O has succeeded before
1223 * starting the actual work of free, claim, or write. Once the gang tree
1224 * is assembled, free and claim are in-memory operations that cannot fail.
1226 * In the event that a gang write fails, zio_dva_unallocate() walks the
1227 * gang tree to immediately free (i.e. insert back into the space map)
1228 * everything we've allocated. This ensures that we don't get ENOSPC
1229 * errors during repeated suspend/resume cycles due to a flaky device.
1231 * Gang rewrites only happen during sync-to-convergence. If we can't assemble
1232 * the gang tree, we won't modify the block, so we can safely defer the free
1233 * (knowing that the block is still intact). If we *can* assemble the gang
1234 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
1235 * each constituent bp and we can allocate a new block on the next sync pass.
1237 * In all cases, the gang tree allows complete recovery from partial failure.
1238 * ==========================================================================
1242 zio_read_gang(zio_t
*pio
, blkptr_t
*bp
, zio_gang_node_t
*gn
, void *data
)
1247 return (zio_read(pio
, pio
->io_spa
, bp
, data
, BP_GET_PSIZE(bp
),
1248 NULL
, NULL
, pio
->io_priority
, ZIO_GANG_CHILD_FLAGS(pio
),
1249 &pio
->io_bookmark
));
1253 zio_rewrite_gang(zio_t
*pio
, blkptr_t
*bp
, zio_gang_node_t
*gn
, void *data
)
1258 zio
= zio_rewrite(pio
, pio
->io_spa
, pio
->io_txg
, bp
,
1259 gn
->gn_gbh
, SPA_GANGBLOCKSIZE
, NULL
, NULL
, pio
->io_priority
,
1260 ZIO_GANG_CHILD_FLAGS(pio
), &pio
->io_bookmark
);
1262 * As we rewrite each gang header, the pipeline will compute
1263 * a new gang block header checksum for it; but no one will
1264 * compute a new data checksum, so we do that here. The one
1265 * exception is the gang leader: the pipeline already computed
1266 * its data checksum because that stage precedes gang assembly.
1267 * (Presently, nothing actually uses interior data checksums;
1268 * this is just good hygiene.)
1270 if (gn
!= pio
->io_logical
->io_gang_tree
) {
1271 zio_checksum_compute(zio
, BP_GET_CHECKSUM(bp
),
1272 data
, BP_GET_PSIZE(bp
));
1275 zio
= zio_rewrite(pio
, pio
->io_spa
, pio
->io_txg
, bp
,
1276 data
, BP_GET_PSIZE(bp
), NULL
, NULL
, pio
->io_priority
,
1277 ZIO_GANG_CHILD_FLAGS(pio
), &pio
->io_bookmark
);
1285 zio_free_gang(zio_t
*pio
, blkptr_t
*bp
, zio_gang_node_t
*gn
, void *data
)
1287 return (zio_free(pio
, pio
->io_spa
, pio
->io_txg
, bp
,
1288 NULL
, NULL
, ZIO_GANG_CHILD_FLAGS(pio
)));
1293 zio_claim_gang(zio_t
*pio
, blkptr_t
*bp
, zio_gang_node_t
*gn
, void *data
)
1295 return (zio_claim(pio
, pio
->io_spa
, pio
->io_txg
, bp
,
1296 NULL
, NULL
, ZIO_GANG_CHILD_FLAGS(pio
)));
1299 static zio_gang_issue_func_t
*zio_gang_issue_func
[ZIO_TYPES
] = {
1308 static void zio_gang_tree_assemble_done(zio_t
*zio
);
1310 static zio_gang_node_t
*
1311 zio_gang_node_alloc(zio_gang_node_t
**gnpp
)
1313 zio_gang_node_t
*gn
;
1315 ASSERT(*gnpp
== NULL
);
1317 gn
= kmem_zalloc(sizeof (*gn
), KM_SLEEP
);
1318 gn
->gn_gbh
= zio_buf_alloc(SPA_GANGBLOCKSIZE
);
1325 zio_gang_node_free(zio_gang_node_t
**gnpp
)
1327 zio_gang_node_t
*gn
= *gnpp
;
1329 for (int g
= 0; g
< SPA_GBH_NBLKPTRS
; g
++)
1330 ASSERT(gn
->gn_child
[g
] == NULL
);
1332 zio_buf_free(gn
->gn_gbh
, SPA_GANGBLOCKSIZE
);
1333 kmem_free(gn
, sizeof (*gn
));
1338 zio_gang_tree_free(zio_gang_node_t
**gnpp
)
1340 zio_gang_node_t
*gn
= *gnpp
;
1345 for (int g
= 0; g
< SPA_GBH_NBLKPTRS
; g
++)
1346 zio_gang_tree_free(&gn
->gn_child
[g
]);
1348 zio_gang_node_free(gnpp
);
1352 zio_gang_tree_assemble(zio_t
*lio
, blkptr_t
*bp
, zio_gang_node_t
**gnpp
)
1354 zio_gang_node_t
*gn
= zio_gang_node_alloc(gnpp
);
1356 ASSERT(lio
->io_logical
== lio
);
1357 ASSERT(BP_IS_GANG(bp
));
1359 zio_nowait(zio_read(lio
, lio
->io_spa
, bp
, gn
->gn_gbh
,
1360 SPA_GANGBLOCKSIZE
, zio_gang_tree_assemble_done
, gn
,
1361 lio
->io_priority
, ZIO_GANG_CHILD_FLAGS(lio
), &lio
->io_bookmark
));
1365 zio_gang_tree_assemble_done(zio_t
*zio
)
1367 zio_t
*lio
= zio
->io_logical
;
1368 zio_gang_node_t
*gn
= zio
->io_private
;
1369 blkptr_t
*bp
= zio
->io_bp
;
1371 ASSERT(zio
->io_parent
== lio
);
1372 ASSERT(zio
->io_child
== NULL
);
1377 if (BP_SHOULD_BYTESWAP(bp
))
1378 byteswap_uint64_array(zio
->io_data
, zio
->io_size
);
1380 ASSERT(zio
->io_data
== gn
->gn_gbh
);
1381 ASSERT(zio
->io_size
== SPA_GANGBLOCKSIZE
);
1382 ASSERT(gn
->gn_gbh
->zg_tail
.zbt_magic
== ZBT_MAGIC
);
1384 for (int g
= 0; g
< SPA_GBH_NBLKPTRS
; g
++) {
1385 blkptr_t
*gbp
= &gn
->gn_gbh
->zg_blkptr
[g
];
1386 if (!BP_IS_GANG(gbp
))
1388 zio_gang_tree_assemble(lio
, gbp
, &gn
->gn_child
[g
]);
1393 zio_gang_tree_issue(zio_t
*pio
, zio_gang_node_t
*gn
, blkptr_t
*bp
, void *data
)
1395 zio_t
*lio
= pio
->io_logical
;
1398 ASSERT(BP_IS_GANG(bp
) == !!gn
);
1399 ASSERT(BP_GET_CHECKSUM(bp
) == BP_GET_CHECKSUM(lio
->io_bp
));
1400 ASSERT(BP_GET_LSIZE(bp
) == BP_GET_PSIZE(bp
) || gn
== lio
->io_gang_tree
);
1403 * If you're a gang header, your data is in gn->gn_gbh.
1404 * If you're a gang member, your data is in 'data' and gn == NULL.
1406 zio
= zio_gang_issue_func
[lio
->io_type
](pio
, bp
, gn
, data
);
1409 ASSERT(gn
->gn_gbh
->zg_tail
.zbt_magic
== ZBT_MAGIC
);
1411 for (int g
= 0; g
< SPA_GBH_NBLKPTRS
; g
++) {
1412 blkptr_t
*gbp
= &gn
->gn_gbh
->zg_blkptr
[g
];
1413 if (BP_IS_HOLE(gbp
))
1415 zio_gang_tree_issue(zio
, gn
->gn_child
[g
], gbp
, data
);
1416 data
= (char *)data
+ BP_GET_PSIZE(gbp
);
1420 if (gn
== lio
->io_gang_tree
)
1421 ASSERT3P((char *)lio
->io_data
+ lio
->io_size
, ==, data
);
1428 zio_gang_assemble(zio_t
*zio
)
1430 blkptr_t
*bp
= zio
->io_bp
;
1432 ASSERT(BP_IS_GANG(bp
) && zio
== zio
->io_logical
);
1434 zio_gang_tree_assemble(zio
, bp
, &zio
->io_gang_tree
);
1436 return (ZIO_PIPELINE_CONTINUE
);
1440 zio_gang_issue(zio_t
*zio
)
1442 zio_t
*lio
= zio
->io_logical
;
1443 blkptr_t
*bp
= zio
->io_bp
;
1445 if (zio_wait_for_children(zio
, ZIO_CHILD_GANG
, ZIO_WAIT_DONE
))
1446 return (ZIO_PIPELINE_STOP
);
1448 ASSERT(BP_IS_GANG(bp
) && zio
== lio
);
1450 if (zio
->io_child_error
[ZIO_CHILD_GANG
] == 0)
1451 zio_gang_tree_issue(lio
, lio
->io_gang_tree
, bp
, lio
->io_data
);
1453 zio_gang_tree_free(&lio
->io_gang_tree
);
1455 zio
->io_pipeline
= ZIO_INTERLOCK_PIPELINE
;
1457 return (ZIO_PIPELINE_CONTINUE
);
1461 zio_write_gang_member_ready(zio_t
*zio
)
1463 zio_t
*pio
= zio
->io_parent
;
1464 zio_t
*lio
= zio
->io_logical
;
1465 dva_t
*cdva
= zio
->io_bp
->blk_dva
;
1466 dva_t
*pdva
= pio
->io_bp
->blk_dva
;
1469 if (BP_IS_HOLE(zio
->io_bp
))
1472 ASSERT(BP_IS_HOLE(&zio
->io_bp_orig
));
1474 ASSERT(zio
->io_child_type
== ZIO_CHILD_GANG
);
1475 ASSERT3U(zio
->io_prop
.zp_ndvas
, ==, lio
->io_prop
.zp_ndvas
);
1476 ASSERT3U(zio
->io_prop
.zp_ndvas
, <=, BP_GET_NDVAS(zio
->io_bp
));
1477 ASSERT3U(pio
->io_prop
.zp_ndvas
, <=, BP_GET_NDVAS(pio
->io_bp
));
1478 ASSERT3U(BP_GET_NDVAS(zio
->io_bp
), <=, BP_GET_NDVAS(pio
->io_bp
));
1480 mutex_enter(&pio
->io_lock
);
1481 for (int d
= 0; d
< BP_GET_NDVAS(zio
->io_bp
); d
++) {
1482 ASSERT(DVA_GET_GANG(&pdva
[d
]));
1483 asize
= DVA_GET_ASIZE(&pdva
[d
]);
1484 asize
+= DVA_GET_ASIZE(&cdva
[d
]);
1485 DVA_SET_ASIZE(&pdva
[d
], asize
);
1487 mutex_exit(&pio
->io_lock
);
1491 zio_write_gang_block(zio_t
*pio
)
1493 spa_t
*spa
= pio
->io_spa
;
1494 blkptr_t
*bp
= pio
->io_bp
;
1495 zio_t
*lio
= pio
->io_logical
;
1497 zio_gang_node_t
*gn
, **gnpp
;
1498 zio_gbh_phys_t
*gbh
;
1499 uint64_t txg
= pio
->io_txg
;
1500 uint64_t resid
= pio
->io_size
;
1502 int ndvas
= lio
->io_prop
.zp_ndvas
;
1503 int gbh_ndvas
= MIN(ndvas
+ 1, spa_max_replication(spa
));
1507 error
= metaslab_alloc(spa
, spa
->spa_normal_class
, SPA_GANGBLOCKSIZE
,
1508 bp
, gbh_ndvas
, txg
, pio
== lio
? NULL
: lio
->io_bp
,
1509 METASLAB_HINTBP_FAVOR
| METASLAB_GANG_HEADER
);
1511 pio
->io_error
= error
;
1512 return (ZIO_PIPELINE_CONTINUE
);
1516 gnpp
= &lio
->io_gang_tree
;
1518 gnpp
= pio
->io_private
;
1519 ASSERT(pio
->io_ready
== zio_write_gang_member_ready
);
1522 gn
= zio_gang_node_alloc(gnpp
);
1524 bzero(gbh
, SPA_GANGBLOCKSIZE
);
1527 * Create the gang header.
1529 zio
= zio_rewrite(pio
, spa
, txg
, bp
, gbh
, SPA_GANGBLOCKSIZE
, NULL
, NULL
,
1530 pio
->io_priority
, ZIO_GANG_CHILD_FLAGS(pio
), &pio
->io_bookmark
);
1533 * Create and nowait the gang children.
1535 for (int g
= 0; resid
!= 0; resid
-= lsize
, g
++) {
1536 lsize
= P2ROUNDUP(resid
/ (SPA_GBH_NBLKPTRS
- g
),
1538 ASSERT(lsize
>= SPA_MINBLOCKSIZE
&& lsize
<= resid
);
1540 zp
.zp_checksum
= lio
->io_prop
.zp_checksum
;
1541 zp
.zp_compress
= ZIO_COMPRESS_OFF
;
1542 zp
.zp_type
= DMU_OT_NONE
;
1544 zp
.zp_ndvas
= lio
->io_prop
.zp_ndvas
;
1546 zio_nowait(zio_write(zio
, spa
, txg
, &gbh
->zg_blkptr
[g
],
1547 (char *)pio
->io_data
+ (pio
->io_size
- resid
), lsize
, &zp
,
1548 zio_write_gang_member_ready
, NULL
, &gn
->gn_child
[g
],
1549 pio
->io_priority
, ZIO_GANG_CHILD_FLAGS(pio
),
1550 &pio
->io_bookmark
));
1554 * Set pio's pipeline to just wait for zio to finish.
1556 pio
->io_pipeline
= ZIO_INTERLOCK_PIPELINE
;
1560 return (ZIO_PIPELINE_CONTINUE
);
1564 * ==========================================================================
1565 * Allocate and free blocks
1566 * ==========================================================================
1570 zio_dva_allocate(zio_t
*zio
)
1572 spa_t
*spa
= zio
->io_spa
;
1573 metaslab_class_t
*mc
= spa
->spa_normal_class
;
1574 blkptr_t
*bp
= zio
->io_bp
;
1577 ASSERT(BP_IS_HOLE(bp
));
1578 ASSERT3U(BP_GET_NDVAS(bp
), ==, 0);
1579 ASSERT3U(zio
->io_prop
.zp_ndvas
, >, 0);
1580 ASSERT3U(zio
->io_prop
.zp_ndvas
, <=, spa_max_replication(spa
));
1581 ASSERT3U(zio
->io_size
, ==, BP_GET_PSIZE(bp
));
1583 error
= metaslab_alloc(spa
, mc
, zio
->io_size
, bp
,
1584 zio
->io_prop
.zp_ndvas
, zio
->io_txg
, NULL
, 0);
1587 if (error
== ENOSPC
&& zio
->io_size
> SPA_MINBLOCKSIZE
)
1588 return (zio_write_gang_block(zio
));
1589 zio
->io_error
= error
;
1592 return (ZIO_PIPELINE_CONTINUE
);
1596 zio_dva_free(zio_t
*zio
)
1598 metaslab_free(zio
->io_spa
, zio
->io_bp
, zio
->io_txg
, B_FALSE
);
1600 return (ZIO_PIPELINE_CONTINUE
);
1604 zio_dva_claim(zio_t
*zio
)
1608 error
= metaslab_claim(zio
->io_spa
, zio
->io_bp
, zio
->io_txg
);
1610 zio
->io_error
= error
;
1612 return (ZIO_PIPELINE_CONTINUE
);
1616 * Undo an allocation. This is used by zio_done() when an I/O fails
1617 * and we want to give back the block we just allocated.
1618 * This handles both normal blocks and gang blocks.
1621 zio_dva_unallocate(zio_t
*zio
, zio_gang_node_t
*gn
, blkptr_t
*bp
)
1623 spa_t
*spa
= zio
->io_spa
;
1624 boolean_t now
= !(zio
->io_flags
& ZIO_FLAG_IO_REWRITE
);
1626 ASSERT(bp
->blk_birth
== zio
->io_txg
|| BP_IS_HOLE(bp
));
1628 if (zio
->io_bp
== bp
&& !now
) {
1630 * This is a rewrite for sync-to-convergence.
1631 * We can't do a metaslab_free(NOW) because bp wasn't allocated
1632 * during this sync pass, which means that metaslab_sync()
1633 * already committed the allocation.
1635 ASSERT(DVA_EQUAL(BP_IDENTITY(bp
),
1636 BP_IDENTITY(&zio
->io_bp_orig
)));
1637 ASSERT(spa_sync_pass(spa
) > 1);
1639 if (BP_IS_GANG(bp
) && gn
== NULL
) {
1641 * This is a gang leader whose gang header(s) we
1642 * couldn't read now, so defer the free until later.
1643 * The block should still be intact because without
1644 * the headers, we'd never even start the rewrite.
1646 bplist_enqueue_deferred(&spa
->spa_sync_bplist
, bp
);
1651 if (!BP_IS_HOLE(bp
))
1652 metaslab_free(spa
, bp
, bp
->blk_birth
, now
);
1655 for (int g
= 0; g
< SPA_GBH_NBLKPTRS
; g
++) {
1656 zio_dva_unallocate(zio
, gn
->gn_child
[g
],
1657 &gn
->gn_gbh
->zg_blkptr
[g
]);
1663 * Try to allocate an intent log block. Return 0 on success, errno on failure.
1666 zio_alloc_blk(spa_t
*spa
, uint64_t size
, blkptr_t
*new_bp
, blkptr_t
*old_bp
,
1671 error
= metaslab_alloc(spa
, spa
->spa_log_class
, size
,
1672 new_bp
, 1, txg
, old_bp
, METASLAB_HINTBP_AVOID
);
1675 error
= metaslab_alloc(spa
, spa
->spa_normal_class
, size
,
1676 new_bp
, 1, txg
, old_bp
, METASLAB_HINTBP_AVOID
);
1679 BP_SET_LSIZE(new_bp
, size
);
1680 BP_SET_PSIZE(new_bp
, size
);
1681 BP_SET_COMPRESS(new_bp
, ZIO_COMPRESS_OFF
);
1682 BP_SET_CHECKSUM(new_bp
, ZIO_CHECKSUM_ZILOG
);
1683 BP_SET_TYPE(new_bp
, DMU_OT_INTENT_LOG
);
1684 BP_SET_LEVEL(new_bp
, 0);
1685 BP_SET_BYTEORDER(new_bp
, ZFS_HOST_BYTEORDER
);
1692 * Free an intent log block. We know it can't be a gang block, so there's
1693 * nothing to do except metaslab_free() it.
1696 zio_free_blk(spa_t
*spa
, blkptr_t
*bp
, uint64_t txg
)
1698 ASSERT(!BP_IS_GANG(bp
));
1700 metaslab_free(spa
, bp
, txg
, B_FALSE
);
1704 * ==========================================================================
1705 * Read and write to physical devices
1706 * ==========================================================================
1710 zio_vdev_io_probe_done(zio_t
*zio
)
1713 vdev_t
*vd
= zio
->io_private
;
1715 mutex_enter(&vd
->vdev_probe_lock
);
1716 ASSERT(vd
->vdev_probe_zio
== zio
);
1717 vd
->vdev_probe_zio
= NULL
;
1718 mutex_exit(&vd
->vdev_probe_lock
);
1720 while ((dio
= zio
->io_delegate_list
) != NULL
) {
1721 zio
->io_delegate_list
= dio
->io_delegate_next
;
1722 dio
->io_delegate_next
= NULL
;
1723 if (!vdev_accessible(vd
, dio
))
1724 dio
->io_error
= ENXIO
;
1730 * Probe the device to determine whether I/O failure is specific to this
1731 * zio (e.g. a bad sector) or affects the entire vdev (e.g. unplugged).
1734 zio_vdev_io_probe(zio_t
*zio
)
1736 vdev_t
*vd
= zio
->io_vd
;
1738 boolean_t created_pio
= B_FALSE
;
1741 * Don't probe the probe.
1743 if (zio
->io_flags
& ZIO_FLAG_PROBE
)
1744 return (ZIO_PIPELINE_CONTINUE
);
1747 * To prevent 'probe storms' when a device fails, we create
1748 * just one probe i/o at a time. All zios that want to probe
1749 * this vdev will join the probe zio's io_delegate_list.
1751 mutex_enter(&vd
->vdev_probe_lock
);
1753 if ((pio
= vd
->vdev_probe_zio
) == NULL
) {
1754 vd
->vdev_probe_zio
= pio
= zio_root(zio
->io_spa
,
1755 zio_vdev_io_probe_done
, vd
, ZIO_FLAG_CANFAIL
);
1756 created_pio
= B_TRUE
;
1757 vd
->vdev_probe_wanted
= B_TRUE
;
1758 spa_async_request(zio
->io_spa
, SPA_ASYNC_PROBE
);
1761 zio
->io_delegate_next
= pio
->io_delegate_list
;
1762 pio
->io_delegate_list
= zio
;
1764 mutex_exit(&vd
->vdev_probe_lock
);
1767 zio_nowait(vdev_probe(vd
, pio
));
1771 return (ZIO_PIPELINE_STOP
);
1775 zio_vdev_io_start(zio_t
*zio
)
1777 vdev_t
*vd
= zio
->io_vd
;
1779 spa_t
*spa
= zio
->io_spa
;
1781 ASSERT(zio
->io_error
== 0);
1782 ASSERT(zio
->io_child_error
[ZIO_CHILD_VDEV
] == 0);
1785 if (!(zio
->io_flags
& ZIO_FLAG_CONFIG_WRITER
))
1786 spa_config_enter(spa
, SCL_ZIO
, zio
, RW_READER
);
1789 * The mirror_ops handle multiple DVAs in a single BP.
1791 return (vdev_mirror_ops
.vdev_op_io_start(zio
));
1794 align
= 1ULL << vd
->vdev_top
->vdev_ashift
;
1796 if (P2PHASE(zio
->io_size
, align
) != 0) {
1797 uint64_t asize
= P2ROUNDUP(zio
->io_size
, align
);
1798 char *abuf
= zio_buf_alloc(asize
);
1799 ASSERT(vd
== vd
->vdev_top
);
1800 if (zio
->io_type
== ZIO_TYPE_WRITE
) {
1801 bcopy(zio
->io_data
, abuf
, zio
->io_size
);
1802 bzero(abuf
+ zio
->io_size
, asize
- zio
->io_size
);
1804 zio_push_transform(zio
, abuf
, asize
, asize
, zio_subblock
);
1807 ASSERT(P2PHASE(zio
->io_offset
, align
) == 0);
1808 ASSERT(P2PHASE(zio
->io_size
, align
) == 0);
1809 ASSERT(zio
->io_type
!= ZIO_TYPE_WRITE
|| (spa_mode
& FWRITE
));
1811 if (vd
->vdev_ops
->vdev_op_leaf
&&
1812 (zio
->io_type
== ZIO_TYPE_READ
|| zio
->io_type
== ZIO_TYPE_WRITE
)) {
1814 if (zio
->io_type
== ZIO_TYPE_READ
&& vdev_cache_read(zio
) == 0)
1815 return (ZIO_PIPELINE_STOP
);
1817 if ((zio
= vdev_queue_io(zio
)) == NULL
)
1818 return (ZIO_PIPELINE_STOP
);
1820 if (!vdev_accessible(vd
, zio
)) {
1821 zio
->io_error
= ENXIO
;
1823 return (ZIO_PIPELINE_STOP
);
1828 return (vd
->vdev_ops
->vdev_op_io_start(zio
));
1832 zio_vdev_io_done(zio_t
*zio
)
1834 vdev_t
*vd
= zio
->io_vd
;
1835 vdev_ops_t
*ops
= vd
? vd
->vdev_ops
: &vdev_mirror_ops
;
1836 boolean_t unexpected_error
= B_FALSE
;
1838 if (zio_wait_for_children(zio
, ZIO_CHILD_VDEV
, ZIO_WAIT_DONE
))
1839 return (ZIO_PIPELINE_STOP
);
1841 ASSERT(zio
->io_type
== ZIO_TYPE_READ
|| zio
->io_type
== ZIO_TYPE_WRITE
);
1843 if (vd
!= NULL
&& vd
->vdev_ops
->vdev_op_leaf
) {
1845 vdev_queue_io_done(zio
);
1847 if (zio
->io_type
== ZIO_TYPE_WRITE
)
1848 vdev_cache_write(zio
);
1850 if (zio_injection_enabled
&& zio
->io_error
== 0)
1851 zio
->io_error
= zio_handle_device_injection(vd
, EIO
);
1853 if (zio_injection_enabled
&& zio
->io_error
== 0)
1854 zio
->io_error
= zio_handle_label_injection(zio
, EIO
);
1856 if (zio
->io_error
) {
1857 if (!vdev_accessible(vd
, zio
)) {
1858 zio
->io_error
= ENXIO
;
1860 unexpected_error
= B_TRUE
;
1865 ops
->vdev_op_io_done(zio
);
1867 if (unexpected_error
)
1868 return (zio_vdev_io_probe(zio
));
1870 return (ZIO_PIPELINE_CONTINUE
);
1874 zio_vdev_io_assess(zio_t
*zio
)
1876 vdev_t
*vd
= zio
->io_vd
;
1878 if (zio_wait_for_children(zio
, ZIO_CHILD_VDEV
, ZIO_WAIT_DONE
))
1879 return (ZIO_PIPELINE_STOP
);
1881 if (vd
== NULL
&& !(zio
->io_flags
& ZIO_FLAG_CONFIG_WRITER
))
1882 spa_config_exit(zio
->io_spa
, SCL_ZIO
, zio
);
1884 if (zio
->io_vsd
!= NULL
) {
1885 zio
->io_vsd_free(zio
);
1889 if (zio_injection_enabled
&& zio
->io_error
== 0)
1890 zio
->io_error
= zio_handle_fault_injection(zio
, EIO
);
1893 * If the I/O failed, determine whether we should attempt to retry it.
1895 if (zio
->io_error
&& vd
== NULL
&&
1896 !(zio
->io_flags
& (ZIO_FLAG_DONT_RETRY
| ZIO_FLAG_IO_RETRY
))) {
1897 ASSERT(!(zio
->io_flags
& ZIO_FLAG_DONT_QUEUE
)); /* not a leaf */
1898 ASSERT(!(zio
->io_flags
& ZIO_FLAG_IO_BYPASS
)); /* not a leaf */
1900 zio
->io_flags
|= ZIO_FLAG_IO_RETRY
|
1901 ZIO_FLAG_DONT_CACHE
| ZIO_FLAG_DONT_AGGREGATE
;
1902 zio
->io_stage
= ZIO_STAGE_VDEV_IO_START
- 1;
1903 zio_taskq_dispatch(zio
, ZIO_TASKQ_ISSUE
);
1904 return (ZIO_PIPELINE_STOP
);
1908 * If we got an error on a leaf device, convert it to ENXIO
1909 * if the device is not accessible at all.
1911 if (zio
->io_error
&& vd
!= NULL
&& vd
->vdev_ops
->vdev_op_leaf
&&
1912 !vdev_accessible(vd
, zio
))
1913 zio
->io_error
= ENXIO
;
1916 * If we can't write to an interior vdev (mirror or RAID-Z),
1917 * set vdev_cant_write so that we stop trying to allocate from it.
1919 if (zio
->io_error
== ENXIO
&& zio
->io_type
== ZIO_TYPE_WRITE
&&
1920 vd
!= NULL
&& !vd
->vdev_ops
->vdev_op_leaf
)
1921 vd
->vdev_cant_write
= B_TRUE
;
1924 zio
->io_pipeline
= ZIO_INTERLOCK_PIPELINE
;
1926 return (ZIO_PIPELINE_CONTINUE
);
1930 zio_vdev_io_reissue(zio_t
*zio
)
1932 ASSERT(zio
->io_stage
== ZIO_STAGE_VDEV_IO_START
);
1933 ASSERT(zio
->io_error
== 0);
1939 zio_vdev_io_redone(zio_t
*zio
)
1941 ASSERT(zio
->io_stage
== ZIO_STAGE_VDEV_IO_DONE
);
1947 zio_vdev_io_bypass(zio_t
*zio
)
1949 ASSERT(zio
->io_stage
== ZIO_STAGE_VDEV_IO_START
);
1950 ASSERT(zio
->io_error
== 0);
1952 zio
->io_flags
|= ZIO_FLAG_IO_BYPASS
;
1953 zio
->io_stage
= ZIO_STAGE_VDEV_IO_ASSESS
- 1;
1957 * ==========================================================================
1958 * Generate and verify checksums
1959 * ==========================================================================
1962 zio_checksum_generate(zio_t
*zio
)
1964 blkptr_t
*bp
= zio
->io_bp
;
1965 enum zio_checksum checksum
;
1969 * This is zio_write_phys().
1970 * We're either generating a label checksum, or none at all.
1972 checksum
= zio
->io_prop
.zp_checksum
;
1974 if (checksum
== ZIO_CHECKSUM_OFF
)
1975 return (ZIO_PIPELINE_CONTINUE
);
1977 ASSERT(checksum
== ZIO_CHECKSUM_LABEL
);
1979 if (BP_IS_GANG(bp
) && zio
->io_child_type
== ZIO_CHILD_GANG
) {
1980 ASSERT(!IO_IS_ALLOCATING(zio
));
1981 checksum
= ZIO_CHECKSUM_GANG_HEADER
;
1983 checksum
= BP_GET_CHECKSUM(bp
);
1987 zio_checksum_compute(zio
, checksum
, zio
->io_data
, zio
->io_size
);
1989 return (ZIO_PIPELINE_CONTINUE
);
1993 zio_checksum_verify(zio_t
*zio
)
1995 blkptr_t
*bp
= zio
->io_bp
;
2000 * This is zio_read_phys().
2001 * We're either verifying a label checksum, or nothing at all.
2003 if (zio
->io_prop
.zp_checksum
== ZIO_CHECKSUM_OFF
)
2004 return (ZIO_PIPELINE_CONTINUE
);
2006 ASSERT(zio
->io_prop
.zp_checksum
== ZIO_CHECKSUM_LABEL
);
2009 if ((error
= zio_checksum_error(zio
)) != 0) {
2010 zio
->io_error
= error
;
2011 if (!(zio
->io_flags
& ZIO_FLAG_SPECULATIVE
)) {
2012 zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM
,
2013 zio
->io_spa
, zio
->io_vd
, zio
, 0, 0);
2017 return (ZIO_PIPELINE_CONTINUE
);
2021 * Called by RAID-Z to ensure we don't compute the checksum twice.
2024 zio_checksum_verified(zio_t
*zio
)
2026 zio
->io_pipeline
&= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY
);
2030 * ==========================================================================
2031 * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
2032 * An error of 0 indictes success. ENXIO indicates whole-device failure,
2033 * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO
2034 * indicate errors that are specific to one I/O, and most likely permanent.
2035 * Any other error is presumed to be worse because we weren't expecting it.
2036 * ==========================================================================
2039 zio_worst_error(int e1
, int e2
)
2041 static int zio_error_rank
[] = { 0, ENXIO
, ECKSUM
, EIO
};
2044 for (r1
= 0; r1
< sizeof (zio_error_rank
) / sizeof (int); r1
++)
2045 if (e1
== zio_error_rank
[r1
])
2048 for (r2
= 0; r2
< sizeof (zio_error_rank
) / sizeof (int); r2
++)
2049 if (e2
== zio_error_rank
[r2
])
2052 return (r1
> r2
? e1
: e2
);
2056 * ==========================================================================
2058 * ==========================================================================
2061 zio_ready(zio_t
*zio
)
2063 blkptr_t
*bp
= zio
->io_bp
;
2064 zio_t
*pio
= zio
->io_parent
;
2066 if (zio
->io_ready
) {
2067 if (BP_IS_GANG(bp
) &&
2068 zio_wait_for_children(zio
, ZIO_CHILD_GANG
, ZIO_WAIT_READY
))
2069 return (ZIO_PIPELINE_STOP
);
2071 ASSERT(IO_IS_ALLOCATING(zio
));
2072 ASSERT(bp
->blk_birth
== zio
->io_txg
|| BP_IS_HOLE(bp
));
2073 ASSERT(zio
->io_children
[ZIO_CHILD_GANG
][ZIO_WAIT_READY
] == 0);
2078 if (bp
!= NULL
&& bp
!= &zio
->io_bp_copy
)
2079 zio
->io_bp_copy
= *bp
;
2082 zio
->io_pipeline
= ZIO_INTERLOCK_PIPELINE
;
2085 zio_notify_parent(pio
, zio
, ZIO_WAIT_READY
);
2087 return (ZIO_PIPELINE_CONTINUE
);
2091 zio_done(zio_t
*zio
)
2093 spa_t
*spa
= zio
->io_spa
;
2094 zio_t
*pio
= zio
->io_parent
;
2095 zio_t
*lio
= zio
->io_logical
;
2096 blkptr_t
*bp
= zio
->io_bp
;
2097 vdev_t
*vd
= zio
->io_vd
;
2098 uint64_t psize
= zio
->io_size
;
2101 * If our of children haven't all completed,
2102 * wait for them and then repeat this pipeline stage.
2104 if (zio_wait_for_children(zio
, ZIO_CHILD_VDEV
, ZIO_WAIT_DONE
) ||
2105 zio_wait_for_children(zio
, ZIO_CHILD_GANG
, ZIO_WAIT_DONE
) ||
2106 zio_wait_for_children(zio
, ZIO_CHILD_LOGICAL
, ZIO_WAIT_DONE
))
2107 return (ZIO_PIPELINE_STOP
);
2109 for (int c
= 0; c
< ZIO_CHILD_TYPES
; c
++)
2110 for (int w
= 0; w
< ZIO_WAIT_TYPES
; w
++)
2111 ASSERT(zio
->io_children
[c
][w
] == 0);
2114 ASSERT(bp
->blk_pad
[0] == 0);
2115 ASSERT(bp
->blk_pad
[1] == 0);
2116 ASSERT(bp
->blk_pad
[2] == 0);
2117 ASSERT(bcmp(bp
, &zio
->io_bp_copy
, sizeof (blkptr_t
)) == 0 ||
2118 (pio
!= NULL
&& bp
== pio
->io_bp
));
2119 if (zio
->io_type
== ZIO_TYPE_WRITE
&& !BP_IS_HOLE(bp
) &&
2120 !(zio
->io_flags
& ZIO_FLAG_IO_REPAIR
)) {
2121 ASSERT(!BP_SHOULD_BYTESWAP(bp
));
2122 ASSERT3U(zio
->io_prop
.zp_ndvas
, <=, BP_GET_NDVAS(bp
));
2123 ASSERT(BP_COUNT_GANG(bp
) == 0 ||
2124 (BP_COUNT_GANG(bp
) == BP_GET_NDVAS(bp
)));
2129 * If there were child vdev or gang errors, they apply to us now.
2131 zio_inherit_child_errors(zio
, ZIO_CHILD_VDEV
);
2132 zio_inherit_child_errors(zio
, ZIO_CHILD_GANG
);
2134 zio_pop_transforms(zio
); /* note: may set zio->io_error */
2136 vdev_stat_update(zio
, psize
);
2138 if (zio
->io_error
) {
2140 * If this I/O is attached to a particular vdev,
2141 * generate an error message describing the I/O failure
2142 * at the block level. We ignore these errors if the
2143 * device is currently unavailable.
2145 if (zio
->io_error
!= ECKSUM
&& vd
!= NULL
&& !vdev_is_dead(vd
))
2146 zfs_ereport_post(FM_EREPORT_ZFS_IO
, spa
, vd
, zio
, 0, 0);
2148 if ((zio
->io_error
== EIO
||
2149 !(zio
->io_flags
& ZIO_FLAG_SPECULATIVE
)) && zio
== lio
) {
2151 * For logical I/O requests, tell the SPA to log the
2152 * error and generate a logical data ereport.
2154 spa_log_error(spa
, zio
);
2155 zfs_ereport_post(FM_EREPORT_ZFS_DATA
, spa
, NULL
, zio
,
2160 if (zio
->io_error
&& zio
== lio
) {
2162 * Determine whether zio should be reexecuted. This will
2163 * propagate all the way to the root via zio_notify_parent().
2165 ASSERT(vd
== NULL
&& bp
!= NULL
);
2167 if (IO_IS_ALLOCATING(zio
))
2168 if (zio
->io_error
!= ENOSPC
)
2169 zio
->io_reexecute
|= ZIO_REEXECUTE_NOW
;
2171 zio
->io_reexecute
|= ZIO_REEXECUTE_SUSPEND
;
2173 if ((zio
->io_type
== ZIO_TYPE_READ
||
2174 zio
->io_type
== ZIO_TYPE_FREE
) &&
2175 zio
->io_error
== ENXIO
&&
2176 spa_get_failmode(spa
) != ZIO_FAILURE_MODE_CONTINUE
)
2177 zio
->io_reexecute
|= ZIO_REEXECUTE_SUSPEND
;
2179 if (!(zio
->io_flags
& ZIO_FLAG_CANFAIL
) && !zio
->io_reexecute
)
2180 zio
->io_reexecute
|= ZIO_REEXECUTE_SUSPEND
;
2184 * If there were logical child errors, they apply to us now.
2185 * We defer this until now to avoid conflating logical child
2186 * errors with errors that happened to the zio itself when
2187 * updating vdev stats and reporting FMA events above.
2189 zio_inherit_child_errors(zio
, ZIO_CHILD_LOGICAL
);
2191 if (zio
->io_reexecute
) {
2193 * This is a logical I/O that wants to reexecute.
2195 * Reexecute is top-down. When an i/o fails, if it's not
2196 * the root, it simply notifies its parent and sticks around.
2197 * The parent, seeing that it still has children in zio_done(),
2198 * does the same. This percolates all the way up to the root.
2199 * The root i/o will reexecute or suspend the entire tree.
2201 * This approach ensures that zio_reexecute() honors
2202 * all the original i/o dependency relationships, e.g.
2203 * parents not executing until children are ready.
2205 ASSERT(zio
->io_child_type
== ZIO_CHILD_LOGICAL
);
2207 if (IO_IS_ALLOCATING(zio
))
2208 zio_dva_unallocate(zio
, zio
->io_gang_tree
, bp
);
2210 zio_gang_tree_free(&zio
->io_gang_tree
);
2214 * We're not a root i/o, so there's nothing to do
2215 * but notify our parent. Don't propagate errors
2216 * upward since we haven't permanently failed yet.
2218 zio
->io_flags
|= ZIO_FLAG_DONT_PROPAGATE
;
2219 zio_notify_parent(pio
, zio
, ZIO_WAIT_DONE
);
2220 } else if (zio
->io_reexecute
& ZIO_REEXECUTE_SUSPEND
) {
2222 * We'd fail again if we reexecuted now, so suspend
2223 * until conditions improve (e.g. device comes online).
2225 zio_suspend(spa
, zio
);
2228 * Reexecution is potentially a huge amount of work.
2229 * Hand it off to the otherwise-unused claim taskq.
2231 (void) taskq_dispatch(
2232 spa
->spa_zio_taskq
[ZIO_TYPE_CLAIM
][ZIO_TASKQ_ISSUE
],
2233 (task_func_t
*)zio_reexecute
, zio
, TQ_SLEEP
);
2235 return (ZIO_PIPELINE_STOP
);
2238 ASSERT(zio
->io_child
== NULL
);
2239 ASSERT(zio
->io_reexecute
== 0);
2240 ASSERT(zio
->io_error
== 0 || (zio
->io_flags
& ZIO_FLAG_CANFAIL
));
2245 zio_gang_tree_free(&zio
->io_gang_tree
);
2247 ASSERT(zio
->io_delegate_list
== NULL
);
2248 ASSERT(zio
->io_delegate_next
== NULL
);
2251 zio_remove_child(pio
, zio
);
2252 zio_notify_parent(pio
, zio
, ZIO_WAIT_DONE
);
2255 if (zio
->io_waiter
!= NULL
) {
2256 mutex_enter(&zio
->io_lock
);
2257 zio
->io_executor
= NULL
;
2258 cv_broadcast(&zio
->io_cv
);
2259 mutex_exit(&zio
->io_lock
);
2264 return (ZIO_PIPELINE_STOP
);
2268 * ==========================================================================
2269 * I/O pipeline definition
2270 * ==========================================================================
2272 static zio_pipe_stage_t
*zio_pipeline
[ZIO_STAGES
] = {
2277 zio_checksum_generate
,
2287 zio_checksum_verify
,