4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24 * Copyright (c) 2017, Intel Corporation.
25 * Copyright (c) 2024, Klara Inc.
31 * To handle fault injection, we keep track of a series of zinject_record_t
32 * structures which describe which logical block(s) should be injected with a
33 * fault. These are kept in a global list. Each record corresponds to a given
34 * spa_t and maintains a special hold on the spa_t so that it cannot be deleted
35 * or exported while the injection record exists.
37 * Device level injection is done using the 'zi_guid' field. If this is set, it
38 * means that the error is destined for a particular device, not a piece of
41 * This is a rather poor data structure and algorithm, but we don't expect more
42 * than a few faults at any one time, so it should be sufficient for our needs.
47 #include <sys/zfs_ioctl.h>
48 #include <sys/vdev_impl.h>
49 #include <sys/dmu_objset.h>
50 #include <sys/dsl_dataset.h>
51 #include <sys/fs/zfs.h>
53 uint32_t zio_injection_enabled
= 0;
56 * Data describing each zinject handler registered on the system, and
57 * contains the list node linking the handler in the global zinject
60 typedef struct inject_handler
{
63 char *zi_spa_name
; /* ZINJECT_DELAY_IMPORT only */
64 zinject_record_t zi_record
;
71 * List of all zinject handlers registered on the system, protected by
72 * the inject_lock defined below.
74 static list_t inject_handlers
;
77 * This protects insertion into, and traversal of, the inject handler
78 * list defined above; as well as the inject_delay_count. Any time a
79 * handler is inserted or removed from the list, this lock should be
80 * taken as a RW_WRITER; and any time traversal is done over the list
81 * (without modification to it) this lock should be taken as a RW_READER.
83 static krwlock_t inject_lock
;
86 * This holds the number of zinject delay handlers that have been
87 * registered on the system. It is protected by the inject_lock defined
88 * above. Thus modifications to this count must be a RW_WRITER of the
89 * inject_lock, and reads of this count must be (at least) a RW_READER
92 static int inject_delay_count
= 0;
95 * This lock is used only in zio_handle_io_delay(), refer to the comment
96 * in that function for more details.
98 static kmutex_t inject_delay_mtx
;
101 * Used to assign unique identifying numbers to each new zinject handler.
103 static int inject_next_id
= 1;
106 * Test if the requested frequency was triggered
109 freq_triggered(uint32_t frequency
)
112 * zero implies always (100%)
118 * Note: we still handle legacy (unscaled) frequency values
120 uint32_t maximum
= (frequency
<= 100) ? 100 : ZI_PERCENTAGE_MAX
;
122 return (random_in_range(maximum
) < frequency
);
126 * Returns true if the given record matches the I/O in progress.
129 zio_match_handler(const zbookmark_phys_t
*zb
, uint64_t type
, int dva
,
130 zinject_record_t
*record
, int error
)
133 * Check for a match against the MOS, which is based on type
135 if (zb
->zb_objset
== DMU_META_OBJSET
&&
136 record
->zi_objset
== DMU_META_OBJSET
&&
137 record
->zi_object
== DMU_META_DNODE_OBJECT
) {
138 if (record
->zi_type
== DMU_OT_NONE
||
139 type
== record
->zi_type
)
140 return (freq_triggered(record
->zi_freq
));
146 * Check for an exact match.
148 if (zb
->zb_objset
== record
->zi_objset
&&
149 zb
->zb_object
== record
->zi_object
&&
150 zb
->zb_level
== record
->zi_level
&&
151 zb
->zb_blkid
>= record
->zi_start
&&
152 zb
->zb_blkid
<= record
->zi_end
&&
153 (record
->zi_dvas
== 0 ||
154 (dva
!= ZI_NO_DVA
&& (record
->zi_dvas
& (1ULL << dva
)))) &&
155 error
== record
->zi_error
) {
156 return (freq_triggered(record
->zi_freq
));
163 * Panic the system when a config change happens in the function
167 zio_handle_panic_injection(spa_t
*spa
, const char *tag
, uint64_t type
)
169 inject_handler_t
*handler
;
171 rw_enter(&inject_lock
, RW_READER
);
173 for (handler
= list_head(&inject_handlers
); handler
!= NULL
;
174 handler
= list_next(&inject_handlers
, handler
)) {
176 if (spa
!= handler
->zi_spa
)
179 if (handler
->zi_record
.zi_type
== type
&&
180 strcmp(tag
, handler
->zi_record
.zi_func
) == 0)
181 panic("Panic requested in function %s\n", tag
);
184 rw_exit(&inject_lock
);
188 * Inject a decryption failure. Decryption failures can occur in
189 * both the ARC and the ZIO layers.
192 zio_handle_decrypt_injection(spa_t
*spa
, const zbookmark_phys_t
*zb
,
193 uint64_t type
, int error
)
196 inject_handler_t
*handler
;
198 rw_enter(&inject_lock
, RW_READER
);
200 for (handler
= list_head(&inject_handlers
); handler
!= NULL
;
201 handler
= list_next(&inject_handlers
, handler
)) {
203 if (spa
!= handler
->zi_spa
||
204 handler
->zi_record
.zi_cmd
!= ZINJECT_DECRYPT_FAULT
)
207 if (zio_match_handler(zb
, type
, ZI_NO_DVA
,
208 &handler
->zi_record
, error
)) {
214 rw_exit(&inject_lock
);
219 * If this is a physical I/O for a vdev child determine which DVA it is
220 * for. We iterate backwards through the DVAs matching on the offset so
221 * that we end up with ZI_NO_DVA (-1) if we don't find a match.
224 zio_match_dva(zio_t
*zio
)
228 if (zio
->io_bp
!= NULL
&& zio
->io_vd
!= NULL
&&
229 zio
->io_child_type
== ZIO_CHILD_VDEV
) {
230 for (i
= BP_GET_NDVAS(zio
->io_bp
) - 1; i
>= 0; i
--) {
231 dva_t
*dva
= &zio
->io_bp
->blk_dva
[i
];
232 uint64_t off
= DVA_GET_OFFSET(dva
);
233 vdev_t
*vd
= vdev_lookup_top(zio
->io_spa
,
236 /* Compensate for vdev label added to leaves */
237 if (zio
->io_vd
->vdev_ops
->vdev_op_leaf
)
238 off
+= VDEV_LABEL_START_SIZE
;
240 if (zio
->io_vd
== vd
&& zio
->io_offset
== off
)
250 * Determine if the I/O in question should return failure. Returns the errno
251 * to be returned to the caller.
254 zio_handle_fault_injection(zio_t
*zio
, int error
)
257 inject_handler_t
*handler
;
260 * Ignore I/O not associated with any logical data.
262 if (zio
->io_logical
== NULL
)
266 * Currently, we only support fault injection on reads.
268 if (zio
->io_type
!= ZIO_TYPE_READ
)
272 * A rebuild I/O has no checksum to verify.
274 if (zio
->io_priority
== ZIO_PRIORITY_REBUILD
&& error
== ECKSUM
)
277 rw_enter(&inject_lock
, RW_READER
);
279 for (handler
= list_head(&inject_handlers
); handler
!= NULL
;
280 handler
= list_next(&inject_handlers
, handler
)) {
281 if (zio
->io_spa
!= handler
->zi_spa
||
282 handler
->zi_record
.zi_cmd
!= ZINJECT_DATA_FAULT
)
285 /* If this handler matches, return the specified error */
286 if (zio_match_handler(&zio
->io_logical
->io_bookmark
,
287 zio
->io_bp
? BP_GET_TYPE(zio
->io_bp
) : DMU_OT_NONE
,
288 zio_match_dva(zio
), &handler
->zi_record
, error
)) {
294 rw_exit(&inject_lock
);
300 * Determine if the zio is part of a label update and has an injection
301 * handler associated with that portion of the label. Currently, we
302 * allow error injection in either the nvlist or the uberblock region of
306 zio_handle_label_injection(zio_t
*zio
, int error
)
308 inject_handler_t
*handler
;
309 vdev_t
*vd
= zio
->io_vd
;
310 uint64_t offset
= zio
->io_offset
;
314 if (offset
>= VDEV_LABEL_START_SIZE
&&
315 offset
< vd
->vdev_psize
- VDEV_LABEL_END_SIZE
)
318 rw_enter(&inject_lock
, RW_READER
);
320 for (handler
= list_head(&inject_handlers
); handler
!= NULL
;
321 handler
= list_next(&inject_handlers
, handler
)) {
322 uint64_t start
= handler
->zi_record
.zi_start
;
323 uint64_t end
= handler
->zi_record
.zi_end
;
325 if (handler
->zi_record
.zi_cmd
!= ZINJECT_LABEL_FAULT
)
329 * The injection region is the relative offsets within a
330 * vdev label. We must determine the label which is being
331 * updated and adjust our region accordingly.
333 label
= vdev_label_number(vd
->vdev_psize
, offset
);
334 start
= vdev_label_offset(vd
->vdev_psize
, label
, start
);
335 end
= vdev_label_offset(vd
->vdev_psize
, label
, end
);
337 if (zio
->io_vd
->vdev_guid
== handler
->zi_record
.zi_guid
&&
338 (offset
>= start
&& offset
<= end
)) {
343 rw_exit(&inject_lock
);
348 zio_inject_bitflip_cb(void *data
, size_t len
, void *private)
350 zio_t
*zio
= private;
351 uint8_t *buffer
= data
;
352 uint_t byte
= random_in_range(len
);
354 ASSERT3U(zio
->io_type
, ==, ZIO_TYPE_READ
);
356 /* flip a single random bit in an abd data buffer */
357 buffer
[byte
] ^= 1 << random_in_range(8);
359 return (1); /* stop after first flip */
363 zio_handle_device_injection_impl(vdev_t
*vd
, zio_t
*zio
, int err1
, int err2
)
365 inject_handler_t
*handler
;
369 * We skip over faults in the labels unless it's during device open
370 * (i.e. zio == NULL) or a device flush (offset is meaningless)
372 if (zio
!= NULL
&& zio
->io_type
!= ZIO_TYPE_FLUSH
) {
373 uint64_t offset
= zio
->io_offset
;
375 if (offset
< VDEV_LABEL_START_SIZE
||
376 offset
>= vd
->vdev_psize
- VDEV_LABEL_END_SIZE
)
380 rw_enter(&inject_lock
, RW_READER
);
382 for (handler
= list_head(&inject_handlers
); handler
!= NULL
;
383 handler
= list_next(&inject_handlers
, handler
)) {
385 if (handler
->zi_record
.zi_cmd
!= ZINJECT_DEVICE_FAULT
)
388 if (vd
->vdev_guid
== handler
->zi_record
.zi_guid
) {
389 if (handler
->zi_record
.zi_failfast
&&
390 (zio
== NULL
|| (zio
->io_flags
&
391 (ZIO_FLAG_IO_RETRY
| ZIO_FLAG_TRYHARD
)))) {
395 /* Handle type specific I/O failures */
397 handler
->zi_record
.zi_iotype
!= ZIO_TYPES
&&
398 handler
->zi_record
.zi_iotype
!= zio
->io_type
)
401 if (handler
->zi_record
.zi_error
== err1
||
402 handler
->zi_record
.zi_error
== err2
) {
404 * limit error injection if requested
406 if (!freq_triggered(handler
->zi_record
.zi_freq
))
410 * For a failed open, pretend like the device
414 vd
->vdev_stat
.vs_aux
=
415 VDEV_AUX_OPEN_FAILED
;
418 * Treat these errors as if they had been
419 * retried so that all the appropriate stats
420 * and FMA events are generated.
422 if (!handler
->zi_record
.zi_failfast
&&
424 zio
->io_flags
|= ZIO_FLAG_IO_RETRY
;
427 * EILSEQ means flip a bit after a read
429 if (handler
->zi_record
.zi_error
== EILSEQ
) {
433 /* locate buffer data and flip a bit */
434 (void) abd_iterate_func(zio
->io_abd
, 0,
435 zio
->io_size
, zio_inject_bitflip_cb
,
440 ret
= handler
->zi_record
.zi_error
;
443 if (handler
->zi_record
.zi_error
== ENXIO
) {
444 ret
= SET_ERROR(EIO
);
450 rw_exit(&inject_lock
);
456 zio_handle_device_injection(vdev_t
*vd
, zio_t
*zio
, int error
)
458 return (zio_handle_device_injection_impl(vd
, zio
, error
, INT_MAX
));
462 zio_handle_device_injections(vdev_t
*vd
, zio_t
*zio
, int err1
, int err2
)
464 return (zio_handle_device_injection_impl(vd
, zio
, err1
, err2
));
468 * Simulate hardware that ignores cache flushes. For requested number
469 * of seconds nix the actual writing to disk.
472 zio_handle_ignored_writes(zio_t
*zio
)
474 inject_handler_t
*handler
;
476 rw_enter(&inject_lock
, RW_READER
);
478 for (handler
= list_head(&inject_handlers
); handler
!= NULL
;
479 handler
= list_next(&inject_handlers
, handler
)) {
481 /* Ignore errors not destined for this pool */
482 if (zio
->io_spa
!= handler
->zi_spa
||
483 handler
->zi_record
.zi_cmd
!= ZINJECT_IGNORED_WRITES
)
487 * Positive duration implies # of seconds, negative
490 if (handler
->zi_record
.zi_timer
== 0) {
491 if (handler
->zi_record
.zi_duration
> 0)
492 handler
->zi_record
.zi_timer
= ddi_get_lbolt64();
494 handler
->zi_record
.zi_timer
= zio
->io_txg
;
497 /* Have a "problem" writing 60% of the time */
498 if (random_in_range(100) < 60)
499 zio
->io_pipeline
&= ~ZIO_VDEV_IO_STAGES
;
503 rw_exit(&inject_lock
);
507 spa_handle_ignored_writes(spa_t
*spa
)
509 inject_handler_t
*handler
;
511 if (zio_injection_enabled
== 0)
514 rw_enter(&inject_lock
, RW_READER
);
516 for (handler
= list_head(&inject_handlers
); handler
!= NULL
;
517 handler
= list_next(&inject_handlers
, handler
)) {
519 if (spa
!= handler
->zi_spa
||
520 handler
->zi_record
.zi_cmd
!= ZINJECT_IGNORED_WRITES
)
523 if (handler
->zi_record
.zi_duration
> 0) {
524 VERIFY(handler
->zi_record
.zi_timer
== 0 ||
526 (int64_t)handler
->zi_record
.zi_timer
+
527 handler
->zi_record
.zi_duration
* hz
,
530 /* duration is negative so the subtraction here adds */
531 VERIFY(handler
->zi_record
.zi_timer
== 0 ||
532 handler
->zi_record
.zi_timer
-
533 handler
->zi_record
.zi_duration
>=
534 spa_syncing_txg(spa
));
538 rw_exit(&inject_lock
);
542 zio_handle_io_delay(zio_t
*zio
)
544 vdev_t
*vd
= zio
->io_vd
;
545 inject_handler_t
*min_handler
= NULL
;
546 hrtime_t min_target
= 0;
548 rw_enter(&inject_lock
, RW_READER
);
551 * inject_delay_count is a subset of zio_injection_enabled that
552 * is only incremented for delay handlers. These checks are
553 * mainly added to remind the reader why we're not explicitly
554 * checking zio_injection_enabled like the other functions.
556 IMPLY(inject_delay_count
> 0, zio_injection_enabled
> 0);
557 IMPLY(zio_injection_enabled
== 0, inject_delay_count
== 0);
560 * If there aren't any inject delay handlers registered, then we
561 * can short circuit and simply return 0 here. A value of zero
562 * informs zio_delay_interrupt() that this request should not be
563 * delayed. This short circuit keeps us from acquiring the
564 * inject_delay_mutex unnecessarily.
566 if (inject_delay_count
== 0) {
567 rw_exit(&inject_lock
);
572 * Each inject handler has a number of "lanes" associated with
573 * it. Each lane is able to handle requests independently of one
574 * another, and at a latency defined by the inject handler
575 * record's zi_timer field. Thus if a handler in configured with
576 * a single lane with a 10ms latency, it will delay requests
577 * such that only a single request is completed every 10ms. So,
578 * if more than one request is attempted per each 10ms interval,
579 * the average latency of the requests will be greater than
580 * 10ms; but if only a single request is submitted each 10ms
581 * interval the average latency will be 10ms.
583 * We need to acquire this mutex to prevent multiple concurrent
584 * threads being assigned to the same lane of a given inject
585 * handler. The mutex allows us to perform the following two
586 * operations atomically:
588 * 1. determine the minimum handler and minimum target
589 * value of all the possible handlers
590 * 2. update that minimum handler's lane array
592 * Without atomicity, two (or more) threads could pick the same
593 * lane in step (1), and then conflict with each other in step
594 * (2). This could allow a single lane handler to process
595 * multiple requests simultaneously, which shouldn't be possible.
597 mutex_enter(&inject_delay_mtx
);
599 for (inject_handler_t
*handler
= list_head(&inject_handlers
);
600 handler
!= NULL
; handler
= list_next(&inject_handlers
, handler
)) {
601 if (handler
->zi_record
.zi_cmd
!= ZINJECT_DELAY_IO
)
604 if (!freq_triggered(handler
->zi_record
.zi_freq
))
607 if (vd
->vdev_guid
!= handler
->zi_record
.zi_guid
)
610 /* also match on I/O type (e.g., -T read) */
611 if (handler
->zi_record
.zi_iotype
!= ZIO_TYPES
&&
612 handler
->zi_record
.zi_iotype
!= zio
->io_type
) {
617 * Defensive; should never happen as the array allocation
618 * occurs prior to inserting this handler on the list.
620 ASSERT3P(handler
->zi_lanes
, !=, NULL
);
623 * This should never happen, the zinject command should
624 * prevent a user from setting an IO delay with zero lanes.
626 ASSERT3U(handler
->zi_record
.zi_nlanes
, !=, 0);
628 ASSERT3U(handler
->zi_record
.zi_nlanes
, >,
629 handler
->zi_next_lane
);
632 * We want to issue this IO to the lane that will become
633 * idle the soonest, so we compare the soonest this
634 * specific handler can complete the IO with all other
635 * handlers, to find the lowest value of all possible
636 * lanes. We then use this lane to submit the request.
638 * Since each handler has a constant value for its
639 * delay, we can just use the "next" lane for that
640 * handler; as it will always be the lane with the
641 * lowest value for that particular handler (i.e. the
642 * lane that will become idle the soonest). This saves a
643 * scan of each handler's lanes array.
645 * There's two cases to consider when determining when
646 * this specific IO request should complete. If this
647 * lane is idle, we want to "submit" the request now so
648 * it will complete after zi_timer milliseconds. Thus,
649 * we set the target to now + zi_timer.
651 * If the lane is busy, we want this request to complete
652 * zi_timer milliseconds after the lane becomes idle.
653 * Since the 'zi_lanes' array holds the time at which
654 * each lane will become idle, we use that value to
655 * determine when this request should complete.
657 hrtime_t idle
= handler
->zi_record
.zi_timer
+ gethrtime();
658 hrtime_t busy
= handler
->zi_record
.zi_timer
+
659 handler
->zi_lanes
[handler
->zi_next_lane
];
660 hrtime_t target
= MAX(idle
, busy
);
662 if (min_handler
== NULL
) {
663 min_handler
= handler
;
668 ASSERT3P(min_handler
, !=, NULL
);
669 ASSERT3U(min_target
, !=, 0);
672 * We don't yet increment the "next lane" variable since
673 * we still might find a lower value lane in another
674 * handler during any remaining iterations. Once we're
675 * sure we've selected the absolute minimum, we'll claim
676 * the lane and increment the handler's "next lane"
680 if (target
< min_target
) {
681 min_handler
= handler
;
687 * 'min_handler' will be NULL if no IO delays are registered for
688 * this vdev, otherwise it will point to the handler containing
689 * the lane that will become idle the soonest.
691 if (min_handler
!= NULL
) {
692 ASSERT3U(min_target
, !=, 0);
693 min_handler
->zi_lanes
[min_handler
->zi_next_lane
] = min_target
;
696 * If we've used all possible lanes for this handler,
697 * loop back and start using the first lane again;
698 * otherwise, just increment the lane index.
700 min_handler
->zi_next_lane
= (min_handler
->zi_next_lane
+ 1) %
701 min_handler
->zi_record
.zi_nlanes
;
704 mutex_exit(&inject_delay_mtx
);
705 rw_exit(&inject_lock
);
711 zio_handle_pool_delay(spa_t
*spa
, hrtime_t elapsed
, zinject_type_t command
)
713 inject_handler_t
*handler
;
717 rw_enter(&inject_lock
, RW_READER
);
719 for (handler
= list_head(&inject_handlers
);
720 handler
!= NULL
&& handler
->zi_record
.zi_cmd
== command
;
721 handler
= list_next(&inject_handlers
, handler
)) {
722 ASSERT3P(handler
->zi_spa_name
, !=, NULL
);
723 if (strcmp(spa_name(spa
), handler
->zi_spa_name
) == 0) {
725 SEC2NSEC(handler
->zi_record
.zi_duration
);
726 if (pause
> elapsed
) {
727 delay
= pause
- elapsed
;
734 rw_exit(&inject_lock
);
737 if (command
== ZINJECT_DELAY_IMPORT
) {
738 spa_import_progress_set_notes(spa
, "injecting %llu "
739 "sec delay", (u_longlong_t
)NSEC2SEC(delay
));
741 zfs_sleep_until(gethrtime() + delay
);
744 /* all done with this one-shot handler */
750 * For testing, inject a delay during an import
753 zio_handle_import_delay(spa_t
*spa
, hrtime_t elapsed
)
755 zio_handle_pool_delay(spa
, elapsed
, ZINJECT_DELAY_IMPORT
);
759 * For testing, inject a delay during an export
762 zio_handle_export_delay(spa_t
*spa
, hrtime_t elapsed
)
764 zio_handle_pool_delay(spa
, elapsed
, ZINJECT_DELAY_EXPORT
);
768 zio_calculate_range(const char *pool
, zinject_record_t
*record
)
777 * Obtain the dnode for object using pool, objset, and object
779 error
= dsl_pool_hold(pool
, FTAG
, &dp
);
783 error
= dsl_dataset_hold_obj(dp
, record
->zi_objset
, FTAG
, &ds
);
784 dsl_pool_rele(dp
, FTAG
);
788 error
= dmu_objset_from_ds(ds
, &os
);
789 dsl_dataset_rele(ds
, FTAG
);
793 error
= dnode_hold(os
, record
->zi_object
, FTAG
, &dn
);
798 * Translate the range into block IDs
800 if (record
->zi_start
!= 0 || record
->zi_end
!= -1ULL) {
801 record
->zi_start
>>= dn
->dn_datablkshift
;
802 record
->zi_end
>>= dn
->dn_datablkshift
;
804 if (record
->zi_level
> 0) {
805 if (record
->zi_level
>= dn
->dn_nlevels
) {
806 dnode_rele(dn
, FTAG
);
807 return (SET_ERROR(EDOM
));
810 if (record
->zi_start
!= 0 || record
->zi_end
!= 0) {
811 int shift
= dn
->dn_indblkshift
- SPA_BLKPTRSHIFT
;
813 for (int level
= record
->zi_level
; level
> 0; level
--) {
814 record
->zi_start
>>= shift
;
815 record
->zi_end
>>= shift
;
820 dnode_rele(dn
, FTAG
);
825 zio_pool_handler_exists(const char *name
, zinject_type_t command
)
827 boolean_t exists
= B_FALSE
;
829 rw_enter(&inject_lock
, RW_READER
);
830 for (inject_handler_t
*handler
= list_head(&inject_handlers
);
831 handler
!= NULL
; handler
= list_next(&inject_handlers
, handler
)) {
832 if (command
!= handler
->zi_record
.zi_cmd
)
835 const char *pool
= (handler
->zi_spa_name
!= NULL
) ?
836 handler
->zi_spa_name
: spa_name(handler
->zi_spa
);
837 if (strcmp(name
, pool
) == 0) {
842 rw_exit(&inject_lock
);
847 * Create a new handler for the given record. We add it to the list, adding
848 * a reference to the spa_t in the process. We increment zio_injection_enabled,
849 * which is the switch to trigger all fault injection.
852 zio_inject_fault(char *name
, int flags
, int *id
, zinject_record_t
*record
)
854 inject_handler_t
*handler
;
859 * If this is pool-wide metadata, make sure we unload the corresponding
860 * spa_t, so that the next attempt to load it will trigger the fault.
861 * We call spa_reset() to unload the pool appropriately.
863 if (flags
& ZINJECT_UNLOAD_SPA
)
864 if ((error
= spa_reset(name
)) != 0)
867 if (record
->zi_cmd
== ZINJECT_DELAY_IO
) {
869 * A value of zero for the number of lanes or for the
870 * delay time doesn't make sense.
872 if (record
->zi_timer
== 0 || record
->zi_nlanes
== 0)
873 return (SET_ERROR(EINVAL
));
876 * The number of lanes is directly mapped to the size of
877 * an array used by the handler. Thus, to ensure the
878 * user doesn't trigger an allocation that's "too large"
879 * we cap the number of lanes here.
881 if (record
->zi_nlanes
>= UINT16_MAX
)
882 return (SET_ERROR(EINVAL
));
886 * If the supplied range was in bytes -- calculate the actual blkid
888 if (flags
& ZINJECT_CALC_RANGE
) {
889 error
= zio_calculate_range(name
, record
);
894 if (!(flags
& ZINJECT_NULL
)) {
896 * Pool delays for import or export don't take an
897 * injection reference on the spa. Instead they
898 * rely on matching by name.
900 if (record
->zi_cmd
== ZINJECT_DELAY_IMPORT
||
901 record
->zi_cmd
== ZINJECT_DELAY_EXPORT
) {
902 if (record
->zi_duration
<= 0)
903 return (SET_ERROR(EINVAL
));
905 * Only one import | export delay handler per pool.
907 if (zio_pool_handler_exists(name
, record
->zi_cmd
))
908 return (SET_ERROR(EEXIST
));
910 mutex_enter(&spa_namespace_lock
);
911 boolean_t has_spa
= spa_lookup(name
) != NULL
;
912 mutex_exit(&spa_namespace_lock
);
914 if (record
->zi_cmd
== ZINJECT_DELAY_IMPORT
&& has_spa
)
915 return (SET_ERROR(EEXIST
));
916 if (record
->zi_cmd
== ZINJECT_DELAY_EXPORT
&& !has_spa
)
917 return (SET_ERROR(ENOENT
));
921 * spa_inject_ref() will add an injection reference,
922 * which will prevent the pool from being removed
923 * from the namespace while still allowing it to be
926 if ((spa
= spa_inject_addref(name
)) == NULL
)
927 return (SET_ERROR(ENOENT
));
930 handler
= kmem_alloc(sizeof (inject_handler_t
), KM_SLEEP
);
931 handler
->zi_spa
= spa
; /* note: can be NULL */
932 handler
->zi_record
= *record
;
934 if (handler
->zi_record
.zi_cmd
== ZINJECT_DELAY_IO
) {
935 handler
->zi_lanes
= kmem_zalloc(
936 sizeof (*handler
->zi_lanes
) *
937 handler
->zi_record
.zi_nlanes
, KM_SLEEP
);
938 handler
->zi_next_lane
= 0;
940 handler
->zi_lanes
= NULL
;
941 handler
->zi_next_lane
= 0;
944 if (handler
->zi_spa
== NULL
)
945 handler
->zi_spa_name
= spa_strdup(name
);
947 handler
->zi_spa_name
= NULL
;
949 rw_enter(&inject_lock
, RW_WRITER
);
952 * We can't move this increment into the conditional
953 * above because we need to hold the RW_WRITER lock of
954 * inject_lock, and we don't want to hold that while
955 * allocating the handler's zi_lanes array.
957 if (handler
->zi_record
.zi_cmd
== ZINJECT_DELAY_IO
) {
958 ASSERT3S(inject_delay_count
, >=, 0);
959 inject_delay_count
++;
960 ASSERT3S(inject_delay_count
, >, 0);
963 *id
= handler
->zi_id
= inject_next_id
++;
964 list_insert_tail(&inject_handlers
, handler
);
965 atomic_inc_32(&zio_injection_enabled
);
967 rw_exit(&inject_lock
);
971 * Flush the ARC, so that any attempts to read this data will end up
972 * going to the ZIO layer. Note that this is a little overkill, but
973 * we don't have the necessary ARC interfaces to do anything else, and
974 * fault injection isn't a performance critical path.
976 if (flags
& ZINJECT_FLUSH_ARC
)
978 * We must use FALSE to ensure arc_flush returns, since
979 * we're not preventing concurrent ARC insertions.
981 arc_flush(NULL
, FALSE
);
987 * Returns the next record with an ID greater than that supplied to the
988 * function. Used to iterate over all handlers in the system.
991 zio_inject_list_next(int *id
, char *name
, size_t buflen
,
992 zinject_record_t
*record
)
994 inject_handler_t
*handler
;
997 mutex_enter(&spa_namespace_lock
);
998 rw_enter(&inject_lock
, RW_READER
);
1000 for (handler
= list_head(&inject_handlers
); handler
!= NULL
;
1001 handler
= list_next(&inject_handlers
, handler
))
1002 if (handler
->zi_id
> *id
)
1006 *record
= handler
->zi_record
;
1007 *id
= handler
->zi_id
;
1008 ASSERT(handler
->zi_spa
|| handler
->zi_spa_name
);
1009 if (handler
->zi_spa
!= NULL
)
1010 (void) strlcpy(name
, spa_name(handler
->zi_spa
), buflen
);
1012 (void) strlcpy(name
, handler
->zi_spa_name
, buflen
);
1015 ret
= SET_ERROR(ENOENT
);
1018 rw_exit(&inject_lock
);
1019 mutex_exit(&spa_namespace_lock
);
1025 * Clear the fault handler with the given identifier, or return ENOENT if none
1029 zio_clear_fault(int id
)
1031 inject_handler_t
*handler
;
1033 rw_enter(&inject_lock
, RW_WRITER
);
1035 for (handler
= list_head(&inject_handlers
); handler
!= NULL
;
1036 handler
= list_next(&inject_handlers
, handler
))
1037 if (handler
->zi_id
== id
)
1040 if (handler
== NULL
) {
1041 rw_exit(&inject_lock
);
1042 return (SET_ERROR(ENOENT
));
1045 if (handler
->zi_record
.zi_cmd
== ZINJECT_DELAY_IO
) {
1046 ASSERT3S(inject_delay_count
, >, 0);
1047 inject_delay_count
--;
1048 ASSERT3S(inject_delay_count
, >=, 0);
1051 list_remove(&inject_handlers
, handler
);
1052 rw_exit(&inject_lock
);
1054 if (handler
->zi_record
.zi_cmd
== ZINJECT_DELAY_IO
) {
1055 ASSERT3P(handler
->zi_lanes
, !=, NULL
);
1056 kmem_free(handler
->zi_lanes
, sizeof (*handler
->zi_lanes
) *
1057 handler
->zi_record
.zi_nlanes
);
1059 ASSERT3P(handler
->zi_lanes
, ==, NULL
);
1062 if (handler
->zi_spa_name
!= NULL
)
1063 spa_strfree(handler
->zi_spa_name
);
1065 if (handler
->zi_spa
!= NULL
)
1066 spa_inject_delref(handler
->zi_spa
);
1067 kmem_free(handler
, sizeof (inject_handler_t
));
1068 atomic_dec_32(&zio_injection_enabled
);
1074 zio_inject_init(void)
1076 rw_init(&inject_lock
, NULL
, RW_DEFAULT
, NULL
);
1077 mutex_init(&inject_delay_mtx
, NULL
, MUTEX_DEFAULT
, NULL
);
1078 list_create(&inject_handlers
, sizeof (inject_handler_t
),
1079 offsetof(inject_handler_t
, zi_link
));
1083 zio_inject_fini(void)
1085 list_destroy(&inject_handlers
);
1086 mutex_destroy(&inject_delay_mtx
);
1087 rw_destroy(&inject_lock
);
1090 #if defined(_KERNEL)
1091 EXPORT_SYMBOL(zio_injection_enabled
);
1092 EXPORT_SYMBOL(zio_inject_fault
);
1093 EXPORT_SYMBOL(zio_inject_list_next
);
1094 EXPORT_SYMBOL(zio_clear_fault
);
1095 EXPORT_SYMBOL(zio_handle_fault_injection
);
1096 EXPORT_SYMBOL(zio_handle_device_injection
);
1097 EXPORT_SYMBOL(zio_handle_label_injection
);