4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/zfs_context.h>
28 #include <sys/vdev_impl.h>
33 * These tunables are for performance analysis.
36 * zfs_vdev_max_pending is the maximum number of i/os concurrently
37 * pending to each device. zfs_vdev_min_pending is the initial number
38 * of i/os pending to each device (before it starts ramping up to
41 int zfs_vdev_max_pending
= 35;
42 int zfs_vdev_min_pending
= 4;
44 /* deadline = pri + (lbolt >> time_shift) */
45 int zfs_vdev_time_shift
= 6;
47 /* exponential I/O issue ramp-up rate */
48 int zfs_vdev_ramp_rate
= 2;
51 * i/os will be aggregated into a single large i/o up to
52 * zfs_vdev_aggregation_limit bytes long.
54 int zfs_vdev_aggregation_limit
= SPA_MAXBLOCKSIZE
;
57 * Virtual device vector for disk I/O scheduling.
60 vdev_queue_deadline_compare(const void *x1
, const void *x2
)
65 if (z1
->io_deadline
< z2
->io_deadline
)
67 if (z1
->io_deadline
> z2
->io_deadline
)
70 if (z1
->io_offset
< z2
->io_offset
)
72 if (z1
->io_offset
> z2
->io_offset
)
84 vdev_queue_offset_compare(const void *x1
, const void *x2
)
89 if (z1
->io_offset
< z2
->io_offset
)
91 if (z1
->io_offset
> z2
->io_offset
)
103 vdev_queue_init(vdev_t
*vd
)
105 vdev_queue_t
*vq
= &vd
->vdev_queue
;
107 mutex_init(&vq
->vq_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
109 avl_create(&vq
->vq_deadline_tree
, vdev_queue_deadline_compare
,
110 sizeof (zio_t
), offsetof(struct zio
, io_deadline_node
));
112 avl_create(&vq
->vq_read_tree
, vdev_queue_offset_compare
,
113 sizeof (zio_t
), offsetof(struct zio
, io_offset_node
));
115 avl_create(&vq
->vq_write_tree
, vdev_queue_offset_compare
,
116 sizeof (zio_t
), offsetof(struct zio
, io_offset_node
));
118 avl_create(&vq
->vq_pending_tree
, vdev_queue_offset_compare
,
119 sizeof (zio_t
), offsetof(struct zio
, io_offset_node
));
123 vdev_queue_fini(vdev_t
*vd
)
125 vdev_queue_t
*vq
= &vd
->vdev_queue
;
127 avl_destroy(&vq
->vq_deadline_tree
);
128 avl_destroy(&vq
->vq_read_tree
);
129 avl_destroy(&vq
->vq_write_tree
);
130 avl_destroy(&vq
->vq_pending_tree
);
132 mutex_destroy(&vq
->vq_lock
);
136 vdev_queue_io_add(vdev_queue_t
*vq
, zio_t
*zio
)
138 avl_add(&vq
->vq_deadline_tree
, zio
);
139 avl_add(zio
->io_vdev_tree
, zio
);
143 vdev_queue_io_remove(vdev_queue_t
*vq
, zio_t
*zio
)
145 avl_remove(&vq
->vq_deadline_tree
, zio
);
146 avl_remove(zio
->io_vdev_tree
, zio
);
150 vdev_queue_agg_io_done(zio_t
*aio
)
155 while ((dio
= aio
->io_delegate_list
) != NULL
) {
156 if (aio
->io_type
== ZIO_TYPE_READ
)
157 bcopy((char *)aio
->io_data
+ offset
, dio
->io_data
,
159 offset
+= dio
->io_size
;
160 aio
->io_delegate_list
= dio
->io_delegate_next
;
161 dio
->io_delegate_next
= NULL
;
162 dio
->io_error
= aio
->io_error
;
165 ASSERT3U(offset
, ==, aio
->io_size
);
167 zio_buf_free(aio
->io_data
, aio
->io_size
);
170 #define IS_ADJACENT(io, nio) \
171 ((io)->io_offset + (io)->io_size == (nio)->io_offset)
174 vdev_queue_io_to_issue(vdev_queue_t
*vq
, uint64_t pending_limit
)
176 zio_t
*fio
, *lio
, *aio
, *dio
;
180 ASSERT(MUTEX_HELD(&vq
->vq_lock
));
182 if (avl_numnodes(&vq
->vq_pending_tree
) >= pending_limit
||
183 avl_numnodes(&vq
->vq_deadline_tree
) == 0)
186 fio
= lio
= avl_first(&vq
->vq_deadline_tree
);
188 tree
= fio
->io_vdev_tree
;
191 while ((dio
= AVL_PREV(tree
, fio
)) != NULL
&& IS_ADJACENT(dio
, fio
) &&
192 !((dio
->io_flags
| fio
->io_flags
) & ZIO_FLAG_DONT_AGGREGATE
) &&
193 size
+ dio
->io_size
<= zfs_vdev_aggregation_limit
) {
194 dio
->io_delegate_next
= fio
;
196 size
+= dio
->io_size
;
199 while ((dio
= AVL_NEXT(tree
, lio
)) != NULL
&& IS_ADJACENT(lio
, dio
) &&
200 !((lio
->io_flags
| dio
->io_flags
) & ZIO_FLAG_DONT_AGGREGATE
) &&
201 size
+ dio
->io_size
<= zfs_vdev_aggregation_limit
) {
202 lio
->io_delegate_next
= dio
;
204 size
+= dio
->io_size
;
208 char *buf
= zio_buf_alloc(size
);
211 ASSERT(size
<= zfs_vdev_aggregation_limit
);
213 aio
= zio_vdev_delegated_io(fio
->io_vd
, fio
->io_offset
,
214 buf
, size
, fio
->io_type
, ZIO_PRIORITY_NOW
,
215 ZIO_FLAG_DONT_CACHE
| ZIO_FLAG_DONT_QUEUE
,
216 vdev_queue_agg_io_done
, NULL
);
218 aio
->io_delegate_list
= fio
;
220 for (dio
= fio
; dio
!= NULL
; dio
= dio
->io_delegate_next
) {
221 ASSERT(dio
->io_type
== aio
->io_type
);
222 ASSERT(dio
->io_vdev_tree
== tree
);
223 if (dio
->io_type
== ZIO_TYPE_WRITE
)
224 bcopy(dio
->io_data
, buf
+ offset
, dio
->io_size
);
225 offset
+= dio
->io_size
;
226 vdev_queue_io_remove(vq
, dio
);
227 zio_vdev_io_bypass(dio
);
230 ASSERT(offset
== size
);
232 avl_add(&vq
->vq_pending_tree
, aio
);
237 ASSERT(fio
->io_vdev_tree
== tree
);
238 vdev_queue_io_remove(vq
, fio
);
240 avl_add(&vq
->vq_pending_tree
, fio
);
246 vdev_queue_io(zio_t
*zio
)
248 vdev_queue_t
*vq
= &zio
->io_vd
->vdev_queue
;
251 ASSERT(zio
->io_type
== ZIO_TYPE_READ
|| zio
->io_type
== ZIO_TYPE_WRITE
);
253 if (zio
->io_flags
& ZIO_FLAG_DONT_QUEUE
)
256 zio
->io_flags
|= ZIO_FLAG_DONT_CACHE
| ZIO_FLAG_DONT_QUEUE
;
258 if (zio
->io_type
== ZIO_TYPE_READ
)
259 zio
->io_vdev_tree
= &vq
->vq_read_tree
;
261 zio
->io_vdev_tree
= &vq
->vq_write_tree
;
263 mutex_enter(&vq
->vq_lock
);
265 zio
->io_deadline
= (lbolt64
>> zfs_vdev_time_shift
) + zio
->io_priority
;
267 vdev_queue_io_add(vq
, zio
);
269 nio
= vdev_queue_io_to_issue(vq
, zfs_vdev_min_pending
);
271 mutex_exit(&vq
->vq_lock
);
276 if (nio
->io_done
== vdev_queue_agg_io_done
) {
285 vdev_queue_io_done(zio_t
*zio
)
287 vdev_queue_t
*vq
= &zio
->io_vd
->vdev_queue
;
289 mutex_enter(&vq
->vq_lock
);
291 avl_remove(&vq
->vq_pending_tree
, zio
);
293 for (int i
= 0; i
< zfs_vdev_ramp_rate
; i
++) {
294 zio_t
*nio
= vdev_queue_io_to_issue(vq
, zfs_vdev_max_pending
);
297 mutex_exit(&vq
->vq_lock
);
298 if (nio
->io_done
== vdev_queue_agg_io_done
) {
301 zio_vdev_io_reissue(nio
);
304 mutex_enter(&vq
->vq_lock
);
307 mutex_exit(&vq
->vq_lock
);