4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
24 #include <sys/dmu_impl.h>
26 #include <sys/dnode.h>
27 #include <sys/zfs_context.h>
28 #include <sys/zfs_racct.h>
29 #include <sys/dsl_dataset.h>
30 #include <sys/dmu_objset.h>
33 make_abd_for_dbuf(dmu_buf_impl_t
*db
, abd_t
*data
, uint64_t offset
,
36 size_t buf_size
= db
->db
.db_size
;
37 abd_t
*pre_buf
= NULL
, *post_buf
= NULL
, *mbuf
= NULL
;
40 ASSERT(MUTEX_HELD(&db
->db_mtx
));
42 if (offset
> db
->db
.db_offset
) {
43 size_t pre_size
= offset
- db
->db
.db_offset
;
44 pre_buf
= abd_alloc_for_io(pre_size
, B_TRUE
);
48 buf_off
= db
->db
.db_offset
- offset
;
52 if (size
< buf_size
) {
53 size_t post_size
= buf_size
- size
;
54 post_buf
= abd_alloc_for_io(post_size
, B_TRUE
);
55 buf_size
-= post_size
;
58 ASSERT3U(buf_size
, >, 0);
59 abd_t
*buf
= abd_get_offset_size(data
, buf_off
, buf_size
);
61 if (pre_buf
|| post_buf
) {
62 mbuf
= abd_alloc_gang();
64 abd_gang_add(mbuf
, pre_buf
, B_TRUE
);
65 abd_gang_add(mbuf
, buf
, B_TRUE
);
67 abd_gang_add(mbuf
, post_buf
, B_TRUE
);
76 dmu_read_abd_done(zio_t
*zio
)
78 abd_free(zio
->io_abd
);
82 dmu_write_direct_ready(zio_t
*zio
)
84 dmu_sync_ready(zio
, NULL
, zio
->io_private
);
88 dmu_write_direct_done(zio_t
*zio
)
90 dmu_sync_arg_t
*dsa
= zio
->io_private
;
91 dbuf_dirty_record_t
*dr
= dsa
->dsa_dr
;
92 dmu_buf_impl_t
*db
= dr
->dr_dbuf
;
94 abd_free(zio
->io_abd
);
96 mutex_enter(&db
->db_mtx
);
97 ASSERT3P(db
->db_buf
, ==, NULL
);
98 ASSERT3P(dr
->dt
.dl
.dr_data
, ==, NULL
);
99 ASSERT3P(db
->db
.db_data
, ==, NULL
);
100 db
->db_state
= DB_UNCACHED
;
101 mutex_exit(&db
->db_mtx
);
103 dmu_sync_done(zio
, NULL
, zio
->io_private
);
105 if (zio
->io_error
!= 0) {
106 if (zio
->io_flags
& ZIO_FLAG_DIO_CHKSUM_ERR
)
107 ASSERT3U(zio
->io_error
, ==, EIO
);
110 * In the event of an I/O error this block has been freed in
111 * zio_done() through zio_dva_unallocate(). Calling
112 * dmu_sync_done() above set dr_override_state to
113 * DR_NOT_OVERRIDDEN. In this case when dbuf_undirty() calls
114 * dbuf_unoverride(), it will skip doing zio_free() to free
115 * this block as that was already taken care of.
117 * Since we are undirtying the record in open-context, we must
118 * have a hold on the db, so it should never be evicted after
119 * calling dbuf_undirty().
121 mutex_enter(&db
->db_mtx
);
122 VERIFY3B(dbuf_undirty(db
, dsa
->dsa_tx
), ==, B_FALSE
);
123 mutex_exit(&db
->db_mtx
);
126 kmem_free(zio
->io_bp
, sizeof (blkptr_t
));
131 dmu_write_direct(zio_t
*pio
, dmu_buf_impl_t
*db
, abd_t
*data
, dmu_tx_t
*tx
)
133 objset_t
*os
= db
->db_objset
;
134 dsl_dataset_t
*ds
= dmu_objset_ds(os
);
136 dbuf_dirty_record_t
*dr_head
;
138 SET_BOOKMARK(&zb
, ds
->ds_object
,
139 db
->db
.db_object
, db
->db_level
, db
->db_blkid
);
143 dmu_write_policy(os
, DB_DNODE(db
), db
->db_level
,
144 WP_DMU_SYNC
| WP_DIRECT_WR
, &zp
);
148 * Dirty this dbuf with DB_NOFILL since we will not have any data
149 * associated with the dbuf.
151 dmu_buf_will_clone_or_dio(&db
->db
, tx
);
153 mutex_enter(&db
->db_mtx
);
155 uint64_t txg
= dmu_tx_get_txg(tx
);
156 ASSERT3U(txg
, >, spa_last_synced_txg(os
->os_spa
));
157 ASSERT3U(txg
, >, spa_syncing_txg(os
->os_spa
));
159 dr_head
= list_head(&db
->db_dirty_records
);
160 ASSERT3U(dr_head
->dr_txg
, ==, txg
);
161 dr_head
->dt
.dl
.dr_diowrite
= B_TRUE
;
162 dr_head
->dr_accounted
= db
->db
.db_size
;
164 blkptr_t
*bp
= kmem_alloc(sizeof (blkptr_t
), KM_SLEEP
);
165 if (db
->db_blkptr
!= NULL
) {
167 * Fill in bp with the current block pointer so that
168 * the nopwrite code can check if we're writing the same
169 * data that's already on disk.
171 *bp
= *db
->db_blkptr
;
173 memset(bp
, 0, sizeof (blkptr_t
));
177 * Disable nopwrite if the current block pointer could change
178 * before this TXG syncs.
180 if (list_next(&db
->db_dirty_records
, dr_head
) != NULL
)
181 zp
.zp_nopwrite
= B_FALSE
;
183 ASSERT0(dr_head
->dt
.dl
.dr_has_raw_params
);
184 ASSERT3S(dr_head
->dt
.dl
.dr_override_state
, ==, DR_NOT_OVERRIDDEN
);
185 dr_head
->dt
.dl
.dr_override_state
= DR_IN_DMU_SYNC
;
187 mutex_exit(&db
->db_mtx
);
189 dmu_objset_willuse_space(os
, dr_head
->dr_accounted
, tx
);
191 dmu_sync_arg_t
*dsa
= kmem_zalloc(sizeof (dmu_sync_arg_t
), KM_SLEEP
);
192 dsa
->dsa_dr
= dr_head
;
195 zio_t
*zio
= zio_write(pio
, os
->os_spa
, txg
, bp
, data
,
196 db
->db
.db_size
, db
->db
.db_size
, &zp
,
197 dmu_write_direct_ready
, NULL
, dmu_write_direct_done
, dsa
,
198 ZIO_PRIORITY_SYNC_WRITE
, ZIO_FLAG_CANFAIL
, &zb
);
201 return (zio_wait(zio
));
209 dmu_write_abd(dnode_t
*dn
, uint64_t offset
, uint64_t size
,
210 abd_t
*data
, uint32_t flags
, dmu_tx_t
*tx
)
213 spa_t
*spa
= dn
->dn_objset
->os_spa
;
216 ASSERT(flags
& DMU_DIRECTIO
);
218 err
= dmu_buf_hold_array_by_dnode(dn
, offset
,
219 size
, B_FALSE
, FTAG
, &numbufs
, &dbp
, flags
);
223 zio_t
*pio
= zio_root(spa
, NULL
, NULL
, ZIO_FLAG_CANFAIL
);
225 for (int i
= 0; i
< numbufs
&& err
== 0; i
++) {
226 dmu_buf_impl_t
*db
= (dmu_buf_impl_t
*)dbp
[i
];
228 abd_t
*abd
= abd_get_offset_size(data
,
229 db
->db
.db_offset
- offset
, dn
->dn_datablksz
);
231 zfs_racct_write(spa
, db
->db
.db_size
, 1, flags
);
232 err
= dmu_write_direct(pio
, db
, abd
, tx
);
239 * The dbuf must be held until the Direct I/O write has completed in
240 * the event there was any errors and dbuf_undirty() was called.
242 dmu_buf_rele_array(dbp
, numbufs
, FTAG
);
248 dmu_read_abd(dnode_t
*dn
, uint64_t offset
, uint64_t size
,
249 abd_t
*data
, uint32_t flags
)
251 objset_t
*os
= dn
->dn_objset
;
252 spa_t
*spa
= os
->os_spa
;
256 ASSERT(flags
& DMU_DIRECTIO
);
258 err
= dmu_buf_hold_array_by_dnode(dn
, offset
,
259 size
, B_FALSE
, FTAG
, &numbufs
, &dbp
, flags
);
263 zio_t
*rio
= zio_root(spa
, NULL
, NULL
, ZIO_FLAG_CANFAIL
);
265 for (int i
= 0; i
< numbufs
; i
++) {
266 dmu_buf_impl_t
*db
= (dmu_buf_impl_t
*)dbp
[i
];
271 mutex_enter(&db
->db_mtx
);
273 SET_BOOKMARK(&zb
, dmu_objset_ds(os
)->ds_object
,
274 db
->db
.db_object
, db
->db_level
, db
->db_blkid
);
277 * If there is another read for this dbuf, we will wait for
278 * that to complete first before checking the db_state below.
280 while (db
->db_state
== DB_READ
)
281 cv_wait(&db
->db_changed
, &db
->db_mtx
);
283 err
= dmu_buf_get_bp_from_dbuf(db
, &bp
);
285 mutex_exit(&db
->db_mtx
);
290 * There is no need to read if this is a hole or the data is
291 * cached. This will not be considered a direct read for IO
292 * accounting in the same way that an ARC hit is not counted.
294 if (bp
== NULL
|| BP_IS_HOLE(bp
) || db
->db_state
== DB_CACHED
) {
295 size_t aoff
= offset
< db
->db
.db_offset
?
296 db
->db
.db_offset
- offset
: 0;
297 size_t boff
= offset
> db
->db
.db_offset
?
298 offset
- db
->db
.db_offset
: 0;
299 size_t len
= MIN(size
- aoff
, db
->db
.db_size
- boff
);
301 if (db
->db_state
== DB_CACHED
) {
303 * We need to untransformed the ARC buf data
304 * before we copy it over.
306 err
= dmu_buf_untransform_direct(db
, spa
);
308 abd_copy_from_buf_off(data
,
309 (char *)db
->db
.db_data
+ boff
, aoff
, len
);
311 abd_zero_off(data
, aoff
, len
);
314 mutex_exit(&db
->db_mtx
);
318 mbuf
= make_abd_for_dbuf(db
, data
, offset
, size
);
319 ASSERT3P(mbuf
, !=, NULL
);
322 * The dbuf mutex (db_mtx) must be held when creating the ZIO
323 * for the read. The BP returned from
324 * dmu_buf_get_bp_from_dbuf() could be from a pending block
325 * clone or a yet to be synced Direct I/O write that is in the
326 * dbuf's dirty record. When zio_read() is called, zio_create()
327 * will make a copy of the BP. However, if zio_read() is called
328 * without the mutex being held then the dirty record from the
329 * dbuf could be freed in dbuf_write_done() resulting in garbage
330 * being set for the zio BP.
332 zio_t
*cio
= zio_read(rio
, spa
, bp
, mbuf
, db
->db
.db_size
,
333 dmu_read_abd_done
, NULL
, ZIO_PRIORITY_SYNC_READ
,
334 ZIO_FLAG_CANFAIL
| ZIO_FLAG_DIO_READ
, &zb
);
335 mutex_exit(&db
->db_mtx
);
337 zfs_racct_read(spa
, db
->db
.db_size
, 1, flags
);
341 dmu_buf_rele_array(dbp
, numbufs
, FTAG
);
343 return (zio_wait(rio
));
346 dmu_buf_rele_array(dbp
, numbufs
, FTAG
);
347 (void) zio_wait(rio
);
353 dmu_read_uio_direct(dnode_t
*dn
, zfs_uio_t
*uio
, uint64_t size
)
355 offset_t offset
= zfs_uio_offset(uio
);
356 offset_t page_index
= (offset
- zfs_uio_soffset(uio
)) >> PAGESHIFT
;
359 ASSERT(uio
->uio_extflg
& UIO_DIRECT
);
360 ASSERT3U(page_index
, <, uio
->uio_dio
.npages
);
362 abd_t
*data
= abd_alloc_from_pages(&uio
->uio_dio
.pages
[page_index
],
363 offset
& (PAGESIZE
- 1), size
);
364 err
= dmu_read_abd(dn
, offset
, size
, data
, DMU_DIRECTIO
);
368 zfs_uioskip(uio
, size
);
374 dmu_write_uio_direct(dnode_t
*dn
, zfs_uio_t
*uio
, uint64_t size
, dmu_tx_t
*tx
)
376 offset_t offset
= zfs_uio_offset(uio
);
377 offset_t page_index
= (offset
- zfs_uio_soffset(uio
)) >> PAGESHIFT
;
380 ASSERT(uio
->uio_extflg
& UIO_DIRECT
);
381 ASSERT3U(page_index
, <, uio
->uio_dio
.npages
);
383 abd_t
*data
= abd_alloc_from_pages(&uio
->uio_dio
.pages
[page_index
],
384 offset
& (PAGESIZE
- 1), size
);
385 err
= dmu_write_abd(dn
, offset
, size
, data
, DMU_DIRECTIO
, tx
);
389 zfs_uioskip(uio
, size
);
395 EXPORT_SYMBOL(dmu_read_uio_direct
);
396 EXPORT_SYMBOL(dmu_write_uio_direct
);