Reduce dirty records memory usage
[zfs.git] / module / zfs / dmu_direct.c
blob40b78b519f499938befc69e60641225b4103f0a8
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 #include <sys/dmu.h>
24 #include <sys/dmu_impl.h>
25 #include <sys/dbuf.h>
26 #include <sys/dnode.h>
27 #include <sys/zfs_context.h>
28 #include <sys/zfs_racct.h>
29 #include <sys/dsl_dataset.h>
30 #include <sys/dmu_objset.h>
32 static abd_t *
33 make_abd_for_dbuf(dmu_buf_impl_t *db, abd_t *data, uint64_t offset,
34 uint64_t size)
36 size_t buf_size = db->db.db_size;
37 abd_t *pre_buf = NULL, *post_buf = NULL, *mbuf = NULL;
38 size_t buf_off = 0;
40 ASSERT(MUTEX_HELD(&db->db_mtx));
42 if (offset > db->db.db_offset) {
43 size_t pre_size = offset - db->db.db_offset;
44 pre_buf = abd_alloc_for_io(pre_size, B_TRUE);
45 buf_size -= pre_size;
46 buf_off = 0;
47 } else {
48 buf_off = db->db.db_offset - offset;
49 size -= buf_off;
52 if (size < buf_size) {
53 size_t post_size = buf_size - size;
54 post_buf = abd_alloc_for_io(post_size, B_TRUE);
55 buf_size -= post_size;
58 ASSERT3U(buf_size, >, 0);
59 abd_t *buf = abd_get_offset_size(data, buf_off, buf_size);
61 if (pre_buf || post_buf) {
62 mbuf = abd_alloc_gang();
63 if (pre_buf)
64 abd_gang_add(mbuf, pre_buf, B_TRUE);
65 abd_gang_add(mbuf, buf, B_TRUE);
66 if (post_buf)
67 abd_gang_add(mbuf, post_buf, B_TRUE);
68 } else {
69 mbuf = buf;
72 return (mbuf);
75 static void
76 dmu_read_abd_done(zio_t *zio)
78 abd_free(zio->io_abd);
81 static void
82 dmu_write_direct_ready(zio_t *zio)
84 dmu_sync_ready(zio, NULL, zio->io_private);
87 static void
88 dmu_write_direct_done(zio_t *zio)
90 dmu_sync_arg_t *dsa = zio->io_private;
91 dbuf_dirty_record_t *dr = dsa->dsa_dr;
92 dmu_buf_impl_t *db = dr->dr_dbuf;
94 abd_free(zio->io_abd);
96 mutex_enter(&db->db_mtx);
97 ASSERT3P(db->db_buf, ==, NULL);
98 ASSERT3P(dr->dt.dl.dr_data, ==, NULL);
99 ASSERT3P(db->db.db_data, ==, NULL);
100 db->db_state = DB_UNCACHED;
101 mutex_exit(&db->db_mtx);
103 dmu_sync_done(zio, NULL, zio->io_private);
105 if (zio->io_error != 0) {
106 if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
107 ASSERT3U(zio->io_error, ==, EIO);
110 * In the event of an I/O error this block has been freed in
111 * zio_done() through zio_dva_unallocate(). Calling
112 * dmu_sync_done() above set dr_override_state to
113 * DR_NOT_OVERRIDDEN. In this case when dbuf_undirty() calls
114 * dbuf_unoverride(), it will skip doing zio_free() to free
115 * this block as that was already taken care of.
117 * Since we are undirtying the record in open-context, we must
118 * have a hold on the db, so it should never be evicted after
119 * calling dbuf_undirty().
121 mutex_enter(&db->db_mtx);
122 VERIFY3B(dbuf_undirty(db, dsa->dsa_tx), ==, B_FALSE);
123 mutex_exit(&db->db_mtx);
126 kmem_free(zio->io_bp, sizeof (blkptr_t));
127 zio->io_bp = NULL;
131 dmu_write_direct(zio_t *pio, dmu_buf_impl_t *db, abd_t *data, dmu_tx_t *tx)
133 objset_t *os = db->db_objset;
134 dsl_dataset_t *ds = dmu_objset_ds(os);
135 zbookmark_phys_t zb;
136 dbuf_dirty_record_t *dr_head;
138 SET_BOOKMARK(&zb, ds->ds_object,
139 db->db.db_object, db->db_level, db->db_blkid);
141 DB_DNODE_ENTER(db);
142 zio_prop_t zp;
143 dmu_write_policy(os, DB_DNODE(db), db->db_level,
144 WP_DMU_SYNC | WP_DIRECT_WR, &zp);
145 DB_DNODE_EXIT(db);
148 * Dirty this dbuf with DB_NOFILL since we will not have any data
149 * associated with the dbuf.
151 dmu_buf_will_clone_or_dio(&db->db, tx);
153 mutex_enter(&db->db_mtx);
155 uint64_t txg = dmu_tx_get_txg(tx);
156 ASSERT3U(txg, >, spa_last_synced_txg(os->os_spa));
157 ASSERT3U(txg, >, spa_syncing_txg(os->os_spa));
159 dr_head = list_head(&db->db_dirty_records);
160 ASSERT3U(dr_head->dr_txg, ==, txg);
161 dr_head->dt.dl.dr_diowrite = B_TRUE;
162 dr_head->dr_accounted = db->db.db_size;
164 blkptr_t *bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
165 if (db->db_blkptr != NULL) {
167 * Fill in bp with the current block pointer so that
168 * the nopwrite code can check if we're writing the same
169 * data that's already on disk.
171 *bp = *db->db_blkptr;
172 } else {
173 memset(bp, 0, sizeof (blkptr_t));
177 * Disable nopwrite if the current block pointer could change
178 * before this TXG syncs.
180 if (list_next(&db->db_dirty_records, dr_head) != NULL)
181 zp.zp_nopwrite = B_FALSE;
183 ASSERT0(dr_head->dt.dl.dr_has_raw_params);
184 ASSERT3S(dr_head->dt.dl.dr_override_state, ==, DR_NOT_OVERRIDDEN);
185 dr_head->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
187 mutex_exit(&db->db_mtx);
189 dmu_objset_willuse_space(os, dr_head->dr_accounted, tx);
191 dmu_sync_arg_t *dsa = kmem_zalloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
192 dsa->dsa_dr = dr_head;
193 dsa->dsa_tx = tx;
195 zio_t *zio = zio_write(pio, os->os_spa, txg, bp, data,
196 db->db.db_size, db->db.db_size, &zp,
197 dmu_write_direct_ready, NULL, dmu_write_direct_done, dsa,
198 ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb);
200 if (pio == NULL)
201 return (zio_wait(zio));
203 zio_nowait(zio);
205 return (0);
209 dmu_write_abd(dnode_t *dn, uint64_t offset, uint64_t size,
210 abd_t *data, uint32_t flags, dmu_tx_t *tx)
212 dmu_buf_t **dbp;
213 spa_t *spa = dn->dn_objset->os_spa;
214 int numbufs, err;
216 ASSERT(flags & DMU_DIRECTIO);
218 err = dmu_buf_hold_array_by_dnode(dn, offset,
219 size, B_FALSE, FTAG, &numbufs, &dbp, flags);
220 if (err)
221 return (err);
223 zio_t *pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
225 for (int i = 0; i < numbufs && err == 0; i++) {
226 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
228 abd_t *abd = abd_get_offset_size(data,
229 db->db.db_offset - offset, dn->dn_datablksz);
231 zfs_racct_write(spa, db->db.db_size, 1, flags);
232 err = dmu_write_direct(pio, db, abd, tx);
233 ASSERT0(err);
236 err = zio_wait(pio);
239 * The dbuf must be held until the Direct I/O write has completed in
240 * the event there was any errors and dbuf_undirty() was called.
242 dmu_buf_rele_array(dbp, numbufs, FTAG);
244 return (err);
248 dmu_read_abd(dnode_t *dn, uint64_t offset, uint64_t size,
249 abd_t *data, uint32_t flags)
251 objset_t *os = dn->dn_objset;
252 spa_t *spa = os->os_spa;
253 dmu_buf_t **dbp;
254 int numbufs, err;
256 ASSERT(flags & DMU_DIRECTIO);
258 err = dmu_buf_hold_array_by_dnode(dn, offset,
259 size, B_FALSE, FTAG, &numbufs, &dbp, flags);
260 if (err)
261 return (err);
263 zio_t *rio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
265 for (int i = 0; i < numbufs; i++) {
266 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
267 abd_t *mbuf;
268 zbookmark_phys_t zb;
269 blkptr_t *bp;
271 mutex_enter(&db->db_mtx);
273 SET_BOOKMARK(&zb, dmu_objset_ds(os)->ds_object,
274 db->db.db_object, db->db_level, db->db_blkid);
277 * If there is another read for this dbuf, we will wait for
278 * that to complete first before checking the db_state below.
280 while (db->db_state == DB_READ)
281 cv_wait(&db->db_changed, &db->db_mtx);
283 err = dmu_buf_get_bp_from_dbuf(db, &bp);
284 if (err) {
285 mutex_exit(&db->db_mtx);
286 goto error;
290 * There is no need to read if this is a hole or the data is
291 * cached. This will not be considered a direct read for IO
292 * accounting in the same way that an ARC hit is not counted.
294 if (bp == NULL || BP_IS_HOLE(bp) || db->db_state == DB_CACHED) {
295 size_t aoff = offset < db->db.db_offset ?
296 db->db.db_offset - offset : 0;
297 size_t boff = offset > db->db.db_offset ?
298 offset - db->db.db_offset : 0;
299 size_t len = MIN(size - aoff, db->db.db_size - boff);
301 if (db->db_state == DB_CACHED) {
303 * We need to untransformed the ARC buf data
304 * before we copy it over.
306 err = dmu_buf_untransform_direct(db, spa);
307 ASSERT0(err);
308 abd_copy_from_buf_off(data,
309 (char *)db->db.db_data + boff, aoff, len);
310 } else {
311 abd_zero_off(data, aoff, len);
314 mutex_exit(&db->db_mtx);
315 continue;
318 mbuf = make_abd_for_dbuf(db, data, offset, size);
319 ASSERT3P(mbuf, !=, NULL);
322 * The dbuf mutex (db_mtx) must be held when creating the ZIO
323 * for the read. The BP returned from
324 * dmu_buf_get_bp_from_dbuf() could be from a pending block
325 * clone or a yet to be synced Direct I/O write that is in the
326 * dbuf's dirty record. When zio_read() is called, zio_create()
327 * will make a copy of the BP. However, if zio_read() is called
328 * without the mutex being held then the dirty record from the
329 * dbuf could be freed in dbuf_write_done() resulting in garbage
330 * being set for the zio BP.
332 zio_t *cio = zio_read(rio, spa, bp, mbuf, db->db.db_size,
333 dmu_read_abd_done, NULL, ZIO_PRIORITY_SYNC_READ,
334 ZIO_FLAG_CANFAIL | ZIO_FLAG_DIO_READ, &zb);
335 mutex_exit(&db->db_mtx);
337 zfs_racct_read(spa, db->db.db_size, 1, flags);
338 zio_nowait(cio);
341 dmu_buf_rele_array(dbp, numbufs, FTAG);
343 return (zio_wait(rio));
345 error:
346 dmu_buf_rele_array(dbp, numbufs, FTAG);
347 (void) zio_wait(rio);
348 return (err);
351 #ifdef _KERNEL
353 dmu_read_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size)
355 offset_t offset = zfs_uio_offset(uio);
356 offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT;
357 int err;
359 ASSERT(uio->uio_extflg & UIO_DIRECT);
360 ASSERT3U(page_index, <, uio->uio_dio.npages);
362 abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index],
363 offset & (PAGESIZE - 1), size);
364 err = dmu_read_abd(dn, offset, size, data, DMU_DIRECTIO);
365 abd_free(data);
367 if (err == 0)
368 zfs_uioskip(uio, size);
370 return (err);
374 dmu_write_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)
376 offset_t offset = zfs_uio_offset(uio);
377 offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT;
378 int err;
380 ASSERT(uio->uio_extflg & UIO_DIRECT);
381 ASSERT3U(page_index, <, uio->uio_dio.npages);
383 abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index],
384 offset & (PAGESIZE - 1), size);
385 err = dmu_write_abd(dn, offset, size, data, DMU_DIRECTIO, tx);
386 abd_free(data);
388 if (err == 0)
389 zfs_uioskip(uio, size);
391 return (err);
393 #endif /* _KERNEL */
395 EXPORT_SYMBOL(dmu_read_uio_direct);
396 EXPORT_SYMBOL(dmu_write_uio_direct);