module/zfs/dmu_direct.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or https://opensource.org/licenses/CDDL-1.0.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22
  23 #include <sys/dmu.h>
  24 #include <sys/dmu_impl.h>
  25 #include <sys/dbuf.h>
  26 #include <sys/dnode.h>
  27 #include <sys/zfs_context.h>
  28 #include <sys/zfs_racct.h>
  29 #include <sys/dsl_dataset.h>
  30 #include <sys/dmu_objset.h>
  31
  32 static abd_t *
  33 make_abd_for_dbuf(dmu_buf_impl_t *db, abd_t *data, uint64_t offset,
  34     uint64_t size)
  35 {
  36         size_t buf_size = db->db.db_size;
  37         abd_t *pre_buf = NULL, *post_buf = NULL, *mbuf = NULL;
  38         size_t buf_off = 0;
  39
  40         ASSERT(MUTEX_HELD(&db->db_mtx));
  41
  42         if (offset > db->db.db_offset) {
  43                 size_t pre_size = offset - db->db.db_offset;
  44                 pre_buf = abd_alloc_for_io(pre_size, B_TRUE);
  45                 buf_size -= pre_size;
  46                 buf_off = 0;
  47         } else {
  48                 buf_off = db->db.db_offset - offset;
  49                 size -= buf_off;
  50         }
  51
  52         if (size < buf_size) {
  53                 size_t post_size = buf_size - size;
  54                 post_buf = abd_alloc_for_io(post_size, B_TRUE);
  55                 buf_size -= post_size;
  56         }
  57
  58         ASSERT3U(buf_size, >, 0);
  59         abd_t *buf = abd_get_offset_size(data, buf_off, buf_size);
  60
  61         if (pre_buf || post_buf) {
  62                 mbuf = abd_alloc_gang();
  63                 if (pre_buf)
  64                         abd_gang_add(mbuf, pre_buf, B_TRUE);
  65                 abd_gang_add(mbuf, buf, B_TRUE);
  66                 if (post_buf)
  67                         abd_gang_add(mbuf, post_buf, B_TRUE);
  68         } else {
  69                 mbuf = buf;
  70         }
  71
  72         return (mbuf);
  73 }
  74
  75 static void
  76 dmu_read_abd_done(zio_t *zio)
  77 {
  78         abd_free(zio->io_abd);
  79 }
  80
  81 static void
  82 dmu_write_direct_ready(zio_t *zio)
  83 {
  84         dmu_sync_ready(zio, NULL, zio->io_private);
  85 }
  86
  87 static void
  88 dmu_write_direct_done(zio_t *zio)
  89 {
  90         dmu_sync_arg_t *dsa = zio->io_private;
  91         dbuf_dirty_record_t *dr = dsa->dsa_dr;
  92         dmu_buf_impl_t *db = dr->dr_dbuf;
  93
  94         abd_free(zio->io_abd);
  95
  96         mutex_enter(&db->db_mtx);
  97         ASSERT3P(db->db_buf, ==, NULL);
  98         ASSERT3P(dr->dt.dl.dr_data, ==, NULL);
  99         ASSERT3P(db->db.db_data, ==, NULL);
 100         db->db_state = DB_UNCACHED;
 101         mutex_exit(&db->db_mtx);
 102
 103         dmu_sync_done(zio, NULL, zio->io_private);
 104
 105         if (zio->io_error != 0) {
 106                 if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
 107                         ASSERT3U(zio->io_error, ==, EIO);
 108
 109                 /*
 110                  * In the event of an I/O error this block has been freed in
 111                  * zio_done() through zio_dva_unallocate(). Calling
 112                  * dmu_sync_done() above set dr_override_state to
 113                  * DR_NOT_OVERRIDDEN. In this case when dbuf_undirty() calls
 114                  * dbuf_unoverride(), it will skip doing zio_free() to free
 115                  * this block as that was already taken care of.
 116                  *
 117                  * Since we are undirtying the record in open-context, we must
 118                  * have a hold on the db, so it should never be evicted after
 119                  * calling dbuf_undirty().
 120                  */
 121                 mutex_enter(&db->db_mtx);
 122                 VERIFY3B(dbuf_undirty(db, dsa->dsa_tx), ==, B_FALSE);
 123                 mutex_exit(&db->db_mtx);
 124         }
 125
 126         kmem_free(zio->io_bp, sizeof (blkptr_t));
 127         zio->io_bp = NULL;
 128 }
 129
 130 int
 131 dmu_write_direct(zio_t *pio, dmu_buf_impl_t *db, abd_t *data, dmu_tx_t *tx)
 132 {
 133         objset_t *os = db->db_objset;
 134         dsl_dataset_t *ds = dmu_objset_ds(os);
 135         zbookmark_phys_t zb;
 136         dbuf_dirty_record_t *dr_head;
 137
 138         SET_BOOKMARK(&zb, ds->ds_object,
 139             db->db.db_object, db->db_level, db->db_blkid);
 140
 141         DB_DNODE_ENTER(db);
 142         zio_prop_t zp;
 143         dmu_write_policy(os, DB_DNODE(db), db->db_level,
 144             WP_DMU_SYNC | WP_DIRECT_WR, &zp);
 145         DB_DNODE_EXIT(db);
 146
 147         /*
 148          * Dirty this dbuf with DB_NOFILL since we will not have any data
 149          * associated with the dbuf.
 150          */
 151         dmu_buf_will_clone_or_dio(&db->db, tx);
 152
 153         mutex_enter(&db->db_mtx);
 154
 155         uint64_t txg = dmu_tx_get_txg(tx);
 156         ASSERT3U(txg, >, spa_last_synced_txg(os->os_spa));
 157         ASSERT3U(txg, >, spa_syncing_txg(os->os_spa));
 158
 159         dr_head = list_head(&db->db_dirty_records);
 160         ASSERT3U(dr_head->dr_txg, ==, txg);
 161         dr_head->dt.dl.dr_diowrite = B_TRUE;
 162         dr_head->dr_accounted = db->db.db_size;
 163
 164         blkptr_t *bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
 165         if (db->db_blkptr != NULL) {
 166                 /*
 167                  * Fill in bp with the current block pointer so that
 168                  * the nopwrite code can check if we're writing the same
 169                  * data that's already on disk.
 170                  */
 171                 *bp = *db->db_blkptr;
 172         } else {
 173                 memset(bp, 0, sizeof (blkptr_t));
 174         }
 175
 176         /*
 177          * Disable nopwrite if the current block pointer could change
 178          * before this TXG syncs.
 179          */
 180         if (list_next(&db->db_dirty_records, dr_head) != NULL)
 181                 zp.zp_nopwrite = B_FALSE;
 182
 183         ASSERT0(dr_head->dt.dl.dr_has_raw_params);
 184         ASSERT3S(dr_head->dt.dl.dr_override_state, ==, DR_NOT_OVERRIDDEN);
 185         dr_head->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
 186
 187         mutex_exit(&db->db_mtx);
 188
 189         dmu_objset_willuse_space(os, dr_head->dr_accounted, tx);
 190
 191         dmu_sync_arg_t *dsa = kmem_zalloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
 192         dsa->dsa_dr = dr_head;
 193         dsa->dsa_tx = tx;
 194
 195         zio_t *zio = zio_write(pio, os->os_spa, txg, bp, data,
 196             db->db.db_size, db->db.db_size, &zp,
 197             dmu_write_direct_ready, NULL, dmu_write_direct_done, dsa,
 198             ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb);
 199
 200         if (pio == NULL)
 201                 return (zio_wait(zio));
 202
 203         zio_nowait(zio);
 204
 205         return (0);
 206 }
 207
 208 int
 209 dmu_write_abd(dnode_t *dn, uint64_t offset, uint64_t size,
 210     abd_t *data, uint32_t flags, dmu_tx_t *tx)
 211 {
 212         dmu_buf_t **dbp;
 213         spa_t *spa = dn->dn_objset->os_spa;
 214         int numbufs, err;
 215
 216         ASSERT(flags & DMU_DIRECTIO);
 217
 218         err = dmu_buf_hold_array_by_dnode(dn, offset,
 219             size, B_FALSE, FTAG, &numbufs, &dbp, flags);
 220         if (err)
 221                 return (err);
 222
 223         zio_t *pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 224
 225         for (int i = 0; i < numbufs && err == 0; i++) {
 226                 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
 227
 228                 abd_t *abd = abd_get_offset_size(data,
 229                     db->db.db_offset - offset, dn->dn_datablksz);
 230
 231                 zfs_racct_write(spa, db->db.db_size, 1, flags);
 232                 err = dmu_write_direct(pio, db, abd, tx);
 233                 ASSERT0(err);
 234         }
 235
 236         err = zio_wait(pio);
 237
 238         /*
 239          * The dbuf must be held until the Direct I/O write has completed in
 240          * the event there was any errors and dbuf_undirty() was called.
 241          */
 242         dmu_buf_rele_array(dbp, numbufs, FTAG);
 243
 244         return (err);
 245 }
 246
 247 int
 248 dmu_read_abd(dnode_t *dn, uint64_t offset, uint64_t size,
 249     abd_t *data, uint32_t flags)
 250 {
 251         objset_t *os = dn->dn_objset;
 252         spa_t *spa = os->os_spa;
 253         dmu_buf_t **dbp;
 254         int numbufs, err;
 255
 256         ASSERT(flags & DMU_DIRECTIO);
 257
 258         err = dmu_buf_hold_array_by_dnode(dn, offset,
 259             size, B_FALSE, FTAG, &numbufs, &dbp, flags);
 260         if (err)
 261                 return (err);
 262
 263         zio_t *rio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 264
 265         for (int i = 0; i < numbufs; i++) {
 266                 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
 267                 abd_t *mbuf;
 268                 zbookmark_phys_t zb;
 269                 blkptr_t *bp;
 270
 271                 mutex_enter(&db->db_mtx);
 272
 273                 SET_BOOKMARK(&zb, dmu_objset_ds(os)->ds_object,
 274                     db->db.db_object, db->db_level, db->db_blkid);
 275
 276                 /*
 277                  * If there is another read for this dbuf, we will wait for
 278                  * that to complete first before checking the db_state below.
 279                  */
 280                 while (db->db_state == DB_READ)
 281                         cv_wait(&db->db_changed, &db->db_mtx);
 282
 283                 err = dmu_buf_get_bp_from_dbuf(db, &bp);
 284                 if (err) {
 285                         mutex_exit(&db->db_mtx);
 286                         goto error;
 287                 }
 288
 289                 /*
 290                  * There is no need to read if this is a hole or the data is
 291                  * cached. This will not be considered a direct read for IO
 292                  * accounting in the same way that an ARC hit is not counted.
 293                  */
 294                 if (bp == NULL || BP_IS_HOLE(bp) || db->db_state == DB_CACHED) {
 295                         size_t aoff = offset < db->db.db_offset ?
 296                             db->db.db_offset - offset : 0;
 297                         size_t boff = offset > db->db.db_offset ?
 298                             offset - db->db.db_offset : 0;
 299                         size_t len = MIN(size - aoff, db->db.db_size - boff);
 300
 301                         if (db->db_state == DB_CACHED) {
 302                                 /*
 303                                  * We need to untransformed the ARC buf data
 304                                  * before we copy it over.
 305                                  */
 306                                 err = dmu_buf_untransform_direct(db, spa);
 307                                 ASSERT0(err);
 308                                 abd_copy_from_buf_off(data,
 309                                     (char *)db->db.db_data + boff, aoff, len);
 310                         } else {
 311                                 abd_zero_off(data, aoff, len);
 312                         }
 313
 314                         mutex_exit(&db->db_mtx);
 315                         continue;
 316                 }
 317
 318                 mbuf = make_abd_for_dbuf(db, data, offset, size);
 319                 ASSERT3P(mbuf, !=, NULL);
 320
 321                 /*
 322                  * The dbuf mutex (db_mtx) must be held when creating the ZIO
 323                  * for the read. The BP returned from
 324                  * dmu_buf_get_bp_from_dbuf() could be from a pending block
 325                  * clone or a yet to be synced Direct I/O write that is in the
 326                  * dbuf's dirty record. When zio_read() is called, zio_create()
 327                  * will make a copy of the BP. However, if zio_read() is called
 328                  * without the mutex being held then the dirty record from the
 329                  * dbuf could be freed in dbuf_write_done() resulting in garbage
 330                  * being set for the zio BP.
 331                  */
 332                 zio_t *cio = zio_read(rio, spa, bp, mbuf, db->db.db_size,
 333                     dmu_read_abd_done, NULL, ZIO_PRIORITY_SYNC_READ,
 334                     ZIO_FLAG_CANFAIL | ZIO_FLAG_DIO_READ, &zb);
 335                 mutex_exit(&db->db_mtx);
 336
 337                 zfs_racct_read(spa, db->db.db_size, 1, flags);
 338                 zio_nowait(cio);
 339         }
 340
 341         dmu_buf_rele_array(dbp, numbufs, FTAG);
 342
 343         return (zio_wait(rio));
 344
 345 error:
 346         dmu_buf_rele_array(dbp, numbufs, FTAG);
 347         (void) zio_wait(rio);
 348         return (err);
 349 }
 350
 351 #ifdef _KERNEL
 352 int
 353 dmu_read_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size)
 354 {
 355         offset_t offset = zfs_uio_offset(uio);
 356         offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT;
 357         int err;
 358
 359         ASSERT(uio->uio_extflg & UIO_DIRECT);
 360         ASSERT3U(page_index, <, uio->uio_dio.npages);
 361
 362         abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index],
 363             offset & (PAGESIZE - 1), size);
 364         err = dmu_read_abd(dn, offset, size, data, DMU_DIRECTIO);
 365         abd_free(data);
 366
 367         if (err == 0)
 368                 zfs_uioskip(uio, size);
 369
 370         return (err);
 371 }
 372
 373 int
 374 dmu_write_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)
 375 {
 376         offset_t offset = zfs_uio_offset(uio);
 377         offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT;
 378         int err;
 379
 380         ASSERT(uio->uio_extflg & UIO_DIRECT);
 381         ASSERT3U(page_index, <, uio->uio_dio.npages);
 382
 383         abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index],
 384             offset & (PAGESIZE - 1), size);
 385         err = dmu_write_abd(dn, offset, size, data, DMU_DIRECTIO, tx);
 386         abd_free(data);
 387
 388         if (err == 0)
 389                 zfs_uioskip(uio, size);
 390
 391         return (err);
 392 }
 393 #endif /* _KERNEL */
 394
 395 EXPORT_SYMBOL(dmu_read_uio_direct);
 396 EXPORT_SYMBOL(dmu_write_uio_direct);