kernel/fs/ufs/lufs_log.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 #include <sys/systm.h>
  27 #include <sys/types.h>
  28 #include <sys/vnode.h>
  29 #include <sys/errno.h>
  30 #include <sys/sysmacros.h>
  31 #include <sys/debug.h>
  32 #include <sys/kmem.h>
  33 #include <sys/conf.h>
  34 #include <sys/proc.h>
  35 #include <sys/cmn_err.h>
  36 #include <sys/fssnap_if.h>
  37 #include <sys/fs/ufs_inode.h>
  38 #include <sys/fs/ufs_filio.h>
  39 #include <sys/fs/ufs_log.h>
  40 #include <sys/fs/ufs_bio.h>
  41 #include <sys/atomic.h>
  42
  43 extern int              maxphys;
  44 extern uint_t           bypass_snapshot_throttle_key;
  45
  46 extern struct kmem_cache        *lufs_sv;
  47 extern struct kmem_cache        *lufs_bp;
  48
  49 static void
  50 makebusy(ml_unit_t *ul, buf_t *bp)
  51 {
  52         sema_p(&bp->b_sem);
  53         if ((bp->b_flags & B_ERROR) == 0)
  54                 return;
  55         if (bp->b_flags & B_READ)
  56                 ldl_seterror(ul, "Error reading ufs log");
  57         else
  58                 ldl_seterror(ul, "Error writing ufs log");
  59 }
  60
  61 static int
  62 logdone(buf_t *bp)
  63 {
  64         bp->b_flags |= B_DONE;
  65
  66         if (bp->b_flags & B_WRITE)
  67                 sema_v(&bp->b_sem);
  68         else
  69                 /* wakeup the thread waiting on this buf */
  70                 sema_v(&bp->b_io);
  71         return (0);
  72 }
  73
  74 static int
  75 ldl_strategy_done(buf_t *cb)
  76 {
  77         lufs_save_t     *sv;
  78         lufs_buf_t      *lbp;
  79         buf_t           *bp;
  80
  81         ASSERT(SEMA_HELD(&cb->b_sem));
  82         ASSERT((cb->b_flags & B_DONE) == 0);
  83
  84         /*
  85          * Compute address of the ``save'' struct
  86          */
  87         lbp = (lufs_buf_t *)cb;
  88         sv = (lufs_save_t *)lbp->lb_ptr;
  89
  90         if (cb->b_flags & B_ERROR)
  91                 sv->sv_error = 1;
  92
  93         /*
  94          * If this is the last request, release the resources and
  95          * ``done'' the original buffer header.
  96          */
  97         if (atomic_add_long_nv(&sv->sv_nb_left, -cb->b_bcount)) {
  98                 kmem_cache_free(lufs_bp, lbp);
  99                 return (1);
 100         }
 101         /* Propagate any errors back to the original buffer header */
 102         bp = sv->sv_bp;
 103         if (sv->sv_error)
 104                 bp->b_flags |= B_ERROR;
 105         kmem_cache_free(lufs_bp, lbp);
 106         kmem_cache_free(lufs_sv, sv);
 107
 108         biodone(bp);
 109         return (0);
 110 }
 111
 112 /*
 113  * Map the log logical block number to a physical disk block number
 114  */
 115 static int
 116 map_frag(
 117         ml_unit_t       *ul,
 118         daddr_t         lblkno,
 119         size_t          bcount,
 120         daddr_t         *pblkno,
 121         size_t          *pbcount)
 122 {
 123         ic_extent_t     *ext = ul->un_ebp->ic_extents;
 124         uint32_t        e = ul->un_ebp->ic_nextents;
 125         uint32_t        s = 0;
 126         uint32_t        i = e >> 1;
 127         uint32_t        lasti = i;
 128         uint32_t        bno_off;
 129
 130 again:
 131         if (ext[i].ic_lbno <= lblkno) {
 132                 if ((ext[i].ic_lbno + ext[i].ic_nbno) > lblkno) {
 133                         /* FOUND IT */
 134                         bno_off = lblkno - (uint32_t)ext[i].ic_lbno;
 135                         *pbcount = MIN(bcount, dbtob(ext[i].ic_nbno - bno_off));
 136                         *pblkno = ext[i].ic_pbno + bno_off;
 137                         return (0);
 138                 } else
 139                         s = i;
 140         } else
 141                 e = i;
 142         i = s + ((e - s) >> 1);
 143
 144         if (i == lasti) {
 145                 *pbcount = bcount;
 146                 return (ENOENT);
 147         }
 148         lasti = i;
 149
 150         goto again;
 151 }
 152
 153 /*
 154  * The log is a set of extents (which typically will be only one, but
 155  * may be more if the disk was close to full when the log was created)
 156  * and hence the logical offsets into the log
 157  * have to be translated into their real device locations before
 158  * calling the device's strategy routine. The translation may result
 159  * in several IO requests if this request spans extents.
 160  */
 161 void
 162 ldl_strategy(ml_unit_t *ul, buf_t *pb)
 163 {
 164         lufs_save_t     *sv;
 165         lufs_buf_t      *lbp;
 166         buf_t           *cb;
 167         ufsvfs_t        *ufsvfsp = ul->un_ufsvfs;
 168         daddr_t         lblkno, pblkno;
 169         size_t          nb_left, pbcount;
 170         off_t           offset;
 171         dev_t           dev     = ul->un_dev;
 172         int             error;
 173         int             read = pb->b_flags & B_READ;
 174
 175         /*
 176          * Allocate and initialise the save stucture,
 177          */
 178         sv = kmem_cache_alloc(lufs_sv, KM_SLEEP);
 179         sv->sv_error = 0;
 180         sv->sv_bp = pb;
 181         nb_left = pb->b_bcount;
 182         sv->sv_nb_left = nb_left;
 183
 184         lblkno = pb->b_blkno;
 185         offset = 0;
 186
 187         do {
 188                 error = map_frag(ul, lblkno, nb_left, &pblkno, &pbcount);
 189
 190                 lbp = kmem_cache_alloc(lufs_bp, KM_SLEEP);
 191                 bioinit(&lbp->lb_buf);
 192                 lbp->lb_ptr = sv;
 193
 194                 cb = bioclone(pb, offset, pbcount, dev,
 195                     pblkno, ldl_strategy_done, &lbp->lb_buf, KM_SLEEP);
 196
 197                 offset += pbcount;
 198                 lblkno += btodb(pbcount);
 199                 nb_left -= pbcount;
 200
 201                 if (error) {
 202                         cb->b_flags |= B_ERROR;
 203                         cb->b_resid = cb->b_bcount;
 204                         biodone(cb);
 205                 } else {
 206                         if (read) {
 207                                 logstats.ls_ldlreads.value.ui64++;
 208                                 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
 209                                 lwp_stat_update(LWP_STAT_INBLK, 1);
 210                         } else {
 211                                 logstats.ls_ldlwrites.value.ui64++;
 212                                 lwp_stat_update(LWP_STAT_OUBLK, 1);
 213                         }
 214
 215                         /*
 216                          * write through the snapshot driver if necessary
 217                          * We do not want this write to be throttled because
 218                          * we are holding the un_log mutex here. If we
 219                          * are throttled in fssnap_translate, the fssnap_taskq
 220                          * thread which can wake us up can get blocked on
 221                          * the un_log mutex resulting in a deadlock.
 222                          */
 223                         if (ufsvfsp->vfs_snapshot) {
 224                                 (void) tsd_set(bypass_snapshot_throttle_key,
 225                                     (void *)1);
 226                                 fssnap_strategy(&ufsvfsp->vfs_snapshot, cb);
 227
 228                                 (void) tsd_set(bypass_snapshot_throttle_key,
 229                                     NULL);
 230                         } else {
 231                                 (void) bdev_strategy(cb);
 232                         }
 233                 }
 234
 235         } while (nb_left);
 236 }
 237
 238 static void
 239 writelog(ml_unit_t *ul, buf_t *bp)
 240 {
 241         ASSERT(SEMA_HELD(&bp->b_sem));
 242
 243         /*
 244          * This is really an B_ASYNC write but we want Presto to
 245          * cache this write.  The iodone routine, logdone, processes
 246          * the buf correctly.
 247          */
 248         bp->b_flags = B_WRITE;
 249         bp->b_edev = ul->un_dev;
 250         bp->b_iodone = logdone;
 251
 252         /*
 253          * return EIO for every IO if in hard error state
 254          */
 255         if (ul->un_flags & LDL_ERROR) {
 256                 bp->b_flags |= B_ERROR;
 257                 bp->b_error = EIO;
 258                 biodone(bp);
 259                 return;
 260         }
 261
 262         ldl_strategy(ul, bp);
 263 }
 264
 265 static void
 266 readlog(ml_unit_t *ul, buf_t *bp)
 267 {
 268         ASSERT(SEMA_HELD(&bp->b_sem));
 269         ASSERT(bp->b_bcount);
 270
 271         bp->b_flags = B_READ;
 272         bp->b_edev = ul->un_dev;
 273         bp->b_iodone = logdone;
 274
 275         /* all IO returns errors when in error state */
 276         if (ul->un_flags & LDL_ERROR) {
 277                 bp->b_flags |= B_ERROR;
 278                 bp->b_error = EIO;
 279                 biodone(bp);
 280                 (void) trans_wait(bp);
 281                 return;
 282         }
 283
 284         ldl_strategy(ul, bp);
 285
 286         if (trans_wait(bp))
 287                 ldl_seterror(ul, "Error reading ufs log");
 288 }
 289
 290 /*
 291  * NOTE: writers are single threaded thru the log layer.
 292  * This means we can safely reference and change the cb and bp fields
 293  * that ldl_read does not reference w/o holding the cb_rwlock or
 294  * the bp makebusy lock.
 295  */
 296 static void
 297 push_dirty_bp(ml_unit_t *ul, buf_t *bp)
 298 {
 299         buf_t           *newbp;
 300         cirbuf_t        *cb             = &ul->un_wrbuf;
 301
 302         ASSERT(bp == cb->cb_bp && bp == cb->cb_dirty);
 303         ASSERT((bp->b_bcount & (DEV_BSIZE-1)) == 0);
 304
 305         /*
 306          * async write the buf
 307          */
 308         writelog(ul, bp);
 309
 310         /*
 311          * no longer filling any buf
 312          */
 313         cb->cb_dirty = NULL;
 314
 315         /*
 316          * no extra buffer space; all done
 317          */
 318         if (bp->b_bcount == bp->b_bufsize)
 319                 return;
 320
 321         /*
 322          * give extra buffer space to a new bp
 323          *      try to take buf off of free list
 324          */
 325         if ((newbp = cb->cb_free) != NULL) {
 326                 cb->cb_free = newbp->b_forw;
 327         } else {
 328                 newbp = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
 329                 sema_init(&newbp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
 330                 sema_init(&newbp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
 331         }
 332         newbp->b_flags = 0;
 333         newbp->b_bcount = 0;
 334         newbp->b_file = NULL;
 335         newbp->b_offset = -1;
 336         newbp->b_bufsize = bp->b_bufsize - bp->b_bcount;
 337         newbp->b_un.b_addr = bp->b_un.b_addr + bp->b_bcount;
 338         bp->b_bufsize = bp->b_bcount;
 339
 340         /*
 341          * lock out readers and put new buf at LRU position
 342          */
 343         rw_enter(&cb->cb_rwlock, RW_WRITER);
 344         newbp->b_forw = bp->b_forw;
 345         newbp->b_back = bp;
 346         bp->b_forw->b_back = newbp;
 347         bp->b_forw = newbp;
 348         rw_exit(&cb->cb_rwlock);
 349 }
 350
 351 static void
 352 inval_range(ml_unit_t *ul, cirbuf_t *cb, off_t lof, off_t nb)
 353 {
 354         buf_t           *bp;
 355         off_t           elof    = lof + nb;
 356         off_t           buflof;
 357         off_t           bufelof;
 358
 359         /*
 360          * discard all bufs that overlap the range (lof, lof + nb)
 361          */
 362         rw_enter(&cb->cb_rwlock, RW_WRITER);
 363         bp = cb->cb_bp;
 364         do {
 365                 if (bp == cb->cb_dirty || bp->b_bcount == 0) {
 366                         bp = bp->b_forw;
 367                         continue;
 368                 }
 369                 buflof = dbtob(bp->b_blkno);
 370                 bufelof = buflof + bp->b_bcount;
 371                 if ((buflof < lof && bufelof <= lof) ||
 372                     (buflof >= elof && bufelof > elof)) {
 373                         bp = bp->b_forw;
 374                         continue;
 375                 }
 376                 makebusy(ul, bp);
 377                 bp->b_flags = 0;
 378                 bp->b_bcount = 0;
 379                 sema_v(&bp->b_sem);
 380                 bp = bp->b_forw;
 381         } while (bp != cb->cb_bp);
 382         rw_exit(&cb->cb_rwlock);
 383 }
 384
 385 /*
 386  * NOTE: writers are single threaded thru the log layer.
 387  * This means we can safely reference and change the cb and bp fields
 388  * that ldl_read does not reference w/o holding the cb_rwlock or
 389  * the bp makebusy lock.
 390  */
 391 static buf_t *
 392 get_write_bp(ml_unit_t *ul)
 393 {
 394         cirbuf_t        *cb = &ul->un_wrbuf;
 395         buf_t           *bp;
 396
 397         /*
 398          * cb_dirty is the buffer we are currently filling; if any
 399          */
 400         if ((bp = cb->cb_dirty) != NULL) {
 401                 makebusy(ul, bp);
 402                 return (bp);
 403         }
 404         /*
 405          * discard any bp that overlaps the current tail since we are
 406          * about to overwrite it.
 407          */
 408         inval_range(ul, cb, ul->un_tail_lof, 1);
 409
 410         /*
 411          * steal LRU buf
 412          */
 413         rw_enter(&cb->cb_rwlock, RW_WRITER);
 414         bp = cb->cb_bp->b_forw;
 415         makebusy(ul, bp);
 416
 417         cb->cb_dirty = bp;
 418         cb->cb_bp = bp;
 419
 420         bp->b_flags = 0;
 421         bp->b_bcount = 0;
 422         bp->b_blkno = btodb(ul->un_tail_lof);
 423         ASSERT(dbtob(bp->b_blkno) == ul->un_tail_lof);
 424         rw_exit(&cb->cb_rwlock);
 425
 426         /*
 427          * NOTE:
 428          *      1. un_tail_lof never addresses >= un_eol_lof
 429          *      2. b_blkno + btodb(b_bufsize) may > un_eol_lof
 430          *              this case is handled in storebuf
 431          */
 432         return (bp);
 433 }
 434
 435 void
 436 alloc_wrbuf(cirbuf_t *cb, size_t bufsize)
 437 {
 438         int     i;
 439         buf_t   *bp;
 440
 441         /*
 442          * Clear previous allocation
 443          */
 444         if (cb->cb_nb)
 445                 free_cirbuf(cb);
 446
 447         bzero(cb, sizeof (*cb));
 448         rw_init(&cb->cb_rwlock, NULL, RW_DRIVER, NULL);
 449
 450         rw_enter(&cb->cb_rwlock, RW_WRITER);
 451
 452         /*
 453          * preallocate 3 bp's and put them on the free list.
 454          */
 455         for (i = 0; i < 3; ++i) {
 456                 bp = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
 457                 sema_init(&bp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
 458                 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
 459                 bp->b_offset = -1;
 460                 bp->b_forw = cb->cb_free;
 461                 cb->cb_free = bp;
 462         }
 463
 464         cb->cb_va = kmem_alloc(bufsize, KM_SLEEP);
 465         cb->cb_nb = bufsize;
 466
 467         /*
 468          * first bp claims entire write buffer
 469          */
 470         bp = cb->cb_free;
 471         cb->cb_free = bp->b_forw;
 472
 473         bp->b_forw = bp;
 474         bp->b_back = bp;
 475         cb->cb_bp = bp;
 476         bp->b_un.b_addr = cb->cb_va;
 477         bp->b_bufsize = cb->cb_nb;
 478
 479         rw_exit(&cb->cb_rwlock);
 480 }
 481
 482 void
 483 alloc_rdbuf(cirbuf_t *cb, size_t bufsize, size_t blksize)
 484 {
 485         caddr_t va;
 486         size_t  nb;
 487         buf_t   *bp;
 488
 489         /*
 490          * Clear previous allocation
 491          */
 492         if (cb->cb_nb)
 493                 free_cirbuf(cb);
 494
 495         bzero(cb, sizeof (*cb));
 496         rw_init(&cb->cb_rwlock, NULL, RW_DRIVER, NULL);
 497
 498         rw_enter(&cb->cb_rwlock, RW_WRITER);
 499
 500         cb->cb_va = kmem_alloc(bufsize, KM_SLEEP);
 501         cb->cb_nb = bufsize;
 502
 503         /*
 504          * preallocate N bufs that are hard-sized to blksize
 505          *      in other words, the read buffer pool is a linked list
 506          *      of statically sized bufs.
 507          */
 508         va = cb->cb_va;
 509         while ((nb = bufsize) != 0) {
 510                 if (nb > blksize)
 511                         nb = blksize;
 512                 bp = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
 513                 sema_init(&bp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
 514                 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
 515                 bp->b_un.b_addr = va;
 516                 bp->b_bufsize = nb;
 517                 if (cb->cb_bp) {
 518                         bp->b_forw = cb->cb_bp->b_forw;
 519                         bp->b_back = cb->cb_bp;
 520                         cb->cb_bp->b_forw->b_back = bp;
 521                         cb->cb_bp->b_forw = bp;
 522                 } else
 523                         bp->b_forw = bp->b_back = bp;
 524                 cb->cb_bp = bp;
 525                 bufsize -= nb;
 526                 va += nb;
 527         }
 528
 529         rw_exit(&cb->cb_rwlock);
 530 }
 531
 532 void
 533 free_cirbuf(cirbuf_t *cb)
 534 {
 535         buf_t   *bp;
 536
 537         if (cb->cb_nb == 0)
 538                 return;
 539
 540         rw_enter(&cb->cb_rwlock, RW_WRITER);
 541         ASSERT(cb->cb_dirty == NULL);
 542
 543         /*
 544          * free the active bufs
 545          */
 546         while ((bp = cb->cb_bp) != NULL) {
 547                 if (bp == bp->b_forw)
 548                         cb->cb_bp = NULL;
 549                 else
 550                         cb->cb_bp = bp->b_forw;
 551                 bp->b_back->b_forw = bp->b_forw;
 552                 bp->b_forw->b_back = bp->b_back;
 553                 sema_destroy(&bp->b_sem);
 554                 sema_destroy(&bp->b_io);
 555                 kmem_free(bp, sizeof (buf_t));
 556         }
 557
 558         /*
 559          * free the free bufs
 560          */
 561         while ((bp = cb->cb_free) != NULL) {
 562                 cb->cb_free = bp->b_forw;
 563                 sema_destroy(&bp->b_sem);
 564                 sema_destroy(&bp->b_io);
 565                 kmem_free(bp, sizeof (buf_t));
 566         }
 567         kmem_free(cb->cb_va, cb->cb_nb);
 568         cb->cb_va = NULL;
 569         cb->cb_nb = 0;
 570         rw_exit(&cb->cb_rwlock);
 571         rw_destroy(&cb->cb_rwlock);
 572 }
 573
 574 static int
 575 within_range(off_t lof, daddr_t blkno, ulong_t bcount)
 576 {
 577         off_t   blof    = dbtob(blkno);
 578
 579         return ((lof >= blof) && (lof < (blof + bcount)));
 580 }
 581
 582 static buf_t *
 583 find_bp(ml_unit_t *ul, cirbuf_t *cb, off_t lof)
 584 {
 585         buf_t *bp;
 586
 587         /*
 588          * find a buf that contains the offset lof
 589          */
 590         rw_enter(&cb->cb_rwlock, RW_READER);
 591         bp = cb->cb_bp;
 592         do {
 593                 if (bp->b_bcount &&
 594                     within_range(lof, bp->b_blkno, bp->b_bcount)) {
 595                         makebusy(ul, bp);
 596                         rw_exit(&cb->cb_rwlock);
 597                         return (bp);
 598                 }
 599                 bp = bp->b_forw;
 600         } while (bp != cb->cb_bp);
 601         rw_exit(&cb->cb_rwlock);
 602
 603         return (NULL);
 604 }
 605
 606 static off_t
 607 find_read_lof(ml_unit_t *ul, cirbuf_t *cb, off_t lof)
 608 {
 609         buf_t   *bp, *bpend;
 610         off_t   rlof;
 611
 612         /*
 613          * we mustn't:
 614          *      o read past eol
 615          *      o read past the tail
 616          *      o read data that may be being written.
 617          */
 618         rw_enter(&cb->cb_rwlock, RW_READER);
 619         bpend = bp = cb->cb_bp->b_forw;
 620         rlof = ul->un_tail_lof;
 621         do {
 622                 if (bp->b_bcount) {
 623                         rlof = dbtob(bp->b_blkno);
 624                         break;
 625                 }
 626                 bp = bp->b_forw;
 627         } while (bp != bpend);
 628         rw_exit(&cb->cb_rwlock);
 629
 630         if (lof <= rlof)
 631                 /* lof is prior to the range represented by the write buf */
 632                 return (rlof);
 633         else
 634                 /* lof follows the range represented by the write buf */
 635                 return ((off_t)ul->un_eol_lof);
 636 }
 637
 638 static buf_t *
 639 get_read_bp(ml_unit_t *ul, off_t lof)
 640 {
 641         cirbuf_t        *cb;
 642         buf_t           *bp;
 643         off_t           rlof;
 644
 645         /*
 646          * retrieve as much data as possible from the incore buffers
 647          */
 648         if ((bp = find_bp(ul, &ul->un_wrbuf, lof)) != NULL) {
 649                 logstats.ls_lreadsinmem.value.ui64++;
 650                 return (bp);
 651         }
 652         if ((bp = find_bp(ul, &ul->un_rdbuf, lof)) != NULL) {
 653                 logstats.ls_lreadsinmem.value.ui64++;
 654                 return (bp);
 655         }
 656
 657         /*
 658          * steal the LRU buf
 659          */
 660         cb = &ul->un_rdbuf;
 661         rw_enter(&cb->cb_rwlock, RW_WRITER);
 662         bp = cb->cb_bp->b_forw;
 663         makebusy(ul, bp);
 664         bp->b_flags = 0;
 665         bp->b_bcount = 0;
 666         cb->cb_bp = bp;
 667         rw_exit(&cb->cb_rwlock);
 668
 669         /*
 670          * don't read past the tail or the end-of-log
 671          */
 672         bp->b_blkno = btodb(lof);
 673         lof = dbtob(bp->b_blkno);
 674         rlof = find_read_lof(ul, &ul->un_wrbuf, lof);
 675         bp->b_bcount = MIN(bp->b_bufsize, rlof - lof);
 676         readlog(ul, bp);
 677         return (bp);
 678 }
 679
 680 /*
 681  * NOTE: writers are single threaded thru the log layer.
 682  * This means we can safely reference and change the cb and bp fields
 683  * that ldl_read does not reference w/o holding the cb_rwlock or
 684  * the bp makebusy lock.
 685  */
 686 static int
 687 extend_write_bp(ml_unit_t *ul, cirbuf_t *cb, buf_t *bp)
 688 {
 689         buf_t   *bpforw = bp->b_forw;
 690
 691         ASSERT(bp == cb->cb_bp && bp == cb->cb_dirty);
 692
 693         /*
 694          * there is no `next' bp; do nothing
 695          */
 696         if (bpforw == bp)
 697                 return (0);
 698
 699         /*
 700          * buffer space is not adjacent; do nothing
 701          */
 702         if ((bp->b_un.b_addr + bp->b_bufsize) != bpforw->b_un.b_addr)
 703                 return (0);
 704
 705         /*
 706          * locking protocol requires giving up any bp locks before
 707          * acquiring cb_rwlock.  This is okay because we hold
 708          * un_log_mutex.
 709          */
 710         sema_v(&bp->b_sem);
 711
 712         /*
 713          * lock out ldl_read
 714          */
 715         rw_enter(&cb->cb_rwlock, RW_WRITER);
 716
 717         /*
 718          * wait for current IO to finish w/next bp; if necessary
 719          */
 720         makebusy(ul, bpforw);
 721
 722         /*
 723          * free the next bp and steal its space
 724          */
 725         bp->b_forw = bpforw->b_forw;
 726         bpforw->b_forw->b_back = bp;
 727         bp->b_bufsize += bpforw->b_bufsize;
 728         sema_v(&bpforw->b_sem);
 729         bpforw->b_forw = cb->cb_free;
 730         cb->cb_free = bpforw;
 731         makebusy(ul, bp);
 732         rw_exit(&cb->cb_rwlock);
 733
 734         return (1);
 735 }
 736
 737 static size_t
 738 storebuf(ml_unit_t *ul, buf_t *bp, caddr_t va, size_t nb)
 739 {
 740         size_t          copy_nb;
 741         size_t          nb_in_sec;
 742         sect_trailer_t  *st;
 743         size_t          nb_left = nb;
 744         cirbuf_t        *cb     = &ul->un_wrbuf;
 745
 746 again:
 747         nb_in_sec = NB_LEFT_IN_SECTOR(bp->b_bcount);
 748         copy_nb = MIN(nb_left, nb_in_sec);
 749
 750         ASSERT(copy_nb);
 751
 752         bcopy(va, bp->b_un.b_addr + bp->b_bcount, copy_nb);
 753         bp->b_bcount += copy_nb;
 754         va += copy_nb;
 755         nb_left -= copy_nb;
 756         ul->un_tail_lof += copy_nb;
 757
 758         if ((nb_in_sec -= copy_nb) == 0) {
 759                 st = (sect_trailer_t *)(bp->b_un.b_addr + bp->b_bcount);
 760
 761                 st->st_tid = ul->un_logmap->mtm_tid;
 762                 st->st_ident = ul->un_tail_ident++;
 763                 bp->b_bcount += sizeof (sect_trailer_t);
 764                 ul->un_tail_lof += sizeof (sect_trailer_t);
 765                 /*
 766                  * log wrapped; async write this bp
 767                  */
 768                 if (ul->un_tail_lof == ul->un_eol_lof) {
 769                         ul->un_tail_lof = ul->un_bol_lof;
 770                         push_dirty_bp(ul, bp);
 771                         return (nb - nb_left);
 772                 }
 773                 /*
 774                  * out of bp space; get more or async write buf
 775                  */
 776                 if (bp->b_bcount == bp->b_bufsize) {
 777                         if (!extend_write_bp(ul, cb, bp)) {
 778                                 push_dirty_bp(ul, bp);
 779                                 return (nb - nb_left);
 780                         }
 781                 }
 782         }
 783         if (nb_left)
 784                 goto again;
 785
 786         sema_v(&bp->b_sem);
 787         return (nb);
 788 }
 789
 790 static void
 791 fetchzeroes(caddr_t dst_va, offset_t dst_mof, ulong_t dst_nb, mapentry_t *me)
 792 {
 793         offset_t        src_mof = me->me_mof;
 794         size_t          src_nb  = me->me_nb;
 795
 796         if (src_mof > dst_mof) {
 797                 ASSERT(src_mof < (dst_mof + dst_nb));
 798                 dst_va += (src_mof - dst_mof);
 799                 dst_nb -= (src_mof - dst_mof);
 800         } else {
 801                 ASSERT(dst_mof < (src_mof + src_nb));
 802                 src_nb -= (dst_mof - src_mof);
 803         }
 804
 805         src_nb = MIN(src_nb, dst_nb);
 806         ASSERT(src_nb);
 807         bzero(dst_va, src_nb);
 808 }
 809
 810 /*
 811  * dst_va == NULL means don't copy anything
 812  */
 813 static ulong_t
 814 fetchbuf(
 815         ml_unit_t *ul,
 816         buf_t *bp,
 817         caddr_t dst_va,
 818         size_t dst_nb,
 819         off_t *dst_lofp)
 820 {
 821         caddr_t copy_va;
 822         size_t  copy_nb;
 823         size_t  nb_sec;
 824         off_t   dst_lof         = *dst_lofp;
 825         ulong_t sav_dst_nb      = dst_nb;
 826         ulong_t src_nb          = bp->b_bcount;
 827         off_t   src_lof         = dbtob(bp->b_blkno);
 828         off_t   src_elof        = src_lof + src_nb;
 829         caddr_t src_va          = bp->b_un.b_addr;
 830
 831         /*
 832          * copy from bp to dst_va
 833          */
 834         while (dst_nb) {
 835                 /*
 836                  * compute address within bp
 837                  */
 838                 copy_va = src_va + (dst_lof - src_lof);
 839
 840                 /*
 841                  * adjust copy size to amount of data in bp
 842                  */
 843                 copy_nb = MIN(dst_nb, src_elof - dst_lof);
 844
 845                 /*
 846                  * adjust copy size to amount of data in sector
 847                  */
 848                 nb_sec = NB_LEFT_IN_SECTOR(dst_lof);
 849                 copy_nb = MIN(copy_nb, nb_sec);
 850
 851                 /*
 852                  * dst_va == NULL means don't do copy (see logseek())
 853                  */
 854                 if (dst_va) {
 855                         bcopy(copy_va, dst_va, copy_nb);
 856                         dst_va += copy_nb;
 857                 }
 858                 dst_lof += copy_nb;
 859                 dst_nb -= copy_nb;
 860                 nb_sec -= copy_nb;
 861
 862                 /*
 863                  * advance over sector trailer
 864                  */
 865                 if (nb_sec == 0)
 866                         dst_lof += sizeof (sect_trailer_t);
 867
 868                 /*
 869                  * exhausted buffer
 870                  *      return current lof for next read
 871                  */
 872                 if (dst_lof == src_elof) {
 873                         sema_v(&bp->b_sem);
 874                         if (dst_lof == ul->un_eol_lof)
 875                                 dst_lof = ul->un_bol_lof;
 876                         *dst_lofp = dst_lof;
 877                         return (sav_dst_nb - dst_nb);
 878                 }
 879         }
 880
 881         /*
 882          * copy complete - return current lof
 883          */
 884         sema_v(&bp->b_sem);
 885         *dst_lofp = dst_lof;
 886         return (sav_dst_nb);
 887 }
 888
 889 void
 890 ldl_round_commit(ml_unit_t *ul)
 891 {
 892         int             wrapped;
 893         buf_t           *bp;
 894         sect_trailer_t  *st;
 895         size_t          bcount;
 896         cirbuf_t        *cb     = &ul->un_wrbuf;
 897
 898         /*
 899          * if nothing to write; then do nothing
 900          */
 901         if ((bp = cb->cb_dirty) == NULL)
 902                 return;
 903         makebusy(ul, bp);
 904
 905         /*
 906          * round up to sector boundary and set new tail
 907          *      don't readjust st_ident if buf is already rounded
 908          */
 909         bcount = P2ROUNDUP(bp->b_bcount, DEV_BSIZE);
 910         if (bcount == bp->b_bcount) {
 911                 sema_v(&bp->b_sem);
 912                 return;
 913         }
 914         bp->b_bcount = bcount;
 915         ul->un_tail_lof = dbtob(bp->b_blkno) + bcount;
 916         wrapped = 0;
 917         if (ul->un_tail_lof == ul->un_eol_lof) {
 918                 ul->un_tail_lof = ul->un_bol_lof;
 919                 ++wrapped;
 920         }
 921         ASSERT(ul->un_tail_lof != ul->un_head_lof);
 922
 923         /*
 924          * fix up the sector trailer
 925          */
 926         /* LINTED */
 927         st = (sect_trailer_t *)
 928             ((bp->b_un.b_addr + bcount) - sizeof (*st));
 929         st->st_tid = ul->un_logmap->mtm_tid;
 930         st->st_ident = ul->un_tail_ident++;
 931
 932         /*
 933          * if tail wrapped or we have exhausted this buffer
 934          *      async write the buffer
 935          */
 936         if (wrapped || bcount == bp->b_bufsize)
 937                 push_dirty_bp(ul, bp);
 938         else
 939                 sema_v(&bp->b_sem);
 940 }
 941
 942 void
 943 ldl_push_commit(ml_unit_t *ul)
 944 {
 945         buf_t           *bp;
 946         cirbuf_t        *cb     = &ul->un_wrbuf;
 947
 948         /*
 949          * if nothing to write; then do nothing
 950          */
 951         if ((bp = cb->cb_dirty) == NULL)
 952                 return;
 953         makebusy(ul, bp);
 954         push_dirty_bp(ul, bp);
 955 }
 956
 957 int
 958 ldl_need_commit(ml_unit_t *ul)
 959 {
 960         return (ul->un_resv > (ul->un_maxresv - (ul->un_maxresv>>2)));
 961 }
 962
 963 int
 964 ldl_has_space(ml_unit_t *ul, mapentry_t *me)
 965 {
 966         off_t   nfb;
 967         off_t   nb;
 968
 969         ASSERT(MUTEX_HELD(&ul->un_log_mutex));
 970
 971         /*
 972          * Add up the size used by the deltas
 973          * round nb up to a sector length plus an extra sector
 974          *      w/o the extra sector we couldn't distinguish
 975          *      a full log (head == tail) from an empty log (head == tail)
 976          */
 977         for (nb = DEV_BSIZE; me; me = me->me_hash) {
 978                 nb += sizeof (struct delta);
 979                 if (me->me_dt != DT_CANCEL)
 980                         nb += me->me_nb;
 981         }
 982         nb = P2ROUNDUP(nb, DEV_BSIZE);
 983
 984         if (ul->un_head_lof <= ul->un_tail_lof)
 985                 nfb = (ul->un_head_lof - ul->un_bol_lof) +
 986                     (ul->un_eol_lof - ul->un_tail_lof);
 987         else
 988                 nfb = ul->un_head_lof - ul->un_tail_lof;
 989
 990         return (nb < nfb);
 991 }
 992
 993 void
 994 ldl_write(ml_unit_t *ul, caddr_t bufp, offset_t bufmof, struct mapentry *me)
 995 {
 996         buf_t           *bp;
 997         caddr_t         va;
 998         size_t          nb;
 999         size_t          actual;
1000
1001         ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1002
1003         /* Write the delta */
1004
1005         nb = sizeof (struct delta);
1006         va = (caddr_t)&me->me_delta;
1007         bp = get_write_bp(ul);
1008
1009         while (nb) {
1010                 if (ul->un_flags & LDL_ERROR) {
1011                         sema_v(&bp->b_sem);
1012                         return;
1013                 }
1014                 actual = storebuf(ul, bp, va, nb);
1015                 ASSERT(actual);
1016                 va += actual;
1017                 nb -= actual;
1018                 if (nb)
1019                         bp = get_write_bp(ul);
1020         }
1021
1022         /* If a commit, cancel, or 0's; we're almost done */
1023         switch (me->me_dt) {
1024                 case DT_COMMIT:
1025                 case DT_CANCEL:
1026                 case DT_ABZERO:
1027                         /* roll needs to know where the next delta will go */
1028                         me->me_lof = ul->un_tail_lof;
1029                         return;
1030                 default:
1031                         break;
1032         }
1033
1034         /* Now write the data */
1035
1036         ASSERT(me->me_nb != 0);
1037
1038         nb = me->me_nb;
1039         va = (me->me_mof - bufmof) + bufp;
1040         bp = get_write_bp(ul);
1041
1042         /* Save where we will put the data */
1043         me->me_lof = ul->un_tail_lof;
1044
1045         while (nb) {
1046                 if (ul->un_flags & LDL_ERROR) {
1047                         sema_v(&bp->b_sem);
1048                         return;
1049                 }
1050                 actual = storebuf(ul, bp, va, nb);
1051                 ASSERT(actual);
1052                 va += actual;
1053                 nb -= actual;
1054                 if (nb)
1055                         bp = get_write_bp(ul);
1056         }
1057 }
1058
1059 void
1060 ldl_waito(ml_unit_t *ul)
1061 {
1062         buf_t           *bp;
1063         cirbuf_t        *cb     = &ul->un_wrbuf;
1064
1065         rw_enter(&cb->cb_rwlock, RW_WRITER);
1066         /*
1067          * wait on them
1068          */
1069         bp = cb->cb_bp;
1070         do {
1071                 if ((bp->b_flags & B_DONE) == 0) {
1072                         makebusy(ul, bp);
1073                         sema_v(&bp->b_sem);
1074                 }
1075                 bp = bp->b_forw;
1076         } while (bp != cb->cb_bp);
1077         rw_exit(&cb->cb_rwlock);
1078 }
1079
1080 /*
1081  * seek nb bytes from location lof
1082  */
1083 static int
1084 logseek(ml_unit_t *ul, off_t lof, size_t nb, off_t *lofp)
1085 {
1086         buf_t   *bp;
1087         ulong_t actual;
1088
1089         while (nb) {
1090                 bp = get_read_bp(ul, lof);
1091                 if (bp->b_flags & B_ERROR) {
1092                         sema_v(&bp->b_sem);
1093                         return (EIO);
1094                 }
1095                 actual = fetchbuf(ul, bp, NULL, nb, &lof);
1096                 ASSERT(actual);
1097                 nb -= actual;
1098         }
1099         *lofp = lof;
1100         ASSERT(nb == 0);
1101         return (0);
1102 }
1103
1104 int
1105 ldl_read(
1106         ml_unit_t *ul,          /* Log unit */
1107         caddr_t va,             /* address of buffer to read into */
1108         offset_t mof,           /* mof of buffer */
1109         off_t nb,               /* length of buffer */
1110         mapentry_t *me)         /* Map entry list */
1111 {
1112         buf_t   *bp;
1113         crb_t   *crb;
1114         caddr_t rva;                    /* address to read into */
1115         size_t  rnb;                    /* # of bytes to read */
1116         off_t   lof;                    /* log device offset to read from */
1117         off_t   skip;
1118         ulong_t actual;
1119         int     error;
1120         caddr_t eva     = va + nb;      /* end of buffer */
1121
1122         for (; me; me = me->me_agenext) {
1123                 ASSERT(me->me_dt != DT_CANCEL);
1124
1125                 /*
1126                  * check for an cached roll buffer
1127                  */
1128                 crb = me->me_crb;
1129                 if (crb) {
1130                         if (mof > crb->c_mof) {
1131                                 /*
1132                                  * This mapentry overlaps with the beginning of
1133                                  * the supplied buffer
1134                                  */
1135                                 skip = mof - crb->c_mof;
1136                                 bcopy(crb->c_buf + skip, va,
1137                                     MIN(nb, crb->c_nb - skip));
1138                         } else {
1139                                 /*
1140                                  * This mapentry starts at or after
1141                                  * the supplied buffer.
1142                                  */
1143                                 skip = crb->c_mof - mof;
1144                                 bcopy(crb->c_buf, va + skip,
1145                                     MIN(crb->c_nb, nb - skip));
1146                         }
1147                         logstats.ls_lreadsinmem.value.ui64++;
1148                         continue;
1149                 }
1150
1151                 /*
1152                  * check for a delta full of zeroes - there's no log data
1153                  */
1154                 if (me->me_dt == DT_ABZERO) {
1155                         fetchzeroes(va, mof, nb, me);
1156                         continue;
1157                 }
1158
1159                 if (mof > me->me_mof) {
1160                         rnb = (size_t)(mof - me->me_mof);
1161                         error = logseek(ul, me->me_lof, rnb, &lof);
1162                         if (error)
1163                                 return (EIO);
1164                         rva = va;
1165                         rnb = me->me_nb - rnb;
1166                         rnb = ((rva + rnb) > eva) ? eva - rva : rnb;
1167                 } else {
1168                         lof = me->me_lof;
1169                         rva = (me->me_mof - mof) + va;
1170                         rnb = ((rva + me->me_nb) > eva) ? eva - rva : me->me_nb;
1171                 }
1172
1173                 while (rnb) {
1174                         bp = get_read_bp(ul, lof);
1175                         if (bp->b_flags & B_ERROR) {
1176                                 sema_v(&bp->b_sem);
1177                                 return (EIO);
1178                         }
1179                         ASSERT(((me->me_flags & ME_ROLL) == 0) ||
1180                             (bp != ul->un_wrbuf.cb_dirty));
1181                         actual = fetchbuf(ul, bp, rva, rnb, &lof);
1182                         ASSERT(actual);
1183                         rva += actual;
1184                         rnb -= actual;
1185                 }
1186         }
1187         return (0);
1188 }
1189
1190 void
1191 ldl_savestate(ml_unit_t *ul)
1192 {
1193         int             error;
1194         buf_t           *bp     = ul->un_bp;
1195         ml_odunit_t     *ud     = (void *)bp->b_un.b_addr;
1196         ml_odunit_t     *ud2    = (void *)(bp->b_un.b_addr + DEV_BSIZE);
1197
1198 #if     DEBUG
1199         /*
1200          * Scan test is running; don't update intermediate state
1201          */
1202         if (ul->un_logmap && ul->un_logmap->mtm_trimlof)
1203                 return;
1204 #endif  /* DEBUG */
1205
1206         mutex_enter(&ul->un_state_mutex);
1207         bcopy(&ul->un_ondisk, ud, sizeof (*ud));
1208         ud->od_chksum = ud->od_head_ident + ud->od_tail_ident;
1209         bcopy(ud, ud2, sizeof (*ud));
1210
1211         /* If a snapshot is enabled write through the shapshot driver. */
1212         if (ul->un_ufsvfs->vfs_snapshot)
1213                 UFS_BWRITE2(ul->un_ufsvfs, bp);
1214         else
1215                 BWRITE2(bp);
1216         logstats.ls_ldlwrites.value.ui64++;
1217         error = bp->b_flags & B_ERROR;
1218         mutex_exit(&ul->un_state_mutex);
1219         if (error)
1220                 ldl_seterror(ul, "Error writing ufs log state");
1221 }
1222
1223 /*
1224  * The head will be set to (new_lof - header) since ldl_sethead is
1225  * called with the new_lof of the data portion of a delta.
1226  */
1227 void
1228 ldl_sethead(ml_unit_t *ul, off_t data_lof, uint32_t tid)
1229 {
1230         off_t           nb;
1231         off_t           new_lof;
1232         uint32_t        new_ident;
1233         daddr_t         beg_blkno;
1234         daddr_t         end_blkno;
1235
1236         ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1237
1238         if (data_lof == -1) {
1239                 /* log is empty */
1240                 new_ident = lufs_hd_genid(ul);
1241                 new_lof = ul->un_tail_lof;
1242
1243         } else {
1244                 /* compute header's lof */
1245                 new_ident = ul->un_head_ident;
1246                 new_lof = data_lof - sizeof (struct delta);
1247
1248                 /* whoops, header spans sectors; subtract out sector trailer */
1249                 if (btodb(new_lof) != btodb(data_lof))
1250                         new_lof -= sizeof (sect_trailer_t);
1251
1252                 /* whoops, header wrapped the log; go to last sector */
1253                 if (new_lof < ul->un_bol_lof) {
1254                         /* sector offset */
1255                         new_lof -= dbtob(btodb(new_lof));
1256                         /* add to last sector's lof */
1257                         new_lof += (ul->un_eol_lof - DEV_BSIZE);
1258                 }
1259                 ul->un_head_tid = tid;
1260         }
1261
1262         /*
1263          * check for nop
1264          */
1265         if (new_lof == ul->un_head_lof)
1266                 return;
1267
1268         /*
1269          * invalidate the affected bufs and calculate new ident
1270          */
1271         if (new_lof > ul->un_head_lof) {
1272                 nb = new_lof - ul->un_head_lof;
1273                 inval_range(ul, &ul->un_wrbuf, ul->un_head_lof, nb);
1274                 inval_range(ul, &ul->un_rdbuf, ul->un_head_lof, nb);
1275
1276                 end_blkno = btodb(new_lof);
1277                 beg_blkno = btodb(ul->un_head_lof);
1278                 new_ident += (end_blkno - beg_blkno);
1279         } else {
1280                 nb = ul->un_eol_lof - ul->un_head_lof;
1281                 inval_range(ul, &ul->un_wrbuf, ul->un_head_lof, nb);
1282                 inval_range(ul, &ul->un_rdbuf, ul->un_head_lof, nb);
1283
1284                 end_blkno = btodb(ul->un_eol_lof);
1285                 beg_blkno = btodb(ul->un_head_lof);
1286                 new_ident += (end_blkno - beg_blkno);
1287
1288                 nb = new_lof - ul->un_bol_lof;
1289                 inval_range(ul, &ul->un_wrbuf, ul->un_bol_lof, nb);
1290                 inval_range(ul, &ul->un_rdbuf, ul->un_bol_lof, nb);
1291
1292                 end_blkno = btodb(new_lof);
1293                 beg_blkno = btodb(ul->un_bol_lof);
1294                 new_ident += (end_blkno - beg_blkno);
1295         }
1296         /*
1297          * don't update the head if there has been an error
1298          */
1299         if (ul->un_flags & LDL_ERROR)
1300                 return;
1301
1302         /* Fix up the head and ident */
1303         ASSERT(new_lof >= ul->un_bol_lof);
1304         ul->un_head_lof = new_lof;
1305         ul->un_head_ident = new_ident;
1306         if (data_lof == -1) {
1307                 ul->un_tail_ident = ul->un_head_ident;
1308         }
1309
1310
1311         /* Commit to the database */
1312         ldl_savestate(ul);
1313
1314         ASSERT(((ul->un_logmap->mtm_debug & MT_SCAN) == 0) ||
1315             ldl_sethead_debug(ul));
1316 }
1317
1318 /*
1319  * The tail will be set to the sector following lof+nb
1320  *      lof + nb == size of the last delta + commit record
1321  *      this function is called once after the log scan has completed.
1322  */
1323 void
1324 ldl_settail(ml_unit_t *ul, off_t lof, size_t nb)
1325 {
1326         off_t           new_lof;
1327         uint32_t        new_ident;
1328         daddr_t         beg_blkno;
1329         daddr_t         end_blkno;
1330
1331         ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1332
1333         if (lof == -1) {
1334                 ul->un_tail_lof = dbtob(btodb(ul->un_head_lof));
1335                 ul->un_head_lof = ul->un_tail_lof;
1336                 ul->un_head_ident = lufs_hd_genid(ul);
1337                 ul->un_tail_ident = ul->un_head_ident;
1338
1339                 /* Commit to the database */
1340                 ldl_savestate(ul);
1341
1342                 return;
1343         }
1344
1345         /*
1346          * new_lof is the offset of the sector following the last commit
1347          */
1348         (void) logseek(ul, lof, nb, &new_lof);
1349         ASSERT(new_lof != dbtob(btodb(ul->un_head_lof)));
1350
1351         /*
1352          * calculate new ident
1353          */
1354         if (new_lof > ul->un_head_lof) {
1355                 end_blkno = btodb(new_lof);
1356                 beg_blkno = btodb(ul->un_head_lof);
1357                 new_ident = ul->un_head_ident + (end_blkno - beg_blkno);
1358         } else {
1359                 end_blkno = btodb(ul->un_eol_lof);
1360                 beg_blkno = btodb(ul->un_head_lof);
1361                 new_ident = ul->un_head_ident + (end_blkno - beg_blkno);
1362
1363                 end_blkno = btodb(new_lof);
1364                 beg_blkno = btodb(ul->un_bol_lof);
1365                 new_ident += (end_blkno - beg_blkno);
1366         }
1367
1368         /* Fix up the tail and ident */
1369         ul->un_tail_lof = new_lof;
1370         ul->un_tail_ident = new_ident;
1371
1372         /* Commit to the database */
1373         ldl_savestate(ul);
1374 }
1375
1376 /*
1377  * LOGSCAN STUFF
1378  */
1379 static int
1380 ldl_logscan_ident(ml_unit_t *ul, buf_t *bp, off_t lof)
1381 {
1382         ulong_t         ident;
1383         size_t          nblk, i;
1384         sect_trailer_t  *st;
1385
1386         /*
1387          * compute ident for first sector in the buffer
1388          */
1389         ident = ul->un_head_ident;
1390         if (bp->b_blkno >= btodb(ul->un_head_lof)) {
1391                 ident += (bp->b_blkno - btodb(ul->un_head_lof));
1392         } else {
1393                 ident += (btodb(ul->un_eol_lof) - btodb(ul->un_head_lof));
1394                 ident += (bp->b_blkno - btodb(ul->un_bol_lof));
1395         }
1396         /*
1397          * truncate the buffer down to the last valid sector
1398          */
1399         nblk = btodb(bp->b_bcount);
1400         bp->b_bcount = 0;
1401         /* LINTED */
1402         st = (sect_trailer_t *)(bp->b_un.b_addr + LDL_USABLE_BSIZE);
1403         for (i = 0; i < nblk; ++i) {
1404                 if (st->st_ident != ident)
1405                         break;
1406
1407                 /* remember last valid tid for ldl_logscan_error() */
1408                 ul->un_tid = st->st_tid;
1409
1410                 /* LINTED */
1411                 st = (sect_trailer_t *)(((caddr_t)st) + DEV_BSIZE);
1412                 ++ident;
1413                 bp->b_bcount += DEV_BSIZE;
1414         }
1415         /*
1416          * make sure that lof is still within range
1417          */
1418         return (within_range(lof, bp->b_blkno, bp->b_bcount));
1419 }
1420
1421 ulong_t
1422 ldl_logscan_nbcommit(off_t lof)
1423 {
1424         /*
1425          * lof is the offset following the commit header.  However,
1426          * if the commit header fell on the end-of-sector, then lof
1427          * has already been advanced to the beginning of the next
1428          * sector.  So do nothing.  Otherwise, return the remaining
1429          * bytes in the sector.
1430          */
1431         if ((lof & (DEV_BSIZE - 1)) == 0)
1432                 return (0);
1433         return (NB_LEFT_IN_SECTOR(lof));
1434 }
1435
1436 int
1437 ldl_logscan_read(ml_unit_t *ul, off_t *lofp, size_t nb, caddr_t va)
1438 {
1439         buf_t   *bp;
1440         ulong_t actual;
1441
1442         ASSERT(ul->un_head_lof != ul->un_tail_lof);
1443
1444         /*
1445          * Check the log data doesn't go out of bounds
1446          */
1447         if (ul->un_head_lof < ul->un_tail_lof) {
1448                 if (!WITHIN(*lofp, nb, ul->un_head_lof,
1449                     (ul->un_tail_lof - ul->un_head_lof))) {
1450                         return (EIO);
1451                 }
1452         } else {
1453                 if (OVERLAP(*lofp, nb, ul->un_tail_lof,
1454                     (ul->un_head_lof - ul->un_tail_lof))) {
1455                         return (EIO);
1456                 }
1457         }
1458
1459         while (nb) {
1460                 bp = get_read_bp(ul, *lofp);
1461                 if (bp->b_flags & B_ERROR) {
1462                         sema_v(&bp->b_sem);
1463                         return (EIO);
1464                 }
1465                 /*
1466                  * out-of-seq idents means partial transaction
1467                  *      panic, non-corrupting powerfail, ...
1468                  */
1469                 if (!ldl_logscan_ident(ul, bp, *lofp)) {
1470                         sema_v(&bp->b_sem);
1471                         return (EIO);
1472                 }
1473                 /*
1474                  * copy the header into the caller's buf
1475                  */
1476                 actual = fetchbuf(ul, bp, va, nb, lofp);
1477                 if (va)
1478                         va += actual;
1479                 nb -= actual;
1480         }
1481         return (0);
1482 }
1483
1484 void
1485 ldl_logscan_begin(ml_unit_t *ul)
1486 {
1487         size_t  bufsize;
1488
1489         ASSERT(ul->un_wrbuf.cb_dirty == NULL);
1490
1491         /*
1492          * logscan has begun
1493          */
1494         ul->un_flags |= LDL_SCAN;
1495
1496         /*
1497          * reset the circular bufs
1498          */
1499         bufsize = ldl_bufsize(ul);
1500         alloc_rdbuf(&ul->un_rdbuf, bufsize, bufsize);
1501         alloc_wrbuf(&ul->un_wrbuf, bufsize);
1502
1503         /*
1504          * set the tail to reflect a full log
1505          */
1506         ul->un_tail_lof = dbtob(btodb(ul->un_head_lof)) - DEV_BSIZE;
1507
1508         if (ul->un_tail_lof < ul->un_bol_lof)
1509                 ul->un_tail_lof = ul->un_eol_lof - DEV_BSIZE;
1510         if (ul->un_tail_lof >= ul->un_eol_lof)
1511                 ul->un_tail_lof = ul->un_bol_lof;
1512
1513         /*
1514          * un_tid is used during error processing; it is initialized to
1515          * the tid of the delta at un_head_lof;
1516          */
1517         ul->un_tid = ul->un_head_tid;
1518 }
1519
1520 void
1521 ldl_logscan_end(ml_unit_t *ul)
1522 {
1523         size_t  bufsize;
1524
1525         /*
1526          * reset the circular bufs
1527          */
1528         bufsize = ldl_bufsize(ul);
1529         alloc_rdbuf(&ul->un_rdbuf, MAPBLOCKSIZE, MAPBLOCKSIZE);
1530         alloc_wrbuf(&ul->un_wrbuf, bufsize);
1531
1532         /*
1533          * Done w/scan
1534          */
1535         ul->un_flags &= ~LDL_SCAN;
1536 }
1537
1538 int
1539 ldl_need_roll(ml_unit_t *ul)
1540 {
1541         off_t   busybytes;
1542         off_t   head;
1543         off_t   tail;
1544         off_t   bol;
1545         off_t   eol;
1546         off_t   nb;
1547
1548         /*
1549          * snapshot the log state
1550          */
1551         head = ul->un_head_lof;
1552         tail = ul->un_tail_lof;
1553         bol = ul->un_bol_lof;
1554         eol = ul->un_eol_lof;
1555         nb = ul->un_logsize;
1556
1557         /*
1558          * compute number of busy (inuse) bytes
1559          */
1560         if (head <= tail)
1561                 busybytes = tail - head;
1562         else
1563                 busybytes = (eol - head) + (tail - bol);
1564
1565         /*
1566          * return TRUE if > 75% full
1567          */
1568         return (busybytes > (nb - (nb >> 2)));
1569 }
1570
1571 void
1572 ldl_seterror(ml_unit_t *ul, char *why)
1573 {
1574         /*
1575          * already in error state; do nothing
1576          */
1577         if (ul->un_flags & LDL_ERROR)
1578                 return;
1579
1580         ul->un_flags |= LDL_ERROR;      /* incore */
1581         ul->un_badlog = 1;              /* ondisk (cleared by fsck) */
1582
1583         /*
1584          * Commit to state sectors
1585          */
1586         uniqtime(&ul->un_timestamp);
1587         ldl_savestate(ul);
1588
1589         /* Pretty print */
1590         cmn_err(CE_WARN, "%s", why);
1591         cmn_err(CE_WARN, "ufs log for %s changed state to Error",
1592             ul->un_ufsvfs->vfs_fs->fs_fsmnt);
1593         cmn_err(CE_WARN, "Please umount(1M) %s and run fsck(1M)",
1594             ul->un_ufsvfs->vfs_fs->fs_fsmnt);
1595
1596         /*
1597          * If we aren't in the middle of scan (aka snarf); tell ufs
1598          * to hard lock itself.
1599          */
1600         if ((ul->un_flags & LDL_SCAN) == 0)
1601                 ufs_trans_onerror();
1602 }
1603
1604 size_t
1605 ldl_bufsize(ml_unit_t *ul)
1606 {
1607         size_t          bufsize;
1608         extern uint32_t ldl_minbufsize;
1609
1610         /*
1611          * initial guess is the maxtransfer value for this log device
1612          *      increase if too small
1613          *      decrease if too large
1614          */
1615         bufsize = dbtob(btod(ul->un_maxtransfer));
1616         if (bufsize < ldl_minbufsize)
1617                 bufsize = ldl_minbufsize;
1618         if (bufsize > maxphys)
1619                 bufsize = maxphys;
1620         if (bufsize > ul->un_maxtransfer)
1621                 bufsize = ul->un_maxtransfer;
1622         return (bufsize);
1623 }