kernel/fs/ufs/lufs.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 #include <sys/systm.h>
  27 #include <sys/types.h>
  28 #include <sys/vnode.h>
  29 #include <sys/buf.h>
  30 #include <sys/errno.h>
  31 #include <sys/fssnap_if.h>
  32 #include <sys/fs/ufs_inode.h>
  33 #include <sys/fs/ufs_filio.h>
  34 #include <sys/sysmacros.h>
  35 #include <sys/modctl.h>
  36 #include <sys/fs/ufs_log.h>
  37 #include <sys/fs/ufs_bio.h>
  38 #include <sys/fs/ufs_fsdir.h>
  39 #include <sys/debug.h>
  40 #include <sys/atomic.h>
  41 #include <sys/kmem.h>
  42 #include <sys/inttypes.h>
  43 #include <sys/vfs.h>
  44 #include <sys/mntent.h>
  45 #include <sys/conf.h>
  46 #include <sys/param.h>
  47 #include <sys/kstat.h>
  48 #include <sys/cmn_err.h>
  49 #include <sys/sdt.h>
  50
  51 #define LUFS_GENID_PRIME        UINT64_C(4294967291)
  52 #define LUFS_GENID_BASE         UINT64_C(311)
  53 #define LUFS_NEXT_ID(id)        ((uint32_t)(((id) * LUFS_GENID_BASE) % \
  54                                     LUFS_GENID_PRIME))
  55
  56 extern  kmutex_t        ufs_scan_lock;
  57
  58 static kmutex_t log_mutex;      /* general purpose log layer lock */
  59 kmutex_t        ml_scan;        /* Scan thread syncronization */
  60 kcondvar_t      ml_scan_cv;     /* Scan thread syncronization */
  61
  62 struct kmem_cache       *lufs_sv;
  63 struct kmem_cache       *lufs_bp;
  64
  65 /* Tunables */
  66 uint_t          ldl_maxlogsize  = LDL_MAXLOGSIZE;
  67 uint_t          ldl_minlogsize  = LDL_MINLOGSIZE;
  68 uint_t          ldl_softlogcap  = LDL_SOFTLOGCAP;
  69 uint32_t        ldl_divisor     = LDL_DIVISOR;
  70 uint32_t        ldl_mintransfer = LDL_MINTRANSFER;
  71 uint32_t        ldl_maxtransfer = LDL_MAXTRANSFER;
  72 uint32_t        ldl_minbufsize  = LDL_MINBUFSIZE;
  73 uint32_t        ldl_cgsizereq   = 0;
  74
  75 /* Generation of header ids */
  76 static kmutex_t genid_mutex;
  77 static uint32_t last_loghead_ident = UINT32_C(0);
  78
  79 /*
  80  * Logging delta and roll statistics
  81  */
  82 struct delta_kstats {
  83         kstat_named_t ds_superblock_deltas;
  84         kstat_named_t ds_bitmap_deltas;
  85         kstat_named_t ds_suminfo_deltas;
  86         kstat_named_t ds_allocblk_deltas;
  87         kstat_named_t ds_ab0_deltas;
  88         kstat_named_t ds_dir_deltas;
  89         kstat_named_t ds_inode_deltas;
  90         kstat_named_t ds_fbiwrite_deltas;
  91         kstat_named_t ds_quota_deltas;
  92         kstat_named_t ds_shadow_deltas;
  93
  94         kstat_named_t ds_superblock_rolled;
  95         kstat_named_t ds_bitmap_rolled;
  96         kstat_named_t ds_suminfo_rolled;
  97         kstat_named_t ds_allocblk_rolled;
  98         kstat_named_t ds_ab0_rolled;
  99         kstat_named_t ds_dir_rolled;
 100         kstat_named_t ds_inode_rolled;
 101         kstat_named_t ds_fbiwrite_rolled;
 102         kstat_named_t ds_quota_rolled;
 103         kstat_named_t ds_shadow_rolled;
 104 } dkstats = {
 105         { "superblock_deltas",  KSTAT_DATA_UINT64 },
 106         { "bitmap_deltas",      KSTAT_DATA_UINT64 },
 107         { "suminfo_deltas",     KSTAT_DATA_UINT64 },
 108         { "allocblk_deltas",    KSTAT_DATA_UINT64 },
 109         { "ab0_deltas",         KSTAT_DATA_UINT64 },
 110         { "dir_deltas",         KSTAT_DATA_UINT64 },
 111         { "inode_deltas",       KSTAT_DATA_UINT64 },
 112         { "fbiwrite_deltas",    KSTAT_DATA_UINT64 },
 113         { "quota_deltas",       KSTAT_DATA_UINT64 },
 114         { "shadow_deltas",      KSTAT_DATA_UINT64 },
 115
 116         { "superblock_rolled",  KSTAT_DATA_UINT64 },
 117         { "bitmap_rolled",      KSTAT_DATA_UINT64 },
 118         { "suminfo_rolled",     KSTAT_DATA_UINT64 },
 119         { "allocblk_rolled",    KSTAT_DATA_UINT64 },
 120         { "ab0_rolled",         KSTAT_DATA_UINT64 },
 121         { "dir_rolled",         KSTAT_DATA_UINT64 },
 122         { "inode_rolled",       KSTAT_DATA_UINT64 },
 123         { "fbiwrite_rolled",    KSTAT_DATA_UINT64 },
 124         { "quota_rolled",       KSTAT_DATA_UINT64 },
 125         { "shadow_rolled",      KSTAT_DATA_UINT64 }
 126 };
 127
 128 uint64_t delta_stats[DT_MAX];
 129 uint64_t roll_stats[DT_MAX];
 130
 131 /*
 132  * General logging kstats
 133  */
 134 struct logstats logstats = {
 135         { "master_reads",               KSTAT_DATA_UINT64 },
 136         { "master_writes",              KSTAT_DATA_UINT64 },
 137         { "log_reads_inmem",            KSTAT_DATA_UINT64 },
 138         { "log_reads",                  KSTAT_DATA_UINT64 },
 139         { "log_writes",                 KSTAT_DATA_UINT64 },
 140         { "log_master_reads",           KSTAT_DATA_UINT64 },
 141         { "log_roll_reads",             KSTAT_DATA_UINT64 },
 142         { "log_roll_writes",            KSTAT_DATA_UINT64 }
 143 };
 144
 145 int
 146 trans_not_done(struct buf *cb)
 147 {
 148         sema_v(&cb->b_io);
 149         return (0);
 150 }
 151
 152 static void
 153 trans_wait_panic(struct buf *cb)
 154 {
 155         while ((cb->b_flags & B_DONE) == 0)
 156                 drv_usecwait(10);
 157 }
 158
 159 int
 160 trans_not_wait(struct buf *cb)
 161 {
 162         /*
 163          * In case of panic, busy wait for completion
 164          */
 165         if (panicstr)
 166                 trans_wait_panic(cb);
 167         else
 168                 sema_p(&cb->b_io);
 169
 170         return (geterror(cb));
 171 }
 172
 173 int
 174 trans_wait(struct buf *cb)
 175 {
 176         /*
 177          * In case of panic, busy wait for completion and run md daemon queues
 178          */
 179         if (panicstr)
 180                 trans_wait_panic(cb);
 181         return (biowait(cb));
 182 }
 183
 184 static void
 185 setsum(int32_t *sp, int32_t *lp, int nb)
 186 {
 187         int32_t csum = 0;
 188
 189         *sp = 0;
 190         nb /= sizeof (int32_t);
 191         while (nb--)
 192                 csum += *lp++;
 193         *sp = csum;
 194 }
 195
 196 static int
 197 checksum(int32_t *sp, int32_t *lp, int nb)
 198 {
 199         int32_t ssum = *sp;
 200
 201         setsum(sp, lp, nb);
 202         if (ssum != *sp) {
 203                 *sp = ssum;
 204                 return (0);
 205         }
 206         return (1);
 207 }
 208
 209 void
 210 lufs_unsnarf(ufsvfs_t *ufsvfsp)
 211 {
 212         ml_unit_t *ul;
 213         mt_map_t *mtm;
 214
 215         ul = ufsvfsp->vfs_log;
 216         if (ul == NULL)
 217                 return;
 218
 219         mtm = ul->un_logmap;
 220
 221         /*
 222          * Wait for a pending top_issue_sync which is
 223          * dispatched (via taskq_dispatch()) but hasnt completed yet.
 224          */
 225
 226         mutex_enter(&mtm->mtm_lock);
 227
 228         while (mtm->mtm_taskq_sync_count != 0) {
 229                 cv_wait(&mtm->mtm_cv, &mtm->mtm_lock);
 230         }
 231
 232         mutex_exit(&mtm->mtm_lock);
 233
 234         /* Roll committed transactions */
 235         logmap_roll_dev(ul);
 236
 237         /* Kill the roll thread */
 238         logmap_kill_roll(ul);
 239
 240         /* release saved alloction info */
 241         if (ul->un_ebp)
 242                 kmem_free(ul->un_ebp, ul->un_nbeb);
 243
 244         /* release circular bufs */
 245         free_cirbuf(&ul->un_rdbuf);
 246         free_cirbuf(&ul->un_wrbuf);
 247
 248         /* release maps */
 249         if (ul->un_logmap)
 250                 ul->un_logmap = map_put(ul->un_logmap);
 251         if (ul->un_deltamap)
 252                 ul->un_deltamap = map_put(ul->un_deltamap);
 253         if (ul->un_matamap)
 254                 ul->un_matamap = map_put(ul->un_matamap);
 255
 256         mutex_destroy(&ul->un_log_mutex);
 257         mutex_destroy(&ul->un_state_mutex);
 258
 259         /* release state buffer MUST BE LAST!! (contains our ondisk data) */
 260         if (ul->un_bp)
 261                 brelse(ul->un_bp);
 262         kmem_free(ul, sizeof (*ul));
 263
 264         ufsvfsp->vfs_log = NULL;
 265 }
 266
 267 int
 268 lufs_snarf(ufsvfs_t *ufsvfsp, struct fs *fs, int ronly)
 269 {
 270         buf_t           *bp, *tbp;
 271         ml_unit_t       *ul;
 272         extent_block_t  *ebp;
 273         ic_extent_block_t  *nebp;
 274         size_t          nb;
 275         daddr_t         bno;    /* in disk blocks */
 276         int             i;
 277
 278         /* LINTED: warning: logical expression always true: op "||" */
 279         ASSERT(sizeof (ml_odunit_t) < DEV_BSIZE);
 280
 281         /*
 282          * Get the allocation table
 283          *      During a remount the superblock pointed to by the ufsvfsp
 284          *      is out of date.  Hence the need for the ``new'' superblock
 285          *      pointer, fs, passed in as a parameter.
 286          */
 287         bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev, logbtodb(fs, fs->fs_logbno),
 288             fs->fs_bsize);
 289         if (bp->b_flags & B_ERROR) {
 290                 brelse(bp);
 291                 return (EIO);
 292         }
 293         ebp = (void *)bp->b_un.b_addr;
 294         if (!checksum(&ebp->chksum, (int32_t *)bp->b_un.b_addr,
 295             fs->fs_bsize)) {
 296                 brelse(bp);
 297                 return (ENODEV);
 298         }
 299
 300         /*
 301          * It is possible to get log blocks with all zeros.
 302          * We should also check for nextents to be zero in such case.
 303          */
 304         if (ebp->type != LUFS_EXTENTS || ebp->nextents == 0) {
 305                 brelse(bp);
 306                 return (EDOM);
 307         }
 308         /*
 309          * Put allocation into memory.  This requires conversion between
 310          * on the ondisk format of the extent (type extent_t) and the
 311          * in-core format of the extent (type ic_extent_t).  The
 312          * difference is the in-core form of the extent block stores
 313          * the physical offset of the extent in disk blocks, which
 314          * can require more than a 32-bit field.
 315          */
 316         nb = (size_t)(sizeof (ic_extent_block_t) +
 317             ((ebp->nextents - 1) * sizeof (ic_extent_t)));
 318         nebp = kmem_alloc(nb, KM_SLEEP);
 319         nebp->ic_nextents = ebp->nextents;
 320         nebp->ic_nbytes = ebp->nbytes;
 321         nebp->ic_nextbno = ebp->nextbno;
 322         for (i = 0; i < ebp->nextents; i++) {
 323                 nebp->ic_extents[i].ic_lbno = ebp->extents[i].lbno;
 324                 nebp->ic_extents[i].ic_nbno = ebp->extents[i].nbno;
 325                 nebp->ic_extents[i].ic_pbno =
 326                     logbtodb(fs, ebp->extents[i].pbno);
 327         }
 328         brelse(bp);
 329
 330         /*
 331          * Get the log state
 332          */
 333         bno = nebp->ic_extents[0].ic_pbno;
 334         bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev, bno, DEV_BSIZE);
 335         if (bp->b_flags & B_ERROR) {
 336                 brelse(bp);
 337                 bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev, bno + 1, DEV_BSIZE);
 338                 if (bp->b_flags & B_ERROR) {
 339                         brelse(bp);
 340                         kmem_free(nebp, nb);
 341                         return (EIO);
 342                 }
 343         }
 344
 345         /*
 346          * Put ondisk struct into an anonymous buffer
 347          *      This buffer will contain the memory for the ml_odunit struct
 348          */
 349         tbp = ngeteblk(dbtob(LS_SECTORS));
 350         tbp->b_edev = bp->b_edev;
 351         tbp->b_dev = bp->b_dev;
 352         tbp->b_blkno = bno;
 353         bcopy(bp->b_un.b_addr, tbp->b_un.b_addr, DEV_BSIZE);
 354         bcopy(bp->b_un.b_addr, tbp->b_un.b_addr + DEV_BSIZE, DEV_BSIZE);
 355         bp->b_flags |= (B_STALE | B_AGE);
 356         brelse(bp);
 357         bp = tbp;
 358
 359         /*
 360          * Verify the log state
 361          *
 362          * read/only mounts w/bad logs are allowed.  umount will
 363          * eventually roll the bad log until the first IO error.
 364          * fsck will then repair the file system.
 365          *
 366          * read/write mounts with bad logs are not allowed.
 367          *
 368          */
 369         ul = (ml_unit_t *)kmem_zalloc(sizeof (*ul), KM_SLEEP);
 370         bcopy(bp->b_un.b_addr, &ul->un_ondisk, sizeof (ml_odunit_t));
 371         if ((ul->un_chksum != ul->un_head_ident + ul->un_tail_ident) ||
 372             (ul->un_version != LUFS_VERSION_LATEST) ||
 373             (!ronly && ul->un_badlog)) {
 374                 kmem_free(ul, sizeof (*ul));
 375                 brelse(bp);
 376                 kmem_free(nebp, nb);
 377                 return (EIO);
 378         }
 379         /*
 380          * Initialize the incore-only fields
 381          */
 382         if (ronly)
 383                 ul->un_flags |= LDL_NOROLL;
 384         ul->un_bp = bp;
 385         ul->un_ufsvfs = ufsvfsp;
 386         ul->un_dev = ufsvfsp->vfs_dev;
 387         ul->un_ebp = nebp;
 388         ul->un_nbeb = nb;
 389         ul->un_maxresv = btodb(ul->un_logsize) * LDL_USABLE_BSIZE;
 390         ul->un_deltamap = map_get(ul, deltamaptype, DELTAMAP_NHASH);
 391         ul->un_logmap = map_get(ul, logmaptype, LOGMAP_NHASH);
 392         if (ul->un_debug & MT_MATAMAP)
 393                 ul->un_matamap = map_get(ul, matamaptype, DELTAMAP_NHASH);
 394         mutex_init(&ul->un_log_mutex, NULL, MUTEX_DEFAULT, NULL);
 395         mutex_init(&ul->un_state_mutex, NULL, MUTEX_DEFAULT, NULL);
 396
 397         /*
 398          * Aquire the ufs_scan_lock before linking the mtm data
 399          * structure so that we keep ufs_sync() and ufs_update() away
 400          * when they execute the ufs_scan_inodes() run while we're in
 401          * progress of enabling/disabling logging.
 402          */
 403         mutex_enter(&ufs_scan_lock);
 404         ufsvfsp->vfs_log = ul;
 405
 406         /* remember the state of the log before the log scan */
 407         logmap_logscan(ul);
 408         mutex_exit(&ufs_scan_lock);
 409
 410         /*
 411          * Error during scan
 412          *
 413          * If this is a read/only mount; ignore the error.
 414          * At a later time umount/fsck will repair the fs.
 415          *
 416          */
 417         if (ul->un_flags & LDL_ERROR) {
 418                 if (!ronly) {
 419                         /*
 420                          * Aquire the ufs_scan_lock before de-linking
 421                          * the mtm data structure so that we keep ufs_sync()
 422                          * and ufs_update() away when they execute the
 423                          * ufs_scan_inodes() run while we're in progress of
 424                          * enabling/disabling logging.
 425                          */
 426                         mutex_enter(&ufs_scan_lock);
 427                         lufs_unsnarf(ufsvfsp);
 428                         mutex_exit(&ufs_scan_lock);
 429                         return (EIO);
 430                 }
 431                 ul->un_flags &= ~LDL_ERROR;
 432         }
 433         if (!ronly)
 434                 logmap_start_roll(ul);
 435         return (0);
 436 }
 437
 438 uint32_t
 439 lufs_hd_genid(const ml_unit_t *up)
 440 {
 441         uint32_t id;
 442
 443         mutex_enter(&genid_mutex);
 444
 445         /*
 446          * The formula below implements an exponential, modular sequence.
 447          *
 448          * ID(N) = (SEED * (BASE^N)) % PRIME
 449          *
 450          * The numbers will be pseudo random.  They depend on SEED, BASE, PRIME,
 451          * but will sweep through almost all of the range 1....PRIME-1.
 452          * Most  importantly  they  will  not  repeat  for PRIME-2 (4294967289)
 453          * repetitions.  If they would repeat that  could possibly cause  hangs,
 454          * panics at mount/umount and failed mount operations.
 455          */
 456         id = LUFS_NEXT_ID(last_loghead_ident);
 457
 458         /* Checking if new identity used already */
 459         if (up != NULL && up->un_head_ident == id) {
 460                 DTRACE_PROBE1(head_ident_collision, uint32_t, id);
 461
 462                 /*
 463                  * The  following  preserves  the  algorithm  for  the fix  for
 464                  * "panic: free: freeing free frag, dev:0x2000000018, blk:34605,
 465                  * cg:26, ino:148071,".
 466                  * If  the header identities  un_head_ident  are  equal  to the
 467                  * present element  in the sequence,  the next element  of  the
 468                  * sequence is returned instead.
 469                  */
 470                 id = LUFS_NEXT_ID(id);
 471         }
 472
 473         last_loghead_ident = id;
 474
 475         mutex_exit(&genid_mutex);
 476
 477         return (id);
 478 }
 479
 480 static void
 481 lufs_genid_init(void)
 482 {
 483         uint64_t seed;
 484
 485         /* Initialization */
 486         mutex_init(&genid_mutex, NULL, MUTEX_DEFAULT, NULL);
 487
 488         /* Seed the algorithm */
 489         do {
 490                 timestruc_t tv;
 491
 492                 gethrestime(&tv);
 493
 494                 seed = (tv.tv_nsec << 3);
 495                 seed ^= tv.tv_sec;
 496
 497                 last_loghead_ident = (uint32_t)(seed % LUFS_GENID_PRIME);
 498         } while (last_loghead_ident == UINT32_C(0));
 499 }
 500
 501 static int
 502 lufs_initialize(
 503         ufsvfs_t *ufsvfsp,
 504         daddr_t bno,
 505         size_t nb,
 506         struct fiolog *flp)
 507 {
 508         ml_odunit_t     *ud, *ud2;
 509         buf_t           *bp;
 510
 511         /* LINTED: warning: logical expression always true: op "||" */
 512         ASSERT(sizeof (ml_odunit_t) < DEV_BSIZE);
 513         ASSERT(nb >= ldl_minlogsize);
 514
 515         bp = UFS_GETBLK(ufsvfsp, ufsvfsp->vfs_dev, bno, dbtob(LS_SECTORS));
 516         bzero(bp->b_un.b_addr, bp->b_bcount);
 517
 518         ud = (void *)bp->b_un.b_addr;
 519         ud->od_version = LUFS_VERSION_LATEST;
 520         ud->od_maxtransfer = MIN(ufsvfsp->vfs_iotransz, ldl_maxtransfer);
 521         if (ud->od_maxtransfer < ldl_mintransfer)
 522                 ud->od_maxtransfer = ldl_mintransfer;
 523         ud->od_devbsize = DEV_BSIZE;
 524
 525         ud->od_requestsize = flp->nbytes_actual;
 526         ud->od_statesize = dbtob(LS_SECTORS);
 527         ud->od_logsize = nb - ud->od_statesize;
 528
 529         ud->od_statebno = INT32_C(0);
 530
 531         ud->od_head_ident = lufs_hd_genid(NULL);
 532         ud->od_tail_ident = ud->od_head_ident;
 533         ud->od_chksum = ud->od_head_ident + ud->od_tail_ident;
 534
 535         ud->od_bol_lof = dbtob(ud->od_statebno) + ud->od_statesize;
 536         ud->od_eol_lof = ud->od_bol_lof + ud->od_logsize;
 537         ud->od_head_lof = ud->od_bol_lof;
 538         ud->od_tail_lof = ud->od_bol_lof;
 539
 540         ASSERT(lufs_initialize_debug(ud));
 541
 542         ud2 = (void *)(bp->b_un.b_addr + DEV_BSIZE);
 543         bcopy(ud, ud2, sizeof (*ud));
 544
 545         UFS_BWRITE2(ufsvfsp, bp);
 546         if (bp->b_flags & B_ERROR) {
 547                 brelse(bp);
 548                 return (EIO);
 549         }
 550         brelse(bp);
 551
 552         return (0);
 553 }
 554
 555 /*
 556  * Free log space
 557  *      Assumes the file system is write locked and is not logging
 558  */
 559 static int
 560 lufs_free(struct ufsvfs *ufsvfsp)
 561 {
 562         int             error = 0, i, j;
 563         buf_t           *bp = NULL;
 564         extent_t        *ep;
 565         extent_block_t  *ebp;
 566         struct fs       *fs = ufsvfsp->vfs_fs;
 567         daddr_t         fno;
 568         int32_t         logbno;
 569         long            nfno;
 570         inode_t         *ip = NULL;
 571         char            clean;
 572
 573         /*
 574          * Nothing to free
 575          */
 576         if (fs->fs_logbno == 0)
 577                 return (0);
 578
 579         /*
 580          * Mark the file system as FSACTIVE and no log but honor the
 581          * current value of fs_reclaim.  The reclaim thread could have
 582          * been active when lufs_disable() was called and if fs_reclaim
 583          * is reset to zero here it could lead to lost inodes.
 584          */
 585         ufsvfsp->vfs_ulockfs.ul_sbowner = curthread;
 586         mutex_enter(&ufsvfsp->vfs_lock);
 587         clean = fs->fs_clean;
 588         logbno = fs->fs_logbno;
 589         fs->fs_clean = FSACTIVE;
 590         fs->fs_logbno = INT32_C(0);
 591         ufs_sbwrite(ufsvfsp);
 592         mutex_exit(&ufsvfsp->vfs_lock);
 593         ufsvfsp->vfs_ulockfs.ul_sbowner = (kthread_id_t)-1;
 594         if (ufsvfsp->vfs_bufp->b_flags & B_ERROR) {
 595                 error = EIO;
 596                 fs->fs_clean = clean;
 597                 fs->fs_logbno = logbno;
 598                 goto errout;
 599         }
 600
 601         /*
 602          * fetch the allocation block
 603          *      superblock -> one block of extents -> log data
 604          */
 605         bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev, logbtodb(fs, logbno),
 606             fs->fs_bsize);
 607         if (bp->b_flags & B_ERROR) {
 608                 error = EIO;
 609                 goto errout;
 610         }
 611
 612         /*
 613          * Free up the allocated space (dummy inode needed for free())
 614          */
 615         ip = ufs_alloc_inode(ufsvfsp, UFSROOTINO);
 616         ebp = (void *)bp->b_un.b_addr;
 617         for (i = 0, ep = &ebp->extents[0]; i < ebp->nextents; ++i, ++ep) {
 618                 fno = logbtofrag(fs, ep->pbno);
 619                 nfno = dbtofsb(fs, ep->nbno);
 620                 for (j = 0; j < nfno; j += fs->fs_frag, fno += fs->fs_frag)
 621                         free(ip, fno, fs->fs_bsize, 0);
 622         }
 623         free(ip, logbtofrag(fs, logbno), fs->fs_bsize, 0);
 624         brelse(bp);
 625         bp = NULL;
 626
 627         /*
 628          * Push the metadata dirtied during the allocations
 629          */
 630         ufsvfsp->vfs_ulockfs.ul_sbowner = curthread;
 631         sbupdate(ufsvfsp->vfs_vfs);
 632         ufsvfsp->vfs_ulockfs.ul_sbowner = (kthread_id_t)-1;
 633         bflush(ufsvfsp->vfs_dev);
 634         error = bfinval(ufsvfsp->vfs_dev, 0);
 635         if (error)
 636                 goto errout;
 637
 638         /*
 639          * Free the dummy inode
 640          */
 641         ufs_free_inode(ip);
 642
 643         return (0);
 644
 645 errout:
 646         /*
 647          * Free up all resources
 648          */
 649         if (bp)
 650                 brelse(bp);
 651         if (ip)
 652                 ufs_free_inode(ip);
 653         return (error);
 654 }
 655
 656 /*
 657  * Allocate log space
 658  *      Assumes the file system is write locked and is not logging
 659  */
 660 static int
 661 lufs_alloc(struct ufsvfs *ufsvfsp, struct fiolog *flp, size_t minb, cred_t *cr)
 662 {
 663         int             error = 0;
 664         buf_t           *bp = NULL;
 665         extent_t        *ep, *nep;
 666         extent_block_t  *ebp;
 667         struct fs       *fs = ufsvfsp->vfs_fs;
 668         daddr_t         fno;    /* in frags */
 669         daddr_t         bno;    /* in disk blocks */
 670         int32_t         logbno = INT32_C(0);    /* will be fs_logbno */
 671         struct inode    *ip = NULL;
 672         size_t          nb = flp->nbytes_actual;
 673         size_t          tb = 0;
 674
 675         /*
 676          * Mark the file system as FSACTIVE
 677          */
 678         ufsvfsp->vfs_ulockfs.ul_sbowner = curthread;
 679         mutex_enter(&ufsvfsp->vfs_lock);
 680         fs->fs_clean = FSACTIVE;
 681         ufs_sbwrite(ufsvfsp);
 682         mutex_exit(&ufsvfsp->vfs_lock);
 683         ufsvfsp->vfs_ulockfs.ul_sbowner = (kthread_id_t)-1;
 684
 685         /*
 686          * Allocate the allocation block (need dummy shadow inode;
 687          * we use a shadow inode so the quota sub-system ignores
 688          * the block allocations.)
 689          *      superblock -> one block of extents -> log data
 690          */
 691         ip = ufs_alloc_inode(ufsvfsp, UFSROOTINO);
 692         ip->i_mode = IFSHAD;            /* make the dummy a shadow inode */
 693         rw_enter(&ip->i_contents, RW_WRITER);
 694         fno = contigpref(ufsvfsp, nb + fs->fs_bsize, minb);
 695         error = alloc(ip, fno, fs->fs_bsize, &fno, cr);
 696         if (error)
 697                 goto errout;
 698         bno = fsbtodb(fs, fno);
 699
 700         bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev, bno, fs->fs_bsize);
 701         if (bp->b_flags & B_ERROR) {
 702                 error = EIO;
 703                 goto errout;
 704         }
 705
 706         ebp = (void *)bp->b_un.b_addr;
 707         ebp->type = LUFS_EXTENTS;
 708         ebp->nextbno = UINT32_C(0);
 709         ebp->nextents = UINT32_C(0);
 710         ebp->chksum = INT32_C(0);
 711         if (fs->fs_magic == FS_MAGIC)
 712                 logbno = bno;
 713         else
 714                 logbno = dbtofsb(fs, bno);
 715
 716         /*
 717          * Initialize the first extent
 718          */
 719         ep = &ebp->extents[0];
 720         error = alloc(ip, fno + fs->fs_frag, fs->fs_bsize, &fno, cr);
 721         if (error)
 722                 goto errout;
 723         bno = fsbtodb(fs, fno);
 724
 725         ep->lbno = UINT32_C(0);
 726         if (fs->fs_magic == FS_MAGIC)
 727                 ep->pbno = (uint32_t)bno;
 728         else
 729                 ep->pbno = (uint32_t)fno;
 730         ep->nbno = (uint32_t)fsbtodb(fs, fs->fs_frag);
 731         ebp->nextents = UINT32_C(1);
 732         tb = fs->fs_bsize;
 733         nb -= fs->fs_bsize;
 734
 735         while (nb) {
 736                 error = alloc(ip, fno + fs->fs_frag, fs->fs_bsize, &fno, cr);
 737                 if (error) {
 738                         if (tb < minb)
 739                                 goto errout;
 740                         error = 0;
 741                         break;
 742                 }
 743                 bno = fsbtodb(fs, fno);
 744                 if ((daddr_t)((logbtodb(fs, ep->pbno) + ep->nbno) == bno))
 745                         ep->nbno += (uint32_t)(fsbtodb(fs, fs->fs_frag));
 746                 else {
 747                         nep = ep + 1;
 748                         if ((caddr_t)(nep + 1) >
 749                             (bp->b_un.b_addr + fs->fs_bsize)) {
 750                                 free(ip, fno, fs->fs_bsize, 0);
 751                                 break;
 752                         }
 753                         nep->lbno = ep->lbno + ep->nbno;
 754                         if (fs->fs_magic == FS_MAGIC)
 755                                 nep->pbno = (uint32_t)bno;
 756                         else
 757                                 nep->pbno = (uint32_t)fno;
 758                         nep->nbno = (uint32_t)(fsbtodb(fs, fs->fs_frag));
 759                         ebp->nextents++;
 760                         ep = nep;
 761                 }
 762                 tb += fs->fs_bsize;
 763                 nb -= fs->fs_bsize;
 764         }
 765
 766         if (tb < minb) {        /* Failed to reach minimum log size */
 767                 error = ENOSPC;
 768                 goto errout;
 769         }
 770
 771         ebp->nbytes = (uint32_t)tb;
 772         setsum(&ebp->chksum, (int32_t *)bp->b_un.b_addr, fs->fs_bsize);
 773         UFS_BWRITE2(ufsvfsp, bp);
 774         if (bp->b_flags & B_ERROR) {
 775                 error = EIO;
 776                 goto errout;
 777         }
 778         /*
 779          * Initialize the first two sectors of the log
 780          */
 781         error = lufs_initialize(ufsvfsp, logbtodb(fs, ebp->extents[0].pbno),
 782             tb, flp);
 783         if (error)
 784                 goto errout;
 785
 786         /*
 787          * We are done initializing the allocation block and the log
 788          */
 789         brelse(bp);
 790         bp = NULL;
 791
 792         /*
 793          * Update the superblock and push the dirty metadata
 794          */
 795         ufsvfsp->vfs_ulockfs.ul_sbowner = curthread;
 796         sbupdate(ufsvfsp->vfs_vfs);
 797         ufsvfsp->vfs_ulockfs.ul_sbowner = (kthread_id_t)-1;
 798         bflush(ufsvfsp->vfs_dev);
 799         error = bfinval(ufsvfsp->vfs_dev, 1);
 800         if (error)
 801                 goto errout;
 802         if (ufsvfsp->vfs_bufp->b_flags & B_ERROR) {
 803                 error = EIO;
 804                 goto errout;
 805         }
 806
 807         /*
 808          * Everything is safely on disk; update log space pointer in sb
 809          */
 810         ufsvfsp->vfs_ulockfs.ul_sbowner = curthread;
 811         mutex_enter(&ufsvfsp->vfs_lock);
 812         fs->fs_logbno = (uint32_t)logbno;
 813         ufs_sbwrite(ufsvfsp);
 814         mutex_exit(&ufsvfsp->vfs_lock);
 815         ufsvfsp->vfs_ulockfs.ul_sbowner = (kthread_id_t)-1;
 816
 817         /*
 818          * Free the dummy inode
 819          */
 820         rw_exit(&ip->i_contents);
 821         ufs_free_inode(ip);
 822
 823         /* inform user of real log size */
 824         flp->nbytes_actual = tb;
 825         return (0);
 826
 827 errout:
 828         /*
 829          * Free all resources
 830          */
 831         if (bp)
 832                 brelse(bp);
 833         if (logbno) {
 834                 fs->fs_logbno = logbno;
 835                 (void) lufs_free(ufsvfsp);
 836         }
 837         if (ip) {
 838                 rw_exit(&ip->i_contents);
 839                 ufs_free_inode(ip);
 840         }
 841         return (error);
 842 }
 843
 844 /*
 845  * Disable logging
 846  */
 847 int
 848 lufs_disable(vnode_t *vp, struct fiolog *flp)
 849 {
 850         int             error = 0;
 851         inode_t         *ip = VTOI(vp);
 852         ufsvfs_t        *ufsvfsp = ip->i_ufsvfs;
 853         struct fs       *fs = ufsvfsp->vfs_fs;
 854         struct lockfs   lf;
 855         struct ulockfs  *ulp;
 856
 857         flp->error = FIOLOG_ENONE;
 858
 859         /*
 860          * Logging is already disabled; done
 861          */
 862         if (fs->fs_logbno == 0 || ufsvfsp->vfs_log == NULL)
 863                 return (0);
 864
 865         /*
 866          * Readonly file system
 867          */
 868         if (fs->fs_ronly) {
 869                 flp->error = FIOLOG_EROFS;
 870                 return (0);
 871         }
 872
 873         /*
 874          * File system must be write locked to disable logging
 875          */
 876         error = ufs_fiolfss(vp, &lf);
 877         if (error) {
 878                 return (error);
 879         }
 880         if (!LOCKFS_IS_ULOCK(&lf)) {
 881                 flp->error = FIOLOG_EULOCK;
 882                 return (0);
 883         }
 884         lf.lf_lock = LOCKFS_WLOCK;
 885         lf.lf_flags = 0;
 886         lf.lf_comment = NULL;
 887         error = ufs_fiolfs(vp, &lf, 1);
 888         if (error) {
 889                 flp->error = FIOLOG_EWLOCK;
 890                 return (0);
 891         }
 892
 893         if (ufsvfsp->vfs_log == NULL || fs->fs_logbno == 0)
 894                 goto errout;
 895
 896         /*
 897          * WE ARE COMMITTED TO DISABLING LOGGING PAST THIS POINT
 898          */
 899
 900         /*
 901          * Disable logging:
 902          * Suspend the reclaim thread and force the delete thread to exit.
 903          *      When a nologging mount has completed there may still be
 904          *      work for reclaim to do so just suspend this thread until
 905          *      it's [deadlock-] safe for it to continue.  The delete
 906          *      thread won't be needed as ufs_iinactive() calls
 907          *      ufs_delete() when logging is disabled.
 908          * Freeze and drain reader ops.
 909          *      Commit any outstanding reader transactions (ufs_flush).
 910          *      Set the ``unmounted'' bit in the ufstrans struct.
 911          *      If debug, remove metadata from matamap.
 912          *      Disable matamap processing.
 913          *      NULL the trans ops table.
 914          *      Free all of the incore structs related to logging.
 915          * Allow reader ops.
 916          */
 917         ufs_thread_suspend(&ufsvfsp->vfs_reclaim);
 918         ufs_thread_exit(&ufsvfsp->vfs_delete);
 919
 920         vfs_lock_wait(ufsvfsp->vfs_vfs);
 921         ulp = &ufsvfsp->vfs_ulockfs;
 922         mutex_enter(&ulp->ul_lock);
 923         atomic_inc_ulong(&ufs_quiesce_pend);
 924         (void) ufs_quiesce(ulp);
 925
 926         (void) ufs_flush(ufsvfsp->vfs_vfs);
 927
 928         TRANS_MATA_UMOUNT(ufsvfsp);
 929         ufsvfsp->vfs_domatamap = 0;
 930
 931         /*
 932          * Free all of the incore structs
 933          * Aquire the ufs_scan_lock before de-linking the mtm data
 934          * structure so that we keep ufs_sync() and ufs_update() away
 935          * when they execute the ufs_scan_inodes() run while we're in
 936          * progress of enabling/disabling logging.
 937          */
 938         mutex_enter(&ufs_scan_lock);
 939         (void) lufs_unsnarf(ufsvfsp);
 940         mutex_exit(&ufs_scan_lock);
 941
 942         atomic_dec_ulong(&ufs_quiesce_pend);
 943         mutex_exit(&ulp->ul_lock);
 944         vfs_setmntopt(ufsvfsp->vfs_vfs, MNTOPT_NOLOGGING, NULL, 0);
 945         vfs_unlock(ufsvfsp->vfs_vfs);
 946
 947         fs->fs_rolled = FS_ALL_ROLLED;
 948         ufsvfsp->vfs_nolog_si = 0;
 949
 950         /*
 951          * Free the log space and mark the superblock as FSACTIVE
 952          */
 953         (void) lufs_free(ufsvfsp);
 954
 955         /*
 956          * Allow the reclaim thread to continue.
 957          */
 958         ufs_thread_continue(&ufsvfsp->vfs_reclaim);
 959
 960         /*
 961          * Unlock the file system
 962          */
 963         lf.lf_lock = LOCKFS_ULOCK;
 964         lf.lf_flags = 0;
 965         error = ufs_fiolfs(vp, &lf, 1);
 966         if (error)
 967                 flp->error = FIOLOG_ENOULOCK;
 968
 969         return (0);
 970
 971 errout:
 972         lf.lf_lock = LOCKFS_ULOCK;
 973         lf.lf_flags = 0;
 974         (void) ufs_fiolfs(vp, &lf, 1);
 975         return (error);
 976 }
 977
 978 /*
 979  * Enable logging
 980  */
 981 int
 982 lufs_enable(struct vnode *vp, struct fiolog *flp, cred_t *cr)
 983 {
 984         int             error;
 985         int             reclaim;
 986         inode_t         *ip = VTOI(vp);
 987         ufsvfs_t        *ufsvfsp = ip->i_ufsvfs;
 988         struct fs       *fs;
 989         ml_unit_t       *ul;
 990         struct lockfs   lf;
 991         struct ulockfs  *ulp;
 992         vfs_t           *vfsp = ufsvfsp->vfs_vfs;
 993         uint64_t        tmp_nbytes_actual;
 994         uint64_t        cg_minlogsize;
 995         uint32_t        cgsize;
 996         static int      minlogsizewarn = 0;
 997         static int      maxlogsizewarn = 0;
 998
 999         /*
1000          * Check if logging is already enabled
1001          */
1002         if (ufsvfsp->vfs_log) {
1003                 flp->error = FIOLOG_ETRANS;
1004                 /* for root ensure logging option is set */
1005                 vfs_setmntopt(vfsp, MNTOPT_LOGGING, NULL, 0);
1006                 return (0);
1007         }
1008         fs = ufsvfsp->vfs_fs;
1009
1010         /*
1011          * Come back here to recheck if we had to disable the log.
1012          */
1013 recheck:
1014         error = 0;
1015         reclaim = 0;
1016         flp->error = FIOLOG_ENONE;
1017
1018         /*
1019          * The size of the ufs log is determined using the following rules:
1020          *
1021          * 1) If no size is requested the log size is calculated as a
1022          *    ratio of the total file system size. By default this is
1023          *    1MB of log per 1GB of file system. This calculation is then
1024          *    capped at the log size specified by ldl_softlogcap.
1025          * 2) The log size requested may then be increased based on the
1026          *    number of cylinder groups contained in the file system.
1027          *    To prevent a hang the log has to be large enough to contain a
1028          *    single transaction that alters every cylinder group in the file
1029          *    system. This is calculated as cg_minlogsize.
1030          * 3) Finally a check is made that the log size requested is within
1031          *    the limits of ldl_minlogsize and ldl_maxlogsize.
1032          */
1033
1034         /*
1035          * Adjust requested log size
1036          */
1037         flp->nbytes_actual = flp->nbytes_requested;
1038         if (flp->nbytes_actual == 0) {
1039                 tmp_nbytes_actual =
1040                     (((uint64_t)fs->fs_size) / ldl_divisor) << fs->fs_fshift;
1041                 flp->nbytes_actual = (uint_t)MIN(tmp_nbytes_actual, INT_MAX);
1042                 /*
1043                  * The 1MB per 1GB log size allocation only applies up to
1044                  * ldl_softlogcap size of log.
1045                  */
1046                 flp->nbytes_actual = MIN(flp->nbytes_actual, ldl_softlogcap);
1047         }
1048
1049         cgsize = ldl_cgsizereq ? ldl_cgsizereq : LDL_CGSIZEREQ(fs);
1050
1051         /*
1052          * Determine the log size required based on the number of cylinder
1053          * groups in the file system. The log has to be at least this size
1054          * to prevent possible hangs due to log space exhaustion.
1055          */
1056         cg_minlogsize = cgsize * fs->fs_ncg;
1057
1058         /*
1059          * Ensure that the minimum log size isn't so small that it could lead
1060          * to a full log hang.
1061          */
1062         if (ldl_minlogsize < LDL_MINLOGSIZE) {
1063                 ldl_minlogsize = LDL_MINLOGSIZE;
1064                 if (!minlogsizewarn) {
1065                         cmn_err(CE_WARN, "ldl_minlogsize too small, increasing "
1066                             "to 0x%x", LDL_MINLOGSIZE);
1067                         minlogsizewarn = 1;
1068                 }
1069         }
1070
1071         /*
1072          * Ensure that the maximum log size isn't greater than INT_MAX as the
1073          * logical log offset fields would overflow.
1074          */
1075         if (ldl_maxlogsize > INT_MAX) {
1076                 ldl_maxlogsize = INT_MAX;
1077                 if (!maxlogsizewarn) {
1078                         cmn_err(CE_WARN, "ldl_maxlogsize too large, reducing "
1079                             "to 0x%x", INT_MAX);
1080                         maxlogsizewarn = 1;
1081                 }
1082         }
1083
1084         if (cg_minlogsize > ldl_maxlogsize) {
1085                 cmn_err(CE_WARN,
1086                     "%s: reducing calculated log size from 0x%x to "
1087                     "ldl_maxlogsize (0x%x).", fs->fs_fsmnt, (int)cg_minlogsize,
1088                     ldl_maxlogsize);
1089         }
1090
1091         cg_minlogsize = MAX(cg_minlogsize, ldl_minlogsize);
1092         cg_minlogsize = MIN(cg_minlogsize, ldl_maxlogsize);
1093
1094         flp->nbytes_actual = MAX(flp->nbytes_actual, cg_minlogsize);
1095         flp->nbytes_actual = MAX(flp->nbytes_actual, ldl_minlogsize);
1096         flp->nbytes_actual = MIN(flp->nbytes_actual, ldl_maxlogsize);
1097         flp->nbytes_actual = blkroundup(fs, flp->nbytes_actual);
1098
1099         /*
1100          * logging is enabled and the log is the right size; done
1101          */
1102         ul = ufsvfsp->vfs_log;
1103         if (ul && fs->fs_logbno && (flp->nbytes_actual == ul->un_requestsize))
1104                         return (0);
1105
1106         /*
1107          * Readonly file system
1108          */
1109         if (fs->fs_ronly) {
1110                 flp->error = FIOLOG_EROFS;
1111                 return (0);
1112         }
1113
1114         /*
1115          * File system must be write locked to enable logging
1116          */
1117         error = ufs_fiolfss(vp, &lf);
1118         if (error) {
1119                 return (error);
1120         }
1121         if (!LOCKFS_IS_ULOCK(&lf)) {
1122                 flp->error = FIOLOG_EULOCK;
1123                 return (0);
1124         }
1125         lf.lf_lock = LOCKFS_WLOCK;
1126         lf.lf_flags = 0;
1127         lf.lf_comment = NULL;
1128         error = ufs_fiolfs(vp, &lf, 1);
1129         if (error) {
1130                 flp->error = FIOLOG_EWLOCK;
1131                 return (0);
1132         }
1133
1134         /*
1135          * Grab appropriate locks to synchronize with the rest
1136          * of the system
1137          */
1138         vfs_lock_wait(vfsp);
1139         ulp = &ufsvfsp->vfs_ulockfs;
1140         mutex_enter(&ulp->ul_lock);
1141
1142         /*
1143          * File system must be fairly consistent to enable logging
1144          */
1145         if (fs->fs_clean != FSLOG &&
1146             fs->fs_clean != FSACTIVE &&
1147             fs->fs_clean != FSSTABLE &&
1148             fs->fs_clean != FSCLEAN) {
1149                 flp->error = FIOLOG_ECLEAN;
1150                 goto unlockout;
1151         }
1152
1153         /*
1154          * A write-locked file system is only active if there are
1155          * open deleted files; so remember to set FS_RECLAIM later.
1156          */
1157         if (fs->fs_clean == FSACTIVE)
1158                 reclaim = FS_RECLAIM;
1159
1160         /*
1161          * Logging is already enabled; must be changing the log's size
1162          */
1163         if (fs->fs_logbno && ufsvfsp->vfs_log) {
1164                 /*
1165                  * Before we can disable logging, we must give up our
1166                  * lock.  As a consequence of unlocking and disabling the
1167                  * log, the fs structure may change.  Because of this, when
1168                  * disabling is complete, we will go back to recheck to
1169                  * repeat all of the checks that we performed to get to
1170                  * this point.  Disabling sets fs->fs_logbno to 0, so this
1171                  * will not put us into an infinite loop.
1172                  */
1173                 mutex_exit(&ulp->ul_lock);
1174                 vfs_unlock(vfsp);
1175
1176                 lf.lf_lock = LOCKFS_ULOCK;
1177                 lf.lf_flags = 0;
1178                 error = ufs_fiolfs(vp, &lf, 1);
1179                 if (error) {
1180                         flp->error = FIOLOG_ENOULOCK;
1181                         return (0);
1182                 }
1183                 error = lufs_disable(vp, flp);
1184                 if (error || (flp->error != FIOLOG_ENONE))
1185                         return (0);
1186                 goto recheck;
1187         }
1188
1189         error = lufs_alloc(ufsvfsp, flp, cg_minlogsize, cr);
1190         if (error)
1191                 goto errout;
1192
1193         /*
1194          * Create all of the incore structs
1195          */
1196         error = lufs_snarf(ufsvfsp, fs, 0);
1197         if (error)
1198                 goto errout;
1199
1200         /*
1201          * DON'T ``GOTO ERROUT'' PAST THIS POINT
1202          */
1203
1204         /*
1205          * Pretend we were just mounted with logging enabled
1206          *              Get the ops vector
1207          *              If debug, record metadata locations with log subsystem
1208          *              Start the delete thread
1209          *              Start the reclaim thread, if necessary
1210          */
1211         vfs_setmntopt(vfsp, MNTOPT_LOGGING, NULL, 0);
1212
1213         TRANS_DOMATAMAP(ufsvfsp);
1214         TRANS_MATA_MOUNT(ufsvfsp);
1215         TRANS_MATA_SI(ufsvfsp, fs);
1216         ufs_thread_start(&ufsvfsp->vfs_delete, ufs_thread_delete, vfsp);
1217         if (fs->fs_reclaim & (FS_RECLAIM|FS_RECLAIMING)) {
1218                 fs->fs_reclaim &= ~FS_RECLAIM;
1219                 fs->fs_reclaim |=  FS_RECLAIMING;
1220                 ufs_thread_start(&ufsvfsp->vfs_reclaim,
1221                     ufs_thread_reclaim, vfsp);
1222         } else
1223                 fs->fs_reclaim |= reclaim;
1224
1225         mutex_exit(&ulp->ul_lock);
1226         vfs_unlock(vfsp);
1227
1228         /*
1229          * Unlock the file system
1230          */
1231         lf.lf_lock = LOCKFS_ULOCK;
1232         lf.lf_flags = 0;
1233         error = ufs_fiolfs(vp, &lf, 1);
1234         if (error) {
1235                 flp->error = FIOLOG_ENOULOCK;
1236                 return (0);
1237         }
1238
1239         /*
1240          * There's nothing in the log yet (we've just allocated it)
1241          * so directly write out the super block.
1242          * Note, we have to force this sb out to disk
1243          * (not just to the log) so that if we crash we know we are logging
1244          */
1245         mutex_enter(&ufsvfsp->vfs_lock);
1246         fs->fs_clean = FSLOG;
1247         fs->fs_rolled = FS_NEED_ROLL; /* Mark the fs as unrolled */
1248         UFS_BWRITE2(NULL, ufsvfsp->vfs_bufp);
1249         mutex_exit(&ufsvfsp->vfs_lock);
1250
1251         return (0);
1252
1253 errout:
1254         /*
1255          * Aquire the ufs_scan_lock before de-linking the mtm data
1256          * structure so that we keep ufs_sync() and ufs_update() away
1257          * when they execute the ufs_scan_inodes() run while we're in
1258          * progress of enabling/disabling logging.
1259          */
1260         mutex_enter(&ufs_scan_lock);
1261         (void) lufs_unsnarf(ufsvfsp);
1262         mutex_exit(&ufs_scan_lock);
1263
1264         (void) lufs_free(ufsvfsp);
1265 unlockout:
1266         mutex_exit(&ulp->ul_lock);
1267         vfs_unlock(vfsp);
1268
1269         lf.lf_lock = LOCKFS_ULOCK;
1270         lf.lf_flags = 0;
1271         (void) ufs_fiolfs(vp, &lf, 1);
1272         return (error);
1273 }
1274
1275 void
1276 lufs_read_strategy(ml_unit_t *ul, buf_t *bp)
1277 {
1278         mt_map_t        *logmap = ul->un_logmap;
1279         offset_t        mof     = ldbtob(bp->b_blkno);
1280         off_t           nb      = bp->b_bcount;
1281         mapentry_t      *age;
1282         char            *va;
1283         int             (*saviodone)();
1284         int             entire_range;
1285
1286         /*
1287          * get a linked list of overlapping deltas
1288          * returns with &mtm->mtm_rwlock held
1289          */
1290         entire_range = logmap_list_get(logmap, mof, nb, &age);
1291
1292         /*
1293          * no overlapping deltas were found; read master
1294          */
1295         if (age == NULL) {
1296                 rw_exit(&logmap->mtm_rwlock);
1297                 if (ul->un_flags & LDL_ERROR) {
1298                         bp->b_flags |= B_ERROR;
1299                         bp->b_error = EIO;
1300                         biodone(bp);
1301                 } else {
1302                         ul->un_ufsvfs->vfs_iotstamp = ddi_get_lbolt();
1303                         logstats.ls_lreads.value.ui64++;
1304                         (void) bdev_strategy(bp);
1305                         lwp_stat_update(LWP_STAT_INBLK, 1);
1306                 }
1307                 return;
1308         }
1309
1310         va = bp_mapin_common(bp, VM_SLEEP);
1311         /*
1312          * if necessary, sync read the data from master
1313          *      errors are returned in bp
1314          */
1315         if (!entire_range) {
1316                 saviodone = bp->b_iodone;
1317                 bp->b_iodone = trans_not_done;
1318                 logstats.ls_mreads.value.ui64++;
1319                 (void) bdev_strategy(bp);
1320                 lwp_stat_update(LWP_STAT_INBLK, 1);
1321                 if (trans_not_wait(bp))
1322                         ldl_seterror(ul, "Error reading master");
1323                 bp->b_iodone = saviodone;
1324         }
1325
1326         /*
1327          * sync read the data from the log
1328          *      errors are returned inline
1329          */
1330         if (ldl_read(ul, va, mof, nb, age)) {
1331                 bp->b_flags |= B_ERROR;
1332                 bp->b_error = EIO;
1333         }
1334
1335         /*
1336          * unlist the deltas
1337          */
1338         logmap_list_put(logmap, age);
1339
1340         /*
1341          * all done
1342          */
1343         if (ul->un_flags & LDL_ERROR) {
1344                 bp->b_flags |= B_ERROR;
1345                 bp->b_error = EIO;
1346         }
1347         biodone(bp);
1348 }
1349
1350 void
1351 lufs_write_strategy(ml_unit_t *ul, buf_t *bp)
1352 {
1353         offset_t        mof     = ldbtob(bp->b_blkno);
1354         off_t           nb      = bp->b_bcount;
1355         char            *va;
1356         mapentry_t      *me;
1357
1358         ASSERT((nb & DEV_BMASK) == 0);
1359         ul->un_logmap->mtm_ref = 1;
1360
1361         /*
1362          * if there are deltas, move into log
1363          */
1364         me = deltamap_remove(ul->un_deltamap, mof, nb);
1365         if (me) {
1366
1367                 va = bp_mapin_common(bp, VM_SLEEP);
1368
1369                 ASSERT(((ul->un_debug & MT_WRITE_CHECK) == 0) ||
1370                     (ul->un_matamap == NULL)||
1371                     matamap_within(ul->un_matamap, mof, nb));
1372
1373                 /*
1374                  * move to logmap
1375                  */
1376                 if (ufs_crb_enable) {
1377                         logmap_add_buf(ul, va, mof, me,
1378                             bp->b_un.b_addr, nb);
1379                 } else {
1380                         logmap_add(ul, va, mof, me);
1381                 }
1382
1383                 if (ul->un_flags & LDL_ERROR) {
1384                         bp->b_flags |= B_ERROR;
1385                         bp->b_error = EIO;
1386                 }
1387                 biodone(bp);
1388                 return;
1389         }
1390         if (ul->un_flags & LDL_ERROR) {
1391                 bp->b_flags |= B_ERROR;
1392                 bp->b_error = EIO;
1393                 biodone(bp);
1394                 return;
1395         }
1396
1397         /*
1398          * Check that we are not updating metadata, or if so then via B_PHYS.
1399          */
1400         ASSERT((ul->un_matamap == NULL) ||
1401             !(matamap_overlap(ul->un_matamap, mof, nb) &&
1402             ((bp->b_flags & B_PHYS) == 0)));
1403
1404         ul->un_ufsvfs->vfs_iotstamp = ddi_get_lbolt();
1405         logstats.ls_lwrites.value.ui64++;
1406
1407         /* If snapshots are enabled, write through the snapshot driver */
1408         if (ul->un_ufsvfs->vfs_snapshot)
1409                 fssnap_strategy(&ul->un_ufsvfs->vfs_snapshot, bp);
1410         else
1411                 (void) bdev_strategy(bp);
1412
1413         lwp_stat_update(LWP_STAT_OUBLK, 1);
1414 }
1415
1416 void
1417 lufs_strategy(ml_unit_t *ul, buf_t *bp)
1418 {
1419         if (bp->b_flags & B_READ)
1420                 lufs_read_strategy(ul, bp);
1421         else
1422                 lufs_write_strategy(ul, bp);
1423 }
1424
1425 /* ARGSUSED */
1426 static int
1427 delta_stats_update(kstat_t *ksp, int rw)
1428 {
1429         if (rw == KSTAT_WRITE) {
1430                 delta_stats[DT_SB] = dkstats.ds_superblock_deltas.value.ui64;
1431                 delta_stats[DT_CG] = dkstats.ds_bitmap_deltas.value.ui64;
1432                 delta_stats[DT_SI] = dkstats.ds_suminfo_deltas.value.ui64;
1433                 delta_stats[DT_AB] = dkstats.ds_allocblk_deltas.value.ui64;
1434                 delta_stats[DT_ABZERO] = dkstats.ds_ab0_deltas.value.ui64;
1435                 delta_stats[DT_DIR] = dkstats.ds_dir_deltas.value.ui64;
1436                 delta_stats[DT_INODE] = dkstats.ds_inode_deltas.value.ui64;
1437                 delta_stats[DT_FBI] = dkstats.ds_fbiwrite_deltas.value.ui64;
1438                 delta_stats[DT_QR] = dkstats.ds_quota_deltas.value.ui64;
1439                 delta_stats[DT_SHAD] = dkstats.ds_shadow_deltas.value.ui64;
1440
1441                 roll_stats[DT_SB] = dkstats.ds_superblock_rolled.value.ui64;
1442                 roll_stats[DT_CG] = dkstats.ds_bitmap_rolled.value.ui64;
1443                 roll_stats[DT_SI] = dkstats.ds_suminfo_rolled.value.ui64;
1444                 roll_stats[DT_AB] = dkstats.ds_allocblk_rolled.value.ui64;
1445                 roll_stats[DT_ABZERO] = dkstats.ds_ab0_rolled.value.ui64;
1446                 roll_stats[DT_DIR] = dkstats.ds_dir_rolled.value.ui64;
1447                 roll_stats[DT_INODE] = dkstats.ds_inode_rolled.value.ui64;
1448                 roll_stats[DT_FBI] = dkstats.ds_fbiwrite_rolled.value.ui64;
1449                 roll_stats[DT_QR] = dkstats.ds_quota_rolled.value.ui64;
1450                 roll_stats[DT_SHAD] = dkstats.ds_shadow_rolled.value.ui64;
1451         } else {
1452                 dkstats.ds_superblock_deltas.value.ui64 = delta_stats[DT_SB];
1453                 dkstats.ds_bitmap_deltas.value.ui64 = delta_stats[DT_CG];
1454                 dkstats.ds_suminfo_deltas.value.ui64 = delta_stats[DT_SI];
1455                 dkstats.ds_allocblk_deltas.value.ui64 = delta_stats[DT_AB];
1456                 dkstats.ds_ab0_deltas.value.ui64 = delta_stats[DT_ABZERO];
1457                 dkstats.ds_dir_deltas.value.ui64 = delta_stats[DT_DIR];
1458                 dkstats.ds_inode_deltas.value.ui64 = delta_stats[DT_INODE];
1459                 dkstats.ds_fbiwrite_deltas.value.ui64 = delta_stats[DT_FBI];
1460                 dkstats.ds_quota_deltas.value.ui64 = delta_stats[DT_QR];
1461                 dkstats.ds_shadow_deltas.value.ui64 = delta_stats[DT_SHAD];
1462
1463                 dkstats.ds_superblock_rolled.value.ui64 = roll_stats[DT_SB];
1464                 dkstats.ds_bitmap_rolled.value.ui64 = roll_stats[DT_CG];
1465                 dkstats.ds_suminfo_rolled.value.ui64 = roll_stats[DT_SI];
1466                 dkstats.ds_allocblk_rolled.value.ui64 = roll_stats[DT_AB];
1467                 dkstats.ds_ab0_rolled.value.ui64 = roll_stats[DT_ABZERO];
1468                 dkstats.ds_dir_rolled.value.ui64 = roll_stats[DT_DIR];
1469                 dkstats.ds_inode_rolled.value.ui64 = roll_stats[DT_INODE];
1470                 dkstats.ds_fbiwrite_rolled.value.ui64 = roll_stats[DT_FBI];
1471                 dkstats.ds_quota_rolled.value.ui64 = roll_stats[DT_QR];
1472                 dkstats.ds_shadow_rolled.value.ui64 = roll_stats[DT_SHAD];
1473         }
1474         return (0);
1475 }
1476
1477 extern size_t ufs_crb_limit;
1478 extern int ufs_max_crb_divisor;
1479
1480 void
1481 lufs_init(void)
1482 {
1483         kstat_t *ksp;
1484
1485         /* Create kmem caches */
1486         lufs_sv = kmem_cache_create("lufs_save", sizeof (lufs_save_t), 0,
1487             NULL, NULL, NULL, NULL, NULL, 0);
1488         lufs_bp = kmem_cache_create("lufs_bufs", sizeof (lufs_buf_t), 0,
1489             NULL, NULL, NULL, NULL, NULL, 0);
1490
1491         mutex_init(&log_mutex, NULL, MUTEX_DEFAULT, NULL);
1492
1493         _init_top();
1494
1495         if (bio_lufs_strategy == NULL)
1496                 bio_lufs_strategy = (void (*) (void *, buf_t *)) lufs_strategy;
1497
1498         /*
1499          * Initialise general logging and delta kstats
1500          */
1501         ksp = kstat_create("ufs_log", 0, "logstats", "ufs", KSTAT_TYPE_NAMED,
1502             sizeof (logstats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
1503         if (ksp) {
1504                 ksp->ks_data = (void *) &logstats;
1505                 kstat_install(ksp);
1506         }
1507
1508         ksp = kstat_create("ufs_log", 0, "deltastats", "ufs", KSTAT_TYPE_NAMED,
1509             sizeof (dkstats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
1510         if (ksp) {
1511                 ksp->ks_data = (void *) &dkstats;
1512                 ksp->ks_update = delta_stats_update;
1513                 kstat_install(ksp);
1514         }
1515
1516         /* Initialize  generation of logging ids */
1517         lufs_genid_init();
1518
1519         /*
1520          * Set up the maximum amount of kmem that the crbs (system wide)
1521          * can use.
1522          */
1523         ufs_crb_limit = kmem_maxavail() / ufs_max_crb_divisor;
1524 }