usr/src/uts/common/fs/ufs/ufs_trans.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  27 /* All Rights Reserved */
  28
  29 /*
  30  * Portions of this source code were derived from Berkeley 4.3 BSD
  31  * under license from the Regents of the University of California.
  32  */
  33
  34 #include <sys/sysmacros.h>
  35 #include <sys/param.h>
  36 #include <sys/types.h>
  37 #include <sys/systm.h>
  38 #include <sys/t_lock.h>
  39 #include <sys/uio.h>
  40 #include <sys/kmem.h>
  41 #include <sys/thread.h>
  42 #include <sys/vfs.h>
  43 #include <sys/errno.h>
  44 #include <sys/buf.h>
  45 #include <sys/vnode.h>
  46 #include <sys/fs/ufs_trans.h>
  47 #include <sys/fs/ufs_inode.h>
  48 #include <sys/fs/ufs_fs.h>
  49 #include <sys/fs/ufs_fsdir.h>
  50 #include <sys/fs/ufs_quota.h>
  51 #include <sys/fs/ufs_panic.h>
  52 #include <sys/fs/ufs_bio.h>
  53 #include <sys/fs/ufs_log.h>
  54 #include <sys/cmn_err.h>
  55 #include <sys/file.h>
  56 #include <sys/debug.h>
  57
  58
  59 extern kmutex_t ufsvfs_mutex;
  60 extern struct ufsvfs *ufs_instances;
  61
  62 /*
  63  * hlock any file systems w/errored logs
  64  */
  65 int
  66 ufs_trans_hlock()
  67 {
  68         struct ufsvfs   *ufsvfsp;
  69         struct lockfs   lockfs;
  70         int             error;
  71         int             retry   = 0;
  72
  73         /*
  74          * find fs's that paniced or have errored logging devices
  75          */
  76         mutex_enter(&ufsvfs_mutex);
  77         for (ufsvfsp = ufs_instances; ufsvfsp; ufsvfsp = ufsvfsp->vfs_next) {
  78                 /*
  79                  * not mounted; continue
  80                  */
  81                 if ((ufsvfsp->vfs_vfs == NULL) ||
  82                     (ufsvfsp->vfs_validfs == UT_UNMOUNTED))
  83                         continue;
  84                 /*
  85                  * disallow unmounts (hlock occurs below)
  86                  */
  87                 if (TRANS_ISERROR(ufsvfsp))
  88                         ufsvfsp->vfs_validfs = UT_HLOCKING;
  89         }
  90         mutex_exit(&ufsvfs_mutex);
  91
  92         /*
  93          * hlock the fs's that paniced or have errored logging devices
  94          */
  95 again:
  96         mutex_enter(&ufsvfs_mutex);
  97         for (ufsvfsp = ufs_instances; ufsvfsp; ufsvfsp = ufsvfsp->vfs_next)
  98                 if (ufsvfsp->vfs_validfs == UT_HLOCKING)
  99                         break;
 100         mutex_exit(&ufsvfs_mutex);
 101         if (ufsvfsp == NULL)
 102                 return (retry);
 103         /*
 104          * hlock the file system
 105          */
 106         (void) ufs_fiolfss(ufsvfsp->vfs_root, &lockfs);
 107         if (!LOCKFS_IS_ELOCK(&lockfs)) {
 108                 lockfs.lf_lock = LOCKFS_HLOCK;
 109                 lockfs.lf_flags = 0;
 110                 lockfs.lf_comlen = 0;
 111                 lockfs.lf_comment = NULL;
 112                 error = ufs_fiolfs(ufsvfsp->vfs_root, &lockfs, 0);
 113                 /*
 114                  * retry after awhile; another app currently doing lockfs
 115                  */
 116                 if (error == EBUSY || error == EINVAL)
 117                         retry = 1;
 118         } else {
 119                 if (ufsfx_get_failure_qlen() > 0) {
 120                         if (mutex_tryenter(&ufs_fix.uq_mutex)) {
 121                                 ufs_fix.uq_lowat = ufs_fix.uq_ne;
 122                                 cv_broadcast(&ufs_fix.uq_cv);
 123                                 mutex_exit(&ufs_fix.uq_mutex);
 124                         }
 125                 }
 126                 retry = 1;
 127         }
 128
 129         /*
 130          * allow unmounts
 131          */
 132         ufsvfsp->vfs_validfs = UT_MOUNTED;
 133         goto again;
 134 }
 135
 136 /*ARGSUSED*/
 137 void
 138 ufs_trans_onerror()
 139 {
 140         mutex_enter(&ufs_hlock.uq_mutex);
 141         ufs_hlock.uq_ne = ufs_hlock.uq_lowat;
 142         cv_broadcast(&ufs_hlock.uq_cv);
 143         mutex_exit(&ufs_hlock.uq_mutex);
 144 }
 145
 146 void
 147 ufs_trans_sbupdate(struct ufsvfs *ufsvfsp, struct vfs *vfsp, top_t topid)
 148 {
 149         if (curthread->t_flag & T_DONTBLOCK) {
 150                 sbupdate(vfsp);
 151                 return;
 152         } else {
 153
 154                 if (panicstr && TRANS_ISTRANS(ufsvfsp))
 155                         return;
 156
 157                 curthread->t_flag |= T_DONTBLOCK;
 158                 TRANS_BEGIN_ASYNC(ufsvfsp, topid, TOP_SBUPDATE_SIZE);
 159                 sbupdate(vfsp);
 160                 TRANS_END_ASYNC(ufsvfsp, topid, TOP_SBUPDATE_SIZE);
 161                 curthread->t_flag &= ~T_DONTBLOCK;
 162         }
 163 }
 164
 165 void
 166 ufs_trans_iupdat(struct inode *ip, int waitfor)
 167 {
 168         struct ufsvfs   *ufsvfsp;
 169
 170         if (curthread->t_flag & T_DONTBLOCK) {
 171                 rw_enter(&ip->i_contents, RW_READER);
 172                 ufs_iupdat(ip, waitfor);
 173                 rw_exit(&ip->i_contents);
 174                 return;
 175         } else {
 176                 ufsvfsp = ip->i_ufsvfs;
 177
 178                 if (panicstr && TRANS_ISTRANS(ufsvfsp))
 179                         return;
 180
 181                 curthread->t_flag |= T_DONTBLOCK;
 182                 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_IUPDAT, TOP_IUPDAT_SIZE(ip));
 183                 rw_enter(&ip->i_contents, RW_READER);
 184                 ufs_iupdat(ip, waitfor);
 185                 rw_exit(&ip->i_contents);
 186                 TRANS_END_ASYNC(ufsvfsp, TOP_IUPDAT, TOP_IUPDAT_SIZE(ip));
 187                 curthread->t_flag &= ~T_DONTBLOCK;
 188         }
 189 }
 190
 191 void
 192 ufs_trans_sbwrite(struct ufsvfs *ufsvfsp, top_t topid)
 193 {
 194         if (curthread->t_flag & T_DONTBLOCK) {
 195                 mutex_enter(&ufsvfsp->vfs_lock);
 196                 ufs_sbwrite(ufsvfsp);
 197                 mutex_exit(&ufsvfsp->vfs_lock);
 198                 return;
 199         } else {
 200
 201                 if (panicstr && TRANS_ISTRANS(ufsvfsp))
 202                         return;
 203
 204                 curthread->t_flag |= T_DONTBLOCK;
 205                 TRANS_BEGIN_ASYNC(ufsvfsp, topid, TOP_SBWRITE_SIZE);
 206                 mutex_enter(&ufsvfsp->vfs_lock);
 207                 ufs_sbwrite(ufsvfsp);
 208                 mutex_exit(&ufsvfsp->vfs_lock);
 209                 TRANS_END_ASYNC(ufsvfsp, topid, TOP_SBWRITE_SIZE);
 210                 curthread->t_flag &= ~T_DONTBLOCK;
 211         }
 212 }
 213
 214 /*ARGSUSED*/
 215 int
 216 ufs_trans_push_si(ufsvfs_t *ufsvfsp, delta_t dtyp, int ignore)
 217 {
 218         struct fs       *fs;
 219
 220         fs = ufsvfsp->vfs_fs;
 221         mutex_enter(&ufsvfsp->vfs_lock);
 222         TRANS_LOG(ufsvfsp, (char *)fs->fs_u.fs_csp,
 223             ldbtob(fsbtodb(fs, fs->fs_csaddr)), fs->fs_cssize,
 224             (caddr_t)fs->fs_u.fs_csp, fs->fs_cssize);
 225         mutex_exit(&ufsvfsp->vfs_lock);
 226         return (0);
 227 }
 228
 229 /*ARGSUSED*/
 230 int
 231 ufs_trans_push_buf(ufsvfs_t *ufsvfsp, delta_t dtyp, daddr_t bno)
 232 {
 233         struct buf      *bp;
 234
 235         bp = (struct buf *)UFS_GETBLK(ufsvfsp, ufsvfsp->vfs_dev, bno, 1);
 236         if (bp == NULL)
 237                 return (ENOENT);
 238
 239         if (bp->b_flags & B_DELWRI) {
 240                 /*
 241                  * Do not use brwrite() here since the buffer is already
 242                  * marked for retry or not by the code that called
 243                  * TRANS_BUF().
 244                  */
 245                 UFS_BWRITE(ufsvfsp, bp);
 246                 return (0);
 247         }
 248         /*
 249          * If we did not find the real buf for this block above then
 250          * clear the dev so the buf won't be found by mistake
 251          * for this block later.  We had to allocate at least a 1 byte
 252          * buffer to keep brelse happy.
 253          */
 254         if (bp->b_bufsize == 1) {
 255                 bp->b_dev = (o_dev_t)NODEV;
 256                 bp->b_edev = NODEV;
 257                 bp->b_flags = 0;
 258         }
 259         brelse(bp);
 260         return (ENOENT);
 261 }
 262
 263 /*ARGSUSED*/
 264 int
 265 ufs_trans_push_inode(ufsvfs_t *ufsvfsp, delta_t dtyp, ino_t ino)
 266 {
 267         int             error;
 268         struct inode    *ip;
 269
 270         /*
 271          * Grab the quota lock (if the file system has not been forcibly
 272          * unmounted).
 273          */
 274         if (ufsvfsp)
 275                 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
 276
 277         error = ufs_iget(ufsvfsp->vfs_vfs, ino, &ip, kcred);
 278
 279         if (ufsvfsp)
 280                 rw_exit(&ufsvfsp->vfs_dqrwlock);
 281         if (error)
 282                 return (ENOENT);
 283
 284         if (ip->i_flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) {
 285                 rw_enter(&ip->i_contents, RW_READER);
 286                 ufs_iupdat(ip, 1);
 287                 rw_exit(&ip->i_contents);
 288                 VN_RELE(ITOV(ip));
 289                 return (0);
 290         }
 291         VN_RELE(ITOV(ip));
 292         return (ENOENT);
 293 }
 294
 295 #ifdef DEBUG
 296 /*
 297  *      These routines maintain the metadata map (matamap)
 298  */
 299
 300 /*
 301  * update the metadata map at mount
 302  */
 303 static int
 304 ufs_trans_mata_mount_scan(struct inode *ip, void *arg)
 305 {
 306         /*
 307          * wrong file system; keep looking
 308          */
 309         if (ip->i_ufsvfs != (struct ufsvfs *)arg)
 310                 return (0);
 311
 312         /*
 313          * load the metadata map
 314          */
 315         rw_enter(&ip->i_contents, RW_WRITER);
 316         ufs_trans_mata_iget(ip);
 317         rw_exit(&ip->i_contents);
 318         return (0);
 319 }
 320
 321 void
 322 ufs_trans_mata_mount(struct ufsvfs *ufsvfsp)
 323 {
 324         struct fs       *fs     = ufsvfsp->vfs_fs;
 325         ino_t           ino;
 326         int             i;
 327
 328         /*
 329          * put static metadata into matamap
 330          *      superblock
 331          *      cylinder groups
 332          *      inode groups
 333          *      existing inodes
 334          */
 335         TRANS_MATAADD(ufsvfsp, ldbtob(SBLOCK), fs->fs_sbsize);
 336
 337         for (ino = i = 0; i < fs->fs_ncg; ++i, ino += fs->fs_ipg) {
 338                 TRANS_MATAADD(ufsvfsp,
 339                     ldbtob(fsbtodb(fs, cgtod(fs, i))), fs->fs_cgsize);
 340                 TRANS_MATAADD(ufsvfsp,
 341                     ldbtob(fsbtodb(fs, itod(fs, ino))),
 342                     fs->fs_ipg * sizeof (struct dinode));
 343         }
 344         (void) ufs_scan_inodes(0, ufs_trans_mata_mount_scan, ufsvfsp, ufsvfsp);
 345 }
 346
 347 /*
 348  * clear the metadata map at umount
 349  */
 350 void
 351 ufs_trans_mata_umount(struct ufsvfs *ufsvfsp)
 352 {
 353         top_mataclr(ufsvfsp);
 354 }
 355
 356 /*
 357  * summary info (may be extended during growfs test)
 358  */
 359 void
 360 ufs_trans_mata_si(struct ufsvfs *ufsvfsp, struct fs *fs)
 361 {
 362         TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, fs->fs_csaddr)),
 363             fs->fs_cssize);
 364 }
 365
 366 /*
 367  * scan an allocation block (either inode or true block)
 368  */
 369 static void
 370 ufs_trans_mata_direct(
 371         struct inode *ip,
 372         daddr_t *fragsp,
 373         daddr32_t *blkp,
 374         unsigned int nblk)
 375 {
 376         int             i;
 377         daddr_t         frag;
 378         ulong_t         nb;
 379         struct ufsvfs   *ufsvfsp        = ip->i_ufsvfs;
 380         struct fs       *fs             = ufsvfsp->vfs_fs;
 381
 382         for (i = 0; i < nblk && *fragsp; ++i, ++blkp)
 383                 if ((frag = *blkp) != 0) {
 384                         if (*fragsp > fs->fs_frag) {
 385                                 nb = fs->fs_bsize;
 386                                 *fragsp -= fs->fs_frag;
 387                         } else {
 388                                 nb = *fragsp * fs->fs_fsize;
 389                                 *fragsp = 0;
 390                         }
 391                         TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, frag)), nb);
 392                 }
 393 }
 394
 395 /*
 396  * scan an indirect allocation block (either inode or true block)
 397  */
 398 static void
 399 ufs_trans_mata_indir(
 400         struct inode *ip,
 401         daddr_t *fragsp,
 402         daddr_t frag,
 403         int level)
 404 {
 405         struct ufsvfs *ufsvfsp  = ip->i_ufsvfs;
 406         struct fs *fs = ufsvfsp->vfs_fs;
 407         int ne = fs->fs_bsize / (int)sizeof (daddr32_t);
 408         int i;
 409         struct buf *bp;
 410         daddr32_t *blkp;
 411         o_mode_t ifmt = ip->i_mode & IFMT;
 412
 413         bp = UFS_BREAD(ufsvfsp, ip->i_dev, fsbtodb(fs, frag), fs->fs_bsize);
 414         if (bp->b_flags & B_ERROR) {
 415                 brelse(bp);
 416                 return;
 417         }
 418         blkp = bp->b_un.b_daddr;
 419
 420         if (level || (ifmt == IFDIR) || (ifmt == IFSHAD) ||
 421             (ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod))
 422                 ufs_trans_mata_direct(ip, fragsp, blkp, ne);
 423
 424         if (level)
 425                 for (i = 0; i < ne && *fragsp; ++i, ++blkp)
 426                         ufs_trans_mata_indir(ip, fragsp, *blkp, level-1);
 427         brelse(bp);
 428 }
 429
 430 /*
 431  * put appropriate metadata into matamap for this inode
 432  */
 433 void
 434 ufs_trans_mata_iget(struct inode *ip)
 435 {
 436         int             i;
 437         daddr_t         frags   = dbtofsb(ip->i_fs, ip->i_blocks);
 438         o_mode_t        ifmt    = ip->i_mode & IFMT;
 439
 440         if (frags && ((ifmt == IFDIR) || (ifmt == IFSHAD) ||
 441             (ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod)))
 442                 ufs_trans_mata_direct(ip, &frags, &ip->i_db[0], NDADDR);
 443
 444         if (frags)
 445                 ufs_trans_mata_direct(ip, &frags, &ip->i_ib[0], NIADDR);
 446
 447         for (i = 0; i < NIADDR && frags; ++i)
 448                 if (ip->i_ib[i])
 449                         ufs_trans_mata_indir(ip, &frags, ip->i_ib[i], i);
 450 }
 451
 452 /*
 453  * freeing possible metadata (block of user data)
 454  */
 455 void
 456 ufs_trans_mata_free(struct ufsvfs *ufsvfsp, offset_t mof, off_t nb)
 457 {
 458         top_matadel(ufsvfsp, mof, nb);
 459
 460 }
 461
 462 /*
 463  * allocating metadata
 464  */
 465 void
 466 ufs_trans_mata_alloc(
 467         struct ufsvfs *ufsvfsp,
 468         struct inode *ip,
 469         daddr_t frag,
 470         ulong_t nb,
 471         int indir)
 472 {
 473         struct fs       *fs     = ufsvfsp->vfs_fs;
 474         o_mode_t        ifmt    = ip->i_mode & IFMT;
 475
 476         if (indir || ((ifmt == IFDIR) || (ifmt == IFSHAD) ||
 477             (ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod)))
 478                 TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, frag)), nb);
 479 }
 480
 481 #endif /* DEBUG */
 482
 483 /*
 484  * ufs_trans_dir is used to declare a directory delta
 485  */
 486 int
 487 ufs_trans_dir(struct inode *ip, off_t offset)
 488 {
 489         daddr_t bn;
 490         int     contig = 0, error;
 491
 492         ASSERT(ip);
 493         ASSERT(RW_WRITE_HELD(&ip->i_contents));
 494         error = bmap_read(ip, (u_offset_t)offset, &bn, &contig);
 495         if (error || (bn == UFS_HOLE)) {
 496                 cmn_err(CE_WARN, "ufs_trans_dir - could not get block"
 497                     " number error = %d bn = %d\n", error, (int)bn);
 498                 if (error == 0) /* treat UFS_HOLE as an I/O error */
 499                         error = EIO;
 500                 return (error);
 501         }
 502         TRANS_DELTA(ip->i_ufsvfs, ldbtob(bn), DIRBLKSIZ, DT_DIR, 0, 0);
 503         return (error);
 504 }
 505
 506 /*ARGSUSED*/
 507 int
 508 ufs_trans_push_quota(ufsvfs_t *ufsvfsp, delta_t dtyp, struct dquot *dqp)
 509 {
 510         /*
 511          * Lock the quota subsystem (ufsvfsp can be NULL
 512          * if the DQ_ERROR is set).
 513          */
 514         if (ufsvfsp)
 515                 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
 516         mutex_enter(&dqp->dq_lock);
 517
 518         /*
 519          * If this transaction has been cancelled by closedq_scan_inode(),
 520          * then bail out now.  We don't call dqput() in this case because
 521          * it has already been done.
 522          */
 523         if ((dqp->dq_flags & DQ_TRANS) == 0) {
 524                 mutex_exit(&dqp->dq_lock);
 525                 if (ufsvfsp)
 526                         rw_exit(&ufsvfsp->vfs_dqrwlock);
 527                 return (0);
 528         }
 529
 530         if (dqp->dq_flags & DQ_ERROR) {
 531                 /*
 532                  * Paranoia to make sure that there is at least one
 533                  * reference to the dquot struct.  We are done with
 534                  * the dquot (due to an error) so clear logging
 535                  * specific markers.
 536                  */
 537                 ASSERT(dqp->dq_cnt >= 1);
 538                 dqp->dq_flags &= ~DQ_TRANS;
 539                 dqput(dqp);
 540                 mutex_exit(&dqp->dq_lock);
 541                 if (ufsvfsp)
 542                         rw_exit(&ufsvfsp->vfs_dqrwlock);
 543                 return (1);
 544         }
 545
 546         if (dqp->dq_flags & (DQ_MOD | DQ_BLKS | DQ_FILES)) {
 547                 ASSERT((dqp->dq_mof != UFS_HOLE) && (dqp->dq_mof != 0));
 548                 TRANS_LOG(ufsvfsp, (caddr_t)&dqp->dq_dqb,
 549                     dqp->dq_mof, (int)sizeof (struct dqblk), NULL, 0);
 550                 /*
 551                  * Paranoia to make sure that there is at least one
 552                  * reference to the dquot struct.  Clear the
 553                  * modification flag because the operation is now in
 554                  * the log.  Also clear the logging specific markers
 555                  * that were set in ufs_trans_quota().
 556                  */
 557                 ASSERT(dqp->dq_cnt >= 1);
 558                 dqp->dq_flags &= ~(DQ_MOD | DQ_TRANS);
 559                 dqput(dqp);
 560         }
 561
 562         /*
 563          * At this point, the logging specific flag should be clear,
 564          * but add paranoia just in case something has gone wrong.
 565          */
 566         ASSERT((dqp->dq_flags & DQ_TRANS) == 0);
 567         mutex_exit(&dqp->dq_lock);
 568         if (ufsvfsp)
 569                 rw_exit(&ufsvfsp->vfs_dqrwlock);
 570         return (0);
 571 }
 572
 573 /*
 574  * ufs_trans_quota take in a uid, allocates the disk space, placing the
 575  * quota record into the metamap, then declares the delta.
 576  */
 577 /*ARGSUSED*/
 578 void
 579 ufs_trans_quota(struct dquot *dqp)
 580 {
 581
 582         struct inode    *qip = dqp->dq_ufsvfsp->vfs_qinod;
 583
 584         ASSERT(qip);
 585         ASSERT(MUTEX_HELD(&dqp->dq_lock));
 586         ASSERT(dqp->dq_flags & DQ_MOD);
 587         ASSERT(dqp->dq_mof != 0);
 588         ASSERT(dqp->dq_mof != UFS_HOLE);
 589
 590         /*
 591          * Mark this dquot to indicate that we are starting a logging
 592          * file system operation for this dquot.  Also increment the
 593          * reference count so that the dquot does not get reused while
 594          * it is on the mapentry_t list.  DQ_TRANS is cleared and the
 595          * reference count is decremented by ufs_trans_push_quota.
 596          *
 597          * If the file system is force-unmounted while there is a
 598          * pending quota transaction, then closedq_scan_inode() will
 599          * clear the DQ_TRANS flag and decrement the reference count.
 600          *
 601          * Since deltamap_add() drops multiple transactions to the
 602          * same dq_mof and ufs_trans_push_quota() won't get called,
 603          * we use DQ_TRANS to prevent repeat transactions from
 604          * incrementing the reference count (or calling TRANS_DELTA()).
 605          */
 606         if ((dqp->dq_flags & DQ_TRANS) == 0) {
 607                 dqp->dq_flags |= DQ_TRANS;
 608                 dqp->dq_cnt++;
 609                 TRANS_DELTA(qip->i_ufsvfs, dqp->dq_mof, sizeof (struct dqblk),
 610                     DT_QR, ufs_trans_push_quota, (ulong_t)dqp);
 611         }
 612 }
 613
 614 void
 615 ufs_trans_dqrele(struct dquot *dqp)
 616 {
 617         struct ufsvfs   *ufsvfsp = dqp->dq_ufsvfsp;
 618
 619         curthread->t_flag |= T_DONTBLOCK;
 620         TRANS_BEGIN_ASYNC(ufsvfsp, TOP_QUOTA, TOP_QUOTA_SIZE);
 621         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
 622         dqrele(dqp);
 623         rw_exit(&ufsvfsp->vfs_dqrwlock);
 624         TRANS_END_ASYNC(ufsvfsp, TOP_QUOTA, TOP_QUOTA_SIZE);
 625         curthread->t_flag &= ~T_DONTBLOCK;
 626 }
 627
 628 int ufs_trans_max_resv = TOP_MAX_RESV;  /* will be adjusted for testing */
 629 long ufs_trans_avgbfree = 0;            /* will be adjusted for testing */
 630 #define TRANS_MAX_WRITE (1024 * 1024)
 631 size_t ufs_trans_max_resid = TRANS_MAX_WRITE;
 632
 633 /*
 634  * Calculate the log reservation for the given write or truncate
 635  */
 636 static ulong_t
 637 ufs_log_amt(struct inode *ip, offset_t offset, ssize_t resid, int trunc)
 638 {
 639         long            ncg, last2blk;
 640         long            niblk           = 0;
 641         u_offset_t      writeend, offblk;
 642         int             resv;
 643         daddr_t         nblk, maxfblk;
 644         long            avgbfree;
 645         struct ufsvfs   *ufsvfsp        = ip->i_ufsvfs;
 646         struct fs       *fs             = ufsvfsp->vfs_fs;
 647         long            fni             = NINDIR(fs);
 648         int             bsize           = fs->fs_bsize;
 649
 650         /*
 651          * Assume that the request will fit in 1 or 2 cg's,
 652          * resv is the amount of log space to reserve (in bytes).
 653          */
 654         resv = SIZECG(ip) * 2 + INODESIZE + 1024;
 655
 656         /*
 657          * get max position of write in fs blocks
 658          */
 659         writeend = offset + resid;
 660         maxfblk = lblkno(fs, writeend);
 661         offblk = lblkno(fs, offset);
 662         /*
 663          * request size in fs blocks
 664          */
 665         nblk = lblkno(fs, blkroundup(fs, resid));
 666         /*
 667          * Adjust for sparse files
 668          */
 669         if (trunc)
 670                 nblk = MIN(nblk, ip->i_blocks);
 671
 672         /*
 673          * Adjust avgbfree (for testing)
 674          */
 675         avgbfree = (ufs_trans_avgbfree) ? 1 : ufsvfsp->vfs_avgbfree + 1;
 676
 677         /*
 678          * Calculate maximum number of blocks of triple indirect
 679          * pointers to write.
 680          */
 681         last2blk = NDADDR + fni + fni * fni;
 682         if (maxfblk > last2blk) {
 683                 long nl2ptr;
 684                 long n3blk;
 685
 686                 if (offblk > last2blk)
 687                         n3blk = maxfblk - offblk;
 688                 else
 689                         n3blk = maxfblk - last2blk;
 690                 niblk += roundup(n3blk * sizeof (daddr_t), bsize) / bsize + 1;
 691                 nl2ptr = roundup(niblk, fni) / fni + 1;
 692                 niblk += roundup(nl2ptr * sizeof (daddr_t), bsize) / bsize + 2;
 693                 maxfblk -= n3blk;
 694         }
 695         /*
 696          * calculate maximum number of blocks of double indirect
 697          * pointers to write.
 698          */
 699         if (maxfblk > NDADDR + fni) {
 700                 long n2blk;
 701
 702                 if (offblk > NDADDR + fni)
 703                         n2blk = maxfblk - offblk;
 704                 else
 705                         n2blk = maxfblk - NDADDR + fni;
 706                 niblk += roundup(n2blk * sizeof (daddr_t), bsize) / bsize + 2;
 707                 maxfblk -= n2blk;
 708         }
 709         /*
 710          * Add in indirect pointer block write
 711          */
 712         if (maxfblk > NDADDR) {
 713                 niblk += 1;
 714         }
 715         /*
 716          * Calculate deltas for indirect pointer writes
 717          */
 718         resv += niblk * (fs->fs_bsize + sizeof (struct delta));
 719         /*
 720          * maximum number of cg's needed for request
 721          */
 722         ncg = nblk / avgbfree;
 723         if (ncg > fs->fs_ncg)
 724                 ncg = fs->fs_ncg;
 725
 726         /*
 727          * maximum amount of log space needed for request
 728          */
 729         if (ncg > 2)
 730                 resv += (ncg - 2) * SIZECG(ip);
 731
 732         return (resv);
 733 }
 734
 735 /*
 736  * Calculate the amount of log space that needs to be reserved for this
 737  * trunc request.  If the amount of log space is too large, then
 738  * calculate the the size that the requests needs to be split into.
 739  */
 740 void
 741 ufs_trans_trunc_resv(
 742         struct inode *ip,
 743         u_offset_t length,
 744         int *resvp,
 745         u_offset_t *residp)
 746 {
 747         ulong_t         resv;
 748         u_offset_t      size, offset, resid;
 749         int             nchunks, flag;
 750
 751         /*
 752          *    *resvp is the amount of log space to reserve (in bytes).
 753          *    when nonzero, *residp is the number of bytes to truncate.
 754          */
 755         *residp = 0;
 756
 757         if (length < ip->i_size) {
 758                 size = ip->i_size - length;
 759         } else {
 760                 resv = SIZECG(ip) * 2 + INODESIZE + 1024;
 761                 /*
 762                  * truncate up, doesn't really use much space,
 763                  * the default above should be sufficient.
 764                  */
 765                 goto done;
 766         }
 767
 768         offset = length;
 769         resid = size;
 770         nchunks = 1;
 771         flag = 0;
 772
 773         /*
 774          * If this request takes too much log space, it will be split into
 775          * "nchunks". If this split is not enough, linearly increment the
 776          * nchunks in the next iteration.
 777          */
 778         for (; (resv = ufs_log_amt(ip, offset, resid, 1)) > ufs_trans_max_resv;
 779             offset = length + (nchunks - 1) * resid) {
 780                 if (!flag) {
 781                         nchunks = roundup(resv, ufs_trans_max_resv) /
 782                             ufs_trans_max_resv;
 783                         flag = 1;
 784                 } else {
 785                         nchunks++;
 786                 }
 787                 resid = size / nchunks;
 788         }
 789
 790         if (nchunks > 1) {
 791                 *residp = resid;
 792         }
 793 done:
 794         *resvp = resv;
 795 }
 796
 797 int
 798 ufs_trans_itrunc(struct inode *ip, u_offset_t length, int flags, cred_t *cr)
 799 {
 800         int             err, issync, resv;
 801         u_offset_t      resid;
 802         int             do_block        = 0;
 803         struct ufsvfs   *ufsvfsp        = ip->i_ufsvfs;
 804         struct fs       *fs             = ufsvfsp->vfs_fs;
 805
 806         /*
 807          * Not logging; just do the trunc
 808          */
 809         if (!TRANS_ISTRANS(ufsvfsp)) {
 810                 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
 811                 rw_enter(&ip->i_contents, RW_WRITER);
 812                 err = ufs_itrunc(ip, length, flags, cr);
 813                 rw_exit(&ip->i_contents);
 814                 rw_exit(&ufsvfsp->vfs_dqrwlock);
 815                 return (err);
 816         }
 817
 818         /*
 819          * within the lockfs protocol but *not* part of a transaction
 820          */
 821         do_block = curthread->t_flag & T_DONTBLOCK;
 822         curthread->t_flag |= T_DONTBLOCK;
 823
 824         /*
 825          * Trunc the file (in pieces, if necessary)
 826          */
 827 again:
 828         ufs_trans_trunc_resv(ip, length, &resv, &resid);
 829         TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ITRUNC, resv);
 830         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
 831         rw_enter(&ip->i_contents, RW_WRITER);
 832         if (resid) {
 833                 /*
 834                  * resid is only set if we have to truncate in chunks
 835                  */
 836                 ASSERT(length + resid < ip->i_size);
 837
 838                 /*
 839                  * Partially trunc file down to desired size (length).
 840                  * Only retain I_FREE on the last partial trunc.
 841                  * Round up size to a block boundary, to ensure the truncate
 842                  * doesn't have to allocate blocks. This is done both for
 843                  * performance and to fix a bug where if the block can't be
 844                  * allocated then the inode delete fails, but the inode
 845                  * is still freed with attached blocks and non-zero size
 846                  * (bug 4348738).
 847                  */
 848                 err = ufs_itrunc(ip, blkroundup(fs, (ip->i_size - resid)),
 849                     flags & ~I_FREE, cr);
 850                 ASSERT(ip->i_size != length);
 851         } else
 852                 err = ufs_itrunc(ip, length, flags, cr);
 853         if (!do_block)
 854                 curthread->t_flag &= ~T_DONTBLOCK;
 855         rw_exit(&ip->i_contents);
 856         rw_exit(&ufsvfsp->vfs_dqrwlock);
 857         TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ITRUNC, resv);
 858
 859         if ((err == 0) && resid) {
 860                 ufsvfsp->vfs_avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
 861                 goto again;
 862         }
 863         return (err);
 864 }
 865
 866 /*
 867  * Calculate the amount of log space that needs to be reserved for this
 868  * write request.  If the amount of log space is too large, then
 869  * calculate the size that the requests needs to be split into.
 870  * First try fixed chunks of size ufs_trans_max_resid. If that
 871  * is too big, iterate down to the largest size that will fit.
 872  * Pagein the pages in the first chunk here, so that the pagein is
 873  * avoided later when the transaction is open.
 874  */
 875 void
 876 ufs_trans_write_resv(
 877         struct inode *ip,
 878         struct uio *uio,
 879         int *resvp,
 880         int *residp)
 881 {
 882         ulong_t         resv;
 883         offset_t        offset;
 884         ssize_t         resid;
 885         int             nchunks;
 886
 887         *residp = 0;
 888         offset = uio->uio_offset;
 889         resid = MIN(uio->uio_resid, ufs_trans_max_resid);
 890         resv = ufs_log_amt(ip, offset, resid, 0);
 891         if (resv <= ufs_trans_max_resv) {
 892                 uio_prefaultpages(resid, uio);
 893                 if (resid != uio->uio_resid)
 894                         *residp = resid;
 895                 *resvp = resv;
 896                 return;
 897         }
 898
 899         resid = uio->uio_resid;
 900         nchunks = 1;
 901         for (; (resv = ufs_log_amt(ip, offset, resid, 0)) > ufs_trans_max_resv;
 902             offset = uio->uio_offset + (nchunks - 1) * resid) {
 903                 nchunks++;
 904                 resid = uio->uio_resid / nchunks;
 905         }
 906         uio_prefaultpages(resid, uio);
 907         /*
 908          * If this request takes too much log space, it will be split
 909          */
 910         if (nchunks > 1)
 911                 *residp = resid;
 912         *resvp = resv;
 913 }
 914
 915 /*
 916  * Issue write request.
 917  *
 918  * Split a large request into smaller chunks.
 919  */
 920 int
 921 ufs_trans_write(
 922         struct inode *ip,
 923         struct uio *uio,
 924         int ioflag,
 925         cred_t *cr,
 926         int resv,
 927         long resid)
 928 {
 929         long            realresid;
 930         int             err;
 931         struct ufsvfs   *ufsvfsp = ip->i_ufsvfs;
 932
 933         /*
 934          * since the write is too big and would "HOG THE LOG" it needs to
 935          * be broken up and done in pieces.  NOTE, the caller will
 936          * issue the EOT after the request has been completed
 937          */
 938         realresid = uio->uio_resid;
 939
 940 again:
 941         /*
 942          * Perform partial request (uiomove will update uio for us)
 943          *      Request is split up into "resid" size chunks until
 944          *      "realresid" bytes have been transferred.
 945          */
 946         uio->uio_resid = MIN(resid, realresid);
 947         realresid -= uio->uio_resid;
 948         err = wrip(ip, uio, ioflag, cr);
 949
 950         /*
 951          * Error or request is done; caller issues final EOT
 952          */
 953         if (err || uio->uio_resid || (realresid == 0)) {
 954                 uio->uio_resid += realresid;
 955                 return (err);
 956         }
 957
 958         /*
 959          * Generate EOT for this part of the request
 960          */
 961         rw_exit(&ip->i_contents);
 962         rw_exit(&ufsvfsp->vfs_dqrwlock);
 963         if (ioflag & (FSYNC|FDSYNC)) {
 964                 TRANS_END_SYNC(ufsvfsp, err, TOP_WRITE_SYNC, resv);
 965         } else {
 966                 TRANS_END_ASYNC(ufsvfsp, TOP_WRITE, resv);
 967         }
 968
 969         /*
 970          * Make sure the input buffer is resident before starting
 971          * the next transaction.
 972          */
 973         uio_prefaultpages(MIN(resid, realresid), uio);
 974
 975         /*
 976          * Generate BOT for next part of the request
 977          */
 978         if (ioflag & (FSYNC|FDSYNC)) {
 979                 int error;
 980                 TRANS_BEGIN_SYNC(ufsvfsp, TOP_WRITE_SYNC, resv, error);
 981                 ASSERT(!error);
 982         } else {
 983                 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_WRITE, resv);
 984         }
 985         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
 986         rw_enter(&ip->i_contents, RW_WRITER);
 987         /*
 988          * Error during EOT (probably device error while writing commit rec)
 989          */
 990         if (err)
 991                 return (err);
 992         goto again;
 993 }