fs/xfs/xfs_vnodeops.c

   1 /*
   2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3  * All Rights Reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18
  19 #include "xfs.h"
  20 #include "xfs_fs.h"
  21 #include "xfs_types.h"
  22 #include "xfs_bit.h"
  23 #include "xfs_log.h"
  24 #include "xfs_inum.h"
  25 #include "xfs_trans.h"
  26 #include "xfs_sb.h"
  27 #include "xfs_ag.h"
  28 #include "xfs_dir2.h"
  29 #include "xfs_mount.h"
  30 #include "xfs_da_btree.h"
  31 #include "xfs_bmap_btree.h"
  32 #include "xfs_ialloc_btree.h"
  33 #include "xfs_dinode.h"
  34 #include "xfs_inode.h"
  35 #include "xfs_inode_item.h"
  36 #include "xfs_itable.h"
  37 #include "xfs_ialloc.h"
  38 #include "xfs_alloc.h"
  39 #include "xfs_bmap.h"
  40 #include "xfs_acl.h"
  41 #include "xfs_attr.h"
  42 #include "xfs_rw.h"
  43 #include "xfs_error.h"
  44 #include "xfs_quota.h"
  45 #include "xfs_utils.h"
  46 #include "xfs_rtalloc.h"
  47 #include "xfs_trans_space.h"
  48 #include "xfs_log_priv.h"
  49 #include "xfs_filestream.h"
  50 #include "xfs_vnodeops.h"
  51 #include "xfs_trace.h"
  52
  53 /*
  54  * The maximum pathlen is 1024 bytes. Since the minimum file system
  55  * blocksize is 512 bytes, we can get a max of 2 extents back from
  56  * bmapi.
  57  */
  58 #define SYMLINK_MAPS 2
  59
  60 STATIC int
  61 xfs_readlink_bmap(
  62         xfs_inode_t     *ip,
  63         char            *link)
  64 {
  65         xfs_mount_t     *mp = ip->i_mount;
  66         int             pathlen = ip->i_d.di_size;
  67         int             nmaps = SYMLINK_MAPS;
  68         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
  69         xfs_daddr_t     d;
  70         int             byte_cnt;
  71         int             n;
  72         xfs_buf_t       *bp;
  73         int             error = 0;
  74
  75         error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, pathlen), mval, &nmaps,
  76                                0);
  77         if (error)
  78                 goto out;
  79
  80         for (n = 0; n < nmaps; n++) {
  81                 d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
  82                 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
  83
  84                 bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt),
  85                                   XBF_LOCK | XBF_MAPPED | XBF_DONT_BLOCK);
  86                 if (!bp)
  87                         return XFS_ERROR(ENOMEM);
  88                 error = bp->b_error;
  89                 if (error) {
  90                         xfs_buf_ioerror_alert(bp, __func__);
  91                         xfs_buf_relse(bp);
  92                         goto out;
  93                 }
  94                 if (pathlen < byte_cnt)
  95                         byte_cnt = pathlen;
  96                 pathlen -= byte_cnt;
  97
  98                 memcpy(link, bp->b_addr, byte_cnt);
  99                 xfs_buf_relse(bp);
 100         }
 101
 102         link[ip->i_d.di_size] = '\0';
 103         error = 0;
 104
 105  out:
 106         return error;
 107 }
 108
 109 int
 110 xfs_readlink(
 111         xfs_inode_t     *ip,
 112         char            *link)
 113 {
 114         xfs_mount_t     *mp = ip->i_mount;
 115         xfs_fsize_t     pathlen;
 116         int             error = 0;
 117
 118         trace_xfs_readlink(ip);
 119
 120         if (XFS_FORCED_SHUTDOWN(mp))
 121                 return XFS_ERROR(EIO);
 122
 123         xfs_ilock(ip, XFS_ILOCK_SHARED);
 124
 125         pathlen = ip->i_d.di_size;
 126         if (!pathlen)
 127                 goto out;
 128
 129         if (pathlen < 0 || pathlen > MAXPATHLEN) {
 130                 xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)",
 131                          __func__, (unsigned long long) ip->i_ino,
 132                          (long long) pathlen);
 133                 ASSERT(0);
 134                 error = XFS_ERROR(EFSCORRUPTED);
 135                 goto out;
 136         }
 137
 138
 139         if (ip->i_df.if_flags & XFS_IFINLINE) {
 140                 memcpy(link, ip->i_df.if_u1.if_data, pathlen);
 141                 link[pathlen] = '\0';
 142         } else {
 143                 error = xfs_readlink_bmap(ip, link);
 144         }
 145
 146  out:
 147         xfs_iunlock(ip, XFS_ILOCK_SHARED);
 148         return error;
 149 }
 150
 151 /*
 152  * Flags for xfs_free_eofblocks
 153  */
 154 #define XFS_FREE_EOF_TRYLOCK    (1<<0)
 155
 156 /*
 157  * This is called by xfs_inactive to free any blocks beyond eof
 158  * when the link count isn't zero and by xfs_dm_punch_hole() when
 159  * punching a hole to EOF.
 160  */
 161 STATIC int
 162 xfs_free_eofblocks(
 163         xfs_mount_t     *mp,
 164         xfs_inode_t     *ip,
 165         int             flags)
 166 {
 167         xfs_trans_t     *tp;
 168         int             error;
 169         xfs_fileoff_t   end_fsb;
 170         xfs_fileoff_t   last_fsb;
 171         xfs_filblks_t   map_len;
 172         int             nimaps;
 173         xfs_bmbt_irec_t imap;
 174
 175         /*
 176          * Figure out if there are any blocks beyond the end
 177          * of the file.  If not, then there is nothing to do.
 178          */
 179         end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
 180         last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
 181         if (last_fsb <= end_fsb)
 182                 return 0;
 183         map_len = last_fsb - end_fsb;
 184
 185         nimaps = 1;
 186         xfs_ilock(ip, XFS_ILOCK_SHARED);
 187         error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
 188         xfs_iunlock(ip, XFS_ILOCK_SHARED);
 189
 190         if (!error && (nimaps != 0) &&
 191             (imap.br_startblock != HOLESTARTBLOCK ||
 192              ip->i_delayed_blks)) {
 193                 /*
 194                  * Attach the dquots to the inode up front.
 195                  */
 196                 error = xfs_qm_dqattach(ip, 0);
 197                 if (error)
 198                         return error;
 199
 200                 /*
 201                  * There are blocks after the end of file.
 202                  * Free them up now by truncating the file to
 203                  * its current size.
 204                  */
 205                 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
 206
 207                 if (flags & XFS_FREE_EOF_TRYLOCK) {
 208                         if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
 209                                 xfs_trans_cancel(tp, 0);
 210                                 return 0;
 211                         }
 212                 } else {
 213                         xfs_ilock(ip, XFS_IOLOCK_EXCL);
 214                 }
 215
 216                 error = xfs_trans_reserve(tp, 0,
 217                                           XFS_ITRUNCATE_LOG_RES(mp),
 218                                           0, XFS_TRANS_PERM_LOG_RES,
 219                                           XFS_ITRUNCATE_LOG_COUNT);
 220                 if (error) {
 221                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
 222                         xfs_trans_cancel(tp, 0);
 223                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 224                         return error;
 225                 }
 226
 227                 xfs_ilock(ip, XFS_ILOCK_EXCL);
 228                 xfs_trans_ijoin(tp, ip, 0);
 229
 230                 /*
 231                  * Do not update the on-disk file size.  If we update the
 232                  * on-disk file size and then the system crashes before the
 233                  * contents of the file are flushed to disk then the files
 234                  * may be full of holes (ie NULL files bug).
 235                  */
 236                 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
 237                                               XFS_ISIZE(ip));
 238                 if (error) {
 239                         /*
 240                          * If we get an error at this point we simply don't
 241                          * bother truncating the file.
 242                          */
 243                         xfs_trans_cancel(tp,
 244                                          (XFS_TRANS_RELEASE_LOG_RES |
 245                                           XFS_TRANS_ABORT));
 246                 } else {
 247                         error = xfs_trans_commit(tp,
 248                                                 XFS_TRANS_RELEASE_LOG_RES);
 249                 }
 250                 xfs_iunlock(ip, XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL);
 251         }
 252         return error;
 253 }
 254
 255 /*
 256  * Free a symlink that has blocks associated with it.
 257  */
 258 STATIC int
 259 xfs_inactive_symlink_rmt(
 260         xfs_inode_t     *ip,
 261         xfs_trans_t     **tpp)
 262 {
 263         xfs_buf_t       *bp;
 264         int             committed;
 265         int             done;
 266         int             error;
 267         xfs_fsblock_t   first_block;
 268         xfs_bmap_free_t free_list;
 269         int             i;
 270         xfs_mount_t     *mp;
 271         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
 272         int             nmaps;
 273         xfs_trans_t     *ntp;
 274         int             size;
 275         xfs_trans_t     *tp;
 276
 277         tp = *tpp;
 278         mp = ip->i_mount;
 279         ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
 280         /*
 281          * We're freeing a symlink that has some
 282          * blocks allocated to it.  Free the
 283          * blocks here.  We know that we've got
 284          * either 1 or 2 extents and that we can
 285          * free them all in one bunmapi call.
 286          */
 287         ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
 288         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
 289                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
 290                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
 291                 xfs_trans_cancel(tp, 0);
 292                 *tpp = NULL;
 293                 return error;
 294         }
 295         /*
 296          * Lock the inode, fix the size, and join it to the transaction.
 297          * Hold it so in the normal path, we still have it locked for
 298          * the second transaction.  In the error paths we need it
 299          * held so the cancel won't rele it, see below.
 300          */
 301         xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
 302         size = (int)ip->i_d.di_size;
 303         ip->i_d.di_size = 0;
 304         xfs_trans_ijoin(tp, ip, 0);
 305         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 306         /*
 307          * Find the block(s) so we can inval and unmap them.
 308          */
 309         done = 0;
 310         xfs_bmap_init(&free_list, &first_block);
 311         nmaps = ARRAY_SIZE(mval);
 312         error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, size),
 313                                 mval, &nmaps, 0);
 314         if (error)
 315                 goto error0;
 316         /*
 317          * Invalidate the block(s).
 318          */
 319         for (i = 0; i < nmaps; i++) {
 320                 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
 321                         XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
 322                         XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
 323                 if (!bp) {
 324                         error = ENOMEM;
 325                         goto error1;
 326                 }
 327                 xfs_trans_binval(tp, bp);
 328         }
 329         /*
 330          * Unmap the dead block(s) to the free_list.
 331          */
 332         if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
 333                         &first_block, &free_list, &done)))
 334                 goto error1;
 335         ASSERT(done);
 336         /*
 337          * Commit the first transaction.  This logs the EFI and the inode.
 338          */
 339         if ((error = xfs_bmap_finish(&tp, &free_list, &committed)))
 340                 goto error1;
 341         /*
 342          * The transaction must have been committed, since there were
 343          * actually extents freed by xfs_bunmapi.  See xfs_bmap_finish.
 344          * The new tp has the extent freeing and EFDs.
 345          */
 346         ASSERT(committed);
 347         /*
 348          * The first xact was committed, so add the inode to the new one.
 349          * Mark it dirty so it will be logged and moved forward in the log as
 350          * part of every commit.
 351          */
 352         xfs_trans_ijoin(tp, ip, 0);
 353         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 354         /*
 355          * Get a new, empty transaction to return to our caller.
 356          */
 357         ntp = xfs_trans_dup(tp);
 358         /*
 359          * Commit the transaction containing extent freeing and EFDs.
 360          * If we get an error on the commit here or on the reserve below,
 361          * we need to unlock the inode since the new transaction doesn't
 362          * have the inode attached.
 363          */
 364         error = xfs_trans_commit(tp, 0);
 365         tp = ntp;
 366         if (error) {
 367                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
 368                 goto error0;
 369         }
 370         /*
 371          * transaction commit worked ok so we can drop the extra ticket
 372          * reference that we gained in xfs_trans_dup()
 373          */
 374         xfs_log_ticket_put(tp->t_ticket);
 375
 376         /*
 377          * Remove the memory for extent descriptions (just bookkeeping).
 378          */
 379         if (ip->i_df.if_bytes)
 380                 xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
 381         ASSERT(ip->i_df.if_bytes == 0);
 382         /*
 383          * Put an itruncate log reservation in the new transaction
 384          * for our caller.
 385          */
 386         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
 387                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
 388                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
 389                 goto error0;
 390         }
 391         /*
 392          * Return with the inode locked but not joined to the transaction.
 393          */
 394         *tpp = tp;
 395         return 0;
 396
 397  error1:
 398         xfs_bmap_cancel(&free_list);
 399  error0:
 400         /*
 401          * Have to come here with the inode locked and either
 402          * (held and in the transaction) or (not in the transaction).
 403          * If the inode isn't held then cancel would iput it, but
 404          * that's wrong since this is inactive and the vnode ref
 405          * count is 0 already.
 406          * Cancel won't do anything to the inode if held, but it still
 407          * needs to be locked until the cancel is done, if it was
 408          * joined to the transaction.
 409          */
 410         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
 411         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
 412         *tpp = NULL;
 413         return error;
 414
 415 }
 416
 417 STATIC int
 418 xfs_inactive_symlink_local(
 419         xfs_inode_t     *ip,
 420         xfs_trans_t     **tpp)
 421 {
 422         int             error;
 423
 424         ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip));
 425         /*
 426          * We're freeing a symlink which fit into
 427          * the inode.  Just free the memory used
 428          * to hold the old symlink.
 429          */
 430         error = xfs_trans_reserve(*tpp, 0,
 431                                   XFS_ITRUNCATE_LOG_RES(ip->i_mount),
 432                                   0, XFS_TRANS_PERM_LOG_RES,
 433                                   XFS_ITRUNCATE_LOG_COUNT);
 434
 435         if (error) {
 436                 xfs_trans_cancel(*tpp, 0);
 437                 *tpp = NULL;
 438                 return error;
 439         }
 440         xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
 441
 442         /*
 443          * Zero length symlinks _can_ exist.
 444          */
 445         if (ip->i_df.if_bytes > 0) {
 446                 xfs_idata_realloc(ip,
 447                                   -(ip->i_df.if_bytes),
 448                                   XFS_DATA_FORK);
 449                 ASSERT(ip->i_df.if_bytes == 0);
 450         }
 451         return 0;
 452 }
 453
 454 STATIC int
 455 xfs_inactive_attrs(
 456         xfs_inode_t     *ip,
 457         xfs_trans_t     **tpp)
 458 {
 459         xfs_trans_t     *tp;
 460         int             error;
 461         xfs_mount_t     *mp;
 462
 463         ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
 464         tp = *tpp;
 465         mp = ip->i_mount;
 466         ASSERT(ip->i_d.di_forkoff != 0);
 467         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 468         xfs_iunlock(ip, XFS_ILOCK_EXCL);
 469         if (error)
 470                 goto error_unlock;
 471
 472         error = xfs_attr_inactive(ip);
 473         if (error)
 474                 goto error_unlock;
 475
 476         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
 477         error = xfs_trans_reserve(tp, 0,
 478                                   XFS_IFREE_LOG_RES(mp),
 479                                   0, XFS_TRANS_PERM_LOG_RES,
 480                                   XFS_INACTIVE_LOG_COUNT);
 481         if (error)
 482                 goto error_cancel;
 483
 484         xfs_ilock(ip, XFS_ILOCK_EXCL);
 485         xfs_trans_ijoin(tp, ip, 0);
 486         xfs_idestroy_fork(ip, XFS_ATTR_FORK);
 487
 488         ASSERT(ip->i_d.di_anextents == 0);
 489
 490         *tpp = tp;
 491         return 0;
 492
 493 error_cancel:
 494         ASSERT(XFS_FORCED_SHUTDOWN(mp));
 495         xfs_trans_cancel(tp, 0);
 496 error_unlock:
 497         *tpp = NULL;
 498         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 499         return error;
 500 }
 501
 502 int
 503 xfs_release(
 504         xfs_inode_t     *ip)
 505 {
 506         xfs_mount_t     *mp = ip->i_mount;
 507         int             error;
 508
 509         if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0))
 510                 return 0;
 511
 512         /* If this is a read-only mount, don't do this (would generate I/O) */
 513         if (mp->m_flags & XFS_MOUNT_RDONLY)
 514                 return 0;
 515
 516         if (!XFS_FORCED_SHUTDOWN(mp)) {
 517                 int truncated;
 518
 519                 /*
 520                  * If we are using filestreams, and we have an unlinked
 521                  * file that we are processing the last close on, then nothing
 522                  * will be able to reopen and write to this file. Purge this
 523                  * inode from the filestreams cache so that it doesn't delay
 524                  * teardown of the inode.
 525                  */
 526                 if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
 527                         xfs_filestream_deassociate(ip);
 528
 529                 /*
 530                  * If we previously truncated this file and removed old data
 531                  * in the process, we want to initiate "early" writeout on
 532                  * the last close.  This is an attempt to combat the notorious
 533                  * NULL files problem which is particularly noticeable from a
 534                  * truncate down, buffered (re-)write (delalloc), followed by
 535                  * a crash.  What we are effectively doing here is
 536                  * significantly reducing the time window where we'd otherwise
 537                  * be exposed to that problem.
 538                  */
 539                 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
 540                 if (truncated) {
 541                         xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
 542                         if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0)
 543                                 xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
 544                 }
 545         }
 546
 547         if (ip->i_d.di_nlink == 0)
 548                 return 0;
 549
 550         if ((S_ISREG(ip->i_d.di_mode) &&
 551              (VFS_I(ip)->i_size > 0 ||
 552               (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
 553              (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
 554             (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
 555
 556                 /*
 557                  * If we can't get the iolock just skip truncating the blocks
 558                  * past EOF because we could deadlock with the mmap_sem
 559                  * otherwise.  We'll get another chance to drop them once the
 560                  * last reference to the inode is dropped, so we'll never leak
 561                  * blocks permanently.
 562                  *
 563                  * Further, check if the inode is being opened, written and
 564                  * closed frequently and we have delayed allocation blocks
 565                  * outstanding (e.g. streaming writes from the NFS server),
 566                  * truncating the blocks past EOF will cause fragmentation to
 567                  * occur.
 568                  *
 569                  * In this case don't do the truncation, either, but we have to
 570                  * be careful how we detect this case. Blocks beyond EOF show
 571                  * up as i_delayed_blks even when the inode is clean, so we
 572                  * need to truncate them away first before checking for a dirty
 573                  * release. Hence on the first dirty close we will still remove
 574                  * the speculative allocation, but after that we will leave it
 575                  * in place.
 576                  */
 577                 if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
 578                         return 0;
 579
 580                 error = xfs_free_eofblocks(mp, ip,
 581                                            XFS_FREE_EOF_TRYLOCK);
 582                 if (error)
 583                         return error;
 584
 585                 /* delalloc blocks after truncation means it really is dirty */
 586                 if (ip->i_delayed_blks)
 587                         xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
 588         }
 589         return 0;
 590 }
 591
 592 /*
 593  * xfs_inactive
 594  *
 595  * This is called when the vnode reference count for the vnode
 596  * goes to zero.  If the file has been unlinked, then it must
 597  * now be truncated.  Also, we clear all of the read-ahead state
 598  * kept for the inode here since the file is now closed.
 599  */
 600 int
 601 xfs_inactive(
 602         xfs_inode_t     *ip)
 603 {
 604         xfs_bmap_free_t free_list;
 605         xfs_fsblock_t   first_block;
 606         int             committed;
 607         xfs_trans_t     *tp;
 608         xfs_mount_t     *mp;
 609         int             error;
 610         int             truncate;
 611
 612         /*
 613          * If the inode is already free, then there can be nothing
 614          * to clean up here.
 615          */
 616         if (ip->i_d.di_mode == 0 || is_bad_inode(VFS_I(ip))) {
 617                 ASSERT(ip->i_df.if_real_bytes == 0);
 618                 ASSERT(ip->i_df.if_broot_bytes == 0);
 619                 return VN_INACTIVE_CACHE;
 620         }
 621
 622         /*
 623          * Only do a truncate if it's a regular file with
 624          * some actual space in it.  It's OK to look at the
 625          * inode's fields without the lock because we're the
 626          * only one with a reference to the inode.
 627          */
 628         truncate = ((ip->i_d.di_nlink == 0) &&
 629             ((ip->i_d.di_size != 0) || XFS_ISIZE(ip) != 0 ||
 630              (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
 631             S_ISREG(ip->i_d.di_mode));
 632
 633         mp = ip->i_mount;
 634
 635         error = 0;
 636
 637         /* If this is a read-only mount, don't do this (would generate I/O) */
 638         if (mp->m_flags & XFS_MOUNT_RDONLY)
 639                 goto out;
 640
 641         if (ip->i_d.di_nlink != 0) {
 642                 if ((S_ISREG(ip->i_d.di_mode) &&
 643                     (VFS_I(ip)->i_size > 0 ||
 644                      (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
 645                     (ip->i_df.if_flags & XFS_IFEXTENTS) &&
 646                     (!(ip->i_d.di_flags &
 647                                 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
 648                      ip->i_delayed_blks != 0))) {
 649                         error = xfs_free_eofblocks(mp, ip, 0);
 650                         if (error)
 651                                 return VN_INACTIVE_CACHE;
 652                 }
 653                 goto out;
 654         }
 655
 656         ASSERT(ip->i_d.di_nlink == 0);
 657
 658         error = xfs_qm_dqattach(ip, 0);
 659         if (error)
 660                 return VN_INACTIVE_CACHE;
 661
 662         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
 663         if (truncate) {
 664                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
 665
 666                 error = xfs_trans_reserve(tp, 0,
 667                                           XFS_ITRUNCATE_LOG_RES(mp),
 668                                           0, XFS_TRANS_PERM_LOG_RES,
 669                                           XFS_ITRUNCATE_LOG_COUNT);
 670                 if (error) {
 671                         /* Don't call itruncate_cleanup */
 672                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
 673                         xfs_trans_cancel(tp, 0);
 674                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 675                         return VN_INACTIVE_CACHE;
 676                 }
 677
 678                 xfs_ilock(ip, XFS_ILOCK_EXCL);
 679                 xfs_trans_ijoin(tp, ip, 0);
 680
 681                 ip->i_d.di_size = 0;
 682                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 683
 684                 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
 685                 if (error) {
 686                         xfs_trans_cancel(tp,
 687                                 XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
 688                         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
 689                         return VN_INACTIVE_CACHE;
 690                 }
 691
 692                 ASSERT(ip->i_d.di_nextents == 0);
 693         } else if (S_ISLNK(ip->i_d.di_mode)) {
 694
 695                 /*
 696                  * If we get an error while cleaning up a
 697                  * symlink we bail out.
 698                  */
 699                 error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ?
 700                         xfs_inactive_symlink_rmt(ip, &tp) :
 701                         xfs_inactive_symlink_local(ip, &tp);
 702
 703                 if (error) {
 704                         ASSERT(tp == NULL);
 705                         return VN_INACTIVE_CACHE;
 706                 }
 707
 708                 xfs_trans_ijoin(tp, ip, 0);
 709         } else {
 710                 error = xfs_trans_reserve(tp, 0,
 711                                           XFS_IFREE_LOG_RES(mp),
 712                                           0, XFS_TRANS_PERM_LOG_RES,
 713                                           XFS_INACTIVE_LOG_COUNT);
 714                 if (error) {
 715                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
 716                         xfs_trans_cancel(tp, 0);
 717                         return VN_INACTIVE_CACHE;
 718                 }
 719
 720                 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
 721                 xfs_trans_ijoin(tp, ip, 0);
 722         }
 723
 724         /*
 725          * If there are attributes associated with the file
 726          * then blow them away now.  The code calls a routine
 727          * that recursively deconstructs the attribute fork.
 728          * We need to just commit the current transaction
 729          * because we can't use it for xfs_attr_inactive().
 730          */
 731         if (ip->i_d.di_anextents > 0) {
 732                 error = xfs_inactive_attrs(ip, &tp);
 733                 /*
 734                  * If we got an error, the transaction is already
 735                  * cancelled, and the inode is unlocked. Just get out.
 736                  */
 737                  if (error)
 738                          return VN_INACTIVE_CACHE;
 739         } else if (ip->i_afp) {
 740                 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
 741         }
 742
 743         /*
 744          * Free the inode.
 745          */
 746         xfs_bmap_init(&free_list, &first_block);
 747         error = xfs_ifree(tp, ip, &free_list);
 748         if (error) {
 749                 /*
 750                  * If we fail to free the inode, shut down.  The cancel
 751                  * might do that, we need to make sure.  Otherwise the
 752                  * inode might be lost for a long time or forever.
 753                  */
 754                 if (!XFS_FORCED_SHUTDOWN(mp)) {
 755                         xfs_notice(mp, "%s: xfs_ifree returned error %d",
 756                                 __func__, error);
 757                         xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
 758                 }
 759                 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
 760         } else {
 761                 /*
 762                  * Credit the quota account(s). The inode is gone.
 763                  */
 764                 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
 765
 766                 /*
 767                  * Just ignore errors at this point.  There is nothing we can
 768                  * do except to try to keep going. Make sure it's not a silent
 769                  * error.
 770                  */
 771                 error = xfs_bmap_finish(&tp,  &free_list, &committed);
 772                 if (error)
 773                         xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
 774                                 __func__, error);
 775                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 776                 if (error)
 777                         xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
 778                                 __func__, error);
 779         }
 780
 781         /*
 782          * Release the dquots held by inode, if any.
 783          */
 784         xfs_qm_dqdetach(ip);
 785         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
 786
 787  out:
 788         return VN_INACTIVE_CACHE;
 789 }
 790
 791 /*
 792  * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
 793  * is allowed, otherwise it has to be an exact match. If a CI match is found,
 794  * ci_name->name will point to a the actual name (caller must free) or
 795  * will be set to NULL if an exact match is found.
 796  */
 797 int
 798 xfs_lookup(
 799         xfs_inode_t             *dp,
 800         struct xfs_name         *name,
 801         xfs_inode_t             **ipp,
 802         struct xfs_name         *ci_name)
 803 {
 804         xfs_ino_t               inum;
 805         int                     error;
 806         uint                    lock_mode;
 807
 808         trace_xfs_lookup(dp, name);
 809
 810         if (XFS_FORCED_SHUTDOWN(dp->i_mount))
 811                 return XFS_ERROR(EIO);
 812
 813         lock_mode = xfs_ilock_map_shared(dp);
 814         error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
 815         xfs_iunlock_map_shared(dp, lock_mode);
 816
 817         if (error)
 818                 goto out;
 819
 820         error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
 821         if (error)
 822                 goto out_free_name;
 823
 824         return 0;
 825
 826 out_free_name:
 827         if (ci_name)
 828                 kmem_free(ci_name->name);
 829 out:
 830         *ipp = NULL;
 831         return error;
 832 }
 833
 834 int
 835 xfs_create(
 836         xfs_inode_t             *dp,
 837         struct xfs_name         *name,
 838         umode_t                 mode,
 839         xfs_dev_t               rdev,
 840         xfs_inode_t             **ipp)
 841 {
 842         int                     is_dir = S_ISDIR(mode);
 843         struct xfs_mount        *mp = dp->i_mount;
 844         struct xfs_inode        *ip = NULL;
 845         struct xfs_trans        *tp = NULL;
 846         int                     error;
 847         xfs_bmap_free_t         free_list;
 848         xfs_fsblock_t           first_block;
 849         boolean_t               unlock_dp_on_error = B_FALSE;
 850         uint                    cancel_flags;
 851         int                     committed;
 852         prid_t                  prid;
 853         struct xfs_dquot        *udqp = NULL;
 854         struct xfs_dquot        *gdqp = NULL;
 855         uint                    resblks;
 856         uint                    log_res;
 857         uint                    log_count;
 858
 859         trace_xfs_create(dp, name);
 860
 861         if (XFS_FORCED_SHUTDOWN(mp))
 862                 return XFS_ERROR(EIO);
 863
 864         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
 865                 prid = xfs_get_projid(dp);
 866         else
 867                 prid = XFS_PROJID_DEFAULT;
 868
 869         /*
 870          * Make sure that we have allocated dquot(s) on disk.
 871          */
 872         error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
 873                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
 874         if (error)
 875                 return error;
 876
 877         if (is_dir) {
 878                 rdev = 0;
 879                 resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
 880                 log_res = XFS_MKDIR_LOG_RES(mp);
 881                 log_count = XFS_MKDIR_LOG_COUNT;
 882                 tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
 883         } else {
 884                 resblks = XFS_CREATE_SPACE_RES(mp, name->len);
 885                 log_res = XFS_CREATE_LOG_RES(mp);
 886                 log_count = XFS_CREATE_LOG_COUNT;
 887                 tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
 888         }
 889
 890         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
 891
 892         /*
 893          * Initially assume that the file does not exist and
 894          * reserve the resources for that case.  If that is not
 895          * the case we'll drop the one we have and get a more
 896          * appropriate transaction later.
 897          */
 898         error = xfs_trans_reserve(tp, resblks, log_res, 0,
 899                         XFS_TRANS_PERM_LOG_RES, log_count);
 900         if (error == ENOSPC) {
 901                 /* flush outstanding delalloc blocks and retry */
 902                 xfs_flush_inodes(dp);
 903                 error = xfs_trans_reserve(tp, resblks, log_res, 0,
 904                                 XFS_TRANS_PERM_LOG_RES, log_count);
 905         }
 906         if (error == ENOSPC) {
 907                 /* No space at all so try a "no-allocation" reservation */
 908                 resblks = 0;
 909                 error = xfs_trans_reserve(tp, 0, log_res, 0,
 910                                 XFS_TRANS_PERM_LOG_RES, log_count);
 911         }
 912         if (error) {
 913                 cancel_flags = 0;
 914                 goto out_trans_cancel;
 915         }
 916
 917         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
 918         unlock_dp_on_error = B_TRUE;
 919
 920         /*
 921          * Check for directory link count overflow.
 922          */
 923         if (is_dir && dp->i_d.di_nlink >= XFS_MAXLINK) {
 924                 error = XFS_ERROR(EMLINK);
 925                 goto out_trans_cancel;
 926         }
 927
 928         xfs_bmap_init(&free_list, &first_block);
 929
 930         /*
 931          * Reserve disk quota and the inode.
 932          */
 933         error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0);
 934         if (error)
 935                 goto out_trans_cancel;
 936
 937         error = xfs_dir_canenter(tp, dp, name, resblks);
 938         if (error)
 939                 goto out_trans_cancel;
 940
 941         /*
 942          * A newly created regular or special file just has one directory
 943          * entry pointing to them, but a directory also the "." entry
 944          * pointing to itself.
 945          */
 946         error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
 947                                prid, resblks > 0, &ip, &committed);
 948         if (error) {
 949                 if (error == ENOSPC)
 950                         goto out_trans_cancel;
 951                 goto out_trans_abort;
 952         }
 953
 954         /*
 955          * Now we join the directory inode to the transaction.  We do not do it
 956          * earlier because xfs_dir_ialloc might commit the previous transaction
 957          * (and release all the locks).  An error from here on will result in
 958          * the transaction cancel unlocking dp so don't do it explicitly in the
 959          * error path.
 960          */
 961         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
 962         unlock_dp_on_error = B_FALSE;
 963
 964         error = xfs_dir_createname(tp, dp, name, ip->i_ino,
 965                                         &first_block, &free_list, resblks ?
 966                                         resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
 967         if (error) {
 968                 ASSERT(error != ENOSPC);
 969                 goto out_trans_abort;
 970         }
 971         xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 972         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
 973
 974         if (is_dir) {
 975                 error = xfs_dir_init(tp, ip, dp);
 976                 if (error)
 977                         goto out_bmap_cancel;
 978
 979                 error = xfs_bumplink(tp, dp);
 980                 if (error)
 981                         goto out_bmap_cancel;
 982         }
 983
 984         /*
 985          * If this is a synchronous mount, make sure that the
 986          * create transaction goes to disk before returning to
 987          * the user.
 988          */
 989         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
 990                 xfs_trans_set_sync(tp);
 991
 992         /*
 993          * Attach the dquot(s) to the inodes and modify them incore.
 994          * These ids of the inode couldn't have changed since the new
 995          * inode has been locked ever since it was created.
 996          */
 997         xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
 998
 999         error = xfs_bmap_finish(&tp, &free_list, &committed);
1000         if (error)
1001                 goto out_bmap_cancel;
1002
1003         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1004         if (error)
1005                 goto out_release_inode;
1006
1007         xfs_qm_dqrele(udqp);
1008         xfs_qm_dqrele(gdqp);
1009
1010         *ipp = ip;
1011         return 0;
1012
1013  out_bmap_cancel:
1014         xfs_bmap_cancel(&free_list);
1015  out_trans_abort:
1016         cancel_flags |= XFS_TRANS_ABORT;
1017  out_trans_cancel:
1018         xfs_trans_cancel(tp, cancel_flags);
1019  out_release_inode:
1020         /*
1021          * Wait until after the current transaction is aborted to
1022          * release the inode.  This prevents recursive transactions
1023          * and deadlocks from xfs_inactive.
1024          */
1025         if (ip)
1026                 IRELE(ip);
1027
1028         xfs_qm_dqrele(udqp);
1029         xfs_qm_dqrele(gdqp);
1030
1031         if (unlock_dp_on_error)
1032                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
1033         return error;
1034 }
1035
1036 #ifdef DEBUG
1037 int xfs_locked_n;
1038 int xfs_small_retries;
1039 int xfs_middle_retries;
1040 int xfs_lots_retries;
1041 int xfs_lock_delays;
1042 #endif
1043
1044 /*
1045  * Bump the subclass so xfs_lock_inodes() acquires each lock with
1046  * a different value
1047  */
1048 static inline int
1049 xfs_lock_inumorder(int lock_mode, int subclass)
1050 {
1051         if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
1052                 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
1053         if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
1054                 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
1055
1056         return lock_mode;
1057 }
1058
1059 /*
1060  * The following routine will lock n inodes in exclusive mode.
1061  * We assume the caller calls us with the inodes in i_ino order.
1062  *
1063  * We need to detect deadlock where an inode that we lock
1064  * is in the AIL and we start waiting for another inode that is locked
1065  * by a thread in a long running transaction (such as truncate). This can
1066  * result in deadlock since the long running trans might need to wait
1067  * for the inode we just locked in order to push the tail and free space
1068  * in the log.
1069  */
1070 void
1071 xfs_lock_inodes(
1072         xfs_inode_t     **ips,
1073         int             inodes,
1074         uint            lock_mode)
1075 {
1076         int             attempts = 0, i, j, try_lock;
1077         xfs_log_item_t  *lp;
1078
1079         ASSERT(ips && (inodes >= 2)); /* we need at least two */
1080
1081         try_lock = 0;
1082         i = 0;
1083
1084 again:
1085         for (; i < inodes; i++) {
1086                 ASSERT(ips[i]);
1087
1088                 if (i && (ips[i] == ips[i-1]))  /* Already locked */
1089                         continue;
1090
1091                 /*
1092                  * If try_lock is not set yet, make sure all locked inodes
1093                  * are not in the AIL.
1094                  * If any are, set try_lock to be used later.
1095                  */
1096
1097                 if (!try_lock) {
1098                         for (j = (i - 1); j >= 0 && !try_lock; j--) {
1099                                 lp = (xfs_log_item_t *)ips[j]->i_itemp;
1100                                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
1101                                         try_lock++;
1102                                 }
1103                         }
1104                 }
1105
1106                 /*
1107                  * If any of the previous locks we have locked is in the AIL,
1108                  * we must TRY to get the second and subsequent locks. If
1109                  * we can't get any, we must release all we have
1110                  * and try again.
1111                  */
1112
1113                 if (try_lock) {
1114                         /* try_lock must be 0 if i is 0. */
1115                         /*
1116                          * try_lock means we have an inode locked
1117                          * that is in the AIL.
1118                          */
1119                         ASSERT(i != 0);
1120                         if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
1121                                 attempts++;
1122
1123                                 /*
1124                                  * Unlock all previous guys and try again.
1125                                  * xfs_iunlock will try to push the tail
1126                                  * if the inode is in the AIL.
1127                                  */
1128
1129                                 for(j = i - 1; j >= 0; j--) {
1130
1131                                         /*
1132                                          * Check to see if we've already
1133                                          * unlocked this one.
1134                                          * Not the first one going back,
1135                                          * and the inode ptr is the same.
1136                                          */
1137                                         if ((j != (i - 1)) && ips[j] ==
1138                                                                 ips[j+1])
1139                                                 continue;
1140
1141                                         xfs_iunlock(ips[j], lock_mode);
1142                                 }
1143
1144                                 if ((attempts % 5) == 0) {
1145                                         delay(1); /* Don't just spin the CPU */
1146 #ifdef DEBUG
1147                                         xfs_lock_delays++;
1148 #endif
1149                                 }
1150                                 i = 0;
1151                                 try_lock = 0;
1152                                 goto again;
1153                         }
1154                 } else {
1155                         xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
1156                 }
1157         }
1158
1159 #ifdef DEBUG
1160         if (attempts) {
1161                 if (attempts < 5) xfs_small_retries++;
1162                 else if (attempts < 100) xfs_middle_retries++;
1163                 else xfs_lots_retries++;
1164         } else {
1165                 xfs_locked_n++;
1166         }
1167 #endif
1168 }
1169
1170 /*
1171  * xfs_lock_two_inodes() can only be used to lock one type of lock
1172  * at a time - the iolock or the ilock, but not both at once. If
1173  * we lock both at once, lockdep will report false positives saying
1174  * we have violated locking orders.
1175  */
1176 void
1177 xfs_lock_two_inodes(
1178         xfs_inode_t             *ip0,
1179         xfs_inode_t             *ip1,
1180         uint                    lock_mode)
1181 {
1182         xfs_inode_t             *temp;
1183         int                     attempts = 0;
1184         xfs_log_item_t          *lp;
1185
1186         if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
1187                 ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
1188         ASSERT(ip0->i_ino != ip1->i_ino);
1189
1190         if (ip0->i_ino > ip1->i_ino) {
1191                 temp = ip0;
1192                 ip0 = ip1;
1193                 ip1 = temp;
1194         }
1195
1196  again:
1197         xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0));
1198
1199         /*
1200          * If the first lock we have locked is in the AIL, we must TRY to get
1201          * the second lock. If we can't get it, we must release the first one
1202          * and try again.
1203          */
1204         lp = (xfs_log_item_t *)ip0->i_itemp;
1205         if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
1206                 if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) {
1207                         xfs_iunlock(ip0, lock_mode);
1208                         if ((++attempts % 5) == 0)
1209                                 delay(1); /* Don't just spin the CPU */
1210                         goto again;
1211                 }
1212         } else {
1213                 xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1));
1214         }
1215 }
1216
1217 int
1218 xfs_remove(
1219         xfs_inode_t             *dp,
1220         struct xfs_name         *name,
1221         xfs_inode_t             *ip)
1222 {
1223         xfs_mount_t             *mp = dp->i_mount;
1224         xfs_trans_t             *tp = NULL;
1225         int                     is_dir = S_ISDIR(ip->i_d.di_mode);
1226         int                     error = 0;
1227         xfs_bmap_free_t         free_list;
1228         xfs_fsblock_t           first_block;
1229         int                     cancel_flags;
1230         int                     committed;
1231         int                     link_zero;
1232         uint                    resblks;
1233         uint                    log_count;
1234
1235         trace_xfs_remove(dp, name);
1236
1237         if (XFS_FORCED_SHUTDOWN(mp))
1238                 return XFS_ERROR(EIO);
1239
1240         error = xfs_qm_dqattach(dp, 0);
1241         if (error)
1242                 goto std_return;
1243
1244         error = xfs_qm_dqattach(ip, 0);
1245         if (error)
1246                 goto std_return;
1247
1248         if (is_dir) {
1249                 tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
1250                 log_count = XFS_DEFAULT_LOG_COUNT;
1251         } else {
1252                 tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
1253                 log_count = XFS_REMOVE_LOG_COUNT;
1254         }
1255         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1256
1257         /*
1258          * We try to get the real space reservation first,
1259          * allowing for directory btree deletion(s) implying
1260          * possible bmap insert(s).  If we can't get the space
1261          * reservation then we use 0 instead, and avoid the bmap
1262          * btree insert(s) in the directory code by, if the bmap
1263          * insert tries to happen, instead trimming the LAST
1264          * block from the directory.
1265          */
1266         resblks = XFS_REMOVE_SPACE_RES(mp);
1267         error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
1268                                   XFS_TRANS_PERM_LOG_RES, log_count);
1269         if (error == ENOSPC) {
1270                 resblks = 0;
1271                 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
1272                                           XFS_TRANS_PERM_LOG_RES, log_count);
1273         }
1274         if (error) {
1275                 ASSERT(error != ENOSPC);
1276                 cancel_flags = 0;
1277                 goto out_trans_cancel;
1278         }
1279
1280         xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
1281
1282         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1283         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1284
1285         /*
1286          * If we're removing a directory perform some additional validation.
1287          */
1288         if (is_dir) {
1289                 ASSERT(ip->i_d.di_nlink >= 2);
1290                 if (ip->i_d.di_nlink != 2) {
1291                         error = XFS_ERROR(ENOTEMPTY);
1292                         goto out_trans_cancel;
1293                 }
1294                 if (!xfs_dir_isempty(ip)) {
1295                         error = XFS_ERROR(ENOTEMPTY);
1296                         goto out_trans_cancel;
1297                 }
1298         }
1299
1300         xfs_bmap_init(&free_list, &first_block);
1301         error = xfs_dir_removename(tp, dp, name, ip->i_ino,
1302                                         &first_block, &free_list, resblks);
1303         if (error) {
1304                 ASSERT(error != ENOENT);
1305                 goto out_bmap_cancel;
1306         }
1307         xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1308
1309         if (is_dir) {
1310                 /*
1311                  * Drop the link from ip's "..".
1312                  */
1313                 error = xfs_droplink(tp, dp);
1314                 if (error)
1315                         goto out_bmap_cancel;
1316
1317                 /*
1318                  * Drop the "." link from ip to self.
1319                  */
1320                 error = xfs_droplink(tp, ip);
1321                 if (error)
1322                         goto out_bmap_cancel;
1323         } else {
1324                 /*
1325                  * When removing a non-directory we need to log the parent
1326                  * inode here.  For a directory this is done implicitly
1327                  * by the xfs_droplink call for the ".." entry.
1328                  */
1329                 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1330         }
1331
1332         /*
1333          * Drop the link from dp to ip.
1334          */
1335         error = xfs_droplink(tp, ip);
1336         if (error)
1337                 goto out_bmap_cancel;
1338
1339         /*
1340          * Determine if this is the last link while
1341          * we are in the transaction.
1342          */
1343         link_zero = (ip->i_d.di_nlink == 0);
1344
1345         /*
1346          * If this is a synchronous mount, make sure that the
1347          * remove transaction goes to disk before returning to
1348          * the user.
1349          */
1350         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
1351                 xfs_trans_set_sync(tp);
1352
1353         error = xfs_bmap_finish(&tp, &free_list, &committed);
1354         if (error)
1355                 goto out_bmap_cancel;
1356
1357         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1358         if (error)
1359                 goto std_return;
1360
1361         /*
1362          * If we are using filestreams, kill the stream association.
1363          * If the file is still open it may get a new one but that
1364          * will get killed on last close in xfs_close() so we don't
1365          * have to worry about that.
1366          */
1367         if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
1368                 xfs_filestream_deassociate(ip);
1369
1370         return 0;
1371
1372  out_bmap_cancel:
1373         xfs_bmap_cancel(&free_list);
1374         cancel_flags |= XFS_TRANS_ABORT;
1375  out_trans_cancel:
1376         xfs_trans_cancel(tp, cancel_flags);
1377  std_return:
1378         return error;
1379 }
1380
1381 int
1382 xfs_link(
1383         xfs_inode_t             *tdp,
1384         xfs_inode_t             *sip,
1385         struct xfs_name         *target_name)
1386 {
1387         xfs_mount_t             *mp = tdp->i_mount;
1388         xfs_trans_t             *tp;
1389         int                     error;
1390         xfs_bmap_free_t         free_list;
1391         xfs_fsblock_t           first_block;
1392         int                     cancel_flags;
1393         int                     committed;
1394         int                     resblks;
1395
1396         trace_xfs_link(tdp, target_name);
1397
1398         ASSERT(!S_ISDIR(sip->i_d.di_mode));
1399
1400         if (XFS_FORCED_SHUTDOWN(mp))
1401                 return XFS_ERROR(EIO);
1402
1403         error = xfs_qm_dqattach(sip, 0);
1404         if (error)
1405                 goto std_return;
1406
1407         error = xfs_qm_dqattach(tdp, 0);
1408         if (error)
1409                 goto std_return;
1410
1411         tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
1412         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1413         resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
1414         error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
1415                         XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
1416         if (error == ENOSPC) {
1417                 resblks = 0;
1418                 error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
1419                                 XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
1420         }
1421         if (error) {
1422                 cancel_flags = 0;
1423                 goto error_return;
1424         }
1425
1426         xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
1427
1428         xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
1429         xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
1430
1431         /*
1432          * If the source has too many links, we can't make any more to it.
1433          */
1434         if (sip->i_d.di_nlink >= XFS_MAXLINK) {
1435                 error = XFS_ERROR(EMLINK);
1436                 goto error_return;
1437         }
1438
1439         /*
1440          * If we are using project inheritance, we only allow hard link
1441          * creation in our tree when the project IDs are the same; else
1442          * the tree quota mechanism could be circumvented.
1443          */
1444         if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
1445                      (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
1446                 error = XFS_ERROR(EXDEV);
1447                 goto error_return;
1448         }
1449
1450         error = xfs_dir_canenter(tp, tdp, target_name, resblks);
1451         if (error)
1452                 goto error_return;
1453
1454         xfs_bmap_init(&free_list, &first_block);
1455
1456         error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
1457                                         &first_block, &free_list, resblks);
1458         if (error)
1459                 goto abort_return;
1460         xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1461         xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
1462
1463         error = xfs_bumplink(tp, sip);
1464         if (error)
1465                 goto abort_return;
1466
1467         /*
1468          * If this is a synchronous mount, make sure that the
1469          * link transaction goes to disk before returning to
1470          * the user.
1471          */
1472         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
1473                 xfs_trans_set_sync(tp);
1474         }
1475
1476         error = xfs_bmap_finish (&tp, &free_list, &committed);
1477         if (error) {
1478                 xfs_bmap_cancel(&free_list);
1479                 goto abort_return;
1480         }
1481
1482         return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1483
1484  abort_return:
1485         cancel_flags |= XFS_TRANS_ABORT;
1486  error_return:
1487         xfs_trans_cancel(tp, cancel_flags);
1488  std_return:
1489         return error;
1490 }
1491
1492 int
1493 xfs_symlink(
1494         xfs_inode_t             *dp,
1495         struct xfs_name         *link_name,
1496         const char              *target_path,
1497         umode_t                 mode,
1498         xfs_inode_t             **ipp)
1499 {
1500         xfs_mount_t             *mp = dp->i_mount;
1501         xfs_trans_t             *tp;
1502         xfs_inode_t             *ip;
1503         int                     error;
1504         int                     pathlen;
1505         xfs_bmap_free_t         free_list;
1506         xfs_fsblock_t           first_block;
1507         boolean_t               unlock_dp_on_error = B_FALSE;
1508         uint                    cancel_flags;
1509         int                     committed;
1510         xfs_fileoff_t           first_fsb;
1511         xfs_filblks_t           fs_blocks;
1512         int                     nmaps;
1513         xfs_bmbt_irec_t         mval[SYMLINK_MAPS];
1514         xfs_daddr_t             d;
1515         const char              *cur_chunk;
1516         int                     byte_cnt;
1517         int                     n;
1518         xfs_buf_t               *bp;
1519         prid_t                  prid;
1520         struct xfs_dquot        *udqp, *gdqp;
1521         uint                    resblks;
1522
1523         *ipp = NULL;
1524         error = 0;
1525         ip = NULL;
1526         tp = NULL;
1527
1528         trace_xfs_symlink(dp, link_name);
1529
1530         if (XFS_FORCED_SHUTDOWN(mp))
1531                 return XFS_ERROR(EIO);
1532
1533         /*
1534          * Check component lengths of the target path name.
1535          */
1536         pathlen = strlen(target_path);
1537         if (pathlen >= MAXPATHLEN)      /* total string too long */
1538                 return XFS_ERROR(ENAMETOOLONG);
1539
1540         udqp = gdqp = NULL;
1541         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1542                 prid = xfs_get_projid(dp);
1543         else
1544                 prid = XFS_PROJID_DEFAULT;
1545
1546         /*
1547          * Make sure that we have allocated dquot(s) on disk.
1548          */
1549         error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
1550                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
1551         if (error)
1552                 goto std_return;
1553
1554         tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
1555         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1556         /*
1557          * The symlink will fit into the inode data fork?
1558          * There can't be any attributes so we get the whole variable part.
1559          */
1560         if (pathlen <= XFS_LITINO(mp))
1561                 fs_blocks = 0;
1562         else
1563                 fs_blocks = XFS_B_TO_FSB(mp, pathlen);
1564         resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
1565         error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
1566                         XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
1567         if (error == ENOSPC && fs_blocks == 0) {
1568                 resblks = 0;
1569                 error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
1570                                 XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
1571         }
1572         if (error) {
1573                 cancel_flags = 0;
1574                 goto error_return;
1575         }
1576
1577         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1578         unlock_dp_on_error = B_TRUE;
1579
1580         /*
1581          * Check whether the directory allows new symlinks or not.
1582          */
1583         if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
1584                 error = XFS_ERROR(EPERM);
1585                 goto error_return;
1586         }
1587
1588         /*
1589          * Reserve disk quota : blocks and inode.
1590          */
1591         error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0);
1592         if (error)
1593                 goto error_return;
1594
1595         /*
1596          * Check for ability to enter directory entry, if no space reserved.
1597          */
1598         error = xfs_dir_canenter(tp, dp, link_name, resblks);
1599         if (error)
1600                 goto error_return;
1601         /*
1602          * Initialize the bmap freelist prior to calling either
1603          * bmapi or the directory create code.
1604          */
1605         xfs_bmap_init(&free_list, &first_block);
1606
1607         /*
1608          * Allocate an inode for the symlink.
1609          */
1610         error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
1611                                prid, resblks > 0, &ip, NULL);
1612         if (error) {
1613                 if (error == ENOSPC)
1614                         goto error_return;
1615                 goto error1;
1616         }
1617
1618         /*
1619          * An error after we've joined dp to the transaction will result in the
1620          * transaction cancel unlocking dp so don't do it explicitly in the
1621          * error path.
1622          */
1623         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1624         unlock_dp_on_error = B_FALSE;
1625
1626         /*
1627          * Also attach the dquot(s) to it, if applicable.
1628          */
1629         xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
1630
1631         if (resblks)
1632                 resblks -= XFS_IALLOC_SPACE_RES(mp);
1633         /*
1634          * If the symlink will fit into the inode, write it inline.
1635          */
1636         if (pathlen <= XFS_IFORK_DSIZE(ip)) {
1637                 xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
1638                 memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
1639                 ip->i_d.di_size = pathlen;
1640
1641                 /*
1642                  * The inode was initially created in extent format.
1643                  */
1644                 ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
1645                 ip->i_df.if_flags |= XFS_IFINLINE;
1646
1647                 ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
1648                 xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
1649
1650         } else {
1651                 first_fsb = 0;
1652                 nmaps = SYMLINK_MAPS;
1653
1654                 error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks,
1655                                   XFS_BMAPI_METADATA, &first_block, resblks,
1656                                   mval, &nmaps, &free_list);
1657                 if (error)
1658                         goto error2;
1659
1660                 if (resblks)
1661                         resblks -= fs_blocks;
1662                 ip->i_d.di_size = pathlen;
1663                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1664
1665                 cur_chunk = target_path;
1666                 for (n = 0; n < nmaps; n++) {
1667                         d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
1668                         byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
1669                         bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
1670                                                BTOBB(byte_cnt), 0);
1671                         if (!bp) {
1672                                 error = ENOMEM;
1673                                 goto error2;
1674                         }
1675                         if (pathlen < byte_cnt) {
1676                                 byte_cnt = pathlen;
1677                         }
1678                         pathlen -= byte_cnt;
1679
1680                         memcpy(bp->b_addr, cur_chunk, byte_cnt);
1681                         cur_chunk += byte_cnt;
1682
1683                         xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
1684                 }
1685         }
1686
1687         /*
1688          * Create the directory entry for the symlink.
1689          */
1690         error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
1691                                         &first_block, &free_list, resblks);
1692         if (error)
1693                 goto error2;
1694         xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1695         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1696
1697         /*
1698          * If this is a synchronous mount, make sure that the
1699          * symlink transaction goes to disk before returning to
1700          * the user.
1701          */
1702         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
1703                 xfs_trans_set_sync(tp);
1704         }
1705
1706         error = xfs_bmap_finish(&tp, &free_list, &committed);
1707         if (error) {
1708                 goto error2;
1709         }
1710         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1711         xfs_qm_dqrele(udqp);
1712         xfs_qm_dqrele(gdqp);
1713
1714         *ipp = ip;
1715         return 0;
1716
1717  error2:
1718         IRELE(ip);
1719  error1:
1720         xfs_bmap_cancel(&free_list);
1721         cancel_flags |= XFS_TRANS_ABORT;
1722  error_return:
1723         xfs_trans_cancel(tp, cancel_flags);
1724         xfs_qm_dqrele(udqp);
1725         xfs_qm_dqrele(gdqp);
1726
1727         if (unlock_dp_on_error)
1728                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
1729  std_return:
1730         return error;
1731 }
1732
1733 int
1734 xfs_set_dmattrs(
1735         xfs_inode_t     *ip,
1736         u_int           evmask,
1737         u_int16_t       state)
1738 {
1739         xfs_mount_t     *mp = ip->i_mount;
1740         xfs_trans_t     *tp;
1741         int             error;
1742
1743         if (!capable(CAP_SYS_ADMIN))
1744                 return XFS_ERROR(EPERM);
1745
1746         if (XFS_FORCED_SHUTDOWN(mp))
1747                 return XFS_ERROR(EIO);
1748
1749         tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
1750         error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
1751         if (error) {
1752                 xfs_trans_cancel(tp, 0);
1753                 return error;
1754         }
1755         xfs_ilock(ip, XFS_ILOCK_EXCL);
1756         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1757
1758         ip->i_d.di_dmevmask = evmask;
1759         ip->i_d.di_dmstate  = state;
1760
1761         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1762         error = xfs_trans_commit(tp, 0);
1763
1764         return error;
1765 }
1766
1767 /*
1768  * xfs_alloc_file_space()
1769  *      This routine allocates disk space for the given file.
1770  *
1771  *      If alloc_type == 0, this request is for an ALLOCSP type
1772  *      request which will change the file size.  In this case, no
1773  *      DMAPI event will be generated by the call.  A TRUNCATE event
1774  *      will be generated later by xfs_setattr.
1775  *
1776  *      If alloc_type != 0, this request is for a RESVSP type
1777  *      request, and a DMAPI DM_EVENT_WRITE will be generated if the
1778  *      lower block boundary byte address is less than the file's
1779  *      length.
1780  *
1781  * RETURNS:
1782  *       0 on success
1783  *      errno on error
1784  *
1785  */
1786 STATIC int
1787 xfs_alloc_file_space(
1788         xfs_inode_t             *ip,
1789         xfs_off_t               offset,
1790         xfs_off_t               len,
1791         int                     alloc_type,
1792         int                     attr_flags)
1793 {
1794         xfs_mount_t             *mp = ip->i_mount;
1795         xfs_off_t               count;
1796         xfs_filblks_t           allocated_fsb;
1797         xfs_filblks_t           allocatesize_fsb;
1798         xfs_extlen_t            extsz, temp;
1799         xfs_fileoff_t           startoffset_fsb;
1800         xfs_fsblock_t           firstfsb;
1801         int                     nimaps;
1802         int                     quota_flag;
1803         int                     rt;
1804         xfs_trans_t             *tp;
1805         xfs_bmbt_irec_t         imaps[1], *imapp;
1806         xfs_bmap_free_t         free_list;
1807         uint                    qblocks, resblks, resrtextents;
1808         int                     committed;
1809         int                     error;
1810
1811         trace_xfs_alloc_file_space(ip);
1812
1813         if (XFS_FORCED_SHUTDOWN(mp))
1814                 return XFS_ERROR(EIO);
1815
1816         error = xfs_qm_dqattach(ip, 0);
1817         if (error)
1818                 return error;
1819
1820         if (len <= 0)
1821                 return XFS_ERROR(EINVAL);
1822
1823         rt = XFS_IS_REALTIME_INODE(ip);
1824         extsz = xfs_get_extsz_hint(ip);
1825
1826         count = len;
1827         imapp = &imaps[0];
1828         nimaps = 1;
1829         startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
1830         allocatesize_fsb = XFS_B_TO_FSB(mp, count);
1831
1832         /*
1833          * Allocate file space until done or until there is an error
1834          */
1835         while (allocatesize_fsb && !error) {
1836                 xfs_fileoff_t   s, e;
1837
1838                 /*
1839                  * Determine space reservations for data/realtime.
1840                  */
1841                 if (unlikely(extsz)) {
1842                         s = startoffset_fsb;
1843                         do_div(s, extsz);
1844                         s *= extsz;
1845                         e = startoffset_fsb + allocatesize_fsb;
1846                         if ((temp = do_mod(startoffset_fsb, extsz)))
1847                                 e += temp;
1848                         if ((temp = do_mod(e, extsz)))
1849                                 e += extsz - temp;
1850                 } else {
1851                         s = 0;
1852                         e = allocatesize_fsb;
1853                 }
1854
1855                 /*
1856                  * The transaction reservation is limited to a 32-bit block
1857                  * count, hence we need to limit the number of blocks we are
1858                  * trying to reserve to avoid an overflow. We can't allocate
1859                  * more than @nimaps extents, and an extent is limited on disk
1860                  * to MAXEXTLEN (21 bits), so use that to enforce the limit.
1861                  */
1862                 resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
1863                 if (unlikely(rt)) {
1864                         resrtextents = qblocks = resblks;
1865                         resrtextents /= mp->m_sb.sb_rextsize;
1866                         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
1867                         quota_flag = XFS_QMOPT_RES_RTBLKS;
1868                 } else {
1869                         resrtextents = 0;
1870                         resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
1871                         quota_flag = XFS_QMOPT_RES_REGBLKS;
1872                 }
1873
1874                 /*
1875                  * Allocate and setup the transaction.
1876                  */
1877                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
1878                 error = xfs_trans_reserve(tp, resblks,
1879                                           XFS_WRITE_LOG_RES(mp), resrtextents,
1880                                           XFS_TRANS_PERM_LOG_RES,
1881                                           XFS_WRITE_LOG_COUNT);
1882                 /*
1883                  * Check for running out of space
1884                  */
1885                 if (error) {
1886                         /*
1887                          * Free the transaction structure.
1888                          */
1889                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
1890                         xfs_trans_cancel(tp, 0);
1891                         break;
1892                 }
1893                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1894                 error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks,
1895                                                       0, quota_flag);
1896                 if (error)
1897                         goto error1;
1898
1899                 xfs_trans_ijoin(tp, ip, 0);
1900
1901                 xfs_bmap_init(&free_list, &firstfsb);
1902                 error = xfs_bmapi_write(tp, ip, startoffset_fsb,
1903                                         allocatesize_fsb, alloc_type, &firstfsb,
1904                                         0, imapp, &nimaps, &free_list);
1905                 if (error) {
1906                         goto error0;
1907                 }
1908
1909                 /*
1910                  * Complete the transaction
1911                  */
1912                 error = xfs_bmap_finish(&tp, &free_list, &committed);
1913                 if (error) {
1914                         goto error0;
1915                 }
1916
1917                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1918                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1919                 if (error) {
1920                         break;
1921                 }
1922
1923                 allocated_fsb = imapp->br_blockcount;
1924
1925                 if (nimaps == 0) {
1926                         error = XFS_ERROR(ENOSPC);
1927                         break;
1928                 }
1929
1930                 startoffset_fsb += allocated_fsb;
1931                 allocatesize_fsb -= allocated_fsb;
1932         }
1933
1934         return error;
1935
1936 error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
1937         xfs_bmap_cancel(&free_list);
1938         xfs_trans_unreserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
1939
1940 error1: /* Just cancel transaction */
1941         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1942         xfs_iunlock(ip, XFS_ILOCK_EXCL);
1943         return error;
1944 }
1945
1946 /*
1947  * Zero file bytes between startoff and endoff inclusive.
1948  * The iolock is held exclusive and no blocks are buffered.
1949  *
1950  * This function is used by xfs_free_file_space() to zero
1951  * partial blocks when the range to free is not block aligned.
1952  * When unreserving space with boundaries that are not block
1953  * aligned we round up the start and round down the end
1954  * boundaries and then use this function to zero the parts of
1955  * the blocks that got dropped during the rounding.
1956  */
1957 STATIC int
1958 xfs_zero_remaining_bytes(
1959         xfs_inode_t             *ip,
1960         xfs_off_t               startoff,
1961         xfs_off_t               endoff)
1962 {
1963         xfs_bmbt_irec_t         imap;
1964         xfs_fileoff_t           offset_fsb;
1965         xfs_off_t               lastoffset;
1966         xfs_off_t               offset;
1967         xfs_buf_t               *bp;
1968         xfs_mount_t             *mp = ip->i_mount;
1969         int                     nimap;
1970         int                     error = 0;
1971
1972         /*
1973          * Avoid doing I/O beyond eof - it's not necessary
1974          * since nothing can read beyond eof.  The space will
1975          * be zeroed when the file is extended anyway.
1976          */
1977         if (startoff >= XFS_ISIZE(ip))
1978                 return 0;
1979
1980         if (endoff > XFS_ISIZE(ip))
1981                 endoff = XFS_ISIZE(ip);
1982
1983         bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
1984                                         mp->m_rtdev_targp : mp->m_ddev_targp,
1985                                 mp->m_sb.sb_blocksize, XBF_DONT_BLOCK);
1986         if (!bp)
1987                 return XFS_ERROR(ENOMEM);
1988
1989         xfs_buf_unlock(bp);
1990
1991         for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
1992                 offset_fsb = XFS_B_TO_FSBT(mp, offset);
1993                 nimap = 1;
1994                 error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
1995                 if (error || nimap < 1)
1996                         break;
1997                 ASSERT(imap.br_blockcount >= 1);
1998                 ASSERT(imap.br_startoff == offset_fsb);
1999                 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
2000                 if (lastoffset > endoff)
2001                         lastoffset = endoff;
2002                 if (imap.br_startblock == HOLESTARTBLOCK)
2003                         continue;
2004                 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
2005                 if (imap.br_state == XFS_EXT_UNWRITTEN)
2006                         continue;
2007                 XFS_BUF_UNDONE(bp);
2008                 XFS_BUF_UNWRITE(bp);
2009                 XFS_BUF_READ(bp);
2010                 XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
2011                 xfsbdstrat(mp, bp);
2012                 error = xfs_buf_iowait(bp);
2013                 if (error) {
2014                         xfs_buf_ioerror_alert(bp,
2015                                         "xfs_zero_remaining_bytes(read)");
2016                         break;
2017                 }
2018                 memset(bp->b_addr +
2019                         (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
2020                       0, lastoffset - offset + 1);
2021                 XFS_BUF_UNDONE(bp);
2022                 XFS_BUF_UNREAD(bp);
2023                 XFS_BUF_WRITE(bp);
2024                 xfsbdstrat(mp, bp);
2025                 error = xfs_buf_iowait(bp);
2026                 if (error) {
2027                         xfs_buf_ioerror_alert(bp,
2028                                         "xfs_zero_remaining_bytes(write)");
2029                         break;
2030                 }
2031         }
2032         xfs_buf_free(bp);
2033         return error;
2034 }
2035
2036 /*
2037  * xfs_free_file_space()
2038  *      This routine frees disk space for the given file.
2039  *
2040  *      This routine is only called by xfs_change_file_space
2041  *      for an UNRESVSP type call.
2042  *
2043  * RETURNS:
2044  *       0 on success
2045  *      errno on error
2046  *
2047  */
2048 STATIC int
2049 xfs_free_file_space(
2050         xfs_inode_t             *ip,
2051         xfs_off_t               offset,
2052         xfs_off_t               len,
2053         int                     attr_flags)
2054 {
2055         int                     committed;
2056         int                     done;
2057         xfs_fileoff_t           endoffset_fsb;
2058         int                     error;
2059         xfs_fsblock_t           firstfsb;
2060         xfs_bmap_free_t         free_list;
2061         xfs_bmbt_irec_t         imap;
2062         xfs_off_t               ioffset;
2063         xfs_extlen_t            mod=0;
2064         xfs_mount_t             *mp;
2065         int                     nimap;
2066         uint                    resblks;
2067         uint                    rounding;
2068         int                     rt;
2069         xfs_fileoff_t           startoffset_fsb;
2070         xfs_trans_t             *tp;
2071         int                     need_iolock = 1;
2072
2073         mp = ip->i_mount;
2074
2075         trace_xfs_free_file_space(ip);
2076
2077         error = xfs_qm_dqattach(ip, 0);
2078         if (error)
2079                 return error;
2080
2081         error = 0;
2082         if (len <= 0)   /* if nothing being freed */
2083                 return error;
2084         rt = XFS_IS_REALTIME_INODE(ip);
2085         startoffset_fsb = XFS_B_TO_FSB(mp, offset);
2086         endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
2087
2088         if (attr_flags & XFS_ATTR_NOLOCK)
2089                 need_iolock = 0;
2090         if (need_iolock) {
2091                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
2092                 /* wait for the completion of any pending DIOs */
2093                 inode_dio_wait(VFS_I(ip));
2094         }
2095
2096         rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
2097         ioffset = offset & ~(rounding - 1);
2098
2099         if (VN_CACHED(VFS_I(ip)) != 0) {
2100                 error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED);
2101                 if (error)
2102                         goto out_unlock_iolock;
2103         }
2104
2105         /*
2106          * Need to zero the stuff we're not freeing, on disk.
2107          * If it's a realtime file & can't use unwritten extents then we
2108          * actually need to zero the extent edges.  Otherwise xfs_bunmapi
2109          * will take care of it for us.
2110          */
2111         if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
2112                 nimap = 1;
2113                 error = xfs_bmapi_read(ip, startoffset_fsb, 1,
2114                                         &imap, &nimap, 0);
2115                 if (error)
2116                         goto out_unlock_iolock;
2117                 ASSERT(nimap == 0 || nimap == 1);
2118                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
2119                         xfs_daddr_t     block;
2120
2121                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
2122                         block = imap.br_startblock;
2123                         mod = do_div(block, mp->m_sb.sb_rextsize);
2124                         if (mod)
2125                                 startoffset_fsb += mp->m_sb.sb_rextsize - mod;
2126                 }
2127                 nimap = 1;
2128                 error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
2129                                         &imap, &nimap, 0);
2130                 if (error)
2131                         goto out_unlock_iolock;
2132                 ASSERT(nimap == 0 || nimap == 1);
2133                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
2134                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
2135                         mod++;
2136                         if (mod && (mod != mp->m_sb.sb_rextsize))
2137                                 endoffset_fsb -= mod;
2138                 }
2139         }
2140         if ((done = (endoffset_fsb <= startoffset_fsb)))
2141                 /*
2142                  * One contiguous piece to clear
2143                  */
2144                 error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
2145         else {
2146                 /*
2147                  * Some full blocks, possibly two pieces to clear
2148                  */
2149                 if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
2150                         error = xfs_zero_remaining_bytes(ip, offset,
2151                                 XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
2152                 if (!error &&
2153                     XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
2154                         error = xfs_zero_remaining_bytes(ip,
2155                                 XFS_FSB_TO_B(mp, endoffset_fsb),
2156                                 offset + len - 1);
2157         }
2158
2159         /*
2160          * free file space until done or until there is an error
2161          */
2162         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
2163         while (!error && !done) {
2164
2165                 /*
2166                  * allocate and setup the transaction. Allow this
2167                  * transaction to dip into the reserve blocks to ensure
2168                  * the freeing of the space succeeds at ENOSPC.
2169                  */
2170                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
2171                 tp->t_flags |= XFS_TRANS_RESERVE;
2172                 error = xfs_trans_reserve(tp,
2173                                           resblks,
2174                                           XFS_WRITE_LOG_RES(mp),
2175                                           0,
2176                                           XFS_TRANS_PERM_LOG_RES,
2177                                           XFS_WRITE_LOG_COUNT);
2178
2179                 /*
2180                  * check for running out of space
2181                  */
2182                 if (error) {
2183                         /*
2184                          * Free the transaction structure.
2185                          */
2186                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
2187                         xfs_trans_cancel(tp, 0);
2188                         break;
2189                 }
2190                 xfs_ilock(ip, XFS_ILOCK_EXCL);
2191                 error = xfs_trans_reserve_quota(tp, mp,
2192                                 ip->i_udquot, ip->i_gdquot,
2193                                 resblks, 0, XFS_QMOPT_RES_REGBLKS);
2194                 if (error)
2195                         goto error1;
2196
2197                 xfs_trans_ijoin(tp, ip, 0);
2198
2199                 /*
2200                  * issue the bunmapi() call to free the blocks
2201                  */
2202                 xfs_bmap_init(&free_list, &firstfsb);
2203                 error = xfs_bunmapi(tp, ip, startoffset_fsb,
2204                                   endoffset_fsb - startoffset_fsb,
2205                                   0, 2, &firstfsb, &free_list, &done);
2206                 if (error) {
2207                         goto error0;
2208                 }
2209
2210                 /*
2211                  * complete the transaction
2212                  */
2213                 error = xfs_bmap_finish(&tp, &free_list, &committed);
2214                 if (error) {
2215                         goto error0;
2216                 }
2217
2218                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2219                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2220         }
2221
2222  out_unlock_iolock:
2223         if (need_iolock)
2224                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
2225         return error;
2226
2227  error0:
2228         xfs_bmap_cancel(&free_list);
2229  error1:
2230         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
2231         xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
2232                     XFS_ILOCK_EXCL);
2233         return error;
2234 }
2235
2236 /*
2237  * xfs_change_file_space()
2238  *      This routine allocates or frees disk space for the given file.
2239  *      The user specified parameters are checked for alignment and size
2240  *      limitations.
2241  *
2242  * RETURNS:
2243  *       0 on success
2244  *      errno on error
2245  *
2246  */
2247 int
2248 xfs_change_file_space(
2249         xfs_inode_t     *ip,
2250         int             cmd,
2251         xfs_flock64_t   *bf,
2252         xfs_off_t       offset,
2253         int             attr_flags)
2254 {
2255         xfs_mount_t     *mp = ip->i_mount;
2256         int             clrprealloc;
2257         int             error;
2258         xfs_fsize_t     fsize;
2259         int             setprealloc;
2260         xfs_off_t       startoffset;
2261         xfs_off_t       llen;
2262         xfs_trans_t     *tp;
2263         struct iattr    iattr;
2264         int             prealloc_type;
2265
2266         if (!S_ISREG(ip->i_d.di_mode))
2267                 return XFS_ERROR(EINVAL);
2268
2269         switch (bf->l_whence) {
2270         case 0: /*SEEK_SET*/
2271                 break;
2272         case 1: /*SEEK_CUR*/
2273                 bf->l_start += offset;
2274                 break;
2275         case 2: /*SEEK_END*/
2276                 bf->l_start += XFS_ISIZE(ip);
2277                 break;
2278         default:
2279                 return XFS_ERROR(EINVAL);
2280         }
2281
2282         llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
2283
2284         if (   (bf->l_start < 0)
2285             || (bf->l_start > XFS_MAXIOFFSET(mp))
2286             || (bf->l_start + llen < 0)
2287             || (bf->l_start + llen > XFS_MAXIOFFSET(mp)))
2288                 return XFS_ERROR(EINVAL);
2289
2290         bf->l_whence = 0;
2291
2292         startoffset = bf->l_start;
2293         fsize = XFS_ISIZE(ip);
2294
2295         /*
2296          * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
2297          * file space.
2298          * These calls do NOT zero the data space allocated to the file,
2299          * nor do they change the file size.
2300          *
2301          * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
2302          * space.
2303          * These calls cause the new file data to be zeroed and the file
2304          * size to be changed.
2305          */
2306         setprealloc = clrprealloc = 0;
2307         prealloc_type = XFS_BMAPI_PREALLOC;
2308
2309         switch (cmd) {
2310         case XFS_IOC_ZERO_RANGE:
2311                 prealloc_type |= XFS_BMAPI_CONVERT;
2312                 xfs_tosspages(ip, startoffset, startoffset + bf->l_len, 0);
2313                 /* FALLTHRU */
2314         case XFS_IOC_RESVSP:
2315         case XFS_IOC_RESVSP64:
2316                 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
2317                                                 prealloc_type, attr_flags);
2318                 if (error)
2319                         return error;
2320                 setprealloc = 1;
2321                 break;
2322
2323         case XFS_IOC_UNRESVSP:
2324         case XFS_IOC_UNRESVSP64:
2325                 if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
2326                                                                 attr_flags)))
2327                         return error;
2328                 break;
2329
2330         case XFS_IOC_ALLOCSP:
2331         case XFS_IOC_ALLOCSP64:
2332         case XFS_IOC_FREESP:
2333         case XFS_IOC_FREESP64:
2334                 if (startoffset > fsize) {
2335                         error = xfs_alloc_file_space(ip, fsize,
2336                                         startoffset - fsize, 0, attr_flags);
2337                         if (error)
2338                                 break;
2339                 }
2340
2341                 iattr.ia_valid = ATTR_SIZE;
2342                 iattr.ia_size = startoffset;
2343
2344                 error = xfs_setattr_size(ip, &iattr, attr_flags);
2345
2346                 if (error)
2347                         return error;
2348
2349                 clrprealloc = 1;
2350                 break;
2351
2352         default:
2353                 ASSERT(0);
2354                 return XFS_ERROR(EINVAL);
2355         }
2356
2357         /*
2358          * update the inode timestamp, mode, and prealloc flag bits
2359          */
2360         tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
2361
2362         if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
2363                                       0, 0, 0))) {
2364                 /* ASSERT(0); */
2365                 xfs_trans_cancel(tp, 0);
2366                 return error;
2367         }
2368
2369         xfs_ilock(ip, XFS_ILOCK_EXCL);
2370         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2371
2372         if ((attr_flags & XFS_ATTR_DMI) == 0) {
2373                 ip->i_d.di_mode &= ~S_ISUID;
2374
2375                 /*
2376                  * Note that we don't have to worry about mandatory
2377                  * file locking being disabled here because we only
2378                  * clear the S_ISGID bit if the Group execute bit is
2379                  * on, but if it was on then mandatory locking wouldn't
2380                  * have been enabled.
2381                  */
2382                 if (ip->i_d.di_mode & S_IXGRP)
2383                         ip->i_d.di_mode &= ~S_ISGID;
2384
2385                 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2386         }
2387         if (setprealloc)
2388                 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
2389         else if (clrprealloc)
2390                 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
2391
2392         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2393         if (attr_flags & XFS_ATTR_SYNC)
2394                 xfs_trans_set_sync(tp);
2395         return xfs_trans_commit(tp, 0);
2396 }