fs/xfs/xfs_vnodeops.c

   1 /*
   2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3  * All Rights Reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18
  19 #include "xfs.h"
  20 #include "xfs_fs.h"
  21 #include "xfs_types.h"
  22 #include "xfs_bit.h"
  23 #include "xfs_log.h"
  24 #include "xfs_inum.h"
  25 #include "xfs_trans.h"
  26 #include "xfs_sb.h"
  27 #include "xfs_ag.h"
  28 #include "xfs_dir2.h"
  29 #include "xfs_dmapi.h"
  30 #include "xfs_mount.h"
  31 #include "xfs_da_btree.h"
  32 #include "xfs_bmap_btree.h"
  33 #include "xfs_alloc_btree.h"
  34 #include "xfs_ialloc_btree.h"
  35 #include "xfs_dir2_sf.h"
  36 #include "xfs_attr_sf.h"
  37 #include "xfs_dinode.h"
  38 #include "xfs_inode.h"
  39 #include "xfs_inode_item.h"
  40 #include "xfs_itable.h"
  41 #include "xfs_btree.h"
  42 #include "xfs_ialloc.h"
  43 #include "xfs_alloc.h"
  44 #include "xfs_bmap.h"
  45 #include "xfs_acl.h"
  46 #include "xfs_attr.h"
  47 #include "xfs_rw.h"
  48 #include "xfs_error.h"
  49 #include "xfs_quota.h"
  50 #include "xfs_utils.h"
  51 #include "xfs_rtalloc.h"
  52 #include "xfs_trans_space.h"
  53 #include "xfs_log_priv.h"
  54 #include "xfs_filestream.h"
  55 #include "xfs_vnodeops.h"
  56 #include "xfs_trace.h"
  57
  58 int
  59 xfs_setattr(
  60         struct xfs_inode        *ip,
  61         struct iattr            *iattr,
  62         int                     flags)
  63 {
  64         xfs_mount_t             *mp = ip->i_mount;
  65         struct inode            *inode = VFS_I(ip);
  66         int                     mask = iattr->ia_valid;
  67         xfs_trans_t             *tp;
  68         int                     code;
  69         uint                    lock_flags;
  70         uint                    commit_flags=0;
  71         uid_t                   uid=0, iuid=0;
  72         gid_t                   gid=0, igid=0;
  73         struct xfs_dquot        *udqp, *gdqp, *olddquot1, *olddquot2;
  74         int                     need_iolock = 1;
  75
  76         xfs_itrace_entry(ip);
  77
  78         if (mp->m_flags & XFS_MOUNT_RDONLY)
  79                 return XFS_ERROR(EROFS);
  80
  81         if (XFS_FORCED_SHUTDOWN(mp))
  82                 return XFS_ERROR(EIO);
  83
  84         code = -inode_change_ok(inode, iattr);
  85         if (code)
  86                 return code;
  87
  88         olddquot1 = olddquot2 = NULL;
  89         udqp = gdqp = NULL;
  90
  91         /*
  92          * If disk quotas is on, we make sure that the dquots do exist on disk,
  93          * before we start any other transactions. Trying to do this later
  94          * is messy. We don't care to take a readlock to look at the ids
  95          * in inode here, because we can't hold it across the trans_reserve.
  96          * If the IDs do change before we take the ilock, we're covered
  97          * because the i_*dquot fields will get updated anyway.
  98          */
  99         if (XFS_IS_QUOTA_ON(mp) && (mask & (ATTR_UID|ATTR_GID))) {
 100                 uint    qflags = 0;
 101
 102                 if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) {
 103                         uid = iattr->ia_uid;
 104                         qflags |= XFS_QMOPT_UQUOTA;
 105                 } else {
 106                         uid = ip->i_d.di_uid;
 107                 }
 108                 if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) {
 109                         gid = iattr->ia_gid;
 110                         qflags |= XFS_QMOPT_GQUOTA;
 111                 }  else {
 112                         gid = ip->i_d.di_gid;
 113                 }
 114
 115                 /*
 116                  * We take a reference when we initialize udqp and gdqp,
 117                  * so it is important that we never blindly double trip on
 118                  * the same variable. See xfs_create() for an example.
 119                  */
 120                 ASSERT(udqp == NULL);
 121                 ASSERT(gdqp == NULL);
 122                 code = xfs_qm_vop_dqalloc(ip, uid, gid, ip->i_d.di_projid,
 123                                          qflags, &udqp, &gdqp);
 124                 if (code)
 125                         return code;
 126         }
 127
 128         /*
 129          * For the other attributes, we acquire the inode lock and
 130          * first do an error checking pass.
 131          */
 132         tp = NULL;
 133         lock_flags = XFS_ILOCK_EXCL;
 134         if (flags & XFS_ATTR_NOLOCK)
 135                 need_iolock = 0;
 136         if (!(mask & ATTR_SIZE)) {
 137                 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
 138                 commit_flags = 0;
 139                 code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp),
 140                                          0, 0, 0);
 141                 if (code) {
 142                         lock_flags = 0;
 143                         goto error_return;
 144                 }
 145         } else {
 146                 if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
 147                     !(flags & XFS_ATTR_DMI)) {
 148                         int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
 149                         code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, ip,
 150                                 iattr->ia_size, 0, dmflags, NULL);
 151                         if (code) {
 152                                 lock_flags = 0;
 153                                 goto error_return;
 154                         }
 155                 }
 156                 if (need_iolock)
 157                         lock_flags |= XFS_IOLOCK_EXCL;
 158         }
 159
 160         xfs_ilock(ip, lock_flags);
 161
 162         /*
 163          * Change file ownership.  Must be the owner or privileged.
 164          */
 165         if (mask & (ATTR_UID|ATTR_GID)) {
 166                 /*
 167                  * These IDs could have changed since we last looked at them.
 168                  * But, we're assured that if the ownership did change
 169                  * while we didn't have the inode locked, inode's dquot(s)
 170                  * would have changed also.
 171                  */
 172                 iuid = ip->i_d.di_uid;
 173                 igid = ip->i_d.di_gid;
 174                 gid = (mask & ATTR_GID) ? iattr->ia_gid : igid;
 175                 uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
 176
 177                 /*
 178                  * Do a quota reservation only if uid/gid is actually
 179                  * going to change.
 180                  */
 181                 if (XFS_IS_QUOTA_RUNNING(mp) &&
 182                     ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
 183                      (XFS_IS_GQUOTA_ON(mp) && igid != gid))) {
 184                         ASSERT(tp);
 185                         code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
 186                                                 capable(CAP_FOWNER) ?
 187                                                 XFS_QMOPT_FORCE_RES : 0);
 188                         if (code)       /* out of quota */
 189                                 goto error_return;
 190                 }
 191         }
 192
 193         /*
 194          * Truncate file.  Must have write permission and not be a directory.
 195          */
 196         if (mask & ATTR_SIZE) {
 197                 /* Short circuit the truncate case for zero length files */
 198                 if (iattr->ia_size == 0 &&
 199                     ip->i_size == 0 && ip->i_d.di_nextents == 0) {
 200                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
 201                         lock_flags &= ~XFS_ILOCK_EXCL;
 202                         if (mask & ATTR_CTIME)
 203                                 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 204                         code = 0;
 205                         goto error_return;
 206                 }
 207
 208                 if (S_ISDIR(ip->i_d.di_mode)) {
 209                         code = XFS_ERROR(EISDIR);
 210                         goto error_return;
 211                 } else if (!S_ISREG(ip->i_d.di_mode)) {
 212                         code = XFS_ERROR(EINVAL);
 213                         goto error_return;
 214                 }
 215
 216                 /*
 217                  * Make sure that the dquots are attached to the inode.
 218                  */
 219                 code = xfs_qm_dqattach_locked(ip, 0);
 220                 if (code)
 221                         goto error_return;
 222
 223                 /*
 224                  * Now we can make the changes.  Before we join the inode
 225                  * to the transaction, if ATTR_SIZE is set then take care of
 226                  * the part of the truncation that must be done without the
 227                  * inode lock.  This needs to be done before joining the inode
 228                  * to the transaction, because the inode cannot be unlocked
 229                  * once it is a part of the transaction.
 230                  */
 231                 if (iattr->ia_size > ip->i_size) {
 232                         /*
 233                          * Do the first part of growing a file: zero any data
 234                          * in the last block that is beyond the old EOF.  We
 235                          * need to do this before the inode is joined to the
 236                          * transaction to modify the i_size.
 237                          */
 238                         code = xfs_zero_eof(ip, iattr->ia_size, ip->i_size);
 239                 }
 240                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
 241
 242                 /*
 243                  * We are going to log the inode size change in this
 244                  * transaction so any previous writes that are beyond the on
 245                  * disk EOF and the new EOF that have not been written out need
 246                  * to be written here. If we do not write the data out, we
 247                  * expose ourselves to the null files problem.
 248                  *
 249                  * Only flush from the on disk size to the smaller of the in
 250                  * memory file size or the new size as that's the range we
 251                  * really care about here and prevents waiting for other data
 252                  * not within the range we care about here.
 253                  */
 254                 if (!code &&
 255                     ip->i_size != ip->i_d.di_size &&
 256                     iattr->ia_size > ip->i_d.di_size) {
 257                         code = xfs_flush_pages(ip,
 258                                         ip->i_d.di_size, iattr->ia_size,
 259                                         XBF_ASYNC, FI_NONE);
 260                 }
 261
 262                 /* wait for all I/O to complete */
 263                 xfs_ioend_wait(ip);
 264
 265                 if (!code)
 266                         code = xfs_itruncate_data(ip, iattr->ia_size);
 267                 if (code) {
 268                         ASSERT(tp == NULL);
 269                         lock_flags &= ~XFS_ILOCK_EXCL;
 270                         ASSERT(lock_flags == XFS_IOLOCK_EXCL || !need_iolock);
 271                         goto error_return;
 272                 }
 273                 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
 274                 if ((code = xfs_trans_reserve(tp, 0,
 275                                              XFS_ITRUNCATE_LOG_RES(mp), 0,
 276                                              XFS_TRANS_PERM_LOG_RES,
 277                                              XFS_ITRUNCATE_LOG_COUNT))) {
 278                         xfs_trans_cancel(tp, 0);
 279                         if (need_iolock)
 280                                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 281                         return code;
 282                 }
 283                 commit_flags = XFS_TRANS_RELEASE_LOG_RES;
 284                 xfs_ilock(ip, XFS_ILOCK_EXCL);
 285
 286                 xfs_trans_ijoin(tp, ip, lock_flags);
 287                 xfs_trans_ihold(tp, ip);
 288
 289                 /*
 290                  * Only change the c/mtime if we are changing the size
 291                  * or we are explicitly asked to change it. This handles
 292                  * the semantic difference between truncate() and ftruncate()
 293                  * as implemented in the VFS.
 294                  *
 295                  * The regular truncate() case without ATTR_CTIME and ATTR_MTIME
 296                  * is a special case where we need to update the times despite
 297                  * not having these flags set.  For all other operations the
 298                  * VFS set these flags explicitly if it wants a timestamp
 299                  * update.
 300                  */
 301                 if (iattr->ia_size != ip->i_size &&
 302                     (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
 303                         iattr->ia_ctime = iattr->ia_mtime =
 304                                 current_fs_time(inode->i_sb);
 305                         mask |= ATTR_CTIME | ATTR_MTIME;
 306                 }
 307
 308                 if (iattr->ia_size > ip->i_size) {
 309                         ip->i_d.di_size = iattr->ia_size;
 310                         ip->i_size = iattr->ia_size;
 311                         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 312                 } else if (iattr->ia_size <= ip->i_size ||
 313                            (iattr->ia_size == 0 && ip->i_d.di_nextents)) {
 314                         /*
 315                          * signal a sync transaction unless
 316                          * we're truncating an already unlinked
 317                          * file on a wsync filesystem
 318                          */
 319                         code = xfs_itruncate_finish(&tp, ip, iattr->ia_size,
 320                                             XFS_DATA_FORK,
 321                                             ((ip->i_d.di_nlink != 0 ||
 322                                               !(mp->m_flags & XFS_MOUNT_WSYNC))
 323                                              ? 1 : 0));
 324                         if (code)
 325                                 goto abort_return;
 326                         /*
 327                          * Truncated "down", so we're removing references
 328                          * to old data here - if we now delay flushing for
 329                          * a long time, we expose ourselves unduly to the
 330                          * notorious NULL files problem.  So, we mark this
 331                          * vnode and flush it when the file is closed, and
 332                          * do not wait the usual (long) time for writeout.
 333                          */
 334                         xfs_iflags_set(ip, XFS_ITRUNCATED);
 335                 }
 336         } else if (tp) {
 337                 xfs_trans_ijoin(tp, ip, lock_flags);
 338                 xfs_trans_ihold(tp, ip);
 339         }
 340
 341         /*
 342          * Change file ownership.  Must be the owner or privileged.
 343          */
 344         if (mask & (ATTR_UID|ATTR_GID)) {
 345                 /*
 346                  * CAP_FSETID overrides the following restrictions:
 347                  *
 348                  * The set-user-ID and set-group-ID bits of a file will be
 349                  * cleared upon successful return from chown()
 350                  */
 351                 if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
 352                     !capable(CAP_FSETID)) {
 353                         ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
 354                 }
 355
 356                 /*
 357                  * Change the ownerships and register quota modifications
 358                  * in the transaction.
 359                  */
 360                 if (iuid != uid) {
 361                         if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) {
 362                                 ASSERT(mask & ATTR_UID);
 363                                 ASSERT(udqp);
 364                                 olddquot1 = xfs_qm_vop_chown(tp, ip,
 365                                                         &ip->i_udquot, udqp);
 366                         }
 367                         ip->i_d.di_uid = uid;
 368                         inode->i_uid = uid;
 369                 }
 370                 if (igid != gid) {
 371                         if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) {
 372                                 ASSERT(!XFS_IS_PQUOTA_ON(mp));
 373                                 ASSERT(mask & ATTR_GID);
 374                                 ASSERT(gdqp);
 375                                 olddquot2 = xfs_qm_vop_chown(tp, ip,
 376                                                         &ip->i_gdquot, gdqp);
 377                         }
 378                         ip->i_d.di_gid = gid;
 379                         inode->i_gid = gid;
 380                 }
 381         }
 382
 383         /*
 384          * Change file access modes.
 385          */
 386         if (mask & ATTR_MODE) {
 387                 umode_t mode = iattr->ia_mode;
 388
 389                 if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
 390                         mode &= ~S_ISGID;
 391
 392                 ip->i_d.di_mode &= S_IFMT;
 393                 ip->i_d.di_mode |= mode & ~S_IFMT;
 394
 395                 inode->i_mode &= S_IFMT;
 396                 inode->i_mode |= mode & ~S_IFMT;
 397         }
 398
 399         /*
 400          * Change file access or modified times.
 401          */
 402         if (mask & ATTR_ATIME) {
 403                 inode->i_atime = iattr->ia_atime;
 404                 ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
 405                 ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
 406                 ip->i_update_core = 1;
 407         }
 408         if (mask & ATTR_CTIME) {
 409                 inode->i_ctime = iattr->ia_ctime;
 410                 ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
 411                 ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
 412                 ip->i_update_core = 1;
 413         }
 414         if (mask & ATTR_MTIME) {
 415                 inode->i_mtime = iattr->ia_mtime;
 416                 ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
 417                 ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
 418                 ip->i_update_core = 1;
 419         }
 420
 421         /*
 422          * And finally, log the inode core if any attribute in it
 423          * has been changed.
 424          */
 425         if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE|
 426                     ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
 427                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 428
 429         XFS_STATS_INC(xs_ig_attrchg);
 430
 431         /*
 432          * If this is a synchronous mount, make sure that the
 433          * transaction goes to disk before returning to the user.
 434          * This is slightly sub-optimal in that truncates require
 435          * two sync transactions instead of one for wsync filesystems.
 436          * One for the truncate and one for the timestamps since we
 437          * don't want to change the timestamps unless we're sure the
 438          * truncate worked.  Truncates are less than 1% of the laddis
 439          * mix so this probably isn't worth the trouble to optimize.
 440          */
 441         code = 0;
 442         if (mp->m_flags & XFS_MOUNT_WSYNC)
 443                 xfs_trans_set_sync(tp);
 444
 445         code = xfs_trans_commit(tp, commit_flags);
 446
 447         xfs_iunlock(ip, lock_flags);
 448
 449         /*
 450          * Release any dquot(s) the inode had kept before chown.
 451          */
 452         xfs_qm_dqrele(olddquot1);
 453         xfs_qm_dqrele(olddquot2);
 454         xfs_qm_dqrele(udqp);
 455         xfs_qm_dqrele(gdqp);
 456
 457         if (code)
 458                 return code;
 459
 460         /*
 461          * XXX(hch): Updating the ACL entries is not atomic vs the i_mode
 462          *           update.  We could avoid this with linked transactions
 463          *           and passing down the transaction pointer all the way
 464          *           to attr_set.  No previous user of the generic
 465          *           Posix ACL code seems to care about this issue either.
 466          */
 467         if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) {
 468                 code = -xfs_acl_chmod(inode);
 469                 if (code)
 470                         return XFS_ERROR(code);
 471         }
 472
 473         if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) &&
 474             !(flags & XFS_ATTR_DMI)) {
 475                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL,
 476                                         NULL, DM_RIGHT_NULL, NULL, NULL,
 477                                         0, 0, AT_DELAY_FLAG(flags));
 478         }
 479         return 0;
 480
 481  abort_return:
 482         commit_flags |= XFS_TRANS_ABORT;
 483         /* FALLTHROUGH */
 484  error_return:
 485         xfs_qm_dqrele(udqp);
 486         xfs_qm_dqrele(gdqp);
 487         if (tp) {
 488                 xfs_trans_cancel(tp, commit_flags);
 489         }
 490         if (lock_flags != 0) {
 491                 xfs_iunlock(ip, lock_flags);
 492         }
 493         return code;
 494 }
 495
 496 /*
 497  * The maximum pathlen is 1024 bytes. Since the minimum file system
 498  * blocksize is 512 bytes, we can get a max of 2 extents back from
 499  * bmapi.
 500  */
 501 #define SYMLINK_MAPS 2
 502
 503 STATIC int
 504 xfs_readlink_bmap(
 505         xfs_inode_t     *ip,
 506         char            *link)
 507 {
 508         xfs_mount_t     *mp = ip->i_mount;
 509         int             pathlen = ip->i_d.di_size;
 510         int             nmaps = SYMLINK_MAPS;
 511         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
 512         xfs_daddr_t     d;
 513         int             byte_cnt;
 514         int             n;
 515         xfs_buf_t       *bp;
 516         int             error = 0;
 517
 518         error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), 0, NULL, 0,
 519                         mval, &nmaps, NULL, NULL);
 520         if (error)
 521                 goto out;
 522
 523         for (n = 0; n < nmaps; n++) {
 524                 d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
 525                 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
 526
 527                 bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt),
 528                                   XBF_LOCK | XBF_MAPPED | XBF_DONT_BLOCK);
 529                 error = XFS_BUF_GETERROR(bp);
 530                 if (error) {
 531                         xfs_ioerror_alert("xfs_readlink",
 532                                   ip->i_mount, bp, XFS_BUF_ADDR(bp));
 533                         xfs_buf_relse(bp);
 534                         goto out;
 535                 }
 536                 if (pathlen < byte_cnt)
 537                         byte_cnt = pathlen;
 538                 pathlen -= byte_cnt;
 539
 540                 memcpy(link, XFS_BUF_PTR(bp), byte_cnt);
 541                 xfs_buf_relse(bp);
 542         }
 543
 544         link[ip->i_d.di_size] = '\0';
 545         error = 0;
 546
 547  out:
 548         return error;
 549 }
 550
 551 int
 552 xfs_readlink(
 553         xfs_inode_t     *ip,
 554         char            *link)
 555 {
 556         xfs_mount_t     *mp = ip->i_mount;
 557         int             pathlen;
 558         int             error = 0;
 559
 560         xfs_itrace_entry(ip);
 561
 562         if (XFS_FORCED_SHUTDOWN(mp))
 563                 return XFS_ERROR(EIO);
 564
 565         xfs_ilock(ip, XFS_ILOCK_SHARED);
 566
 567         ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK);
 568         ASSERT(ip->i_d.di_size <= MAXPATHLEN);
 569
 570         pathlen = ip->i_d.di_size;
 571         if (!pathlen)
 572                 goto out;
 573
 574         if (ip->i_df.if_flags & XFS_IFINLINE) {
 575                 memcpy(link, ip->i_df.if_u1.if_data, pathlen);
 576                 link[pathlen] = '\0';
 577         } else {
 578                 error = xfs_readlink_bmap(ip, link);
 579         }
 580
 581  out:
 582         xfs_iunlock(ip, XFS_ILOCK_SHARED);
 583         return error;
 584 }
 585
 586 /*
 587  * Flags for xfs_free_eofblocks
 588  */
 589 #define XFS_FREE_EOF_TRYLOCK    (1<<0)
 590
 591 /*
 592  * This is called by xfs_inactive to free any blocks beyond eof
 593  * when the link count isn't zero and by xfs_dm_punch_hole() when
 594  * punching a hole to EOF.
 595  */
 596 STATIC int
 597 xfs_free_eofblocks(
 598         xfs_mount_t     *mp,
 599         xfs_inode_t     *ip,
 600         int             flags)
 601 {
 602         xfs_trans_t     *tp;
 603         int             error;
 604         xfs_fileoff_t   end_fsb;
 605         xfs_fileoff_t   last_fsb;
 606         xfs_filblks_t   map_len;
 607         int             nimaps;
 608         xfs_bmbt_irec_t imap;
 609
 610         /*
 611          * Figure out if there are any blocks beyond the end
 612          * of the file.  If not, then there is nothing to do.
 613          */
 614         end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
 615         last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
 616         map_len = last_fsb - end_fsb;
 617         if (map_len <= 0)
 618                 return 0;
 619
 620         nimaps = 1;
 621         xfs_ilock(ip, XFS_ILOCK_SHARED);
 622         error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0,
 623                           NULL, 0, &imap, &nimaps, NULL, NULL);
 624         xfs_iunlock(ip, XFS_ILOCK_SHARED);
 625
 626         if (!error && (nimaps != 0) &&
 627             (imap.br_startblock != HOLESTARTBLOCK ||
 628              ip->i_delayed_blks)) {
 629                 /*
 630                  * Attach the dquots to the inode up front.
 631                  */
 632                 error = xfs_qm_dqattach(ip, 0);
 633                 if (error)
 634                         return error;
 635
 636                 /*
 637                  * There are blocks after the end of file.
 638                  * Free them up now by truncating the file to
 639                  * its current size.
 640                  */
 641                 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
 642
 643                 /*
 644                  * Do the xfs_itruncate_start() call before
 645                  * reserving any log space because
 646                  * itruncate_start will call into the buffer
 647                  * cache and we can't
 648                  * do that within a transaction.
 649                  */
 650                 if (flags & XFS_FREE_EOF_TRYLOCK) {
 651                         if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
 652                                 xfs_trans_cancel(tp, 0);
 653                                 return 0;
 654                         }
 655                 } else {
 656                         xfs_ilock(ip, XFS_IOLOCK_EXCL);
 657                 }
 658                 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
 659                                     ip->i_size);
 660                 if (error) {
 661                         xfs_trans_cancel(tp, 0);
 662                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 663                         return error;
 664                 }
 665
 666                 error = xfs_trans_reserve(tp, 0,
 667                                           XFS_ITRUNCATE_LOG_RES(mp),
 668                                           0, XFS_TRANS_PERM_LOG_RES,
 669                                           XFS_ITRUNCATE_LOG_COUNT);
 670                 if (error) {
 671                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
 672                         xfs_trans_cancel(tp, 0);
 673                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 674                         return error;
 675                 }
 676
 677                 xfs_ilock(ip, XFS_ILOCK_EXCL);
 678                 xfs_trans_ijoin(tp, ip,
 679                                 XFS_IOLOCK_EXCL |
 680                                 XFS_ILOCK_EXCL);
 681                 xfs_trans_ihold(tp, ip);
 682
 683                 error = xfs_itruncate_finish(&tp, ip,
 684                                              ip->i_size,
 685                                              XFS_DATA_FORK,
 686                                              0);
 687                 /*
 688                  * If we get an error at this point we
 689                  * simply don't bother truncating the file.
 690                  */
 691                 if (error) {
 692                         xfs_trans_cancel(tp,
 693                                          (XFS_TRANS_RELEASE_LOG_RES |
 694                                           XFS_TRANS_ABORT));
 695                 } else {
 696                         error = xfs_trans_commit(tp,
 697                                                 XFS_TRANS_RELEASE_LOG_RES);
 698                 }
 699                 xfs_iunlock(ip, XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL);
 700         }
 701         return error;
 702 }
 703
 704 /*
 705  * Free a symlink that has blocks associated with it.
 706  */
 707 STATIC int
 708 xfs_inactive_symlink_rmt(
 709         xfs_inode_t     *ip,
 710         xfs_trans_t     **tpp)
 711 {
 712         xfs_buf_t       *bp;
 713         int             committed;
 714         int             done;
 715         int             error;
 716         xfs_fsblock_t   first_block;
 717         xfs_bmap_free_t free_list;
 718         int             i;
 719         xfs_mount_t     *mp;
 720         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
 721         int             nmaps;
 722         xfs_trans_t     *ntp;
 723         int             size;
 724         xfs_trans_t     *tp;
 725
 726         tp = *tpp;
 727         mp = ip->i_mount;
 728         ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
 729         /*
 730          * We're freeing a symlink that has some
 731          * blocks allocated to it.  Free the
 732          * blocks here.  We know that we've got
 733          * either 1 or 2 extents and that we can
 734          * free them all in one bunmapi call.
 735          */
 736         ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
 737         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
 738                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
 739                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
 740                 xfs_trans_cancel(tp, 0);
 741                 *tpp = NULL;
 742                 return error;
 743         }
 744         /*
 745          * Lock the inode, fix the size, and join it to the transaction.
 746          * Hold it so in the normal path, we still have it locked for
 747          * the second transaction.  In the error paths we need it
 748          * held so the cancel won't rele it, see below.
 749          */
 750         xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
 751         size = (int)ip->i_d.di_size;
 752         ip->i_d.di_size = 0;
 753         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
 754         xfs_trans_ihold(tp, ip);
 755         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 756         /*
 757          * Find the block(s) so we can inval and unmap them.
 758          */
 759         done = 0;
 760         xfs_bmap_init(&free_list, &first_block);
 761         nmaps = ARRAY_SIZE(mval);
 762         if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
 763                         XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
 764                         &free_list, NULL)))
 765                 goto error0;
 766         /*
 767          * Invalidate the block(s).
 768          */
 769         for (i = 0; i < nmaps; i++) {
 770                 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
 771                         XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
 772                         XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
 773                 xfs_trans_binval(tp, bp);
 774         }
 775         /*
 776          * Unmap the dead block(s) to the free_list.
 777          */
 778         if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
 779                         &first_block, &free_list, NULL, &done)))
 780                 goto error1;
 781         ASSERT(done);
 782         /*
 783          * Commit the first transaction.  This logs the EFI and the inode.
 784          */
 785         if ((error = xfs_bmap_finish(&tp, &free_list, &committed)))
 786                 goto error1;
 787         /*
 788          * The transaction must have been committed, since there were
 789          * actually extents freed by xfs_bunmapi.  See xfs_bmap_finish.
 790          * The new tp has the extent freeing and EFDs.
 791          */
 792         ASSERT(committed);
 793         /*
 794          * The first xact was committed, so add the inode to the new one.
 795          * Mark it dirty so it will be logged and moved forward in the log as
 796          * part of every commit.
 797          */
 798         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
 799         xfs_trans_ihold(tp, ip);
 800         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 801         /*
 802          * Get a new, empty transaction to return to our caller.
 803          */
 804         ntp = xfs_trans_dup(tp);
 805         /*
 806          * Commit the transaction containing extent freeing and EFDs.
 807          * If we get an error on the commit here or on the reserve below,
 808          * we need to unlock the inode since the new transaction doesn't
 809          * have the inode attached.
 810          */
 811         error = xfs_trans_commit(tp, 0);
 812         tp = ntp;
 813         if (error) {
 814                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
 815                 goto error0;
 816         }
 817         /*
 818          * transaction commit worked ok so we can drop the extra ticket
 819          * reference that we gained in xfs_trans_dup()
 820          */
 821         xfs_log_ticket_put(tp->t_ticket);
 822
 823         /*
 824          * Remove the memory for extent descriptions (just bookkeeping).
 825          */
 826         if (ip->i_df.if_bytes)
 827                 xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
 828         ASSERT(ip->i_df.if_bytes == 0);
 829         /*
 830          * Put an itruncate log reservation in the new transaction
 831          * for our caller.
 832          */
 833         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
 834                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
 835                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
 836                 goto error0;
 837         }
 838         /*
 839          * Return with the inode locked but not joined to the transaction.
 840          */
 841         *tpp = tp;
 842         return 0;
 843
 844  error1:
 845         xfs_bmap_cancel(&free_list);
 846  error0:
 847         /*
 848          * Have to come here with the inode locked and either
 849          * (held and in the transaction) or (not in the transaction).
 850          * If the inode isn't held then cancel would iput it, but
 851          * that's wrong since this is inactive and the vnode ref
 852          * count is 0 already.
 853          * Cancel won't do anything to the inode if held, but it still
 854          * needs to be locked until the cancel is done, if it was
 855          * joined to the transaction.
 856          */
 857         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
 858         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
 859         *tpp = NULL;
 860         return error;
 861
 862 }
 863
 864 STATIC int
 865 xfs_inactive_symlink_local(
 866         xfs_inode_t     *ip,
 867         xfs_trans_t     **tpp)
 868 {
 869         int             error;
 870
 871         ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip));
 872         /*
 873          * We're freeing a symlink which fit into
 874          * the inode.  Just free the memory used
 875          * to hold the old symlink.
 876          */
 877         error = xfs_trans_reserve(*tpp, 0,
 878                                   XFS_ITRUNCATE_LOG_RES(ip->i_mount),
 879                                   0, XFS_TRANS_PERM_LOG_RES,
 880                                   XFS_ITRUNCATE_LOG_COUNT);
 881
 882         if (error) {
 883                 xfs_trans_cancel(*tpp, 0);
 884                 *tpp = NULL;
 885                 return error;
 886         }
 887         xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
 888
 889         /*
 890          * Zero length symlinks _can_ exist.
 891          */
 892         if (ip->i_df.if_bytes > 0) {
 893                 xfs_idata_realloc(ip,
 894                                   -(ip->i_df.if_bytes),
 895                                   XFS_DATA_FORK);
 896                 ASSERT(ip->i_df.if_bytes == 0);
 897         }
 898         return 0;
 899 }
 900
 901 STATIC int
 902 xfs_inactive_attrs(
 903         xfs_inode_t     *ip,
 904         xfs_trans_t     **tpp)
 905 {
 906         xfs_trans_t     *tp;
 907         int             error;
 908         xfs_mount_t     *mp;
 909
 910         ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
 911         tp = *tpp;
 912         mp = ip->i_mount;
 913         ASSERT(ip->i_d.di_forkoff != 0);
 914         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 915         xfs_iunlock(ip, XFS_ILOCK_EXCL);
 916         if (error)
 917                 goto error_unlock;
 918
 919         error = xfs_attr_inactive(ip);
 920         if (error)
 921                 goto error_unlock;
 922
 923         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
 924         error = xfs_trans_reserve(tp, 0,
 925                                   XFS_IFREE_LOG_RES(mp),
 926                                   0, XFS_TRANS_PERM_LOG_RES,
 927                                   XFS_INACTIVE_LOG_COUNT);
 928         if (error)
 929                 goto error_cancel;
 930
 931         xfs_ilock(ip, XFS_ILOCK_EXCL);
 932         xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
 933         xfs_trans_ihold(tp, ip);
 934         xfs_idestroy_fork(ip, XFS_ATTR_FORK);
 935
 936         ASSERT(ip->i_d.di_anextents == 0);
 937
 938         *tpp = tp;
 939         return 0;
 940
 941 error_cancel:
 942         ASSERT(XFS_FORCED_SHUTDOWN(mp));
 943         xfs_trans_cancel(tp, 0);
 944 error_unlock:
 945         *tpp = NULL;
 946         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 947         return error;
 948 }
 949
 950 int
 951 xfs_release(
 952         xfs_inode_t     *ip)
 953 {
 954         xfs_mount_t     *mp = ip->i_mount;
 955         int             error;
 956
 957         if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0))
 958                 return 0;
 959
 960         /* If this is a read-only mount, don't do this (would generate I/O) */
 961         if (mp->m_flags & XFS_MOUNT_RDONLY)
 962                 return 0;
 963
 964         if (!XFS_FORCED_SHUTDOWN(mp)) {
 965                 int truncated;
 966
 967                 /*
 968                  * If we are using filestreams, and we have an unlinked
 969                  * file that we are processing the last close on, then nothing
 970                  * will be able to reopen and write to this file. Purge this
 971                  * inode from the filestreams cache so that it doesn't delay
 972                  * teardown of the inode.
 973                  */
 974                 if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
 975                         xfs_filestream_deassociate(ip);
 976
 977                 /*
 978                  * If we previously truncated this file and removed old data
 979                  * in the process, we want to initiate "early" writeout on
 980                  * the last close.  This is an attempt to combat the notorious
 981                  * NULL files problem which is particularly noticable from a
 982                  * truncate down, buffered (re-)write (delalloc), followed by
 983                  * a crash.  What we are effectively doing here is
 984                  * significantly reducing the time window where we'd otherwise
 985                  * be exposed to that problem.
 986                  */
 987                 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
 988                 if (truncated && VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0)
 989                         xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
 990         }
 991
 992         if (ip->i_d.di_nlink != 0) {
 993                 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
 994                      ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
 995                        ip->i_delayed_blks > 0)) &&
 996                      (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
 997                     (!(ip->i_d.di_flags &
 998                                 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
 999
1000                         /*
1001                          * If we can't get the iolock just skip truncating
1002                          * the blocks past EOF because we could deadlock
1003                          * with the mmap_sem otherwise.  We'll get another
1004                          * chance to drop them once the last reference to
1005                          * the inode is dropped, so we'll never leak blocks
1006                          * permanently.
1007                          */
1008                         error = xfs_free_eofblocks(mp, ip,
1009                                                    XFS_FREE_EOF_TRYLOCK);
1010                         if (error)
1011                                 return error;
1012                 }
1013         }
1014
1015         return 0;
1016 }
1017
1018 /*
1019  * xfs_inactive
1020  *
1021  * This is called when the vnode reference count for the vnode
1022  * goes to zero.  If the file has been unlinked, then it must
1023  * now be truncated.  Also, we clear all of the read-ahead state
1024  * kept for the inode here since the file is now closed.
1025  */
1026 int
1027 xfs_inactive(
1028         xfs_inode_t     *ip)
1029 {
1030         xfs_bmap_free_t free_list;
1031         xfs_fsblock_t   first_block;
1032         int             committed;
1033         xfs_trans_t     *tp;
1034         xfs_mount_t     *mp;
1035         int             error;
1036         int             truncate;
1037
1038         xfs_itrace_entry(ip);
1039
1040         /*
1041          * If the inode is already free, then there can be nothing
1042          * to clean up here.
1043          */
1044         if (ip->i_d.di_mode == 0 || is_bad_inode(VFS_I(ip))) {
1045                 ASSERT(ip->i_df.if_real_bytes == 0);
1046                 ASSERT(ip->i_df.if_broot_bytes == 0);
1047                 return VN_INACTIVE_CACHE;
1048         }
1049
1050         /*
1051          * Only do a truncate if it's a regular file with
1052          * some actual space in it.  It's OK to look at the
1053          * inode's fields without the lock because we're the
1054          * only one with a reference to the inode.
1055          */
1056         truncate = ((ip->i_d.di_nlink == 0) &&
1057             ((ip->i_d.di_size != 0) || (ip->i_size != 0) ||
1058              (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
1059             ((ip->i_d.di_mode & S_IFMT) == S_IFREG));
1060
1061         mp = ip->i_mount;
1062
1063         if (ip->i_d.di_nlink == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_DESTROY))
1064                 XFS_SEND_DESTROY(mp, ip, DM_RIGHT_NULL);
1065
1066         error = 0;
1067
1068         /* If this is a read-only mount, don't do this (would generate I/O) */
1069         if (mp->m_flags & XFS_MOUNT_RDONLY)
1070                 goto out;
1071
1072         if (ip->i_d.di_nlink != 0) {
1073                 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1074                      ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
1075                        ip->i_delayed_blks > 0)) &&
1076                       (ip->i_df.if_flags & XFS_IFEXTENTS) &&
1077                      (!(ip->i_d.di_flags &
1078                                 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
1079                       (ip->i_delayed_blks != 0)))) {
1080                         error = xfs_free_eofblocks(mp, ip, 0);
1081                         if (error)
1082                                 return VN_INACTIVE_CACHE;
1083                 }
1084                 goto out;
1085         }
1086
1087         ASSERT(ip->i_d.di_nlink == 0);
1088
1089         error = xfs_qm_dqattach(ip, 0);
1090         if (error)
1091                 return VN_INACTIVE_CACHE;
1092
1093         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1094         if (truncate) {
1095                 /*
1096                  * Do the xfs_itruncate_start() call before
1097                  * reserving any log space because itruncate_start
1098                  * will call into the buffer cache and we can't
1099                  * do that within a transaction.
1100                  */
1101                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1102
1103                 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0);
1104                 if (error) {
1105                         xfs_trans_cancel(tp, 0);
1106                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1107                         return VN_INACTIVE_CACHE;
1108                 }
1109
1110                 error = xfs_trans_reserve(tp, 0,
1111                                           XFS_ITRUNCATE_LOG_RES(mp),
1112                                           0, XFS_TRANS_PERM_LOG_RES,
1113                                           XFS_ITRUNCATE_LOG_COUNT);
1114                 if (error) {
1115                         /* Don't call itruncate_cleanup */
1116                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1117                         xfs_trans_cancel(tp, 0);
1118                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1119                         return VN_INACTIVE_CACHE;
1120                 }
1121
1122                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1123                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1124                 xfs_trans_ihold(tp, ip);
1125
1126                 /*
1127                  * normally, we have to run xfs_itruncate_finish sync.
1128                  * But if filesystem is wsync and we're in the inactive
1129                  * path, then we know that nlink == 0, and that the
1130                  * xaction that made nlink == 0 is permanently committed
1131                  * since xfs_remove runs as a synchronous transaction.
1132                  */
1133                 error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK,
1134                                 (!(mp->m_flags & XFS_MOUNT_WSYNC) ? 1 : 0));
1135
1136                 if (error) {
1137                         xfs_trans_cancel(tp,
1138                                 XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1139                         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1140                         return VN_INACTIVE_CACHE;
1141                 }
1142         } else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) {
1143
1144                 /*
1145                  * If we get an error while cleaning up a
1146                  * symlink we bail out.
1147                  */
1148                 error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ?
1149                         xfs_inactive_symlink_rmt(ip, &tp) :
1150                         xfs_inactive_symlink_local(ip, &tp);
1151
1152                 if (error) {
1153                         ASSERT(tp == NULL);
1154                         return VN_INACTIVE_CACHE;
1155                 }
1156
1157                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1158                 xfs_trans_ihold(tp, ip);
1159         } else {
1160                 error = xfs_trans_reserve(tp, 0,
1161                                           XFS_IFREE_LOG_RES(mp),
1162                                           0, XFS_TRANS_PERM_LOG_RES,
1163                                           XFS_INACTIVE_LOG_COUNT);
1164                 if (error) {
1165                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1166                         xfs_trans_cancel(tp, 0);
1167                         return VN_INACTIVE_CACHE;
1168                 }
1169
1170                 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1171                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1172                 xfs_trans_ihold(tp, ip);
1173         }
1174
1175         /*
1176          * If there are attributes associated with the file
1177          * then blow them away now.  The code calls a routine
1178          * that recursively deconstructs the attribute fork.
1179          * We need to just commit the current transaction
1180          * because we can't use it for xfs_attr_inactive().
1181          */
1182         if (ip->i_d.di_anextents > 0) {
1183                 error = xfs_inactive_attrs(ip, &tp);
1184                 /*
1185                  * If we got an error, the transaction is already
1186                  * cancelled, and the inode is unlocked. Just get out.
1187                  */
1188                  if (error)
1189                          return VN_INACTIVE_CACHE;
1190         } else if (ip->i_afp) {
1191                 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1192         }
1193
1194         /*
1195          * Free the inode.
1196          */
1197         xfs_bmap_init(&free_list, &first_block);
1198         error = xfs_ifree(tp, ip, &free_list);
1199         if (error) {
1200                 /*
1201                  * If we fail to free the inode, shut down.  The cancel
1202                  * might do that, we need to make sure.  Otherwise the
1203                  * inode might be lost for a long time or forever.
1204                  */
1205                 if (!XFS_FORCED_SHUTDOWN(mp)) {
1206                         cmn_err(CE_NOTE,
1207                 "xfs_inactive:  xfs_ifree() returned an error = %d on %s",
1208                                 error, mp->m_fsname);
1209                         xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1210                 }
1211                 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
1212         } else {
1213                 /*
1214                  * Credit the quota account(s). The inode is gone.
1215                  */
1216                 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1217
1218                 /*
1219                  * Just ignore errors at this point.  There is nothing we can
1220                  * do except to try to keep going. Make sure it's not a silent
1221                  * error.
1222                  */
1223                 error = xfs_bmap_finish(&tp,  &free_list, &committed);
1224                 if (error)
1225                         xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
1226                                 "xfs_bmap_finish() returned error %d", error);
1227                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1228                 if (error)
1229                         xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
1230                                 "xfs_trans_commit() returned error %d", error);
1231         }
1232
1233         /*
1234          * Release the dquots held by inode, if any.
1235          */
1236         xfs_qm_dqdetach(ip);
1237         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1238
1239  out:
1240         return VN_INACTIVE_CACHE;
1241 }
1242
1243 /*
1244  * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
1245  * is allowed, otherwise it has to be an exact match. If a CI match is found,
1246  * ci_name->name will point to a the actual name (caller must free) or
1247  * will be set to NULL if an exact match is found.
1248  */
1249 int
1250 xfs_lookup(
1251         xfs_inode_t             *dp,
1252         struct xfs_name         *name,
1253         xfs_inode_t             **ipp,
1254         struct xfs_name         *ci_name)
1255 {
1256         xfs_ino_t               inum;
1257         int                     error;
1258         uint                    lock_mode;
1259
1260         xfs_itrace_entry(dp);
1261
1262         if (XFS_FORCED_SHUTDOWN(dp->i_mount))
1263                 return XFS_ERROR(EIO);
1264
1265         lock_mode = xfs_ilock_map_shared(dp);
1266         error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
1267         xfs_iunlock_map_shared(dp, lock_mode);
1268
1269         if (error)
1270                 goto out;
1271
1272         error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp, 0);
1273         if (error)
1274                 goto out_free_name;
1275
1276         return 0;
1277
1278 out_free_name:
1279         if (ci_name)
1280                 kmem_free(ci_name->name);
1281 out:
1282         *ipp = NULL;
1283         return error;
1284 }
1285
1286 int
1287 xfs_create(
1288         xfs_inode_t             *dp,
1289         struct xfs_name         *name,
1290         mode_t                  mode,
1291         xfs_dev_t               rdev,
1292         xfs_inode_t             **ipp,
1293         cred_t                  *credp)
1294 {
1295         int                     is_dir = S_ISDIR(mode);
1296         struct xfs_mount        *mp = dp->i_mount;
1297         struct xfs_inode        *ip = NULL;
1298         struct xfs_trans        *tp = NULL;
1299         int                     error;
1300         xfs_bmap_free_t         free_list;
1301         xfs_fsblock_t           first_block;
1302         boolean_t               unlock_dp_on_error = B_FALSE;
1303         uint                    cancel_flags;
1304         int                     committed;
1305         xfs_prid_t              prid;
1306         struct xfs_dquot        *udqp = NULL;
1307         struct xfs_dquot        *gdqp = NULL;
1308         uint                    resblks;
1309         uint                    log_res;
1310         uint                    log_count;
1311
1312         xfs_itrace_entry(dp);
1313
1314         if (XFS_FORCED_SHUTDOWN(mp))
1315                 return XFS_ERROR(EIO);
1316
1317         if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
1318                 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
1319                                 dp, DM_RIGHT_NULL, NULL,
1320                                 DM_RIGHT_NULL, name->name, NULL,
1321                                 mode, 0, 0);
1322
1323                 if (error)
1324                         return error;
1325         }
1326
1327         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1328                 prid = dp->i_d.di_projid;
1329         else
1330                 prid = dfltprid;
1331
1332         /*
1333          * Make sure that we have allocated dquot(s) on disk.
1334          */
1335         error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
1336                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
1337         if (error)
1338                 goto std_return;
1339
1340         if (is_dir) {
1341                 rdev = 0;
1342                 resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
1343                 log_res = XFS_MKDIR_LOG_RES(mp);
1344                 log_count = XFS_MKDIR_LOG_COUNT;
1345                 tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
1346         } else {
1347                 resblks = XFS_CREATE_SPACE_RES(mp, name->len);
1348                 log_res = XFS_CREATE_LOG_RES(mp);
1349                 log_count = XFS_CREATE_LOG_COUNT;
1350                 tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1351         }
1352
1353         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1354
1355         /*
1356          * Initially assume that the file does not exist and
1357          * reserve the resources for that case.  If that is not
1358          * the case we'll drop the one we have and get a more
1359          * appropriate transaction later.
1360          */
1361         error = xfs_trans_reserve(tp, resblks, log_res, 0,
1362                         XFS_TRANS_PERM_LOG_RES, log_count);
1363         if (error == ENOSPC) {
1364                 /* flush outstanding delalloc blocks and retry */
1365                 xfs_flush_inodes(dp);
1366                 error = xfs_trans_reserve(tp, resblks, log_res, 0,
1367                                 XFS_TRANS_PERM_LOG_RES, log_count);
1368         }
1369         if (error == ENOSPC) {
1370                 /* No space at all so try a "no-allocation" reservation */
1371                 resblks = 0;
1372                 error = xfs_trans_reserve(tp, 0, log_res, 0,
1373                                 XFS_TRANS_PERM_LOG_RES, log_count);
1374         }
1375         if (error) {
1376                 cancel_flags = 0;
1377                 goto out_trans_cancel;
1378         }
1379
1380         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1381         unlock_dp_on_error = B_TRUE;
1382
1383         /*
1384          * Check for directory link count overflow.
1385          */
1386         if (is_dir && dp->i_d.di_nlink >= XFS_MAXLINK) {
1387                 error = XFS_ERROR(EMLINK);
1388                 goto out_trans_cancel;
1389         }
1390
1391         xfs_bmap_init(&free_list, &first_block);
1392
1393         /*
1394          * Reserve disk quota and the inode.
1395          */
1396         error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0);
1397         if (error)
1398                 goto out_trans_cancel;
1399
1400         error = xfs_dir_canenter(tp, dp, name, resblks);
1401         if (error)
1402                 goto out_trans_cancel;
1403
1404         /*
1405          * A newly created regular or special file just has one directory
1406          * entry pointing to them, but a directory also the "." entry
1407          * pointing to itself.
1408          */
1409         error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, credp,
1410                                prid, resblks > 0, &ip, &committed);
1411         if (error) {
1412                 if (error == ENOSPC)
1413                         goto out_trans_cancel;
1414                 goto out_trans_abort;
1415         }
1416
1417         /*
1418          * At this point, we've gotten a newly allocated inode.
1419          * It is locked (and joined to the transaction).
1420          */
1421         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1422
1423         /*
1424          * Now we join the directory inode to the transaction.  We do not do it
1425          * earlier because xfs_dir_ialloc might commit the previous transaction
1426          * (and release all the locks).  An error from here on will result in
1427          * the transaction cancel unlocking dp so don't do it explicitly in the
1428          * error path.
1429          */
1430         IHOLD(dp);
1431         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1432         unlock_dp_on_error = B_FALSE;
1433
1434         error = xfs_dir_createname(tp, dp, name, ip->i_ino,
1435                                         &first_block, &free_list, resblks ?
1436                                         resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
1437         if (error) {
1438                 ASSERT(error != ENOSPC);
1439                 goto out_trans_abort;
1440         }
1441         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1442         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1443
1444         if (is_dir) {
1445                 error = xfs_dir_init(tp, ip, dp);
1446                 if (error)
1447                         goto out_bmap_cancel;
1448
1449                 error = xfs_bumplink(tp, dp);
1450                 if (error)
1451                         goto out_bmap_cancel;
1452         }
1453
1454         /*
1455          * If this is a synchronous mount, make sure that the
1456          * create transaction goes to disk before returning to
1457          * the user.
1458          */
1459         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
1460                 xfs_trans_set_sync(tp);
1461
1462         /*
1463          * Attach the dquot(s) to the inodes and modify them incore.
1464          * These ids of the inode couldn't have changed since the new
1465          * inode has been locked ever since it was created.
1466          */
1467         xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
1468
1469         /*
1470          * xfs_trans_commit normally decrements the vnode ref count
1471          * when it unlocks the inode. Since we want to return the
1472          * vnode to the caller, we bump the vnode ref count now.
1473          */
1474         IHOLD(ip);
1475
1476         error = xfs_bmap_finish(&tp, &free_list, &committed);
1477         if (error)
1478                 goto out_abort_rele;
1479
1480         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1481         if (error) {
1482                 IRELE(ip);
1483                 goto out_dqrele;
1484         }
1485
1486         xfs_qm_dqrele(udqp);
1487         xfs_qm_dqrele(gdqp);
1488
1489         *ipp = ip;
1490
1491         /* Fallthrough to std_return with error = 0  */
1492  std_return:
1493         if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
1494                 XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, dp, DM_RIGHT_NULL,
1495                                 ip, DM_RIGHT_NULL, name->name, NULL, mode,
1496                                 error, 0);
1497         }
1498
1499         return error;
1500
1501  out_bmap_cancel:
1502         xfs_bmap_cancel(&free_list);
1503  out_trans_abort:
1504         cancel_flags |= XFS_TRANS_ABORT;
1505  out_trans_cancel:
1506         xfs_trans_cancel(tp, cancel_flags);
1507  out_dqrele:
1508         xfs_qm_dqrele(udqp);
1509         xfs_qm_dqrele(gdqp);
1510
1511         if (unlock_dp_on_error)
1512                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
1513
1514         goto std_return;
1515
1516  out_abort_rele:
1517         /*
1518          * Wait until after the current transaction is aborted to
1519          * release the inode.  This prevents recursive transactions
1520          * and deadlocks from xfs_inactive.
1521          */
1522         xfs_bmap_cancel(&free_list);
1523         cancel_flags |= XFS_TRANS_ABORT;
1524         xfs_trans_cancel(tp, cancel_flags);
1525         IRELE(ip);
1526         unlock_dp_on_error = B_FALSE;
1527         goto out_dqrele;
1528 }
1529
1530 #ifdef DEBUG
1531 int xfs_locked_n;
1532 int xfs_small_retries;
1533 int xfs_middle_retries;
1534 int xfs_lots_retries;
1535 int xfs_lock_delays;
1536 #endif
1537
1538 /*
1539  * Bump the subclass so xfs_lock_inodes() acquires each lock with
1540  * a different value
1541  */
1542 static inline int
1543 xfs_lock_inumorder(int lock_mode, int subclass)
1544 {
1545         if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
1546                 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
1547         if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
1548                 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
1549
1550         return lock_mode;
1551 }
1552
1553 /*
1554  * The following routine will lock n inodes in exclusive mode.
1555  * We assume the caller calls us with the inodes in i_ino order.
1556  *
1557  * We need to detect deadlock where an inode that we lock
1558  * is in the AIL and we start waiting for another inode that is locked
1559  * by a thread in a long running transaction (such as truncate). This can
1560  * result in deadlock since the long running trans might need to wait
1561  * for the inode we just locked in order to push the tail and free space
1562  * in the log.
1563  */
1564 void
1565 xfs_lock_inodes(
1566         xfs_inode_t     **ips,
1567         int             inodes,
1568         uint            lock_mode)
1569 {
1570         int             attempts = 0, i, j, try_lock;
1571         xfs_log_item_t  *lp;
1572
1573         ASSERT(ips && (inodes >= 2)); /* we need at least two */
1574
1575         try_lock = 0;
1576         i = 0;
1577
1578 again:
1579         for (; i < inodes; i++) {
1580                 ASSERT(ips[i]);
1581
1582                 if (i && (ips[i] == ips[i-1]))  /* Already locked */
1583                         continue;
1584
1585                 /*
1586                  * If try_lock is not set yet, make sure all locked inodes
1587                  * are not in the AIL.
1588                  * If any are, set try_lock to be used later.
1589                  */
1590
1591                 if (!try_lock) {
1592                         for (j = (i - 1); j >= 0 && !try_lock; j--) {
1593                                 lp = (xfs_log_item_t *)ips[j]->i_itemp;
1594                                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
1595                                         try_lock++;
1596                                 }
1597                         }
1598                 }
1599
1600                 /*
1601                  * If any of the previous locks we have locked is in the AIL,
1602                  * we must TRY to get the second and subsequent locks. If
1603                  * we can't get any, we must release all we have
1604                  * and try again.
1605                  */
1606
1607                 if (try_lock) {
1608                         /* try_lock must be 0 if i is 0. */
1609                         /*
1610                          * try_lock means we have an inode locked
1611                          * that is in the AIL.
1612                          */
1613                         ASSERT(i != 0);
1614                         if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
1615                                 attempts++;
1616
1617                                 /*
1618                                  * Unlock all previous guys and try again.
1619                                  * xfs_iunlock will try to push the tail
1620                                  * if the inode is in the AIL.
1621                                  */
1622
1623                                 for(j = i - 1; j >= 0; j--) {
1624
1625                                         /*
1626                                          * Check to see if we've already
1627                                          * unlocked this one.
1628                                          * Not the first one going back,
1629                                          * and the inode ptr is the same.
1630                                          */
1631                                         if ((j != (i - 1)) && ips[j] ==
1632                                                                 ips[j+1])
1633                                                 continue;
1634
1635                                         xfs_iunlock(ips[j], lock_mode);
1636                                 }
1637
1638                                 if ((attempts % 5) == 0) {
1639                                         delay(1); /* Don't just spin the CPU */
1640 #ifdef DEBUG
1641                                         xfs_lock_delays++;
1642 #endif
1643                                 }
1644                                 i = 0;
1645                                 try_lock = 0;
1646                                 goto again;
1647                         }
1648                 } else {
1649                         xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
1650                 }
1651         }
1652
1653 #ifdef DEBUG
1654         if (attempts) {
1655                 if (attempts < 5) xfs_small_retries++;
1656                 else if (attempts < 100) xfs_middle_retries++;
1657                 else xfs_lots_retries++;
1658         } else {
1659                 xfs_locked_n++;
1660         }
1661 #endif
1662 }
1663
1664 /*
1665  * xfs_lock_two_inodes() can only be used to lock one type of lock
1666  * at a time - the iolock or the ilock, but not both at once. If
1667  * we lock both at once, lockdep will report false positives saying
1668  * we have violated locking orders.
1669  */
1670 void
1671 xfs_lock_two_inodes(
1672         xfs_inode_t             *ip0,
1673         xfs_inode_t             *ip1,
1674         uint                    lock_mode)
1675 {
1676         xfs_inode_t             *temp;
1677         int                     attempts = 0;
1678         xfs_log_item_t          *lp;
1679
1680         if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
1681                 ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
1682         ASSERT(ip0->i_ino != ip1->i_ino);
1683
1684         if (ip0->i_ino > ip1->i_ino) {
1685                 temp = ip0;
1686                 ip0 = ip1;
1687                 ip1 = temp;
1688         }
1689
1690  again:
1691         xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0));
1692
1693         /*
1694          * If the first lock we have locked is in the AIL, we must TRY to get
1695          * the second lock. If we can't get it, we must release the first one
1696          * and try again.
1697          */
1698         lp = (xfs_log_item_t *)ip0->i_itemp;
1699         if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
1700                 if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) {
1701                         xfs_iunlock(ip0, lock_mode);
1702                         if ((++attempts % 5) == 0)
1703                                 delay(1); /* Don't just spin the CPU */
1704                         goto again;
1705                 }
1706         } else {
1707                 xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1));
1708         }
1709 }
1710
1711 int
1712 xfs_remove(
1713         xfs_inode_t             *dp,
1714         struct xfs_name         *name,
1715         xfs_inode_t             *ip)
1716 {
1717         xfs_mount_t             *mp = dp->i_mount;
1718         xfs_trans_t             *tp = NULL;
1719         int                     is_dir = S_ISDIR(ip->i_d.di_mode);
1720         int                     error = 0;
1721         xfs_bmap_free_t         free_list;
1722         xfs_fsblock_t           first_block;
1723         int                     cancel_flags;
1724         int                     committed;
1725         int                     link_zero;
1726         uint                    resblks;
1727         uint                    log_count;
1728
1729         xfs_itrace_entry(dp);
1730         xfs_itrace_entry(ip);
1731
1732         if (XFS_FORCED_SHUTDOWN(mp))
1733                 return XFS_ERROR(EIO);
1734
1735         if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
1736                 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dp, DM_RIGHT_NULL,
1737                                         NULL, DM_RIGHT_NULL, name->name, NULL,
1738                                         ip->i_d.di_mode, 0, 0);
1739                 if (error)
1740                         return error;
1741         }
1742
1743         error = xfs_qm_dqattach(dp, 0);
1744         if (error)
1745                 goto std_return;
1746
1747         error = xfs_qm_dqattach(ip, 0);
1748         if (error)
1749                 goto std_return;
1750
1751         if (is_dir) {
1752                 tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
1753                 log_count = XFS_DEFAULT_LOG_COUNT;
1754         } else {
1755                 tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
1756                 log_count = XFS_REMOVE_LOG_COUNT;
1757         }
1758         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1759
1760         /*
1761          * We try to get the real space reservation first,
1762          * allowing for directory btree deletion(s) implying
1763          * possible bmap insert(s).  If we can't get the space
1764          * reservation then we use 0 instead, and avoid the bmap
1765          * btree insert(s) in the directory code by, if the bmap
1766          * insert tries to happen, instead trimming the LAST
1767          * block from the directory.
1768          */
1769         resblks = XFS_REMOVE_SPACE_RES(mp);
1770         error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
1771                                   XFS_TRANS_PERM_LOG_RES, log_count);
1772         if (error == ENOSPC) {
1773                 resblks = 0;
1774                 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
1775                                           XFS_TRANS_PERM_LOG_RES, log_count);
1776         }
1777         if (error) {
1778                 ASSERT(error != ENOSPC);
1779                 cancel_flags = 0;
1780                 goto out_trans_cancel;
1781         }
1782
1783         xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
1784
1785         /*
1786          * At this point, we've gotten both the directory and the entry
1787          * inodes locked.
1788          */
1789         IHOLD(ip);
1790         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1791
1792         IHOLD(dp);
1793         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1794
1795         /*
1796          * If we're removing a directory perform some additional validation.
1797          */
1798         if (is_dir) {
1799                 ASSERT(ip->i_d.di_nlink >= 2);
1800                 if (ip->i_d.di_nlink != 2) {
1801                         error = XFS_ERROR(ENOTEMPTY);
1802                         goto out_trans_cancel;
1803                 }
1804                 if (!xfs_dir_isempty(ip)) {
1805                         error = XFS_ERROR(ENOTEMPTY);
1806                         goto out_trans_cancel;
1807                 }
1808         }
1809
1810         xfs_bmap_init(&free_list, &first_block);
1811         error = xfs_dir_removename(tp, dp, name, ip->i_ino,
1812                                         &first_block, &free_list, resblks);
1813         if (error) {
1814                 ASSERT(error != ENOENT);
1815                 goto out_bmap_cancel;
1816         }
1817         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1818
1819         if (is_dir) {
1820                 /*
1821                  * Drop the link from ip's "..".
1822                  */
1823                 error = xfs_droplink(tp, dp);
1824                 if (error)
1825                         goto out_bmap_cancel;
1826
1827                 /*
1828                  * Drop the "." link from ip to self.
1829                  */
1830                 error = xfs_droplink(tp, ip);
1831                 if (error)
1832                         goto out_bmap_cancel;
1833         } else {
1834                 /*
1835                  * When removing a non-directory we need to log the parent
1836                  * inode here.  For a directory this is done implicitly
1837                  * by the xfs_droplink call for the ".." entry.
1838                  */
1839                 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1840         }
1841
1842         /*
1843          * Drop the link from dp to ip.
1844          */
1845         error = xfs_droplink(tp, ip);
1846         if (error)
1847                 goto out_bmap_cancel;
1848
1849         /*
1850          * Determine if this is the last link while
1851          * we are in the transaction.
1852          */
1853         link_zero = (ip->i_d.di_nlink == 0);
1854
1855         /*
1856          * If this is a synchronous mount, make sure that the
1857          * remove transaction goes to disk before returning to
1858          * the user.
1859          */
1860         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
1861                 xfs_trans_set_sync(tp);
1862
1863         error = xfs_bmap_finish(&tp, &free_list, &committed);
1864         if (error)
1865                 goto out_bmap_cancel;
1866
1867         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1868         if (error)
1869                 goto std_return;
1870
1871         /*
1872          * If we are using filestreams, kill the stream association.
1873          * If the file is still open it may get a new one but that
1874          * will get killed on last close in xfs_close() so we don't
1875          * have to worry about that.
1876          */
1877         if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
1878                 xfs_filestream_deassociate(ip);
1879
1880  std_return:
1881         if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
1882                 XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE, dp, DM_RIGHT_NULL,
1883                                 NULL, DM_RIGHT_NULL, name->name, NULL,
1884                                 ip->i_d.di_mode, error, 0);
1885         }
1886
1887         return error;
1888
1889  out_bmap_cancel:
1890         xfs_bmap_cancel(&free_list);
1891         cancel_flags |= XFS_TRANS_ABORT;
1892  out_trans_cancel:
1893         xfs_trans_cancel(tp, cancel_flags);
1894         goto std_return;
1895 }
1896
1897 int
1898 xfs_link(
1899         xfs_inode_t             *tdp,
1900         xfs_inode_t             *sip,
1901         struct xfs_name         *target_name)
1902 {
1903         xfs_mount_t             *mp = tdp->i_mount;
1904         xfs_trans_t             *tp;
1905         int                     error;
1906         xfs_bmap_free_t         free_list;
1907         xfs_fsblock_t           first_block;
1908         int                     cancel_flags;
1909         int                     committed;
1910         int                     resblks;
1911
1912         xfs_itrace_entry(tdp);
1913         xfs_itrace_entry(sip);
1914
1915         ASSERT(!S_ISDIR(sip->i_d.di_mode));
1916
1917         if (XFS_FORCED_SHUTDOWN(mp))
1918                 return XFS_ERROR(EIO);
1919
1920         if (DM_EVENT_ENABLED(tdp, DM_EVENT_LINK)) {
1921                 error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
1922                                         tdp, DM_RIGHT_NULL,
1923                                         sip, DM_RIGHT_NULL,
1924                                         target_name->name, NULL, 0, 0, 0);
1925                 if (error)
1926                         return error;
1927         }
1928
1929         /* Return through std_return after this point. */
1930
1931         error = xfs_qm_dqattach(sip, 0);
1932         if (error)
1933                 goto std_return;
1934
1935         error = xfs_qm_dqattach(tdp, 0);
1936         if (error)
1937                 goto std_return;
1938
1939         tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
1940         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1941         resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
1942         error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
1943                         XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
1944         if (error == ENOSPC) {
1945                 resblks = 0;
1946                 error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
1947                                 XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
1948         }
1949         if (error) {
1950                 cancel_flags = 0;
1951                 goto error_return;
1952         }
1953
1954         xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
1955
1956         /*
1957          * Increment vnode ref counts since xfs_trans_commit &
1958          * xfs_trans_cancel will both unlock the inodes and
1959          * decrement the associated ref counts.
1960          */
1961         IHOLD(sip);
1962         IHOLD(tdp);
1963         xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
1964         xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
1965
1966         /*
1967          * If the source has too many links, we can't make any more to it.
1968          */
1969         if (sip->i_d.di_nlink >= XFS_MAXLINK) {
1970                 error = XFS_ERROR(EMLINK);
1971                 goto error_return;
1972         }
1973
1974         /*
1975          * If we are using project inheritance, we only allow hard link
1976          * creation in our tree when the project IDs are the same; else
1977          * the tree quota mechanism could be circumvented.
1978          */
1979         if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
1980                      (tdp->i_d.di_projid != sip->i_d.di_projid))) {
1981                 error = XFS_ERROR(EXDEV);
1982                 goto error_return;
1983         }
1984
1985         error = xfs_dir_canenter(tp, tdp, target_name, resblks);
1986         if (error)
1987                 goto error_return;
1988
1989         xfs_bmap_init(&free_list, &first_block);
1990
1991         error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
1992                                         &first_block, &free_list, resblks);
1993         if (error)
1994                 goto abort_return;
1995         xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1996         xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
1997
1998         error = xfs_bumplink(tp, sip);
1999         if (error)
2000                 goto abort_return;
2001
2002         /*
2003          * If this is a synchronous mount, make sure that the
2004          * link transaction goes to disk before returning to
2005          * the user.
2006          */
2007         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2008                 xfs_trans_set_sync(tp);
2009         }
2010
2011         error = xfs_bmap_finish (&tp, &free_list, &committed);
2012         if (error) {
2013                 xfs_bmap_cancel(&free_list);
2014                 goto abort_return;
2015         }
2016
2017         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2018         if (error)
2019                 goto std_return;
2020
2021         /* Fall through to std_return with error = 0. */
2022 std_return:
2023         if (DM_EVENT_ENABLED(sip, DM_EVENT_POSTLINK)) {
2024                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
2025                                 tdp, DM_RIGHT_NULL,
2026                                 sip, DM_RIGHT_NULL,
2027                                 target_name->name, NULL, 0, error, 0);
2028         }
2029         return error;
2030
2031  abort_return:
2032         cancel_flags |= XFS_TRANS_ABORT;
2033         /* FALLTHROUGH */
2034
2035  error_return:
2036         xfs_trans_cancel(tp, cancel_flags);
2037         goto std_return;
2038 }
2039
2040 int
2041 xfs_symlink(
2042         xfs_inode_t             *dp,
2043         struct xfs_name         *link_name,
2044         const char              *target_path,
2045         mode_t                  mode,
2046         xfs_inode_t             **ipp,
2047         cred_t                  *credp)
2048 {
2049         xfs_mount_t             *mp = dp->i_mount;
2050         xfs_trans_t             *tp;
2051         xfs_inode_t             *ip;
2052         int                     error;
2053         int                     pathlen;
2054         xfs_bmap_free_t         free_list;
2055         xfs_fsblock_t           first_block;
2056         boolean_t               unlock_dp_on_error = B_FALSE;
2057         uint                    cancel_flags;
2058         int                     committed;
2059         xfs_fileoff_t           first_fsb;
2060         xfs_filblks_t           fs_blocks;
2061         int                     nmaps;
2062         xfs_bmbt_irec_t         mval[SYMLINK_MAPS];
2063         xfs_daddr_t             d;
2064         const char              *cur_chunk;
2065         int                     byte_cnt;
2066         int                     n;
2067         xfs_buf_t               *bp;
2068         xfs_prid_t              prid;
2069         struct xfs_dquot        *udqp, *gdqp;
2070         uint                    resblks;
2071
2072         *ipp = NULL;
2073         error = 0;
2074         ip = NULL;
2075         tp = NULL;
2076
2077         xfs_itrace_entry(dp);
2078
2079         if (XFS_FORCED_SHUTDOWN(mp))
2080                 return XFS_ERROR(EIO);
2081
2082         /*
2083          * Check component lengths of the target path name.
2084          */
2085         pathlen = strlen(target_path);
2086         if (pathlen >= MAXPATHLEN)      /* total string too long */
2087                 return XFS_ERROR(ENAMETOOLONG);
2088
2089         if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) {
2090                 error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp,
2091                                         DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
2092                                         link_name->name,
2093                                         (unsigned char *)target_path, 0, 0, 0);
2094                 if (error)
2095                         return error;
2096         }
2097
2098         /* Return through std_return after this point. */
2099
2100         udqp = gdqp = NULL;
2101         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
2102                 prid = dp->i_d.di_projid;
2103         else
2104                 prid = (xfs_prid_t)dfltprid;
2105
2106         /*
2107          * Make sure that we have allocated dquot(s) on disk.
2108          */
2109         error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
2110                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
2111         if (error)
2112                 goto std_return;
2113
2114         tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
2115         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2116         /*
2117          * The symlink will fit into the inode data fork?
2118          * There can't be any attributes so we get the whole variable part.
2119          */
2120         if (pathlen <= XFS_LITINO(mp))
2121                 fs_blocks = 0;
2122         else
2123                 fs_blocks = XFS_B_TO_FSB(mp, pathlen);
2124         resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
2125         error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
2126                         XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
2127         if (error == ENOSPC && fs_blocks == 0) {
2128                 resblks = 0;
2129                 error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
2130                                 XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
2131         }
2132         if (error) {
2133                 cancel_flags = 0;
2134                 goto error_return;
2135         }
2136
2137         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
2138         unlock_dp_on_error = B_TRUE;
2139
2140         /*
2141          * Check whether the directory allows new symlinks or not.
2142          */
2143         if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
2144                 error = XFS_ERROR(EPERM);
2145                 goto error_return;
2146         }
2147
2148         /*
2149          * Reserve disk quota : blocks and inode.
2150          */
2151         error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0);
2152         if (error)
2153                 goto error_return;
2154
2155         /*
2156          * Check for ability to enter directory entry, if no space reserved.
2157          */
2158         error = xfs_dir_canenter(tp, dp, link_name, resblks);
2159         if (error)
2160                 goto error_return;
2161         /*
2162          * Initialize the bmap freelist prior to calling either
2163          * bmapi or the directory create code.
2164          */
2165         xfs_bmap_init(&free_list, &first_block);
2166
2167         /*
2168          * Allocate an inode for the symlink.
2169          */
2170         error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT),
2171                                1, 0, credp, prid, resblks > 0, &ip, NULL);
2172         if (error) {
2173                 if (error == ENOSPC)
2174                         goto error_return;
2175                 goto error1;
2176         }
2177
2178         /*
2179          * An error after we've joined dp to the transaction will result in the
2180          * transaction cancel unlocking dp so don't do it explicitly in the
2181          * error path.
2182          */
2183         IHOLD(dp);
2184         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2185         unlock_dp_on_error = B_FALSE;
2186
2187         /*
2188          * Also attach the dquot(s) to it, if applicable.
2189          */
2190         xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
2191
2192         if (resblks)
2193                 resblks -= XFS_IALLOC_SPACE_RES(mp);
2194         /*
2195          * If the symlink will fit into the inode, write it inline.
2196          */
2197         if (pathlen <= XFS_IFORK_DSIZE(ip)) {
2198                 xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
2199                 memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
2200                 ip->i_d.di_size = pathlen;
2201
2202                 /*
2203                  * The inode was initially created in extent format.
2204                  */
2205                 ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
2206                 ip->i_df.if_flags |= XFS_IFINLINE;
2207
2208                 ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
2209                 xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
2210
2211         } else {
2212                 first_fsb = 0;
2213                 nmaps = SYMLINK_MAPS;
2214
2215                 error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
2216                                   XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
2217                                   &first_block, resblks, mval, &nmaps,
2218                                   &free_list, NULL);
2219                 if (error) {
2220                         goto error1;
2221                 }
2222
2223                 if (resblks)
2224                         resblks -= fs_blocks;
2225                 ip->i_d.di_size = pathlen;
2226                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2227
2228                 cur_chunk = target_path;
2229                 for (n = 0; n < nmaps; n++) {
2230                         d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
2231                         byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
2232                         bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
2233                                                BTOBB(byte_cnt), 0);
2234                         ASSERT(bp && !XFS_BUF_GETERROR(bp));
2235                         if (pathlen < byte_cnt) {
2236                                 byte_cnt = pathlen;
2237                         }
2238                         pathlen -= byte_cnt;
2239
2240                         memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt);
2241                         cur_chunk += byte_cnt;
2242
2243                         xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
2244                 }
2245         }
2246
2247         /*
2248          * Create the directory entry for the symlink.
2249          */
2250         error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
2251                                         &first_block, &free_list, resblks);
2252         if (error)
2253                 goto error1;
2254         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2255         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2256
2257         /*
2258          * If this is a synchronous mount, make sure that the
2259          * symlink transaction goes to disk before returning to
2260          * the user.
2261          */
2262         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2263                 xfs_trans_set_sync(tp);
2264         }
2265
2266         /*
2267          * xfs_trans_commit normally decrements the vnode ref count
2268          * when it unlocks the inode. Since we want to return the
2269          * vnode to the caller, we bump the vnode ref count now.
2270          */
2271         IHOLD(ip);
2272
2273         error = xfs_bmap_finish(&tp, &free_list, &committed);
2274         if (error) {
2275                 goto error2;
2276         }
2277         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2278         xfs_qm_dqrele(udqp);
2279         xfs_qm_dqrele(gdqp);
2280
2281         /* Fall through to std_return with error = 0 or errno from
2282          * xfs_trans_commit     */
2283 std_return:
2284         if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTSYMLINK)) {
2285                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
2286                                         dp, DM_RIGHT_NULL,
2287                                         error ? NULL : ip,
2288                                         DM_RIGHT_NULL, link_name->name,
2289                                         (unsigned char *)target_path,
2290                                         0, error, 0);
2291         }
2292
2293         if (!error)
2294                 *ipp = ip;
2295         return error;
2296
2297  error2:
2298         IRELE(ip);
2299  error1:
2300         xfs_bmap_cancel(&free_list);
2301         cancel_flags |= XFS_TRANS_ABORT;
2302  error_return:
2303         xfs_trans_cancel(tp, cancel_flags);
2304         xfs_qm_dqrele(udqp);
2305         xfs_qm_dqrele(gdqp);
2306
2307         if (unlock_dp_on_error)
2308                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2309
2310         goto std_return;
2311 }
2312
2313 int
2314 xfs_set_dmattrs(
2315         xfs_inode_t     *ip,
2316         u_int           evmask,
2317         u_int16_t       state)
2318 {
2319         xfs_mount_t     *mp = ip->i_mount;
2320         xfs_trans_t     *tp;
2321         int             error;
2322
2323         if (!capable(CAP_SYS_ADMIN))
2324                 return XFS_ERROR(EPERM);
2325
2326         if (XFS_FORCED_SHUTDOWN(mp))
2327                 return XFS_ERROR(EIO);
2328
2329         tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
2330         error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
2331         if (error) {
2332                 xfs_trans_cancel(tp, 0);
2333                 return error;
2334         }
2335         xfs_ilock(ip, XFS_ILOCK_EXCL);
2336         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2337
2338         ip->i_d.di_dmevmask = evmask;
2339         ip->i_d.di_dmstate  = state;
2340
2341         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2342         IHOLD(ip);
2343         error = xfs_trans_commit(tp, 0);
2344
2345         return error;
2346 }
2347
2348 /*
2349  * xfs_alloc_file_space()
2350  *      This routine allocates disk space for the given file.
2351  *
2352  *      If alloc_type == 0, this request is for an ALLOCSP type
2353  *      request which will change the file size.  In this case, no
2354  *      DMAPI event will be generated by the call.  A TRUNCATE event
2355  *      will be generated later by xfs_setattr.
2356  *
2357  *      If alloc_type != 0, this request is for a RESVSP type
2358  *      request, and a DMAPI DM_EVENT_WRITE will be generated if the
2359  *      lower block boundary byte address is less than the file's
2360  *      length.
2361  *
2362  * RETURNS:
2363  *       0 on success
2364  *      errno on error
2365  *
2366  */
2367 STATIC int
2368 xfs_alloc_file_space(
2369         xfs_inode_t             *ip,
2370         xfs_off_t               offset,
2371         xfs_off_t               len,
2372         int                     alloc_type,
2373         int                     attr_flags)
2374 {
2375         xfs_mount_t             *mp = ip->i_mount;
2376         xfs_off_t               count;
2377         xfs_filblks_t           allocated_fsb;
2378         xfs_filblks_t           allocatesize_fsb;
2379         xfs_extlen_t            extsz, temp;
2380         xfs_fileoff_t           startoffset_fsb;
2381         xfs_fsblock_t           firstfsb;
2382         int                     nimaps;
2383         int                     bmapi_flag;
2384         int                     quota_flag;
2385         int                     rt;
2386         xfs_trans_t             *tp;
2387         xfs_bmbt_irec_t         imaps[1], *imapp;
2388         xfs_bmap_free_t         free_list;
2389         uint                    qblocks, resblks, resrtextents;
2390         int                     committed;
2391         int                     error;
2392
2393         xfs_itrace_entry(ip);
2394
2395         if (XFS_FORCED_SHUTDOWN(mp))
2396                 return XFS_ERROR(EIO);
2397
2398         error = xfs_qm_dqattach(ip, 0);
2399         if (error)
2400                 return error;
2401
2402         if (len <= 0)
2403                 return XFS_ERROR(EINVAL);
2404
2405         rt = XFS_IS_REALTIME_INODE(ip);
2406         extsz = xfs_get_extsz_hint(ip);
2407
2408         count = len;
2409         imapp = &imaps[0];
2410         nimaps = 1;
2411         bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
2412         startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
2413         allocatesize_fsb = XFS_B_TO_FSB(mp, count);
2414
2415         /*      Generate a DMAPI event if needed.       */
2416         if (alloc_type != 0 && offset < ip->i_size &&
2417                         (attr_flags & XFS_ATTR_DMI) == 0  &&
2418                         DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
2419                 xfs_off_t           end_dmi_offset;
2420
2421                 end_dmi_offset = offset+len;
2422                 if (end_dmi_offset > ip->i_size)
2423                         end_dmi_offset = ip->i_size;
2424                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, offset,
2425                                       end_dmi_offset - offset, 0, NULL);
2426                 if (error)
2427                         return error;
2428         }
2429
2430         /*
2431          * Allocate file space until done or until there is an error
2432          */
2433 retry:
2434         while (allocatesize_fsb && !error) {
2435                 xfs_fileoff_t   s, e;
2436
2437                 /*
2438                  * Determine space reservations for data/realtime.
2439                  */
2440                 if (unlikely(extsz)) {
2441                         s = startoffset_fsb;
2442                         do_div(s, extsz);
2443                         s *= extsz;
2444                         e = startoffset_fsb + allocatesize_fsb;
2445                         if ((temp = do_mod(startoffset_fsb, extsz)))
2446                                 e += temp;
2447                         if ((temp = do_mod(e, extsz)))
2448                                 e += extsz - temp;
2449                 } else {
2450                         s = 0;
2451                         e = allocatesize_fsb;
2452                 }
2453
2454                 if (unlikely(rt)) {
2455                         resrtextents = qblocks = (uint)(e - s);
2456                         resrtextents /= mp->m_sb.sb_rextsize;
2457                         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
2458                         quota_flag = XFS_QMOPT_RES_RTBLKS;
2459                 } else {
2460                         resrtextents = 0;
2461                         resblks = qblocks = \
2462                                 XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s));
2463                         quota_flag = XFS_QMOPT_RES_REGBLKS;
2464                 }
2465
2466                 /*
2467                  * Allocate and setup the transaction.
2468                  */
2469                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
2470                 error = xfs_trans_reserve(tp, resblks,
2471                                           XFS_WRITE_LOG_RES(mp), resrtextents,
2472                                           XFS_TRANS_PERM_LOG_RES,
2473                                           XFS_WRITE_LOG_COUNT);
2474                 /*
2475                  * Check for running out of space
2476                  */
2477                 if (error) {
2478                         /*
2479                          * Free the transaction structure.
2480                          */
2481                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
2482                         xfs_trans_cancel(tp, 0);
2483                         break;
2484                 }
2485                 xfs_ilock(ip, XFS_ILOCK_EXCL);
2486                 error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks,
2487                                                       0, quota_flag);
2488                 if (error)
2489                         goto error1;
2490
2491                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2492                 xfs_trans_ihold(tp, ip);
2493
2494                 /*
2495                  * Issue the xfs_bmapi() call to allocate the blocks
2496                  */
2497                 xfs_bmap_init(&free_list, &firstfsb);
2498                 error = xfs_bmapi(tp, ip, startoffset_fsb,
2499                                   allocatesize_fsb, bmapi_flag,
2500                                   &firstfsb, 0, imapp, &nimaps,
2501                                   &free_list, NULL);
2502                 if (error) {
2503                         goto error0;
2504                 }
2505
2506                 /*
2507                  * Complete the transaction
2508                  */
2509                 error = xfs_bmap_finish(&tp, &free_list, &committed);
2510                 if (error) {
2511                         goto error0;
2512                 }
2513
2514                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2515                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2516                 if (error) {
2517                         break;
2518                 }
2519
2520                 allocated_fsb = imapp->br_blockcount;
2521
2522                 if (nimaps == 0) {
2523                         error = XFS_ERROR(ENOSPC);
2524                         break;
2525                 }
2526
2527                 startoffset_fsb += allocated_fsb;
2528                 allocatesize_fsb -= allocated_fsb;
2529         }
2530 dmapi_enospc_check:
2531         if (error == ENOSPC && (attr_flags & XFS_ATTR_DMI) == 0 &&
2532             DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE)) {
2533                 error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
2534                                 ip, DM_RIGHT_NULL,
2535                                 ip, DM_RIGHT_NULL,
2536                                 NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
2537                 if (error == 0)
2538                         goto retry;     /* Maybe DMAPI app. has made space */
2539                 /* else fall through with error from XFS_SEND_DATA */
2540         }
2541
2542         return error;
2543
2544 error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
2545         xfs_bmap_cancel(&free_list);
2546         xfs_trans_unreserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
2547
2548 error1: /* Just cancel transaction */
2549         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
2550         xfs_iunlock(ip, XFS_ILOCK_EXCL);
2551         goto dmapi_enospc_check;
2552 }
2553
2554 /*
2555  * Zero file bytes between startoff and endoff inclusive.
2556  * The iolock is held exclusive and no blocks are buffered.
2557  *
2558  * This function is used by xfs_free_file_space() to zero
2559  * partial blocks when the range to free is not block aligned.
2560  * When unreserving space with boundaries that are not block
2561  * aligned we round up the start and round down the end
2562  * boundaries and then use this function to zero the parts of
2563  * the blocks that got dropped during the rounding.
2564  */
2565 STATIC int
2566 xfs_zero_remaining_bytes(
2567         xfs_inode_t             *ip,
2568         xfs_off_t               startoff,
2569         xfs_off_t               endoff)
2570 {
2571         xfs_bmbt_irec_t         imap;
2572         xfs_fileoff_t           offset_fsb;
2573         xfs_off_t               lastoffset;
2574         xfs_off_t               offset;
2575         xfs_buf_t               *bp;
2576         xfs_mount_t             *mp = ip->i_mount;
2577         int                     nimap;
2578         int                     error = 0;
2579
2580         /*
2581          * Avoid doing I/O beyond eof - it's not necessary
2582          * since nothing can read beyond eof.  The space will
2583          * be zeroed when the file is extended anyway.
2584          */
2585         if (startoff >= ip->i_size)
2586                 return 0;
2587
2588         if (endoff > ip->i_size)
2589                 endoff = ip->i_size;
2590
2591         bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
2592                                 XFS_IS_REALTIME_INODE(ip) ?
2593                                 mp->m_rtdev_targp : mp->m_ddev_targp);
2594         if (!bp)
2595                 return XFS_ERROR(ENOMEM);
2596
2597         for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
2598                 offset_fsb = XFS_B_TO_FSBT(mp, offset);
2599                 nimap = 1;
2600                 error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0,
2601                         NULL, 0, &imap, &nimap, NULL, NULL);
2602                 if (error || nimap < 1)
2603                         break;
2604                 ASSERT(imap.br_blockcount >= 1);
2605                 ASSERT(imap.br_startoff == offset_fsb);
2606                 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
2607                 if (lastoffset > endoff)
2608                         lastoffset = endoff;
2609                 if (imap.br_startblock == HOLESTARTBLOCK)
2610                         continue;
2611                 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
2612                 if (imap.br_state == XFS_EXT_UNWRITTEN)
2613                         continue;
2614                 XFS_BUF_UNDONE(bp);
2615                 XFS_BUF_UNWRITE(bp);
2616                 XFS_BUF_READ(bp);
2617                 XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
2618                 xfsbdstrat(mp, bp);
2619                 error = xfs_iowait(bp);
2620                 if (error) {
2621                         xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
2622                                           mp, bp, XFS_BUF_ADDR(bp));
2623                         break;
2624                 }
2625                 memset(XFS_BUF_PTR(bp) +
2626                         (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
2627                       0, lastoffset - offset + 1);
2628                 XFS_BUF_UNDONE(bp);
2629                 XFS_BUF_UNREAD(bp);
2630                 XFS_BUF_WRITE(bp);
2631                 xfsbdstrat(mp, bp);
2632                 error = xfs_iowait(bp);
2633                 if (error) {
2634                         xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
2635                                           mp, bp, XFS_BUF_ADDR(bp));
2636                         break;
2637                 }
2638         }
2639         xfs_buf_free(bp);
2640         return error;
2641 }
2642
2643 /*
2644  * xfs_free_file_space()
2645  *      This routine frees disk space for the given file.
2646  *
2647  *      This routine is only called by xfs_change_file_space
2648  *      for an UNRESVSP type call.
2649  *
2650  * RETURNS:
2651  *       0 on success
2652  *      errno on error
2653  *
2654  */
2655 STATIC int
2656 xfs_free_file_space(
2657         xfs_inode_t             *ip,
2658         xfs_off_t               offset,
2659         xfs_off_t               len,
2660         int                     attr_flags)
2661 {
2662         int                     committed;
2663         int                     done;
2664         xfs_off_t               end_dmi_offset;
2665         xfs_fileoff_t           endoffset_fsb;
2666         int                     error;
2667         xfs_fsblock_t           firstfsb;
2668         xfs_bmap_free_t         free_list;
2669         xfs_bmbt_irec_t         imap;
2670         xfs_off_t               ioffset;
2671         xfs_extlen_t            mod=0;
2672         xfs_mount_t             *mp;
2673         int                     nimap;
2674         uint                    resblks;
2675         uint                    rounding;
2676         int                     rt;
2677         xfs_fileoff_t           startoffset_fsb;
2678         xfs_trans_t             *tp;
2679         int                     need_iolock = 1;
2680
2681         mp = ip->i_mount;
2682
2683         xfs_itrace_entry(ip);
2684
2685         error = xfs_qm_dqattach(ip, 0);
2686         if (error)
2687                 return error;
2688
2689         error = 0;
2690         if (len <= 0)   /* if nothing being freed */
2691                 return error;
2692         rt = XFS_IS_REALTIME_INODE(ip);
2693         startoffset_fsb = XFS_B_TO_FSB(mp, offset);
2694         end_dmi_offset = offset + len;
2695         endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
2696
2697         if (offset < ip->i_size && (attr_flags & XFS_ATTR_DMI) == 0 &&
2698             DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
2699                 if (end_dmi_offset > ip->i_size)
2700                         end_dmi_offset = ip->i_size;
2701                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip,
2702                                 offset, end_dmi_offset - offset,
2703                                 AT_DELAY_FLAG(attr_flags), NULL);
2704                 if (error)
2705                         return error;
2706         }
2707
2708         if (attr_flags & XFS_ATTR_NOLOCK)
2709                 need_iolock = 0;
2710         if (need_iolock) {
2711                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
2712                 /* wait for the completion of any pending DIOs */
2713                 xfs_ioend_wait(ip);
2714         }
2715
2716         rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
2717         ioffset = offset & ~(rounding - 1);
2718
2719         if (VN_CACHED(VFS_I(ip)) != 0) {
2720                 error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED);
2721                 if (error)
2722                         goto out_unlock_iolock;
2723         }
2724
2725         /*
2726          * Need to zero the stuff we're not freeing, on disk.
2727          * If it's a realtime file & can't use unwritten extents then we
2728          * actually need to zero the extent edges.  Otherwise xfs_bunmapi
2729          * will take care of it for us.
2730          */
2731         if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
2732                 nimap = 1;
2733                 error = xfs_bmapi(NULL, ip, startoffset_fsb,
2734                         1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
2735                 if (error)
2736                         goto out_unlock_iolock;
2737                 ASSERT(nimap == 0 || nimap == 1);
2738                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
2739                         xfs_daddr_t     block;
2740
2741                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
2742                         block = imap.br_startblock;
2743                         mod = do_div(block, mp->m_sb.sb_rextsize);
2744                         if (mod)
2745                                 startoffset_fsb += mp->m_sb.sb_rextsize - mod;
2746                 }
2747                 nimap = 1;
2748                 error = xfs_bmapi(NULL, ip, endoffset_fsb - 1,
2749                         1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
2750                 if (error)
2751                         goto out_unlock_iolock;
2752                 ASSERT(nimap == 0 || nimap == 1);
2753                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
2754                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
2755                         mod++;
2756                         if (mod && (mod != mp->m_sb.sb_rextsize))
2757                                 endoffset_fsb -= mod;
2758                 }
2759         }
2760         if ((done = (endoffset_fsb <= startoffset_fsb)))
2761                 /*
2762                  * One contiguous piece to clear
2763                  */
2764                 error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
2765         else {
2766                 /*
2767                  * Some full blocks, possibly two pieces to clear
2768                  */
2769                 if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
2770                         error = xfs_zero_remaining_bytes(ip, offset,
2771                                 XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
2772                 if (!error &&
2773                     XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
2774                         error = xfs_zero_remaining_bytes(ip,
2775                                 XFS_FSB_TO_B(mp, endoffset_fsb),
2776                                 offset + len - 1);
2777         }
2778
2779         /*
2780          * free file space until done or until there is an error
2781          */
2782         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
2783         while (!error && !done) {
2784
2785                 /*
2786                  * allocate and setup the transaction. Allow this
2787                  * transaction to dip into the reserve blocks to ensure
2788                  * the freeing of the space succeeds at ENOSPC.
2789                  */
2790                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
2791                 tp->t_flags |= XFS_TRANS_RESERVE;
2792                 error = xfs_trans_reserve(tp,
2793                                           resblks,
2794                                           XFS_WRITE_LOG_RES(mp),
2795                                           0,
2796                                           XFS_TRANS_PERM_LOG_RES,
2797                                           XFS_WRITE_LOG_COUNT);
2798
2799                 /*
2800                  * check for running out of space
2801                  */
2802                 if (error) {
2803                         /*
2804                          * Free the transaction structure.
2805                          */
2806                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
2807                         xfs_trans_cancel(tp, 0);
2808                         break;
2809                 }
2810                 xfs_ilock(ip, XFS_ILOCK_EXCL);
2811                 error = xfs_trans_reserve_quota(tp, mp,
2812                                 ip->i_udquot, ip->i_gdquot,
2813                                 resblks, 0, XFS_QMOPT_RES_REGBLKS);
2814                 if (error)
2815                         goto error1;
2816
2817                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2818                 xfs_trans_ihold(tp, ip);
2819
2820                 /*
2821                  * issue the bunmapi() call to free the blocks
2822                  */
2823                 xfs_bmap_init(&free_list, &firstfsb);
2824                 error = xfs_bunmapi(tp, ip, startoffset_fsb,
2825                                   endoffset_fsb - startoffset_fsb,
2826                                   0, 2, &firstfsb, &free_list, NULL, &done);
2827                 if (error) {
2828                         goto error0;
2829                 }
2830
2831                 /*
2832                  * complete the transaction
2833                  */
2834                 error = xfs_bmap_finish(&tp, &free_list, &committed);
2835                 if (error) {
2836                         goto error0;
2837                 }
2838
2839                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2840                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2841         }
2842
2843  out_unlock_iolock:
2844         if (need_iolock)
2845                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
2846         return error;
2847
2848  error0:
2849         xfs_bmap_cancel(&free_list);
2850  error1:
2851         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
2852         xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
2853                     XFS_ILOCK_EXCL);
2854         return error;
2855 }
2856
2857 /*
2858  * xfs_change_file_space()
2859  *      This routine allocates or frees disk space for the given file.
2860  *      The user specified parameters are checked for alignment and size
2861  *      limitations.
2862  *
2863  * RETURNS:
2864  *       0 on success
2865  *      errno on error
2866  *
2867  */
2868 int
2869 xfs_change_file_space(
2870         xfs_inode_t     *ip,
2871         int             cmd,
2872         xfs_flock64_t   *bf,
2873         xfs_off_t       offset,
2874         int             attr_flags)
2875 {
2876         xfs_mount_t     *mp = ip->i_mount;
2877         int             clrprealloc;
2878         int             error;
2879         xfs_fsize_t     fsize;
2880         int             setprealloc;
2881         xfs_off_t       startoffset;
2882         xfs_off_t       llen;
2883         xfs_trans_t     *tp;
2884         struct iattr    iattr;
2885
2886         xfs_itrace_entry(ip);
2887
2888         if (!S_ISREG(ip->i_d.di_mode))
2889                 return XFS_ERROR(EINVAL);
2890
2891         switch (bf->l_whence) {
2892         case 0: /*SEEK_SET*/
2893                 break;
2894         case 1: /*SEEK_CUR*/
2895                 bf->l_start += offset;
2896                 break;
2897         case 2: /*SEEK_END*/
2898                 bf->l_start += ip->i_size;
2899                 break;
2900         default:
2901                 return XFS_ERROR(EINVAL);
2902         }
2903
2904         llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
2905
2906         if (   (bf->l_start < 0)
2907             || (bf->l_start > XFS_MAXIOFFSET(mp))
2908             || (bf->l_start + llen < 0)
2909             || (bf->l_start + llen > XFS_MAXIOFFSET(mp)))
2910                 return XFS_ERROR(EINVAL);
2911
2912         bf->l_whence = 0;
2913
2914         startoffset = bf->l_start;
2915         fsize = ip->i_size;
2916
2917         /*
2918          * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
2919          * file space.
2920          * These calls do NOT zero the data space allocated to the file,
2921          * nor do they change the file size.
2922          *
2923          * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
2924          * space.
2925          * These calls cause the new file data to be zeroed and the file
2926          * size to be changed.
2927          */
2928         setprealloc = clrprealloc = 0;
2929
2930         switch (cmd) {
2931         case XFS_IOC_RESVSP:
2932         case XFS_IOC_RESVSP64:
2933                 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
2934                                                                 1, attr_flags);
2935                 if (error)
2936                         return error;
2937                 setprealloc = 1;
2938                 break;
2939
2940         case XFS_IOC_UNRESVSP:
2941         case XFS_IOC_UNRESVSP64:
2942                 if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
2943                                                                 attr_flags)))
2944                         return error;
2945                 break;
2946
2947         case XFS_IOC_ALLOCSP:
2948         case XFS_IOC_ALLOCSP64:
2949         case XFS_IOC_FREESP:
2950         case XFS_IOC_FREESP64:
2951                 if (startoffset > fsize) {
2952                         error = xfs_alloc_file_space(ip, fsize,
2953                                         startoffset - fsize, 0, attr_flags);
2954                         if (error)
2955                                 break;
2956                 }
2957
2958                 iattr.ia_valid = ATTR_SIZE;
2959                 iattr.ia_size = startoffset;
2960
2961                 error = xfs_setattr(ip, &iattr, attr_flags);
2962
2963                 if (error)
2964                         return error;
2965
2966                 clrprealloc = 1;
2967                 break;
2968
2969         default:
2970                 ASSERT(0);
2971                 return XFS_ERROR(EINVAL);
2972         }
2973
2974         /*
2975          * update the inode timestamp, mode, and prealloc flag bits
2976          */
2977         tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
2978
2979         if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
2980                                       0, 0, 0))) {
2981                 /* ASSERT(0); */
2982                 xfs_trans_cancel(tp, 0);
2983                 return error;
2984         }
2985
2986         xfs_ilock(ip, XFS_ILOCK_EXCL);
2987
2988         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2989         xfs_trans_ihold(tp, ip);
2990
2991         if ((attr_flags & XFS_ATTR_DMI) == 0) {
2992                 ip->i_d.di_mode &= ~S_ISUID;
2993
2994                 /*
2995                  * Note that we don't have to worry about mandatory
2996                  * file locking being disabled here because we only
2997                  * clear the S_ISGID bit if the Group execute bit is
2998                  * on, but if it was on then mandatory locking wouldn't
2999                  * have been enabled.
3000                  */
3001                 if (ip->i_d.di_mode & S_IXGRP)
3002                         ip->i_d.di_mode &= ~S_ISGID;
3003
3004                 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3005         }
3006         if (setprealloc)
3007                 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
3008         else if (clrprealloc)
3009                 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
3010
3011         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3012         xfs_trans_set_sync(tp);
3013
3014         error = xfs_trans_commit(tp, 0);
3015
3016         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3017
3018         return error;
3019 }