fs/xfs/xfs_buf_item.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   4  * All Rights Reserved.
   5  */
   6 #include "xfs.h"
   7 #include "xfs_fs.h"
   8 #include "xfs_shared.h"
   9 #include "xfs_format.h"
  10 #include "xfs_log_format.h"
  11 #include "xfs_trans_resv.h"
  12 #include "xfs_bit.h"
  13 #include "xfs_mount.h"
  14 #include "xfs_trans.h"
  15 #include "xfs_trans_priv.h"
  16 #include "xfs_buf_item.h"
  17 #include "xfs_inode.h"
  18 #include "xfs_inode_item.h"
  19 #include "xfs_quota.h"
  20 #include "xfs_dquot_item.h"
  21 #include "xfs_dquot.h"
  22 #include "xfs_trace.h"
  23 #include "xfs_log.h"
  24 #include "xfs_log_priv.h"
  25 #include "xfs_error.h"
  26
  27
  28 struct kmem_cache       *xfs_buf_item_cache;
  29
  30 static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
  31 {
  32         return container_of(lip, struct xfs_buf_log_item, bli_item);
  33 }
  34
  35 /* Is this log iovec plausibly large enough to contain the buffer log format? */
  36 bool
  37 xfs_buf_log_check_iovec(
  38         struct xfs_log_iovec            *iovec)
  39 {
  40         struct xfs_buf_log_format       *blfp = iovec->i_addr;
  41         char                            *bmp_end;
  42         char                            *item_end;
  43
  44         if (offsetof(struct xfs_buf_log_format, blf_data_map) > iovec->i_len)
  45                 return false;
  46
  47         item_end = (char *)iovec->i_addr + iovec->i_len;
  48         bmp_end = (char *)&blfp->blf_data_map[blfp->blf_map_size];
  49         return bmp_end <= item_end;
  50 }
  51
  52 static inline int
  53 xfs_buf_log_format_size(
  54         struct xfs_buf_log_format *blfp)
  55 {
  56         return offsetof(struct xfs_buf_log_format, blf_data_map) +
  57                         (blfp->blf_map_size * sizeof(blfp->blf_data_map[0]));
  58 }
  59
  60 static inline bool
  61 xfs_buf_item_straddle(
  62         struct xfs_buf          *bp,
  63         uint                    offset,
  64         int                     first_bit,
  65         int                     nbits)
  66 {
  67         void                    *first, *last;
  68
  69         first = xfs_buf_offset(bp, offset + (first_bit << XFS_BLF_SHIFT));
  70         last = xfs_buf_offset(bp,
  71                         offset + ((first_bit + nbits) << XFS_BLF_SHIFT));
  72
  73         if (last - first != nbits * XFS_BLF_CHUNK)
  74                 return true;
  75         return false;
  76 }
  77
  78 /*
  79  * Return the number of log iovecs and space needed to log the given buf log
  80  * item segment.
  81  *
  82  * It calculates this as 1 iovec for the buf log format structure and 1 for each
  83  * stretch of non-contiguous chunks to be logged.  Contiguous chunks are logged
  84  * in a single iovec.
  85  */
  86 STATIC void
  87 xfs_buf_item_size_segment(
  88         struct xfs_buf_log_item         *bip,
  89         struct xfs_buf_log_format       *blfp,
  90         uint                            offset,
  91         int                             *nvecs,
  92         int                             *nbytes)
  93 {
  94         struct xfs_buf                  *bp = bip->bli_buf;
  95         int                             first_bit;
  96         int                             nbits;
  97         int                             next_bit;
  98         int                             last_bit;
  99
 100         first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
 101         if (first_bit == -1)
 102                 return;
 103
 104         (*nvecs)++;
 105         *nbytes += xfs_buf_log_format_size(blfp);
 106
 107         do {
 108                 nbits = xfs_contig_bits(blfp->blf_data_map,
 109                                         blfp->blf_map_size, first_bit);
 110                 ASSERT(nbits > 0);
 111
 112                 /*
 113                  * Straddling a page is rare because we don't log contiguous
 114                  * chunks of unmapped buffers anywhere.
 115                  */
 116                 if (nbits > 1 &&
 117                     xfs_buf_item_straddle(bp, offset, first_bit, nbits))
 118                         goto slow_scan;
 119
 120                 (*nvecs)++;
 121                 *nbytes += nbits * XFS_BLF_CHUNK;
 122
 123                 /*
 124                  * This takes the bit number to start looking from and
 125                  * returns the next set bit from there.  It returns -1
 126                  * if there are no more bits set or the start bit is
 127                  * beyond the end of the bitmap.
 128                  */
 129                 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
 130                                         (uint)first_bit + nbits + 1);
 131         } while (first_bit != -1);
 132
 133         return;
 134
 135 slow_scan:
 136         /* Count the first bit we jumped out of the above loop from */
 137         (*nvecs)++;
 138         *nbytes += XFS_BLF_CHUNK;
 139         last_bit = first_bit;
 140         while (last_bit != -1) {
 141                 /*
 142                  * This takes the bit number to start looking from and
 143                  * returns the next set bit from there.  It returns -1
 144                  * if there are no more bits set or the start bit is
 145                  * beyond the end of the bitmap.
 146                  */
 147                 next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
 148                                         last_bit + 1);
 149                 /*
 150                  * If we run out of bits, leave the loop,
 151                  * else if we find a new set of bits bump the number of vecs,
 152                  * else keep scanning the current set of bits.
 153                  */
 154                 if (next_bit == -1) {
 155                         break;
 156                 } else if (next_bit != last_bit + 1 ||
 157                            xfs_buf_item_straddle(bp, offset, first_bit, nbits)) {
 158                         last_bit = next_bit;
 159                         first_bit = next_bit;
 160                         (*nvecs)++;
 161                         nbits = 1;
 162                 } else {
 163                         last_bit++;
 164                         nbits++;
 165                 }
 166                 *nbytes += XFS_BLF_CHUNK;
 167         }
 168 }
 169
 170 /*
 171  * Return the number of log iovecs and space needed to log the given buf log
 172  * item.
 173  *
 174  * Discontiguous buffers need a format structure per region that is being
 175  * logged. This makes the changes in the buffer appear to log recovery as though
 176  * they came from separate buffers, just like would occur if multiple buffers
 177  * were used instead of a single discontiguous buffer. This enables
 178  * discontiguous buffers to be in-memory constructs, completely transparent to
 179  * what ends up on disk.
 180  *
 181  * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log
 182  * format structures. If the item has previously been logged and has dirty
 183  * regions, we do not relog them in stale buffers. This has the effect of
 184  * reducing the size of the relogged item by the amount of dirty data tracked
 185  * by the log item. This can result in the committing transaction reducing the
 186  * amount of space being consumed by the CIL.
 187  */
 188 STATIC void
 189 xfs_buf_item_size(
 190         struct xfs_log_item     *lip,
 191         int                     *nvecs,
 192         int                     *nbytes)
 193 {
 194         struct xfs_buf_log_item *bip = BUF_ITEM(lip);
 195         struct xfs_buf          *bp = bip->bli_buf;
 196         int                     i;
 197         int                     bytes;
 198         uint                    offset = 0;
 199
 200         ASSERT(atomic_read(&bip->bli_refcount) > 0);
 201         if (bip->bli_flags & XFS_BLI_STALE) {
 202                 /*
 203                  * The buffer is stale, so all we need to log is the buf log
 204                  * format structure with the cancel flag in it as we are never
 205                  * going to replay the changes tracked in the log item.
 206                  */
 207                 trace_xfs_buf_item_size_stale(bip);
 208                 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
 209                 *nvecs += bip->bli_format_count;
 210                 for (i = 0; i < bip->bli_format_count; i++) {
 211                         *nbytes += xfs_buf_log_format_size(&bip->bli_formats[i]);
 212                 }
 213                 return;
 214         }
 215
 216         ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
 217
 218         if (bip->bli_flags & XFS_BLI_ORDERED) {
 219                 /*
 220                  * The buffer has been logged just to order it. It is not being
 221                  * included in the transaction commit, so no vectors are used at
 222                  * all.
 223                  */
 224                 trace_xfs_buf_item_size_ordered(bip);
 225                 *nvecs = XFS_LOG_VEC_ORDERED;
 226                 return;
 227         }
 228
 229         /*
 230          * The vector count is based on the number of buffer vectors we have
 231          * dirty bits in. This will only be greater than one when we have a
 232          * compound buffer with more than one segment dirty. Hence for compound
 233          * buffers we need to track which segment the dirty bits correspond to,
 234          * and when we move from one segment to the next increment the vector
 235          * count for the extra buf log format structure that will need to be
 236          * written.
 237          */
 238         bytes = 0;
 239         for (i = 0; i < bip->bli_format_count; i++) {
 240                 xfs_buf_item_size_segment(bip, &bip->bli_formats[i], offset,
 241                                           nvecs, &bytes);
 242                 offset += BBTOB(bp->b_maps[i].bm_len);
 243         }
 244
 245         /*
 246          * Round up the buffer size required to minimise the number of memory
 247          * allocations that need to be done as this item grows when relogged by
 248          * repeated modifications.
 249          */
 250         *nbytes = round_up(bytes, 512);
 251         trace_xfs_buf_item_size(bip);
 252 }
 253
 254 static inline void
 255 xfs_buf_item_copy_iovec(
 256         struct xfs_log_vec      *lv,
 257         struct xfs_log_iovec    **vecp,
 258         struct xfs_buf          *bp,
 259         uint                    offset,
 260         int                     first_bit,
 261         uint                    nbits)
 262 {
 263         offset += first_bit * XFS_BLF_CHUNK;
 264         xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BCHUNK,
 265                         xfs_buf_offset(bp, offset),
 266                         nbits * XFS_BLF_CHUNK);
 267 }
 268
 269 static void
 270 xfs_buf_item_format_segment(
 271         struct xfs_buf_log_item *bip,
 272         struct xfs_log_vec      *lv,
 273         struct xfs_log_iovec    **vecp,
 274         uint                    offset,
 275         struct xfs_buf_log_format *blfp)
 276 {
 277         struct xfs_buf          *bp = bip->bli_buf;
 278         uint                    base_size;
 279         int                     first_bit;
 280         int                     last_bit;
 281         int                     next_bit;
 282         uint                    nbits;
 283
 284         /* copy the flags across from the base format item */
 285         blfp->blf_flags = bip->__bli_format.blf_flags;
 286
 287         /*
 288          * Base size is the actual size of the ondisk structure - it reflects
 289          * the actual size of the dirty bitmap rather than the size of the in
 290          * memory structure.
 291          */
 292         base_size = xfs_buf_log_format_size(blfp);
 293
 294         first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
 295         if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) {
 296                 /*
 297                  * If the map is not be dirty in the transaction, mark
 298                  * the size as zero and do not advance the vector pointer.
 299                  */
 300                 return;
 301         }
 302
 303         blfp = xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BFORMAT, blfp, base_size);
 304         blfp->blf_size = 1;
 305
 306         if (bip->bli_flags & XFS_BLI_STALE) {
 307                 /*
 308                  * The buffer is stale, so all we need to log
 309                  * is the buf log format structure with the
 310                  * cancel flag in it.
 311                  */
 312                 trace_xfs_buf_item_format_stale(bip);
 313                 ASSERT(blfp->blf_flags & XFS_BLF_CANCEL);
 314                 return;
 315         }
 316
 317
 318         /*
 319          * Fill in an iovec for each set of contiguous chunks.
 320          */
 321         do {
 322                 ASSERT(first_bit >= 0);
 323                 nbits = xfs_contig_bits(blfp->blf_data_map,
 324                                         blfp->blf_map_size, first_bit);
 325                 ASSERT(nbits > 0);
 326
 327                 /*
 328                  * Straddling a page is rare because we don't log contiguous
 329                  * chunks of unmapped buffers anywhere.
 330                  */
 331                 if (nbits > 1 &&
 332                     xfs_buf_item_straddle(bp, offset, first_bit, nbits))
 333                         goto slow_scan;
 334
 335                 xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
 336                                         first_bit, nbits);
 337                 blfp->blf_size++;
 338
 339                 /*
 340                  * This takes the bit number to start looking from and
 341                  * returns the next set bit from there.  It returns -1
 342                  * if there are no more bits set or the start bit is
 343                  * beyond the end of the bitmap.
 344                  */
 345                 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
 346                                         (uint)first_bit + nbits + 1);
 347         } while (first_bit != -1);
 348
 349         return;
 350
 351 slow_scan:
 352         ASSERT(bp->b_addr == NULL);
 353         last_bit = first_bit;
 354         nbits = 1;
 355         for (;;) {
 356                 /*
 357                  * This takes the bit number to start looking from and
 358                  * returns the next set bit from there.  It returns -1
 359                  * if there are no more bits set or the start bit is
 360                  * beyond the end of the bitmap.
 361                  */
 362                 next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
 363                                         (uint)last_bit + 1);
 364                 /*
 365                  * If we run out of bits fill in the last iovec and get out of
 366                  * the loop.  Else if we start a new set of bits then fill in
 367                  * the iovec for the series we were looking at and start
 368                  * counting the bits in the new one.  Else we're still in the
 369                  * same set of bits so just keep counting and scanning.
 370                  */
 371                 if (next_bit == -1) {
 372                         xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
 373                                                 first_bit, nbits);
 374                         blfp->blf_size++;
 375                         break;
 376                 } else if (next_bit != last_bit + 1 ||
 377                            xfs_buf_item_straddle(bp, offset, first_bit, nbits)) {
 378                         xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
 379                                                 first_bit, nbits);
 380                         blfp->blf_size++;
 381                         first_bit = next_bit;
 382                         last_bit = next_bit;
 383                         nbits = 1;
 384                 } else {
 385                         last_bit++;
 386                         nbits++;
 387                 }
 388         }
 389 }
 390
 391 /*
 392  * This is called to fill in the vector of log iovecs for the
 393  * given log buf item.  It fills the first entry with a buf log
 394  * format structure, and the rest point to contiguous chunks
 395  * within the buffer.
 396  */
 397 STATIC void
 398 xfs_buf_item_format(
 399         struct xfs_log_item     *lip,
 400         struct xfs_log_vec      *lv)
 401 {
 402         struct xfs_buf_log_item *bip = BUF_ITEM(lip);
 403         struct xfs_buf          *bp = bip->bli_buf;
 404         struct xfs_log_iovec    *vecp = NULL;
 405         uint                    offset = 0;
 406         int                     i;
 407
 408         ASSERT(atomic_read(&bip->bli_refcount) > 0);
 409         ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
 410                (bip->bli_flags & XFS_BLI_STALE));
 411         ASSERT((bip->bli_flags & XFS_BLI_STALE) ||
 412                (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF
 413                 && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF));
 414         ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED) ||
 415                (bip->bli_flags & XFS_BLI_STALE));
 416
 417
 418         /*
 419          * If it is an inode buffer, transfer the in-memory state to the
 420          * format flags and clear the in-memory state.
 421          *
 422          * For buffer based inode allocation, we do not transfer
 423          * this state if the inode buffer allocation has not yet been committed
 424          * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
 425          * correct replay of the inode allocation.
 426          *
 427          * For icreate item based inode allocation, the buffers aren't written
 428          * to the journal during allocation, and hence we should always tag the
 429          * buffer as an inode buffer so that the correct unlinked list replay
 430          * occurs during recovery.
 431          */
 432         if (bip->bli_flags & XFS_BLI_INODE_BUF) {
 433                 if (xfs_has_v3inodes(lip->li_log->l_mp) ||
 434                     !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
 435                       xfs_log_item_in_current_chkpt(lip)))
 436                         bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF;
 437                 bip->bli_flags &= ~XFS_BLI_INODE_BUF;
 438         }
 439
 440         for (i = 0; i < bip->bli_format_count; i++) {
 441                 xfs_buf_item_format_segment(bip, lv, &vecp, offset,
 442                                             &bip->bli_formats[i]);
 443                 offset += BBTOB(bp->b_maps[i].bm_len);
 444         }
 445
 446         /*
 447          * Check to make sure everything is consistent.
 448          */
 449         trace_xfs_buf_item_format(bip);
 450 }
 451
 452 /*
 453  * This is called to pin the buffer associated with the buf log item in memory
 454  * so it cannot be written out.
 455  *
 456  * We take a reference to the buffer log item here so that the BLI life cycle
 457  * extends at least until the buffer is unpinned via xfs_buf_item_unpin() and
 458  * inserted into the AIL.
 459  *
 460  * We also need to take a reference to the buffer itself as the BLI unpin
 461  * processing requires accessing the buffer after the BLI has dropped the final
 462  * BLI reference. See xfs_buf_item_unpin() for an explanation.
 463  * If unpins race to drop the final BLI reference and only the
 464  * BLI owns a reference to the buffer, then the loser of the race can have the
 465  * buffer fgreed from under it (e.g. on shutdown). Taking a buffer reference per
 466  * pin count ensures the life cycle of the buffer extends for as
 467  * long as we hold the buffer pin reference in xfs_buf_item_unpin().
 468  */
 469 STATIC void
 470 xfs_buf_item_pin(
 471         struct xfs_log_item     *lip)
 472 {
 473         struct xfs_buf_log_item *bip = BUF_ITEM(lip);
 474
 475         ASSERT(atomic_read(&bip->bli_refcount) > 0);
 476         ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
 477                (bip->bli_flags & XFS_BLI_ORDERED) ||
 478                (bip->bli_flags & XFS_BLI_STALE));
 479
 480         trace_xfs_buf_item_pin(bip);
 481
 482         xfs_buf_hold(bip->bli_buf);
 483         atomic_inc(&bip->bli_refcount);
 484         atomic_inc(&bip->bli_buf->b_pin_count);
 485 }
 486
 487 /*
 488  * This is called to unpin the buffer associated with the buf log item which was
 489  * previously pinned with a call to xfs_buf_item_pin().  We enter this function
 490  * with a buffer pin count, a buffer reference and a BLI reference.
 491  *
 492  * We must drop the BLI reference before we unpin the buffer because the AIL
 493  * doesn't acquire a BLI reference whenever it accesses it. Therefore if the
 494  * refcount drops to zero, the bli could still be AIL resident and the buffer
 495  * submitted for I/O at any point before we return. This can result in IO
 496  * completion freeing the buffer while we are still trying to access it here.
 497  * This race condition can also occur in shutdown situations where we abort and
 498  * unpin buffers from contexts other that journal IO completion.
 499  *
 500  * Hence we have to hold a buffer reference per pin count to ensure that the
 501  * buffer cannot be freed until we have finished processing the unpin operation.
 502  * The reference is taken in xfs_buf_item_pin(), and we must hold it until we
 503  * are done processing the buffer state. In the case of an abort (remove =
 504  * true) then we re-use the current pin reference as the IO reference we hand
 505  * off to IO failure handling.
 506  */
 507 STATIC void
 508 xfs_buf_item_unpin(
 509         struct xfs_log_item     *lip,
 510         int                     remove)
 511 {
 512         struct xfs_buf_log_item *bip = BUF_ITEM(lip);
 513         struct xfs_buf          *bp = bip->bli_buf;
 514         int                     stale = bip->bli_flags & XFS_BLI_STALE;
 515         int                     freed;
 516
 517         ASSERT(bp->b_log_item == bip);
 518         ASSERT(atomic_read(&bip->bli_refcount) > 0);
 519
 520         trace_xfs_buf_item_unpin(bip);
 521
 522         freed = atomic_dec_and_test(&bip->bli_refcount);
 523         if (atomic_dec_and_test(&bp->b_pin_count))
 524                 wake_up_all(&bp->b_waiters);
 525
 526         /*
 527          * Nothing to do but drop the buffer pin reference if the BLI is
 528          * still active.
 529          */
 530         if (!freed) {
 531                 xfs_buf_rele(bp);
 532                 return;
 533         }
 534
 535         if (stale) {
 536                 ASSERT(bip->bli_flags & XFS_BLI_STALE);
 537                 ASSERT(xfs_buf_islocked(bp));
 538                 ASSERT(bp->b_flags & XBF_STALE);
 539                 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
 540                 ASSERT(list_empty(&lip->li_trans));
 541                 ASSERT(!bp->b_transp);
 542
 543                 trace_xfs_buf_item_unpin_stale(bip);
 544
 545                 /*
 546                  * The buffer has been locked and referenced since it was marked
 547                  * stale so we own both lock and reference exclusively here. We
 548                  * do not need the pin reference any more, so drop it now so
 549                  * that we only have one reference to drop once item completion
 550                  * processing is complete.
 551                  */
 552                 xfs_buf_rele(bp);
 553
 554                 /*
 555                  * If we get called here because of an IO error, we may or may
 556                  * not have the item on the AIL. xfs_trans_ail_delete() will
 557                  * take care of that situation. xfs_trans_ail_delete() drops
 558                  * the AIL lock.
 559                  */
 560                 if (bip->bli_flags & XFS_BLI_STALE_INODE) {
 561                         xfs_buf_item_done(bp);
 562                         xfs_buf_inode_iodone(bp);
 563                         ASSERT(list_empty(&bp->b_li_list));
 564                 } else {
 565                         xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR);
 566                         xfs_buf_item_relse(bp);
 567                         ASSERT(bp->b_log_item == NULL);
 568                 }
 569                 xfs_buf_relse(bp);
 570                 return;
 571         }
 572
 573         if (remove) {
 574                 /*
 575                  * We need to simulate an async IO failures here to ensure that
 576                  * the correct error completion is run on this buffer. This
 577                  * requires a reference to the buffer and for the buffer to be
 578                  * locked. We can safely pass ownership of the pin reference to
 579                  * the IO to ensure that nothing can free the buffer while we
 580                  * wait for the lock and then run the IO failure completion.
 581                  */
 582                 xfs_buf_lock(bp);
 583                 bp->b_flags |= XBF_ASYNC;
 584                 xfs_buf_ioend_fail(bp);
 585                 return;
 586         }
 587
 588         /*
 589          * BLI has no more active references - it will be moved to the AIL to
 590          * manage the remaining BLI/buffer life cycle. There is nothing left for
 591          * us to do here so drop the pin reference to the buffer.
 592          */
 593         xfs_buf_rele(bp);
 594 }
 595
 596 STATIC uint
 597 xfs_buf_item_push(
 598         struct xfs_log_item     *lip,
 599         struct list_head        *buffer_list)
 600 {
 601         struct xfs_buf_log_item *bip = BUF_ITEM(lip);
 602         struct xfs_buf          *bp = bip->bli_buf;
 603         uint                    rval = XFS_ITEM_SUCCESS;
 604
 605         if (xfs_buf_ispinned(bp))
 606                 return XFS_ITEM_PINNED;
 607         if (!xfs_buf_trylock(bp)) {
 608                 /*
 609                  * If we have just raced with a buffer being pinned and it has
 610                  * been marked stale, we could end up stalling until someone else
 611                  * issues a log force to unpin the stale buffer. Check for the
 612                  * race condition here so xfsaild recognizes the buffer is pinned
 613                  * and queues a log force to move it along.
 614                  */
 615                 if (xfs_buf_ispinned(bp))
 616                         return XFS_ITEM_PINNED;
 617                 return XFS_ITEM_LOCKED;
 618         }
 619
 620         ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
 621
 622         trace_xfs_buf_item_push(bip);
 623
 624         /* has a previous flush failed due to IO errors? */
 625         if (bp->b_flags & XBF_WRITE_FAIL) {
 626                 xfs_buf_alert_ratelimited(bp, "XFS: Failing async write",
 627             "Failing async write on buffer block 0x%llx. Retrying async write.",
 628                                           (long long)xfs_buf_daddr(bp));
 629         }
 630
 631         if (!xfs_buf_delwri_queue(bp, buffer_list))
 632                 rval = XFS_ITEM_FLUSHING;
 633         xfs_buf_unlock(bp);
 634         return rval;
 635 }
 636
 637 /*
 638  * Drop the buffer log item refcount and take appropriate action. This helper
 639  * determines whether the bli must be freed or not, since a decrement to zero
 640  * does not necessarily mean the bli is unused.
 641  *
 642  * Return true if the bli is freed, false otherwise.
 643  */
 644 bool
 645 xfs_buf_item_put(
 646         struct xfs_buf_log_item *bip)
 647 {
 648         struct xfs_log_item     *lip = &bip->bli_item;
 649         bool                    aborted;
 650         bool                    dirty;
 651
 652         /* drop the bli ref and return if it wasn't the last one */
 653         if (!atomic_dec_and_test(&bip->bli_refcount))
 654                 return false;
 655
 656         /*
 657          * We dropped the last ref and must free the item if clean or aborted.
 658          * If the bli is dirty and non-aborted, the buffer was clean in the
 659          * transaction but still awaiting writeback from previous changes. In
 660          * that case, the bli is freed on buffer writeback completion.
 661          */
 662         aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) ||
 663                         xlog_is_shutdown(lip->li_log);
 664         dirty = bip->bli_flags & XFS_BLI_DIRTY;
 665         if (dirty && !aborted)
 666                 return false;
 667
 668         /*
 669          * The bli is aborted or clean. An aborted item may be in the AIL
 670          * regardless of dirty state.  For example, consider an aborted
 671          * transaction that invalidated a dirty bli and cleared the dirty
 672          * state.
 673          */
 674         if (aborted)
 675                 xfs_trans_ail_delete(lip, 0);
 676         xfs_buf_item_relse(bip->bli_buf);
 677         return true;
 678 }
 679
 680 /*
 681  * Release the buffer associated with the buf log item.  If there is no dirty
 682  * logged data associated with the buffer recorded in the buf log item, then
 683  * free the buf log item and remove the reference to it in the buffer.
 684  *
 685  * This call ignores the recursion count.  It is only called when the buffer
 686  * should REALLY be unlocked, regardless of the recursion count.
 687  *
 688  * We unconditionally drop the transaction's reference to the log item. If the
 689  * item was logged, then another reference was taken when it was pinned, so we
 690  * can safely drop the transaction reference now.  This also allows us to avoid
 691  * potential races with the unpin code freeing the bli by not referencing the
 692  * bli after we've dropped the reference count.
 693  *
 694  * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item
 695  * if necessary but do not unlock the buffer.  This is for support of
 696  * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't
 697  * free the item.
 698  */
 699 STATIC void
 700 xfs_buf_item_release(
 701         struct xfs_log_item     *lip)
 702 {
 703         struct xfs_buf_log_item *bip = BUF_ITEM(lip);
 704         struct xfs_buf          *bp = bip->bli_buf;
 705         bool                    released;
 706         bool                    hold = bip->bli_flags & XFS_BLI_HOLD;
 707         bool                    stale = bip->bli_flags & XFS_BLI_STALE;
 708 #if defined(DEBUG) || defined(XFS_WARN)
 709         bool                    ordered = bip->bli_flags & XFS_BLI_ORDERED;
 710         bool                    dirty = bip->bli_flags & XFS_BLI_DIRTY;
 711         bool                    aborted = test_bit(XFS_LI_ABORTED,
 712                                                    &lip->li_flags);
 713 #endif
 714
 715         trace_xfs_buf_item_release(bip);
 716
 717         /*
 718          * The bli dirty state should match whether the blf has logged segments
 719          * except for ordered buffers, where only the bli should be dirty.
 720          */
 721         ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) ||
 722                (ordered && dirty && !xfs_buf_item_dirty_format(bip)));
 723         ASSERT(!stale || (bip->__bli_format.blf_flags & XFS_BLF_CANCEL));
 724
 725         /*
 726          * Clear the buffer's association with this transaction and
 727          * per-transaction state from the bli, which has been copied above.
 728          */
 729         bp->b_transp = NULL;
 730         bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
 731
 732         /*
 733          * Unref the item and unlock the buffer unless held or stale. Stale
 734          * buffers remain locked until final unpin unless the bli is freed by
 735          * the unref call. The latter implies shutdown because buffer
 736          * invalidation dirties the bli and transaction.
 737          */
 738         released = xfs_buf_item_put(bip);
 739         if (hold || (stale && !released))
 740                 return;
 741         ASSERT(!stale || aborted);
 742         xfs_buf_relse(bp);
 743 }
 744
 745 STATIC void
 746 xfs_buf_item_committing(
 747         struct xfs_log_item     *lip,
 748         xfs_csn_t               seq)
 749 {
 750         return xfs_buf_item_release(lip);
 751 }
 752
 753 /*
 754  * This is called to find out where the oldest active copy of the
 755  * buf log item in the on disk log resides now that the last log
 756  * write of it completed at the given lsn.
 757  * We always re-log all the dirty data in a buffer, so usually the
 758  * latest copy in the on disk log is the only one that matters.  For
 759  * those cases we simply return the given lsn.
 760  *
 761  * The one exception to this is for buffers full of newly allocated
 762  * inodes.  These buffers are only relogged with the XFS_BLI_INODE_BUF
 763  * flag set, indicating that only the di_next_unlinked fields from the
 764  * inodes in the buffers will be replayed during recovery.  If the
 765  * original newly allocated inode images have not yet been flushed
 766  * when the buffer is so relogged, then we need to make sure that we
 767  * keep the old images in the 'active' portion of the log.  We do this
 768  * by returning the original lsn of that transaction here rather than
 769  * the current one.
 770  */
 771 STATIC xfs_lsn_t
 772 xfs_buf_item_committed(
 773         struct xfs_log_item     *lip,
 774         xfs_lsn_t               lsn)
 775 {
 776         struct xfs_buf_log_item *bip = BUF_ITEM(lip);
 777
 778         trace_xfs_buf_item_committed(bip);
 779
 780         if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && lip->li_lsn != 0)
 781                 return lip->li_lsn;
 782         return lsn;
 783 }
 784
 785 #ifdef DEBUG_EXPENSIVE
 786 static int
 787 xfs_buf_item_precommit(
 788         struct xfs_trans        *tp,
 789         struct xfs_log_item     *lip)
 790 {
 791         struct xfs_buf_log_item *bip = BUF_ITEM(lip);
 792         struct xfs_buf          *bp = bip->bli_buf;
 793         struct xfs_mount        *mp = bp->b_mount;
 794         xfs_failaddr_t          fa;
 795
 796         if (!bp->b_ops || !bp->b_ops->verify_struct)
 797                 return 0;
 798         if (bip->bli_flags & XFS_BLI_STALE)
 799                 return 0;
 800
 801         fa = bp->b_ops->verify_struct(bp);
 802         if (fa) {
 803                 xfs_buf_verifier_error(bp, -EFSCORRUPTED, bp->b_ops->name,
 804                                 bp->b_addr, BBTOB(bp->b_length), fa);
 805                 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 806                 ASSERT(fa == NULL);
 807         }
 808
 809         return 0;
 810 }
 811 #else
 812 # define xfs_buf_item_precommit NULL
 813 #endif
 814
 815 static const struct xfs_item_ops xfs_buf_item_ops = {
 816         .iop_size       = xfs_buf_item_size,
 817         .iop_precommit  = xfs_buf_item_precommit,
 818         .iop_format     = xfs_buf_item_format,
 819         .iop_pin        = xfs_buf_item_pin,
 820         .iop_unpin      = xfs_buf_item_unpin,
 821         .iop_release    = xfs_buf_item_release,
 822         .iop_committing = xfs_buf_item_committing,
 823         .iop_committed  = xfs_buf_item_committed,
 824         .iop_push       = xfs_buf_item_push,
 825 };
 826
 827 STATIC void
 828 xfs_buf_item_get_format(
 829         struct xfs_buf_log_item *bip,
 830         int                     count)
 831 {
 832         ASSERT(bip->bli_formats == NULL);
 833         bip->bli_format_count = count;
 834
 835         if (count == 1) {
 836                 bip->bli_formats = &bip->__bli_format;
 837                 return;
 838         }
 839
 840         bip->bli_formats = kzalloc(count * sizeof(struct xfs_buf_log_format),
 841                                 GFP_KERNEL | __GFP_NOFAIL);
 842 }
 843
 844 STATIC void
 845 xfs_buf_item_free_format(
 846         struct xfs_buf_log_item *bip)
 847 {
 848         if (bip->bli_formats != &bip->__bli_format) {
 849                 kfree(bip->bli_formats);
 850                 bip->bli_formats = NULL;
 851         }
 852 }
 853
 854 /*
 855  * Allocate a new buf log item to go with the given buffer.
 856  * Set the buffer's b_log_item field to point to the new
 857  * buf log item.
 858  */
 859 int
 860 xfs_buf_item_init(
 861         struct xfs_buf  *bp,
 862         struct xfs_mount *mp)
 863 {
 864         struct xfs_buf_log_item *bip = bp->b_log_item;
 865         int                     chunks;
 866         int                     map_size;
 867         int                     i;
 868
 869         /*
 870          * Check to see if there is already a buf log item for
 871          * this buffer. If we do already have one, there is
 872          * nothing to do here so return.
 873          */
 874         ASSERT(bp->b_mount == mp);
 875         if (bip) {
 876                 ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
 877                 ASSERT(!bp->b_transp);
 878                 ASSERT(bip->bli_buf == bp);
 879                 return 0;
 880         }
 881
 882         bip = kmem_cache_zalloc(xfs_buf_item_cache, GFP_KERNEL | __GFP_NOFAIL);
 883         xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
 884         bip->bli_buf = bp;
 885
 886         /*
 887          * chunks is the number of XFS_BLF_CHUNK size pieces the buffer
 888          * can be divided into. Make sure not to truncate any pieces.
 889          * map_size is the size of the bitmap needed to describe the
 890          * chunks of the buffer.
 891          *
 892          * Discontiguous buffer support follows the layout of the underlying
 893          * buffer. This makes the implementation as simple as possible.
 894          */
 895         xfs_buf_item_get_format(bip, bp->b_map_count);
 896
 897         for (i = 0; i < bip->bli_format_count; i++) {
 898                 chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len),
 899                                       XFS_BLF_CHUNK);
 900                 map_size = DIV_ROUND_UP(chunks, NBWORD);
 901
 902                 if (map_size > XFS_BLF_DATAMAP_SIZE) {
 903                         kmem_cache_free(xfs_buf_item_cache, bip);
 904                         xfs_err(mp,
 905         "buffer item dirty bitmap (%u uints) too small to reflect %u bytes!",
 906                                         map_size,
 907                                         BBTOB(bp->b_maps[i].bm_len));
 908                         return -EFSCORRUPTED;
 909                 }
 910
 911                 bip->bli_formats[i].blf_type = XFS_LI_BUF;
 912                 bip->bli_formats[i].blf_blkno = bp->b_maps[i].bm_bn;
 913                 bip->bli_formats[i].blf_len = bp->b_maps[i].bm_len;
 914                 bip->bli_formats[i].blf_map_size = map_size;
 915         }
 916
 917         bp->b_log_item = bip;
 918         xfs_buf_hold(bp);
 919         return 0;
 920 }
 921
 922
 923 /*
 924  * Mark bytes first through last inclusive as dirty in the buf
 925  * item's bitmap.
 926  */
 927 static void
 928 xfs_buf_item_log_segment(
 929         uint                    first,
 930         uint                    last,
 931         uint                    *map)
 932 {
 933         uint            first_bit;
 934         uint            last_bit;
 935         uint            bits_to_set;
 936         uint            bits_set;
 937         uint            word_num;
 938         uint            *wordp;
 939         uint            bit;
 940         uint            end_bit;
 941         uint            mask;
 942
 943         ASSERT(first < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD);
 944         ASSERT(last < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD);
 945
 946         /*
 947          * Convert byte offsets to bit numbers.
 948          */
 949         first_bit = first >> XFS_BLF_SHIFT;
 950         last_bit = last >> XFS_BLF_SHIFT;
 951
 952         /*
 953          * Calculate the total number of bits to be set.
 954          */
 955         bits_to_set = last_bit - first_bit + 1;
 956
 957         /*
 958          * Get a pointer to the first word in the bitmap
 959          * to set a bit in.
 960          */
 961         word_num = first_bit >> BIT_TO_WORD_SHIFT;
 962         wordp = &map[word_num];
 963
 964         /*
 965          * Calculate the starting bit in the first word.
 966          */
 967         bit = first_bit & (uint)(NBWORD - 1);
 968
 969         /*
 970          * First set any bits in the first word of our range.
 971          * If it starts at bit 0 of the word, it will be
 972          * set below rather than here.  That is what the variable
 973          * bit tells us. The variable bits_set tracks the number
 974          * of bits that have been set so far.  End_bit is the number
 975          * of the last bit to be set in this word plus one.
 976          */
 977         if (bit) {
 978                 end_bit = min(bit + bits_to_set, (uint)NBWORD);
 979                 mask = ((1U << (end_bit - bit)) - 1) << bit;
 980                 *wordp |= mask;
 981                 wordp++;
 982                 bits_set = end_bit - bit;
 983         } else {
 984                 bits_set = 0;
 985         }
 986
 987         /*
 988          * Now set bits a whole word at a time that are between
 989          * first_bit and last_bit.
 990          */
 991         while ((bits_to_set - bits_set) >= NBWORD) {
 992                 *wordp = 0xffffffff;
 993                 bits_set += NBWORD;
 994                 wordp++;
 995         }
 996
 997         /*
 998          * Finally, set any bits left to be set in one last partial word.
 999          */
1000         end_bit = bits_to_set - bits_set;
1001         if (end_bit) {
1002                 mask = (1U << end_bit) - 1;
1003                 *wordp |= mask;
1004         }
1005 }
1006
1007 /*
1008  * Mark bytes first through last inclusive as dirty in the buf
1009  * item's bitmap.
1010  */
1011 void
1012 xfs_buf_item_log(
1013         struct xfs_buf_log_item *bip,
1014         uint                    first,
1015         uint                    last)
1016 {
1017         int                     i;
1018         uint                    start;
1019         uint                    end;
1020         struct xfs_buf          *bp = bip->bli_buf;
1021
1022         /*
1023          * walk each buffer segment and mark them dirty appropriately.
1024          */
1025         start = 0;
1026         for (i = 0; i < bip->bli_format_count; i++) {
1027                 if (start > last)
1028                         break;
1029                 end = start + BBTOB(bp->b_maps[i].bm_len) - 1;
1030
1031                 /* skip to the map that includes the first byte to log */
1032                 if (first > end) {
1033                         start += BBTOB(bp->b_maps[i].bm_len);
1034                         continue;
1035                 }
1036
1037                 /*
1038                  * Trim the range to this segment and mark it in the bitmap.
1039                  * Note that we must convert buffer offsets to segment relative
1040                  * offsets (e.g., the first byte of each segment is byte 0 of
1041                  * that segment).
1042                  */
1043                 if (first < start)
1044                         first = start;
1045                 if (end > last)
1046                         end = last;
1047                 xfs_buf_item_log_segment(first - start, end - start,
1048                                          &bip->bli_formats[i].blf_data_map[0]);
1049
1050                 start += BBTOB(bp->b_maps[i].bm_len);
1051         }
1052 }
1053
1054
1055 /*
1056  * Return true if the buffer has any ranges logged/dirtied by a transaction,
1057  * false otherwise.
1058  */
1059 bool
1060 xfs_buf_item_dirty_format(
1061         struct xfs_buf_log_item *bip)
1062 {
1063         int                     i;
1064
1065         for (i = 0; i < bip->bli_format_count; i++) {
1066                 if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
1067                              bip->bli_formats[i].blf_map_size))
1068                         return true;
1069         }
1070
1071         return false;
1072 }
1073
1074 STATIC void
1075 xfs_buf_item_free(
1076         struct xfs_buf_log_item *bip)
1077 {
1078         xfs_buf_item_free_format(bip);
1079         kvfree(bip->bli_item.li_lv_shadow);
1080         kmem_cache_free(xfs_buf_item_cache, bip);
1081 }
1082
1083 /*
1084  * xfs_buf_item_relse() is called when the buf log item is no longer needed.
1085  */
1086 void
1087 xfs_buf_item_relse(
1088         struct xfs_buf  *bp)
1089 {
1090         struct xfs_buf_log_item *bip = bp->b_log_item;
1091
1092         trace_xfs_buf_item_relse(bp, _RET_IP_);
1093         ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags));
1094
1095         if (atomic_read(&bip->bli_refcount))
1096                 return;
1097         bp->b_log_item = NULL;
1098         xfs_buf_rele(bp);
1099         xfs_buf_item_free(bip);
1100 }
1101
1102 void
1103 xfs_buf_item_done(
1104         struct xfs_buf          *bp)
1105 {
1106         /*
1107          * If we are forcibly shutting down, this may well be off the AIL
1108          * already. That's because we simulate the log-committed callbacks to
1109          * unpin these buffers. Or we may never have put this item on AIL
1110          * because of the transaction was aborted forcibly.
1111          * xfs_trans_ail_delete() takes care of these.
1112          *
1113          * Either way, AIL is useless if we're forcing a shutdown.
1114          *
1115          * Note that log recovery writes might have buffer items that are not on
1116          * the AIL even when the file system is not shut down.
1117          */
1118         xfs_trans_ail_delete(&bp->b_log_item->bli_item,
1119                              (bp->b_flags & _XBF_LOGRECOVERY) ? 0 :
1120                              SHUTDOWN_CORRUPT_INCORE);
1121         xfs_buf_item_relse(bp);
1122 }