fs/xfs/libxfs/xfs_defer.c

   1 // SPDX-License-Identifier: GPL-2.0+
   2 /*
   3  * Copyright (C) 2016 Oracle.  All Rights Reserved.
   4  * Author: Darrick J. Wong <darrick.wong@oracle.com>
   5  */
   6 #include "xfs.h"
   7 #include "xfs_fs.h"
   8 #include "xfs_shared.h"
   9 #include "xfs_format.h"
  10 #include "xfs_log_format.h"
  11 #include "xfs_trans_resv.h"
  12 #include "xfs_bit.h"
  13 #include "xfs_sb.h"
  14 #include "xfs_mount.h"
  15 #include "xfs_defer.h"
  16 #include "xfs_trans.h"
  17 #include "xfs_buf_item.h"
  18 #include "xfs_inode.h"
  19 #include "xfs_inode_item.h"
  20 #include "xfs_trace.h"
  21
  22 /*
  23  * Deferred Operations in XFS
  24  *
  25  * Due to the way locking rules work in XFS, certain transactions (block
  26  * mapping and unmapping, typically) have permanent reservations so that
  27  * we can roll the transaction to adhere to AG locking order rules and
  28  * to unlock buffers between metadata updates.  Prior to rmap/reflink,
  29  * the mapping code had a mechanism to perform these deferrals for
  30  * extents that were going to be freed; this code makes that facility
  31  * more generic.
  32  *
  33  * When adding the reverse mapping and reflink features, it became
  34  * necessary to perform complex remapping multi-transactions to comply
  35  * with AG locking order rules, and to be able to spread a single
  36  * refcount update operation (an operation on an n-block extent can
  37  * update as many as n records!) among multiple transactions.  XFS can
  38  * roll a transaction to facilitate this, but using this facility
  39  * requires us to log "intent" items in case log recovery needs to
  40  * redo the operation, and to log "done" items to indicate that redo
  41  * is not necessary.
  42  *
  43  * Deferred work is tracked in xfs_defer_pending items.  Each pending
  44  * item tracks one type of deferred work.  Incoming work items (which
  45  * have not yet had an intent logged) are attached to a pending item
  46  * on the dop_intake list, where they wait for the caller to finish
  47  * the deferred operations.
  48  *
  49  * Finishing a set of deferred operations is an involved process.  To
  50  * start, we define "rolling a deferred-op transaction" as follows:
  51  *
  52  * > For each xfs_defer_pending item on the dop_intake list,
  53  *   - Sort the work items in AG order.  XFS locking
  54  *     order rules require us to lock buffers in AG order.
  55  *   - Create a log intent item for that type.
  56  *   - Attach it to the pending item.
  57  *   - Move the pending item from the dop_intake list to the
  58  *     dop_pending list.
  59  * > Roll the transaction.
  60  *
  61  * NOTE: To avoid exceeding the transaction reservation, we limit the
  62  * number of items that we attach to a given xfs_defer_pending.
  63  *
  64  * The actual finishing process looks like this:
  65  *
  66  * > For each xfs_defer_pending in the dop_pending list,
  67  *   - Roll the deferred-op transaction as above.
  68  *   - Create a log done item for that type, and attach it to the
  69  *     log intent item.
  70  *   - For each work item attached to the log intent item,
  71  *     * Perform the described action.
  72  *     * Attach the work item to the log done item.
  73  *     * If the result of doing the work was -EAGAIN, ->finish work
  74  *       wants a new transaction.  See the "Requesting a Fresh
  75  *       Transaction while Finishing Deferred Work" section below for
  76  *       details.
  77  *
  78  * The key here is that we must log an intent item for all pending
  79  * work items every time we roll the transaction, and that we must log
  80  * a done item as soon as the work is completed.  With this mechanism
  81  * we can perform complex remapping operations, chaining intent items
  82  * as needed.
  83  *
  84  * Requesting a Fresh Transaction while Finishing Deferred Work
  85  *
  86  * If ->finish_item decides that it needs a fresh transaction to
  87  * finish the work, it must ask its caller (xfs_defer_finish) for a
  88  * continuation.  The most likely cause of this circumstance are the
  89  * refcount adjust functions deciding that they've logged enough items
  90  * to be at risk of exceeding the transaction reservation.
  91  *
  92  * To get a fresh transaction, we want to log the existing log done
  93  * item to prevent the log intent item from replaying, immediately log
  94  * a new log intent item with the unfinished work items, roll the
  95  * transaction, and re-call ->finish_item wherever it left off.  The
  96  * log done item and the new log intent item must be in the same
  97  * transaction or atomicity cannot be guaranteed; defer_finish ensures
  98  * that this happens.
  99  *
 100  * This requires some coordination between ->finish_item and
 101  * defer_finish.  Upon deciding to request a new transaction,
 102  * ->finish_item should update the current work item to reflect the
 103  * unfinished work.  Next, it should reset the log done item's list
 104  * count to the number of items finished, and return -EAGAIN.
 105  * defer_finish sees the -EAGAIN, logs the new log intent item
 106  * with the remaining work items, and leaves the xfs_defer_pending
 107  * item at the head of the dop_work queue.  Then it rolls the
 108  * transaction and picks up processing where it left off.  It is
 109  * required that ->finish_item must be careful to leave enough
 110  * transaction reservation to fit the new log intent item.
 111  *
 112  * This is an example of remapping the extent (E, E+B) into file X at
 113  * offset A and dealing with the extent (C, C+B) already being mapped
 114  * there:
 115  * +-------------------------------------------------+
 116  * | Unmap file X startblock C offset A length B     | t0
 117  * | Intent to reduce refcount for extent (C, B)     |
 118  * | Intent to remove rmap (X, C, A, B)              |
 119  * | Intent to free extent (D, 1) (bmbt block)       |
 120  * | Intent to map (X, A, B) at startblock E         |
 121  * +-------------------------------------------------+
 122  * | Map file X startblock E offset A length B       | t1
 123  * | Done mapping (X, E, A, B)                       |
 124  * | Intent to increase refcount for extent (E, B)   |
 125  * | Intent to add rmap (X, E, A, B)                 |
 126  * +-------------------------------------------------+
 127  * | Reduce refcount for extent (C, B)               | t2
 128  * | Done reducing refcount for extent (C, 9)        |
 129  * | Intent to reduce refcount for extent (C+9, B-9) |
 130  * | (ran out of space after 9 refcount updates)     |
 131  * +-------------------------------------------------+
 132  * | Reduce refcount for extent (C+9, B+9)           | t3
 133  * | Done reducing refcount for extent (C+9, B-9)    |
 134  * | Increase refcount for extent (E, B)             |
 135  * | Done increasing refcount for extent (E, B)      |
 136  * | Intent to free extent (C, B)                    |
 137  * | Intent to free extent (F, 1) (refcountbt block) |
 138  * | Intent to remove rmap (F, 1, REFC)              |
 139  * +-------------------------------------------------+
 140  * | Remove rmap (X, C, A, B)                        | t4
 141  * | Done removing rmap (X, C, A, B)                 |
 142  * | Add rmap (X, E, A, B)                           |
 143  * | Done adding rmap (X, E, A, B)                   |
 144  * | Remove rmap (F, 1, REFC)                        |
 145  * | Done removing rmap (F, 1, REFC)                 |
 146  * +-------------------------------------------------+
 147  * | Free extent (C, B)                              | t5
 148  * | Done freeing extent (C, B)                      |
 149  * | Free extent (D, 1)                              |
 150  * | Done freeing extent (D, 1)                      |
 151  * | Free extent (F, 1)                              |
 152  * | Done freeing extent (F, 1)                      |
 153  * +-------------------------------------------------+
 154  *
 155  * If we should crash before t2 commits, log recovery replays
 156  * the following intent items:
 157  *
 158  * - Intent to reduce refcount for extent (C, B)
 159  * - Intent to remove rmap (X, C, A, B)
 160  * - Intent to free extent (D, 1) (bmbt block)
 161  * - Intent to increase refcount for extent (E, B)
 162  * - Intent to add rmap (X, E, A, B)
 163  *
 164  * In the process of recovering, it should also generate and take care
 165  * of these intent items:
 166  *
 167  * - Intent to free extent (C, B)
 168  * - Intent to free extent (F, 1) (refcountbt block)
 169  * - Intent to remove rmap (F, 1, REFC)
 170  *
 171  * Note that the continuation requested between t2 and t3 is likely to
 172  * reoccur.
 173  */
 174
 175 static const struct xfs_defer_op_type *defer_op_types[] = {
 176         [XFS_DEFER_OPS_TYPE_BMAP]       = &xfs_bmap_update_defer_type,
 177         [XFS_DEFER_OPS_TYPE_REFCOUNT]   = &xfs_refcount_update_defer_type,
 178         [XFS_DEFER_OPS_TYPE_RMAP]       = &xfs_rmap_update_defer_type,
 179         [XFS_DEFER_OPS_TYPE_FREE]       = &xfs_extent_free_defer_type,
 180         [XFS_DEFER_OPS_TYPE_AGFL_FREE]  = &xfs_agfl_free_defer_type,
 181 };
 182
 183 /*
 184  * For each pending item in the intake list, log its intent item and the
 185  * associated extents, then add the entire intake list to the end of
 186  * the pending list.
 187  */
 188 STATIC void
 189 xfs_defer_create_intents(
 190         struct xfs_trans                *tp)
 191 {
 192         struct list_head                *li;
 193         struct xfs_defer_pending        *dfp;
 194         const struct xfs_defer_op_type  *ops;
 195
 196         list_for_each_entry(dfp, &tp->t_dfops, dfp_list) {
 197                 ops = defer_op_types[dfp->dfp_type];
 198                 dfp->dfp_intent = ops->create_intent(tp, dfp->dfp_count);
 199                 trace_xfs_defer_create_intent(tp->t_mountp, dfp);
 200                 list_sort(tp->t_mountp, &dfp->dfp_work, ops->diff_items);
 201                 list_for_each(li, &dfp->dfp_work)
 202                         ops->log_item(tp, dfp->dfp_intent, li);
 203         }
 204 }
 205
 206 /* Abort all the intents that were committed. */
 207 STATIC void
 208 xfs_defer_trans_abort(
 209         struct xfs_trans                *tp,
 210         struct list_head                *dop_pending)
 211 {
 212         struct xfs_defer_pending        *dfp;
 213         const struct xfs_defer_op_type  *ops;
 214
 215         trace_xfs_defer_trans_abort(tp, _RET_IP_);
 216
 217         /* Abort intent items that don't have a done item. */
 218         list_for_each_entry(dfp, dop_pending, dfp_list) {
 219                 ops = defer_op_types[dfp->dfp_type];
 220                 trace_xfs_defer_pending_abort(tp->t_mountp, dfp);
 221                 if (dfp->dfp_intent && !dfp->dfp_done) {
 222                         ops->abort_intent(dfp->dfp_intent);
 223                         dfp->dfp_intent = NULL;
 224                 }
 225         }
 226 }
 227
 228 /* Roll a transaction so we can do some deferred op processing. */
 229 STATIC int
 230 xfs_defer_trans_roll(
 231         struct xfs_trans                **tpp)
 232 {
 233         struct xfs_trans                *tp = *tpp;
 234         struct xfs_buf_log_item         *bli;
 235         struct xfs_inode_log_item       *ili;
 236         struct xfs_log_item             *lip;
 237         struct xfs_buf                  *bplist[XFS_DEFER_OPS_NR_BUFS];
 238         struct xfs_inode                *iplist[XFS_DEFER_OPS_NR_INODES];
 239         int                             bpcount = 0, ipcount = 0;
 240         int                             i;
 241         int                             error;
 242
 243         list_for_each_entry(lip, &tp->t_items, li_trans) {
 244                 switch (lip->li_type) {
 245                 case XFS_LI_BUF:
 246                         bli = container_of(lip, struct xfs_buf_log_item,
 247                                            bli_item);
 248                         if (bli->bli_flags & XFS_BLI_HOLD) {
 249                                 if (bpcount >= XFS_DEFER_OPS_NR_BUFS) {
 250                                         ASSERT(0);
 251                                         return -EFSCORRUPTED;
 252                                 }
 253                                 xfs_trans_dirty_buf(tp, bli->bli_buf);
 254                                 bplist[bpcount++] = bli->bli_buf;
 255                         }
 256                         break;
 257                 case XFS_LI_INODE:
 258                         ili = container_of(lip, struct xfs_inode_log_item,
 259                                            ili_item);
 260                         if (ili->ili_lock_flags == 0) {
 261                                 if (ipcount >= XFS_DEFER_OPS_NR_INODES) {
 262                                         ASSERT(0);
 263                                         return -EFSCORRUPTED;
 264                                 }
 265                                 xfs_trans_log_inode(tp, ili->ili_inode,
 266                                                     XFS_ILOG_CORE);
 267                                 iplist[ipcount++] = ili->ili_inode;
 268                         }
 269                         break;
 270                 default:
 271                         break;
 272                 }
 273         }
 274
 275         trace_xfs_defer_trans_roll(tp, _RET_IP_);
 276
 277         /*
 278          * Roll the transaction.  Rolling always given a new transaction (even
 279          * if committing the old one fails!) to hand back to the caller, so we
 280          * join the held resources to the new transaction so that we always
 281          * return with the held resources joined to @tpp, no matter what
 282          * happened.
 283          */
 284         error = xfs_trans_roll(tpp);
 285         tp = *tpp;
 286
 287         /* Rejoin the joined inodes. */
 288         for (i = 0; i < ipcount; i++)
 289                 xfs_trans_ijoin(tp, iplist[i], 0);
 290
 291         /* Rejoin the buffers and dirty them so the log moves forward. */
 292         for (i = 0; i < bpcount; i++) {
 293                 xfs_trans_bjoin(tp, bplist[i]);
 294                 xfs_trans_bhold(tp, bplist[i]);
 295         }
 296
 297         if (error)
 298                 trace_xfs_defer_trans_roll_error(tp, error);
 299         return error;
 300 }
 301
 302 /*
 303  * Reset an already used dfops after finish.
 304  */
 305 static void
 306 xfs_defer_reset(
 307         struct xfs_trans        *tp)
 308 {
 309         ASSERT(list_empty(&tp->t_dfops));
 310
 311         /*
 312          * Low mode state transfers across transaction rolls to mirror dfops
 313          * lifetime. Clear it now that dfops is reset.
 314          */
 315         tp->t_flags &= ~XFS_TRANS_LOWMODE;
 316 }
 317
 318 /*
 319  * Free up any items left in the list.
 320  */
 321 static void
 322 xfs_defer_cancel_list(
 323         struct xfs_mount                *mp,
 324         struct list_head                *dop_list)
 325 {
 326         struct xfs_defer_pending        *dfp;
 327         struct xfs_defer_pending        *pli;
 328         struct list_head                *pwi;
 329         struct list_head                *n;
 330         const struct xfs_defer_op_type  *ops;
 331
 332         /*
 333          * Free the pending items.  Caller should already have arranged
 334          * for the intent items to be released.
 335          */
 336         list_for_each_entry_safe(dfp, pli, dop_list, dfp_list) {
 337                 ops = defer_op_types[dfp->dfp_type];
 338                 trace_xfs_defer_cancel_list(mp, dfp);
 339                 list_del(&dfp->dfp_list);
 340                 list_for_each_safe(pwi, n, &dfp->dfp_work) {
 341                         list_del(pwi);
 342                         dfp->dfp_count--;
 343                         ops->cancel_item(pwi);
 344                 }
 345                 ASSERT(dfp->dfp_count == 0);
 346                 kmem_free(dfp);
 347         }
 348 }
 349
 350 /*
 351  * Finish all the pending work.  This involves logging intent items for
 352  * any work items that wandered in since the last transaction roll (if
 353  * one has even happened), rolling the transaction, and finishing the
 354  * work items in the first item on the logged-and-pending list.
 355  *
 356  * If an inode is provided, relog it to the new transaction.
 357  */
 358 int
 359 xfs_defer_finish_noroll(
 360         struct xfs_trans                **tp)
 361 {
 362         struct xfs_defer_pending        *dfp;
 363         struct list_head                *li;
 364         struct list_head                *n;
 365         void                            *state;
 366         int                             error = 0;
 367         const struct xfs_defer_op_type  *ops;
 368         LIST_HEAD(dop_pending);
 369
 370         ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
 371
 372         trace_xfs_defer_finish(*tp, _RET_IP_);
 373
 374         /* Until we run out of pending work to finish... */
 375         while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) {
 376                 /* log intents and pull in intake items */
 377                 xfs_defer_create_intents(*tp);
 378                 list_splice_tail_init(&(*tp)->t_dfops, &dop_pending);
 379
 380                 /*
 381                  * Roll the transaction.
 382                  */
 383                 error = xfs_defer_trans_roll(tp);
 384                 if (error)
 385                         goto out;
 386
 387                 /* Log an intent-done item for the first pending item. */
 388                 dfp = list_first_entry(&dop_pending, struct xfs_defer_pending,
 389                                        dfp_list);
 390                 ops = defer_op_types[dfp->dfp_type];
 391                 trace_xfs_defer_pending_finish((*tp)->t_mountp, dfp);
 392                 dfp->dfp_done = ops->create_done(*tp, dfp->dfp_intent,
 393                                 dfp->dfp_count);
 394
 395                 /* Finish the work items. */
 396                 state = NULL;
 397                 list_for_each_safe(li, n, &dfp->dfp_work) {
 398                         list_del(li);
 399                         dfp->dfp_count--;
 400                         error = ops->finish_item(*tp, li, dfp->dfp_done,
 401                                         &state);
 402                         if (error == -EAGAIN) {
 403                                 /*
 404                                  * Caller wants a fresh transaction;
 405                                  * put the work item back on the list
 406                                  * and jump out.
 407                                  */
 408                                 list_add(li, &dfp->dfp_work);
 409                                 dfp->dfp_count++;
 410                                 break;
 411                         } else if (error) {
 412                                 /*
 413                                  * Clean up after ourselves and jump out.
 414                                  * xfs_defer_cancel will take care of freeing
 415                                  * all these lists and stuff.
 416                                  */
 417                                 if (ops->finish_cleanup)
 418                                         ops->finish_cleanup(*tp, state, error);
 419                                 goto out;
 420                         }
 421                 }
 422                 if (error == -EAGAIN) {
 423                         /*
 424                          * Caller wants a fresh transaction, so log a
 425                          * new log intent item to replace the old one
 426                          * and roll the transaction.  See "Requesting
 427                          * a Fresh Transaction while Finishing
 428                          * Deferred Work" above.
 429                          */
 430                         dfp->dfp_intent = ops->create_intent(*tp,
 431                                         dfp->dfp_count);
 432                         dfp->dfp_done = NULL;
 433                         list_for_each(li, &dfp->dfp_work)
 434                                 ops->log_item(*tp, dfp->dfp_intent, li);
 435                 } else {
 436                         /* Done with the dfp, free it. */
 437                         list_del(&dfp->dfp_list);
 438                         kmem_free(dfp);
 439                 }
 440
 441                 if (ops->finish_cleanup)
 442                         ops->finish_cleanup(*tp, state, error);
 443         }
 444
 445 out:
 446         if (error) {
 447                 xfs_defer_trans_abort(*tp, &dop_pending);
 448                 xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE);
 449                 trace_xfs_defer_finish_error(*tp, error);
 450                 xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending);
 451                 xfs_defer_cancel(*tp);
 452                 return error;
 453         }
 454
 455         trace_xfs_defer_finish_done(*tp, _RET_IP_);
 456         return 0;
 457 }
 458
 459 int
 460 xfs_defer_finish(
 461         struct xfs_trans        **tp)
 462 {
 463         int                     error;
 464
 465         /*
 466          * Finish and roll the transaction once more to avoid returning to the
 467          * caller with a dirty transaction.
 468          */
 469         error = xfs_defer_finish_noroll(tp);
 470         if (error)
 471                 return error;
 472         if ((*tp)->t_flags & XFS_TRANS_DIRTY) {
 473                 error = xfs_defer_trans_roll(tp);
 474                 if (error) {
 475                         xfs_force_shutdown((*tp)->t_mountp,
 476                                            SHUTDOWN_CORRUPT_INCORE);
 477                         return error;
 478                 }
 479         }
 480         xfs_defer_reset(*tp);
 481         return 0;
 482 }
 483
 484 void
 485 xfs_defer_cancel(
 486         struct xfs_trans        *tp)
 487 {
 488         struct xfs_mount        *mp = tp->t_mountp;
 489
 490         trace_xfs_defer_cancel(tp, _RET_IP_);
 491         xfs_defer_cancel_list(mp, &tp->t_dfops);
 492 }
 493
 494 /* Add an item for later deferred processing. */
 495 void
 496 xfs_defer_add(
 497         struct xfs_trans                *tp,
 498         enum xfs_defer_ops_type         type,
 499         struct list_head                *li)
 500 {
 501         struct xfs_defer_pending        *dfp = NULL;
 502         const struct xfs_defer_op_type  *ops;
 503
 504         ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
 505         BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX);
 506
 507         /*
 508          * Add the item to a pending item at the end of the intake list.
 509          * If the last pending item has the same type, reuse it.  Else,
 510          * create a new pending item at the end of the intake list.
 511          */
 512         if (!list_empty(&tp->t_dfops)) {
 513                 dfp = list_last_entry(&tp->t_dfops,
 514                                 struct xfs_defer_pending, dfp_list);
 515                 ops = defer_op_types[dfp->dfp_type];
 516                 if (dfp->dfp_type != type ||
 517                     (ops->max_items && dfp->dfp_count >= ops->max_items))
 518                         dfp = NULL;
 519         }
 520         if (!dfp) {
 521                 dfp = kmem_alloc(sizeof(struct xfs_defer_pending),
 522                                 KM_SLEEP | KM_NOFS);
 523                 dfp->dfp_type = type;
 524                 dfp->dfp_intent = NULL;
 525                 dfp->dfp_done = NULL;
 526                 dfp->dfp_count = 0;
 527                 INIT_LIST_HEAD(&dfp->dfp_work);
 528                 list_add_tail(&dfp->dfp_list, &tp->t_dfops);
 529         }
 530
 531         list_add_tail(li, &dfp->dfp_work);
 532         dfp->dfp_count++;
 533 }
 534
 535 /*
 536  * Move deferred ops from one transaction to another and reset the source to
 537  * initial state. This is primarily used to carry state forward across
 538  * transaction rolls with pending dfops.
 539  */
 540 void
 541 xfs_defer_move(
 542         struct xfs_trans        *dtp,
 543         struct xfs_trans        *stp)
 544 {
 545         list_splice_init(&stp->t_dfops, &dtp->t_dfops);
 546
 547         /*
 548          * Low free space mode was historically controlled by a dfops field.
 549          * This meant that low mode state potentially carried across multiple
 550          * transaction rolls. Transfer low mode on a dfops move to preserve
 551          * that behavior.
 552          */
 553         dtp->t_flags |= (stp->t_flags & XFS_TRANS_LOWMODE);
 554
 555         xfs_defer_reset(stp);
 556 }