fs/xfs/xfs_discard.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2010, 2023 Red Hat, Inc.
   4  * All Rights Reserved.
   5  */
   6 #include "xfs.h"
   7 #include "xfs_shared.h"
   8 #include "xfs_format.h"
   9 #include "xfs_log_format.h"
  10 #include "xfs_trans_resv.h"
  11 #include "xfs_trans.h"
  12 #include "xfs_mount.h"
  13 #include "xfs_btree.h"
  14 #include "xfs_alloc_btree.h"
  15 #include "xfs_alloc.h"
  16 #include "xfs_discard.h"
  17 #include "xfs_error.h"
  18 #include "xfs_extent_busy.h"
  19 #include "xfs_trace.h"
  20 #include "xfs_log.h"
  21 #include "xfs_ag.h"
  22 #include "xfs_health.h"
  23 #include "xfs_rtbitmap.h"
  24 #include "xfs_rtgroup.h"
  25
  26 /*
  27  * Notes on an efficient, low latency fstrim algorithm
  28  *
  29  * We need to walk the filesystem free space and issue discards on the free
  30  * space that meet the search criteria (size and location). We cannot issue
  31  * discards on extents that might be in use, or are so recently in use they are
  32  * still marked as busy. To serialise against extent state changes whilst we are
  33  * gathering extents to trim, we must hold the AGF lock to lock out other
  34  * allocations and extent free operations that might change extent state.
  35  *
  36  * However, we cannot just hold the AGF for the entire AG free space walk whilst
  37  * we issue discards on each free space that is found. Storage devices can have
  38  * extremely slow discard implementations (e.g. ceph RBD) and so walking a
  39  * couple of million free extents and issuing synchronous discards on each
  40  * extent can take a *long* time. Whilst we are doing this walk, nothing else
  41  * can access the AGF, and we can stall transactions and hence the log whilst
  42  * modifications wait for the AGF lock to be released. This can lead hung tasks
  43  * kicking the hung task timer and rebooting the system. This is bad.
  44  *
  45  * Hence we need to take a leaf from the bulkstat playbook. It takes the AGI
  46  * lock, gathers a range of inode cluster buffers that are allocated, drops the
  47  * AGI lock and then reads all the inode cluster buffers and processes them. It
  48  * loops doing this, using a cursor to keep track of where it is up to in the AG
  49  * for each iteration to restart the INOBT lookup from.
  50  *
  51  * We can't do this exactly with free space - once we drop the AGF lock, the
  52  * state of the free extent is out of our control and we cannot run a discard
  53  * safely on it in this situation. Unless, of course, we've marked the free
  54  * extent as busy and undergoing a discard operation whilst we held the AGF
  55  * locked.
  56  *
  57  * This is exactly how online discard works - free extents are marked busy when
  58  * they are freed, and once the extent free has been committed to the journal,
  59  * the busy extent record is marked as "undergoing discard" and the discard is
  60  * then issued on the free extent. Once the discard completes, the busy extent
  61  * record is removed and the extent is able to be allocated again.
  62  *
  63  * In the context of fstrim, if we find a free extent we need to discard, we
  64  * don't have to discard it immediately. All we need to do it record that free
  65  * extent as being busy and under discard, and all the allocation routines will
  66  * now avoid trying to allocate it. Hence if we mark the extent as busy under
  67  * the AGF lock, we can safely discard it without holding the AGF lock because
  68  * nothing will attempt to allocate that free space until the discard completes.
  69  *
  70  * This also allows us to issue discards asynchronously like we do with online
  71  * discard, and so for fast devices fstrim will run much faster as we can have
  72  * multiple discard operations in flight at once, as well as pipeline the free
  73  * extent search so that it overlaps in flight discard IO.
  74  */
  75
  76 #define XFS_DISCARD_MAX_EXAMINE (100)
  77
  78 struct workqueue_struct *xfs_discard_wq;
  79
  80 static void
  81 xfs_discard_endio_work(
  82         struct work_struct      *work)
  83 {
  84         struct xfs_busy_extents *extents =
  85                 container_of(work, struct xfs_busy_extents, endio_work);
  86
  87         xfs_extent_busy_clear(&extents->extent_list, false);
  88         kfree(extents->owner);
  89 }
  90
  91 /*
  92  * Queue up the actual completion to a thread to avoid IRQ-safe locking for
  93  * pagb_lock.
  94  */
  95 static void
  96 xfs_discard_endio(
  97         struct bio              *bio)
  98 {
  99         struct xfs_busy_extents *extents = bio->bi_private;
 100
 101         INIT_WORK(&extents->endio_work, xfs_discard_endio_work);
 102         queue_work(xfs_discard_wq, &extents->endio_work);
 103         bio_put(bio);
 104 }
 105
 106 static inline struct block_device *
 107 xfs_group_bdev(
 108         const struct xfs_group  *xg)
 109 {
 110         struct xfs_mount        *mp = xg->xg_mount;
 111
 112         switch (xg->xg_type) {
 113         case XG_TYPE_AG:
 114                 return mp->m_ddev_targp->bt_bdev;
 115         case XG_TYPE_RTG:
 116                 return mp->m_rtdev_targp->bt_bdev;
 117         default:
 118                 ASSERT(0);
 119                 break;
 120         }
 121         return NULL;
 122 }
 123
 124 /*
 125  * Walk the discard list and issue discards on all the busy extents in the
 126  * list. We plug and chain the bios so that we only need a single completion
 127  * call to clear all the busy extents once the discards are complete.
 128  */
 129 int
 130 xfs_discard_extents(
 131         struct xfs_mount        *mp,
 132         struct xfs_busy_extents *extents)
 133 {
 134         struct xfs_extent_busy  *busyp;
 135         struct bio              *bio = NULL;
 136         struct blk_plug         plug;
 137         int                     error = 0;
 138
 139         blk_start_plug(&plug);
 140         list_for_each_entry(busyp, &extents->extent_list, list) {
 141                 trace_xfs_discard_extent(busyp->group, busyp->bno,
 142                                 busyp->length);
 143
 144                 error = __blkdev_issue_discard(xfs_group_bdev(busyp->group),
 145                                 xfs_gbno_to_daddr(busyp->group, busyp->bno),
 146                                 XFS_FSB_TO_BB(mp, busyp->length),
 147                                 GFP_KERNEL, &bio);
 148                 if (error && error != -EOPNOTSUPP) {
 149                         xfs_info(mp,
 150          "discard failed for extent [0x%llx,%u], error %d",
 151                                  (unsigned long long)busyp->bno,
 152                                  busyp->length,
 153                                  error);
 154                         break;
 155                 }
 156         }
 157
 158         if (bio) {
 159                 bio->bi_private = extents;
 160                 bio->bi_end_io = xfs_discard_endio;
 161                 submit_bio(bio);
 162         } else {
 163                 xfs_discard_endio_work(&extents->endio_work);
 164         }
 165         blk_finish_plug(&plug);
 166
 167         return error;
 168 }
 169
 170 struct xfs_trim_cur {
 171         xfs_agblock_t   start;
 172         xfs_extlen_t    count;
 173         xfs_agblock_t   end;
 174         xfs_extlen_t    minlen;
 175         bool            by_bno;
 176 };
 177
 178 static int
 179 xfs_trim_gather_extents(
 180         struct xfs_perag        *pag,
 181         struct xfs_trim_cur     *tcur,
 182         struct xfs_busy_extents *extents)
 183 {
 184         struct xfs_mount        *mp = pag_mount(pag);
 185         struct xfs_trans        *tp;
 186         struct xfs_btree_cur    *cur;
 187         struct xfs_buf          *agbp;
 188         int                     error;
 189         int                     i;
 190         int                     batch = XFS_DISCARD_MAX_EXAMINE;
 191
 192         /*
 193          * Force out the log.  This means any transactions that might have freed
 194          * space before we take the AGF buffer lock are now on disk, and the
 195          * volatile disk cache is flushed.
 196          */
 197         xfs_log_force(mp, XFS_LOG_SYNC);
 198
 199         error = xfs_trans_alloc_empty(mp, &tp);
 200         if (error)
 201                 return error;
 202
 203         error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
 204         if (error)
 205                 goto out_trans_cancel;
 206
 207         if (tcur->by_bno) {
 208                 /* sub-AG discard request always starts at tcur->start */
 209                 cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag);
 210                 error = xfs_alloc_lookup_le(cur, tcur->start, 0, &i);
 211                 if (!error && !i)
 212                         error = xfs_alloc_lookup_ge(cur, tcur->start, 0, &i);
 213         } else if (tcur->start == 0) {
 214                 /* first time through a by-len starts with max length */
 215                 cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag);
 216                 error = xfs_alloc_lookup_ge(cur, 0, tcur->count, &i);
 217         } else {
 218                 /* nth time through a by-len starts where we left off */
 219                 cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag);
 220                 error = xfs_alloc_lookup_le(cur, tcur->start, tcur->count, &i);
 221         }
 222         if (error)
 223                 goto out_del_cursor;
 224         if (i == 0) {
 225                 /* nothing of that length left in the AG, we are done */
 226                 tcur->count = 0;
 227                 goto out_del_cursor;
 228         }
 229
 230         /*
 231          * Loop until we are done with all extents that are large
 232          * enough to be worth discarding or we hit batch limits.
 233          */
 234         while (i) {
 235                 xfs_agblock_t   fbno;
 236                 xfs_extlen_t    flen;
 237
 238                 error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
 239                 if (error)
 240                         break;
 241                 if (XFS_IS_CORRUPT(mp, i != 1)) {
 242                         xfs_btree_mark_sick(cur);
 243                         error = -EFSCORRUPTED;
 244                         break;
 245                 }
 246
 247                 if (--batch <= 0) {
 248                         /*
 249                          * Update the cursor to point at this extent so we
 250                          * restart the next batch from this extent.
 251                          */
 252                         tcur->start = fbno;
 253                         tcur->count = flen;
 254                         break;
 255                 }
 256
 257                 /*
 258                  * If the extent is entirely outside of the range we are
 259                  * supposed to skip it.  Do not bother to trim down partially
 260                  * overlapping ranges for now.
 261                  */
 262                 if (fbno + flen < tcur->start) {
 263                         trace_xfs_discard_exclude(pag_group(pag), fbno, flen);
 264                         goto next_extent;
 265                 }
 266                 if (fbno > tcur->end) {
 267                         trace_xfs_discard_exclude(pag_group(pag), fbno, flen);
 268                         if (tcur->by_bno) {
 269                                 tcur->count = 0;
 270                                 break;
 271                         }
 272                         goto next_extent;
 273                 }
 274
 275                 /* Trim the extent returned to the range we want. */
 276                 if (fbno < tcur->start) {
 277                         flen -= tcur->start - fbno;
 278                         fbno = tcur->start;
 279                 }
 280                 if (fbno + flen > tcur->end + 1)
 281                         flen = tcur->end - fbno + 1;
 282
 283                 /* Too small?  Give up. */
 284                 if (flen < tcur->minlen) {
 285                         trace_xfs_discard_toosmall(pag_group(pag), fbno, flen);
 286                         if (tcur->by_bno)
 287                                 goto next_extent;
 288                         tcur->count = 0;
 289                         break;
 290                 }
 291
 292                 /*
 293                  * If any blocks in the range are still busy, skip the
 294                  * discard and try again the next time.
 295                  */
 296                 if (xfs_extent_busy_search(pag_group(pag), fbno, flen)) {
 297                         trace_xfs_discard_busy(pag_group(pag), fbno, flen);
 298                         goto next_extent;
 299                 }
 300
 301                 xfs_extent_busy_insert_discard(pag_group(pag), fbno, flen,
 302                                 &extents->extent_list);
 303 next_extent:
 304                 if (tcur->by_bno)
 305                         error = xfs_btree_increment(cur, 0, &i);
 306                 else
 307                         error = xfs_btree_decrement(cur, 0, &i);
 308                 if (error)
 309                         break;
 310
 311                 /*
 312                  * If there's no more records in the tree, we are done. Set the
 313                  * cursor block count to 0 to indicate to the caller that there
 314                  * is no more extents to search.
 315                  */
 316                 if (i == 0)
 317                         tcur->count = 0;
 318         }
 319
 320         /*
 321          * If there was an error, release all the gathered busy extents because
 322          * we aren't going to issue a discard on them any more.
 323          */
 324         if (error)
 325                 xfs_extent_busy_clear(&extents->extent_list, false);
 326 out_del_cursor:
 327         xfs_btree_del_cursor(cur, error);
 328 out_trans_cancel:
 329         xfs_trans_cancel(tp);
 330         return error;
 331 }
 332
 333 static bool
 334 xfs_trim_should_stop(void)
 335 {
 336         return fatal_signal_pending(current) || freezing(current);
 337 }
 338
 339 /*
 340  * Iterate the free list gathering extents and discarding them. We need a cursor
 341  * for the repeated iteration of gather/discard loop, so use the longest extent
 342  * we found in the last batch as the key to start the next.
 343  */
 344 static int
 345 xfs_trim_perag_extents(
 346         struct xfs_perag        *pag,
 347         xfs_agblock_t           start,
 348         xfs_agblock_t           end,
 349         xfs_extlen_t            minlen)
 350 {
 351         struct xfs_trim_cur     tcur = {
 352                 .start          = start,
 353                 .count          = pag->pagf_longest,
 354                 .end            = end,
 355                 .minlen         = minlen,
 356         };
 357         int                     error = 0;
 358
 359         if (start != 0 || end != pag_group(pag)->xg_block_count)
 360                 tcur.by_bno = true;
 361
 362         do {
 363                 struct xfs_busy_extents *extents;
 364
 365                 extents = kzalloc(sizeof(*extents), GFP_KERNEL);
 366                 if (!extents) {
 367                         error = -ENOMEM;
 368                         break;
 369                 }
 370
 371                 extents->owner = extents;
 372                 INIT_LIST_HEAD(&extents->extent_list);
 373
 374                 error = xfs_trim_gather_extents(pag, &tcur, extents);
 375                 if (error) {
 376                         kfree(extents);
 377                         break;
 378                 }
 379
 380                 /*
 381                  * We hand the extent list to the discard function here so the
 382                  * discarded extents can be removed from the busy extent list.
 383                  * This allows the discards to run asynchronously with gathering
 384                  * the next round of extents to discard.
 385                  *
 386                  * However, we must ensure that we do not reference the extent
 387                  * list  after this function call, as it may have been freed by
 388                  * the time control returns to us.
 389                  */
 390                 error = xfs_discard_extents(pag_mount(pag), extents);
 391                 if (error)
 392                         break;
 393
 394                 if (xfs_trim_should_stop())
 395                         break;
 396
 397         } while (tcur.count != 0);
 398
 399         return error;
 400
 401 }
 402
 403 static int
 404 xfs_trim_datadev_extents(
 405         struct xfs_mount        *mp,
 406         xfs_daddr_t             start,
 407         xfs_daddr_t             end,
 408         xfs_extlen_t            minlen)
 409 {
 410         xfs_agnumber_t          start_agno, end_agno;
 411         xfs_agblock_t           start_agbno, end_agbno;
 412         struct xfs_perag        *pag = NULL;
 413         xfs_daddr_t             ddev_end;
 414         int                     last_error = 0, error;
 415
 416         ddev_end = min_t(xfs_daddr_t, end,
 417                          XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1);
 418
 419         start_agno = xfs_daddr_to_agno(mp, start);
 420         start_agbno = xfs_daddr_to_agbno(mp, start);
 421         end_agno = xfs_daddr_to_agno(mp, ddev_end);
 422         end_agbno = xfs_daddr_to_agbno(mp, ddev_end);
 423
 424         while ((pag = xfs_perag_next_range(mp, pag, start_agno, end_agno))) {
 425                 xfs_agblock_t   agend = pag_group(pag)->xg_block_count;
 426
 427                 if (pag_agno(pag) == end_agno)
 428                         agend = end_agbno;
 429                 error = xfs_trim_perag_extents(pag, start_agbno, agend, minlen);
 430                 if (error)
 431                         last_error = error;
 432
 433                 if (xfs_trim_should_stop()) {
 434                         xfs_perag_rele(pag);
 435                         break;
 436                 }
 437                 start_agbno = 0;
 438         }
 439
 440         return last_error;
 441 }
 442
 443 #ifdef CONFIG_XFS_RT
 444 struct xfs_trim_rtdev {
 445         /* list of rt extents to free */
 446         struct list_head        extent_list;
 447
 448         /* minimum length that caller allows us to trim */
 449         xfs_rtblock_t           minlen_fsb;
 450
 451         /* restart point for the rtbitmap walk */
 452         xfs_rtxnum_t            restart_rtx;
 453
 454         /* stopping point for the current rtbitmap walk */
 455         xfs_rtxnum_t            stop_rtx;
 456 };
 457
 458 struct xfs_rtx_busy {
 459         struct list_head        list;
 460         xfs_rtblock_t           bno;
 461         xfs_rtblock_t           length;
 462 };
 463
 464 static void
 465 xfs_discard_free_rtdev_extents(
 466         struct xfs_trim_rtdev   *tr)
 467 {
 468         struct xfs_rtx_busy     *busyp, *n;
 469
 470         list_for_each_entry_safe(busyp, n, &tr->extent_list, list) {
 471                 list_del_init(&busyp->list);
 472                 kfree(busyp);
 473         }
 474 }
 475
 476 /*
 477  * Walk the discard list and issue discards on all the busy extents in the
 478  * list. We plug and chain the bios so that we only need a single completion
 479  * call to clear all the busy extents once the discards are complete.
 480  */
 481 static int
 482 xfs_discard_rtdev_extents(
 483         struct xfs_mount        *mp,
 484         struct xfs_trim_rtdev   *tr)
 485 {
 486         struct block_device     *bdev = mp->m_rtdev_targp->bt_bdev;
 487         struct xfs_rtx_busy     *busyp;
 488         struct bio              *bio = NULL;
 489         struct blk_plug         plug;
 490         xfs_rtblock_t           start = NULLRTBLOCK, length = 0;
 491         int                     error = 0;
 492
 493         blk_start_plug(&plug);
 494         list_for_each_entry(busyp, &tr->extent_list, list) {
 495                 if (start == NULLRTBLOCK)
 496                         start = busyp->bno;
 497                 length += busyp->length;
 498
 499                 trace_xfs_discard_rtextent(mp, busyp->bno, busyp->length);
 500
 501                 error = __blkdev_issue_discard(bdev,
 502                                 xfs_rtb_to_daddr(mp, busyp->bno),
 503                                 XFS_FSB_TO_BB(mp, busyp->length),
 504                                 GFP_NOFS, &bio);
 505                 if (error)
 506                         break;
 507         }
 508         xfs_discard_free_rtdev_extents(tr);
 509
 510         if (bio) {
 511                 error = submit_bio_wait(bio);
 512                 if (error == -EOPNOTSUPP)
 513                         error = 0;
 514                 if (error)
 515                         xfs_info(mp,
 516          "discard failed for rtextent [0x%llx,%llu], error %d",
 517                                  (unsigned long long)start,
 518                                  (unsigned long long)length,
 519                                  error);
 520                 bio_put(bio);
 521         }
 522         blk_finish_plug(&plug);
 523
 524         return error;
 525 }
 526
 527 static int
 528 xfs_trim_gather_rtextent(
 529         struct xfs_rtgroup              *rtg,
 530         struct xfs_trans                *tp,
 531         const struct xfs_rtalloc_rec    *rec,
 532         void                            *priv)
 533 {
 534         struct xfs_trim_rtdev           *tr = priv;
 535         struct xfs_rtx_busy             *busyp;
 536         xfs_rtblock_t                   rbno, rlen;
 537
 538         if (rec->ar_startext > tr->stop_rtx) {
 539                 /*
 540                  * If we've scanned a large number of rtbitmap blocks, update
 541                  * the cursor to point at this extent so we restart the next
 542                  * batch from this extent.
 543                  */
 544                 tr->restart_rtx = rec->ar_startext;
 545                 return -ECANCELED;
 546         }
 547
 548         rbno = xfs_rtx_to_rtb(rtg, rec->ar_startext);
 549         rlen = xfs_rtbxlen_to_blen(rtg_mount(rtg), rec->ar_extcount);
 550
 551         /* Ignore too small. */
 552         if (rlen < tr->minlen_fsb) {
 553                 trace_xfs_discard_rttoosmall(rtg_mount(rtg), rbno, rlen);
 554                 return 0;
 555         }
 556
 557         busyp = kzalloc(sizeof(struct xfs_rtx_busy), GFP_KERNEL);
 558         if (!busyp)
 559                 return -ENOMEM;
 560
 561         busyp->bno = rbno;
 562         busyp->length = rlen;
 563         INIT_LIST_HEAD(&busyp->list);
 564         list_add_tail(&busyp->list, &tr->extent_list);
 565
 566         tr->restart_rtx = rec->ar_startext + rec->ar_extcount;
 567         return 0;
 568 }
 569
 570 /* Trim extents on an !rtgroups realtime device */
 571 static int
 572 xfs_trim_rtextents(
 573         struct xfs_rtgroup      *rtg,
 574         xfs_rtxnum_t            low,
 575         xfs_rtxnum_t            high,
 576         xfs_daddr_t             minlen)
 577 {
 578         struct xfs_mount        *mp = rtg_mount(rtg);
 579         struct xfs_trim_rtdev   tr = {
 580                 .minlen_fsb     = XFS_BB_TO_FSB(mp, minlen),
 581                 .extent_list    = LIST_HEAD_INIT(tr.extent_list),
 582         };
 583         struct xfs_trans        *tp;
 584         int                     error;
 585
 586         error = xfs_trans_alloc_empty(mp, &tp);
 587         if (error)
 588                 return error;
 589
 590         /*
 591          * Walk the free ranges between low and high.  The query_range function
 592          * trims the extents returned.
 593          */
 594         do {
 595                 tr.stop_rtx = low + xfs_rtbitmap_rtx_per_rbmblock(mp);
 596                 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
 597                 error = xfs_rtalloc_query_range(rtg, tp, low, high,
 598                                 xfs_trim_gather_rtextent, &tr);
 599
 600                 if (error == -ECANCELED)
 601                         error = 0;
 602                 if (error) {
 603                         xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
 604                         xfs_discard_free_rtdev_extents(&tr);
 605                         break;
 606                 }
 607
 608                 if (list_empty(&tr.extent_list)) {
 609                         xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
 610                         break;
 611                 }
 612
 613                 error = xfs_discard_rtdev_extents(mp, &tr);
 614                 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
 615                 if (error)
 616                         break;
 617
 618                 low = tr.restart_rtx;
 619         } while (!xfs_trim_should_stop() && low <= high);
 620
 621         xfs_trans_cancel(tp);
 622         return error;
 623 }
 624
 625 struct xfs_trim_rtgroup {
 626         /* list of rtgroup extents to free */
 627         struct xfs_busy_extents *extents;
 628
 629         /* minimum length that caller allows us to trim */
 630         xfs_rtblock_t           minlen_fsb;
 631
 632         /* restart point for the rtbitmap walk */
 633         xfs_rtxnum_t            restart_rtx;
 634
 635         /* number of extents to examine before stopping to issue discard ios */
 636         int                     batch;
 637
 638         /* number of extents queued for discard */
 639         int                     queued;
 640 };
 641
 642 static int
 643 xfs_trim_gather_rtgroup_extent(
 644         struct xfs_rtgroup              *rtg,
 645         struct xfs_trans                *tp,
 646         const struct xfs_rtalloc_rec    *rec,
 647         void                            *priv)
 648 {
 649         struct xfs_trim_rtgroup         *tr = priv;
 650         xfs_rgblock_t                   rgbno;
 651         xfs_extlen_t                    len;
 652
 653         if (--tr->batch <= 0) {
 654                 /*
 655                  * If we've checked a large number of extents, update the
 656                  * cursor to point at this extent so we restart the next batch
 657                  * from this extent.
 658                  */
 659                 tr->restart_rtx = rec->ar_startext;
 660                 return -ECANCELED;
 661         }
 662
 663         rgbno = xfs_rtx_to_rgbno(rtg, rec->ar_startext);
 664         len = xfs_rtxlen_to_extlen(rtg_mount(rtg), rec->ar_extcount);
 665
 666         /* Ignore too small. */
 667         if (len < tr->minlen_fsb) {
 668                 trace_xfs_discard_toosmall(rtg_group(rtg), rgbno, len);
 669                 return 0;
 670         }
 671
 672         /*
 673          * If any blocks in the range are still busy, skip the discard and try
 674          * again the next time.
 675          */
 676         if (xfs_extent_busy_search(rtg_group(rtg), rgbno, len)) {
 677                 trace_xfs_discard_busy(rtg_group(rtg), rgbno, len);
 678                 return 0;
 679         }
 680
 681         xfs_extent_busy_insert_discard(rtg_group(rtg), rgbno, len,
 682                         &tr->extents->extent_list);
 683
 684         tr->queued++;
 685         tr->restart_rtx = rec->ar_startext + rec->ar_extcount;
 686         return 0;
 687 }
 688
 689 /* Trim extents in this rtgroup using the busy extent machinery. */
 690 static int
 691 xfs_trim_rtgroup_extents(
 692         struct xfs_rtgroup      *rtg,
 693         xfs_rtxnum_t            low,
 694         xfs_rtxnum_t            high,
 695         xfs_daddr_t             minlen)
 696 {
 697         struct xfs_mount        *mp = rtg_mount(rtg);
 698         struct xfs_trim_rtgroup tr = {
 699                 .minlen_fsb     = XFS_BB_TO_FSB(mp, minlen),
 700         };
 701         struct xfs_trans        *tp;
 702         int                     error;
 703
 704         error = xfs_trans_alloc_empty(mp, &tp);
 705         if (error)
 706                 return error;
 707
 708         /*
 709          * Walk the free ranges between low and high.  The query_range function
 710          * trims the extents returned.
 711          */
 712         do {
 713                 tr.extents = kzalloc(sizeof(*tr.extents), GFP_KERNEL);
 714                 if (!tr.extents) {
 715                         error = -ENOMEM;
 716                         break;
 717                 }
 718
 719                 tr.queued = 0;
 720                 tr.batch = XFS_DISCARD_MAX_EXAMINE;
 721                 tr.extents->owner = tr.extents;
 722                 INIT_LIST_HEAD(&tr.extents->extent_list);
 723
 724                 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
 725                 error = xfs_rtalloc_query_range(rtg, tp, low, high,
 726                                 xfs_trim_gather_rtgroup_extent, &tr);
 727                 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
 728                 if (error == -ECANCELED)
 729                         error = 0;
 730                 if (error) {
 731                         kfree(tr.extents);
 732                         break;
 733                 }
 734
 735                 if (!tr.queued)
 736                         break;
 737
 738                 /*
 739                  * We hand the extent list to the discard function here so the
 740                  * discarded extents can be removed from the busy extent list.
 741                  * This allows the discards to run asynchronously with
 742                  * gathering the next round of extents to discard.
 743                  *
 744                  * However, we must ensure that we do not reference the extent
 745                  * list  after this function call, as it may have been freed by
 746                  * the time control returns to us.
 747                  */
 748                 error = xfs_discard_extents(rtg_mount(rtg), tr.extents);
 749                 if (error)
 750                         break;
 751
 752                 low = tr.restart_rtx;
 753         } while (!xfs_trim_should_stop() && low <= high);
 754
 755         xfs_trans_cancel(tp);
 756         return error;
 757 }
 758
 759 static int
 760 xfs_trim_rtdev_extents(
 761         struct xfs_mount        *mp,
 762         xfs_daddr_t             start,
 763         xfs_daddr_t             end,
 764         xfs_daddr_t             minlen)
 765 {
 766         xfs_rtblock_t           start_rtbno, end_rtbno;
 767         xfs_rtxnum_t            start_rtx, end_rtx;
 768         xfs_rgnumber_t          start_rgno, end_rgno;
 769         xfs_daddr_t             daddr_offset;
 770         int                     last_error = 0, error;
 771         struct xfs_rtgroup      *rtg = NULL;
 772
 773         /* Shift the start and end downwards to match the rt device. */
 774         daddr_offset = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
 775         if (start > daddr_offset)
 776                 start -= daddr_offset;
 777         else
 778                 start = 0;
 779         start_rtbno = xfs_daddr_to_rtb(mp, start);
 780         start_rtx = xfs_rtb_to_rtx(mp, start_rtbno);
 781         start_rgno = xfs_rtb_to_rgno(mp, start_rtbno);
 782
 783         if (end <= daddr_offset)
 784                 return 0;
 785         else
 786                 end -= daddr_offset;
 787         end_rtbno = xfs_daddr_to_rtb(mp, end);
 788         end_rtx = xfs_rtb_to_rtx(mp, end_rtbno + mp->m_sb.sb_rextsize - 1);
 789         end_rgno = xfs_rtb_to_rgno(mp, end_rtbno);
 790
 791         while ((rtg = xfs_rtgroup_next_range(mp, rtg, start_rgno, end_rgno))) {
 792                 xfs_rtxnum_t    rtg_end = rtg->rtg_extents;
 793
 794                 if (rtg_rgno(rtg) == end_rgno)
 795                         rtg_end = min(rtg_end, end_rtx);
 796
 797                 if (xfs_has_rtgroups(mp))
 798                         error = xfs_trim_rtgroup_extents(rtg, start_rtx,
 799                                         rtg_end, minlen);
 800                 else
 801                         error = xfs_trim_rtextents(rtg, start_rtx, rtg_end,
 802                                         minlen);
 803                 if (error)
 804                         last_error = error;
 805
 806                 if (xfs_trim_should_stop()) {
 807                         xfs_rtgroup_rele(rtg);
 808                         break;
 809                 }
 810                 start_rtx = 0;
 811         }
 812
 813         return last_error;
 814 }
 815 #else
 816 # define xfs_trim_rtdev_extents(...)    (-EOPNOTSUPP)
 817 #endif /* CONFIG_XFS_RT */
 818
 819 /*
 820  * trim a range of the filesystem.
 821  *
 822  * Note: the parameters passed from userspace are byte ranges into the
 823  * filesystem which does not match to the format we use for filesystem block
 824  * addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format
 825  * is a linear address range. Hence we need to use DADDR based conversions and
 826  * comparisons for determining the correct offset and regions to trim.
 827  *
 828  * The realtime device is mapped into the FITRIM "address space" immediately
 829  * after the data device.
 830  */
 831 int
 832 xfs_ioc_trim(
 833         struct xfs_mount                *mp,
 834         struct fstrim_range __user      *urange)
 835 {
 836         unsigned int            granularity =
 837                 bdev_discard_granularity(mp->m_ddev_targp->bt_bdev);
 838         struct block_device     *rt_bdev = NULL;
 839         struct fstrim_range     range;
 840         xfs_daddr_t             start, end;
 841         xfs_extlen_t            minlen;
 842         xfs_rfsblock_t          max_blocks;
 843         int                     error, last_error = 0;
 844
 845         if (!capable(CAP_SYS_ADMIN))
 846                 return -EPERM;
 847         if (mp->m_rtdev_targp &&
 848             bdev_max_discard_sectors(mp->m_rtdev_targp->bt_bdev))
 849                 rt_bdev = mp->m_rtdev_targp->bt_bdev;
 850         if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev) && !rt_bdev)
 851                 return -EOPNOTSUPP;
 852
 853         if (rt_bdev)
 854                 granularity = max(granularity,
 855                                   bdev_discard_granularity(rt_bdev));
 856
 857         /*
 858          * We haven't recovered the log, so we cannot use our bnobt-guided
 859          * storage zapping commands.
 860          */
 861         if (xfs_has_norecovery(mp))
 862                 return -EROFS;
 863
 864         if (copy_from_user(&range, urange, sizeof(range)))
 865                 return -EFAULT;
 866
 867         range.minlen = max_t(u64, granularity, range.minlen);
 868         minlen = XFS_B_TO_FSB(mp, range.minlen);
 869
 870         /*
 871          * Truncating down the len isn't actually quite correct, but using
 872          * BBTOB would mean we trivially get overflows for values
 873          * of ULLONG_MAX or slightly lower.  And ULLONG_MAX is the default
 874          * used by the fstrim application.  In the end it really doesn't
 875          * matter as trimming blocks is an advisory interface.
 876          */
 877         max_blocks = mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks;
 878         if (range.start >= XFS_FSB_TO_B(mp, max_blocks) ||
 879             range.minlen > XFS_FSB_TO_B(mp, mp->m_ag_max_usable) ||
 880             range.len < mp->m_sb.sb_blocksize)
 881                 return -EINVAL;
 882
 883         start = BTOBB(range.start);
 884         end = start + BTOBBT(range.len) - 1;
 885
 886         if (bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev)) {
 887                 error = xfs_trim_datadev_extents(mp, start, end, minlen);
 888                 if (error)
 889                         last_error = error;
 890         }
 891
 892         if (rt_bdev && !xfs_trim_should_stop()) {
 893                 error = xfs_trim_rtdev_extents(mp, start, end, minlen);
 894                 if (error)
 895                         last_error = error;
 896         }
 897
 898         if (last_error)
 899                 return last_error;
 900
 901         range.len = min_t(unsigned long long, range.len,
 902                           XFS_FSB_TO_B(mp, max_blocks) - range.start);
 903         if (copy_to_user(urange, &range, sizeof(range)))
 904                 return -EFAULT;
 905         return 0;
 906 }