drm/tests: Add test for drm_atomic_helper_check_modeset()
[drm/drm-misc.git] / fs / xfs / xfs_discard.c
blobc4bd145f5ec1bfb1b0fda2bf5a78878c9a2b70e9
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2010, 2023 Red Hat, Inc.
4 * All Rights Reserved.
5 */
6 #include "xfs.h"
7 #include "xfs_shared.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_trans.h"
12 #include "xfs_mount.h"
13 #include "xfs_btree.h"
14 #include "xfs_alloc_btree.h"
15 #include "xfs_alloc.h"
16 #include "xfs_discard.h"
17 #include "xfs_error.h"
18 #include "xfs_extent_busy.h"
19 #include "xfs_trace.h"
20 #include "xfs_log.h"
21 #include "xfs_ag.h"
22 #include "xfs_health.h"
23 #include "xfs_rtbitmap.h"
24 #include "xfs_rtgroup.h"
27 * Notes on an efficient, low latency fstrim algorithm
29 * We need to walk the filesystem free space and issue discards on the free
30 * space that meet the search criteria (size and location). We cannot issue
31 * discards on extents that might be in use, or are so recently in use they are
32 * still marked as busy. To serialise against extent state changes whilst we are
33 * gathering extents to trim, we must hold the AGF lock to lock out other
34 * allocations and extent free operations that might change extent state.
36 * However, we cannot just hold the AGF for the entire AG free space walk whilst
37 * we issue discards on each free space that is found. Storage devices can have
38 * extremely slow discard implementations (e.g. ceph RBD) and so walking a
39 * couple of million free extents and issuing synchronous discards on each
40 * extent can take a *long* time. Whilst we are doing this walk, nothing else
41 * can access the AGF, and we can stall transactions and hence the log whilst
42 * modifications wait for the AGF lock to be released. This can lead hung tasks
43 * kicking the hung task timer and rebooting the system. This is bad.
45 * Hence we need to take a leaf from the bulkstat playbook. It takes the AGI
46 * lock, gathers a range of inode cluster buffers that are allocated, drops the
47 * AGI lock and then reads all the inode cluster buffers and processes them. It
48 * loops doing this, using a cursor to keep track of where it is up to in the AG
49 * for each iteration to restart the INOBT lookup from.
51 * We can't do this exactly with free space - once we drop the AGF lock, the
52 * state of the free extent is out of our control and we cannot run a discard
53 * safely on it in this situation. Unless, of course, we've marked the free
54 * extent as busy and undergoing a discard operation whilst we held the AGF
55 * locked.
57 * This is exactly how online discard works - free extents are marked busy when
58 * they are freed, and once the extent free has been committed to the journal,
59 * the busy extent record is marked as "undergoing discard" and the discard is
60 * then issued on the free extent. Once the discard completes, the busy extent
61 * record is removed and the extent is able to be allocated again.
63 * In the context of fstrim, if we find a free extent we need to discard, we
64 * don't have to discard it immediately. All we need to do it record that free
65 * extent as being busy and under discard, and all the allocation routines will
66 * now avoid trying to allocate it. Hence if we mark the extent as busy under
67 * the AGF lock, we can safely discard it without holding the AGF lock because
68 * nothing will attempt to allocate that free space until the discard completes.
70 * This also allows us to issue discards asynchronously like we do with online
71 * discard, and so for fast devices fstrim will run much faster as we can have
72 * multiple discard operations in flight at once, as well as pipeline the free
73 * extent search so that it overlaps in flight discard IO.
76 #define XFS_DISCARD_MAX_EXAMINE (100)
78 struct workqueue_struct *xfs_discard_wq;
80 static void
81 xfs_discard_endio_work(
82 struct work_struct *work)
84 struct xfs_busy_extents *extents =
85 container_of(work, struct xfs_busy_extents, endio_work);
87 xfs_extent_busy_clear(&extents->extent_list, false);
88 kfree(extents->owner);
92 * Queue up the actual completion to a thread to avoid IRQ-safe locking for
93 * pagb_lock.
95 static void
96 xfs_discard_endio(
97 struct bio *bio)
99 struct xfs_busy_extents *extents = bio->bi_private;
101 INIT_WORK(&extents->endio_work, xfs_discard_endio_work);
102 queue_work(xfs_discard_wq, &extents->endio_work);
103 bio_put(bio);
106 static inline struct block_device *
107 xfs_group_bdev(
108 const struct xfs_group *xg)
110 struct xfs_mount *mp = xg->xg_mount;
112 switch (xg->xg_type) {
113 case XG_TYPE_AG:
114 return mp->m_ddev_targp->bt_bdev;
115 case XG_TYPE_RTG:
116 return mp->m_rtdev_targp->bt_bdev;
117 default:
118 ASSERT(0);
119 break;
121 return NULL;
125 * Walk the discard list and issue discards on all the busy extents in the
126 * list. We plug and chain the bios so that we only need a single completion
127 * call to clear all the busy extents once the discards are complete.
130 xfs_discard_extents(
131 struct xfs_mount *mp,
132 struct xfs_busy_extents *extents)
134 struct xfs_extent_busy *busyp;
135 struct bio *bio = NULL;
136 struct blk_plug plug;
137 int error = 0;
139 blk_start_plug(&plug);
140 list_for_each_entry(busyp, &extents->extent_list, list) {
141 trace_xfs_discard_extent(busyp->group, busyp->bno,
142 busyp->length);
144 error = __blkdev_issue_discard(xfs_group_bdev(busyp->group),
145 xfs_gbno_to_daddr(busyp->group, busyp->bno),
146 XFS_FSB_TO_BB(mp, busyp->length),
147 GFP_KERNEL, &bio);
148 if (error && error != -EOPNOTSUPP) {
149 xfs_info(mp,
150 "discard failed for extent [0x%llx,%u], error %d",
151 (unsigned long long)busyp->bno,
152 busyp->length,
153 error);
154 break;
158 if (bio) {
159 bio->bi_private = extents;
160 bio->bi_end_io = xfs_discard_endio;
161 submit_bio(bio);
162 } else {
163 xfs_discard_endio_work(&extents->endio_work);
165 blk_finish_plug(&plug);
167 return error;
170 struct xfs_trim_cur {
171 xfs_agblock_t start;
172 xfs_extlen_t count;
173 xfs_agblock_t end;
174 xfs_extlen_t minlen;
175 bool by_bno;
178 static int
179 xfs_trim_gather_extents(
180 struct xfs_perag *pag,
181 struct xfs_trim_cur *tcur,
182 struct xfs_busy_extents *extents)
184 struct xfs_mount *mp = pag_mount(pag);
185 struct xfs_trans *tp;
186 struct xfs_btree_cur *cur;
187 struct xfs_buf *agbp;
188 int error;
189 int i;
190 int batch = XFS_DISCARD_MAX_EXAMINE;
193 * Force out the log. This means any transactions that might have freed
194 * space before we take the AGF buffer lock are now on disk, and the
195 * volatile disk cache is flushed.
197 xfs_log_force(mp, XFS_LOG_SYNC);
199 error = xfs_trans_alloc_empty(mp, &tp);
200 if (error)
201 return error;
203 error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
204 if (error)
205 goto out_trans_cancel;
207 if (tcur->by_bno) {
208 /* sub-AG discard request always starts at tcur->start */
209 cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag);
210 error = xfs_alloc_lookup_le(cur, tcur->start, 0, &i);
211 if (!error && !i)
212 error = xfs_alloc_lookup_ge(cur, tcur->start, 0, &i);
213 } else if (tcur->start == 0) {
214 /* first time through a by-len starts with max length */
215 cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag);
216 error = xfs_alloc_lookup_ge(cur, 0, tcur->count, &i);
217 } else {
218 /* nth time through a by-len starts where we left off */
219 cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag);
220 error = xfs_alloc_lookup_le(cur, tcur->start, tcur->count, &i);
222 if (error)
223 goto out_del_cursor;
224 if (i == 0) {
225 /* nothing of that length left in the AG, we are done */
226 tcur->count = 0;
227 goto out_del_cursor;
231 * Loop until we are done with all extents that are large
232 * enough to be worth discarding or we hit batch limits.
234 while (i) {
235 xfs_agblock_t fbno;
236 xfs_extlen_t flen;
238 error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
239 if (error)
240 break;
241 if (XFS_IS_CORRUPT(mp, i != 1)) {
242 xfs_btree_mark_sick(cur);
243 error = -EFSCORRUPTED;
244 break;
247 if (--batch <= 0) {
249 * Update the cursor to point at this extent so we
250 * restart the next batch from this extent.
252 tcur->start = fbno;
253 tcur->count = flen;
254 break;
258 * If the extent is entirely outside of the range we are
259 * supposed to skip it. Do not bother to trim down partially
260 * overlapping ranges for now.
262 if (fbno + flen < tcur->start) {
263 trace_xfs_discard_exclude(pag_group(pag), fbno, flen);
264 goto next_extent;
266 if (fbno > tcur->end) {
267 trace_xfs_discard_exclude(pag_group(pag), fbno, flen);
268 if (tcur->by_bno) {
269 tcur->count = 0;
270 break;
272 goto next_extent;
275 /* Trim the extent returned to the range we want. */
276 if (fbno < tcur->start) {
277 flen -= tcur->start - fbno;
278 fbno = tcur->start;
280 if (fbno + flen > tcur->end + 1)
281 flen = tcur->end - fbno + 1;
283 /* Too small? Give up. */
284 if (flen < tcur->minlen) {
285 trace_xfs_discard_toosmall(pag_group(pag), fbno, flen);
286 if (tcur->by_bno)
287 goto next_extent;
288 tcur->count = 0;
289 break;
293 * If any blocks in the range are still busy, skip the
294 * discard and try again the next time.
296 if (xfs_extent_busy_search(pag_group(pag), fbno, flen)) {
297 trace_xfs_discard_busy(pag_group(pag), fbno, flen);
298 goto next_extent;
301 xfs_extent_busy_insert_discard(pag_group(pag), fbno, flen,
302 &extents->extent_list);
303 next_extent:
304 if (tcur->by_bno)
305 error = xfs_btree_increment(cur, 0, &i);
306 else
307 error = xfs_btree_decrement(cur, 0, &i);
308 if (error)
309 break;
312 * If there's no more records in the tree, we are done. Set the
313 * cursor block count to 0 to indicate to the caller that there
314 * is no more extents to search.
316 if (i == 0)
317 tcur->count = 0;
321 * If there was an error, release all the gathered busy extents because
322 * we aren't going to issue a discard on them any more.
324 if (error)
325 xfs_extent_busy_clear(&extents->extent_list, false);
326 out_del_cursor:
327 xfs_btree_del_cursor(cur, error);
328 out_trans_cancel:
329 xfs_trans_cancel(tp);
330 return error;
333 static bool
334 xfs_trim_should_stop(void)
336 return fatal_signal_pending(current) || freezing(current);
340 * Iterate the free list gathering extents and discarding them. We need a cursor
341 * for the repeated iteration of gather/discard loop, so use the longest extent
342 * we found in the last batch as the key to start the next.
344 static int
345 xfs_trim_perag_extents(
346 struct xfs_perag *pag,
347 xfs_agblock_t start,
348 xfs_agblock_t end,
349 xfs_extlen_t minlen)
351 struct xfs_trim_cur tcur = {
352 .start = start,
353 .count = pag->pagf_longest,
354 .end = end,
355 .minlen = minlen,
357 int error = 0;
359 if (start != 0 || end != pag_group(pag)->xg_block_count)
360 tcur.by_bno = true;
362 do {
363 struct xfs_busy_extents *extents;
365 extents = kzalloc(sizeof(*extents), GFP_KERNEL);
366 if (!extents) {
367 error = -ENOMEM;
368 break;
371 extents->owner = extents;
372 INIT_LIST_HEAD(&extents->extent_list);
374 error = xfs_trim_gather_extents(pag, &tcur, extents);
375 if (error) {
376 kfree(extents);
377 break;
381 * We hand the extent list to the discard function here so the
382 * discarded extents can be removed from the busy extent list.
383 * This allows the discards to run asynchronously with gathering
384 * the next round of extents to discard.
386 * However, we must ensure that we do not reference the extent
387 * list after this function call, as it may have been freed by
388 * the time control returns to us.
390 error = xfs_discard_extents(pag_mount(pag), extents);
391 if (error)
392 break;
394 if (xfs_trim_should_stop())
395 break;
397 } while (tcur.count != 0);
399 return error;
403 static int
404 xfs_trim_datadev_extents(
405 struct xfs_mount *mp,
406 xfs_daddr_t start,
407 xfs_daddr_t end,
408 xfs_extlen_t minlen)
410 xfs_agnumber_t start_agno, end_agno;
411 xfs_agblock_t start_agbno, end_agbno;
412 struct xfs_perag *pag = NULL;
413 xfs_daddr_t ddev_end;
414 int last_error = 0, error;
416 ddev_end = min_t(xfs_daddr_t, end,
417 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1);
419 start_agno = xfs_daddr_to_agno(mp, start);
420 start_agbno = xfs_daddr_to_agbno(mp, start);
421 end_agno = xfs_daddr_to_agno(mp, ddev_end);
422 end_agbno = xfs_daddr_to_agbno(mp, ddev_end);
424 while ((pag = xfs_perag_next_range(mp, pag, start_agno, end_agno))) {
425 xfs_agblock_t agend = pag_group(pag)->xg_block_count;
427 if (pag_agno(pag) == end_agno)
428 agend = end_agbno;
429 error = xfs_trim_perag_extents(pag, start_agbno, agend, minlen);
430 if (error)
431 last_error = error;
433 if (xfs_trim_should_stop()) {
434 xfs_perag_rele(pag);
435 break;
437 start_agbno = 0;
440 return last_error;
443 #ifdef CONFIG_XFS_RT
444 struct xfs_trim_rtdev {
445 /* list of rt extents to free */
446 struct list_head extent_list;
448 /* minimum length that caller allows us to trim */
449 xfs_rtblock_t minlen_fsb;
451 /* restart point for the rtbitmap walk */
452 xfs_rtxnum_t restart_rtx;
454 /* stopping point for the current rtbitmap walk */
455 xfs_rtxnum_t stop_rtx;
458 struct xfs_rtx_busy {
459 struct list_head list;
460 xfs_rtblock_t bno;
461 xfs_rtblock_t length;
464 static void
465 xfs_discard_free_rtdev_extents(
466 struct xfs_trim_rtdev *tr)
468 struct xfs_rtx_busy *busyp, *n;
470 list_for_each_entry_safe(busyp, n, &tr->extent_list, list) {
471 list_del_init(&busyp->list);
472 kfree(busyp);
477 * Walk the discard list and issue discards on all the busy extents in the
478 * list. We plug and chain the bios so that we only need a single completion
479 * call to clear all the busy extents once the discards are complete.
481 static int
482 xfs_discard_rtdev_extents(
483 struct xfs_mount *mp,
484 struct xfs_trim_rtdev *tr)
486 struct block_device *bdev = mp->m_rtdev_targp->bt_bdev;
487 struct xfs_rtx_busy *busyp;
488 struct bio *bio = NULL;
489 struct blk_plug plug;
490 xfs_rtblock_t start = NULLRTBLOCK, length = 0;
491 int error = 0;
493 blk_start_plug(&plug);
494 list_for_each_entry(busyp, &tr->extent_list, list) {
495 if (start == NULLRTBLOCK)
496 start = busyp->bno;
497 length += busyp->length;
499 trace_xfs_discard_rtextent(mp, busyp->bno, busyp->length);
501 error = __blkdev_issue_discard(bdev,
502 xfs_rtb_to_daddr(mp, busyp->bno),
503 XFS_FSB_TO_BB(mp, busyp->length),
504 GFP_NOFS, &bio);
505 if (error)
506 break;
508 xfs_discard_free_rtdev_extents(tr);
510 if (bio) {
511 error = submit_bio_wait(bio);
512 if (error == -EOPNOTSUPP)
513 error = 0;
514 if (error)
515 xfs_info(mp,
516 "discard failed for rtextent [0x%llx,%llu], error %d",
517 (unsigned long long)start,
518 (unsigned long long)length,
519 error);
520 bio_put(bio);
522 blk_finish_plug(&plug);
524 return error;
527 static int
528 xfs_trim_gather_rtextent(
529 struct xfs_rtgroup *rtg,
530 struct xfs_trans *tp,
531 const struct xfs_rtalloc_rec *rec,
532 void *priv)
534 struct xfs_trim_rtdev *tr = priv;
535 struct xfs_rtx_busy *busyp;
536 xfs_rtblock_t rbno, rlen;
538 if (rec->ar_startext > tr->stop_rtx) {
540 * If we've scanned a large number of rtbitmap blocks, update
541 * the cursor to point at this extent so we restart the next
542 * batch from this extent.
544 tr->restart_rtx = rec->ar_startext;
545 return -ECANCELED;
548 rbno = xfs_rtx_to_rtb(rtg, rec->ar_startext);
549 rlen = xfs_rtbxlen_to_blen(rtg_mount(rtg), rec->ar_extcount);
551 /* Ignore too small. */
552 if (rlen < tr->minlen_fsb) {
553 trace_xfs_discard_rttoosmall(rtg_mount(rtg), rbno, rlen);
554 return 0;
557 busyp = kzalloc(sizeof(struct xfs_rtx_busy), GFP_KERNEL);
558 if (!busyp)
559 return -ENOMEM;
561 busyp->bno = rbno;
562 busyp->length = rlen;
563 INIT_LIST_HEAD(&busyp->list);
564 list_add_tail(&busyp->list, &tr->extent_list);
566 tr->restart_rtx = rec->ar_startext + rec->ar_extcount;
567 return 0;
570 /* Trim extents on an !rtgroups realtime device */
571 static int
572 xfs_trim_rtextents(
573 struct xfs_rtgroup *rtg,
574 xfs_rtxnum_t low,
575 xfs_rtxnum_t high,
576 xfs_daddr_t minlen)
578 struct xfs_mount *mp = rtg_mount(rtg);
579 struct xfs_trim_rtdev tr = {
580 .minlen_fsb = XFS_BB_TO_FSB(mp, minlen),
581 .extent_list = LIST_HEAD_INIT(tr.extent_list),
583 struct xfs_trans *tp;
584 int error;
586 error = xfs_trans_alloc_empty(mp, &tp);
587 if (error)
588 return error;
591 * Walk the free ranges between low and high. The query_range function
592 * trims the extents returned.
594 do {
595 tr.stop_rtx = low + xfs_rtbitmap_rtx_per_rbmblock(mp);
596 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
597 error = xfs_rtalloc_query_range(rtg, tp, low, high,
598 xfs_trim_gather_rtextent, &tr);
600 if (error == -ECANCELED)
601 error = 0;
602 if (error) {
603 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
604 xfs_discard_free_rtdev_extents(&tr);
605 break;
608 if (list_empty(&tr.extent_list)) {
609 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
610 break;
613 error = xfs_discard_rtdev_extents(mp, &tr);
614 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
615 if (error)
616 break;
618 low = tr.restart_rtx;
619 } while (!xfs_trim_should_stop() && low <= high);
621 xfs_trans_cancel(tp);
622 return error;
625 struct xfs_trim_rtgroup {
626 /* list of rtgroup extents to free */
627 struct xfs_busy_extents *extents;
629 /* minimum length that caller allows us to trim */
630 xfs_rtblock_t minlen_fsb;
632 /* restart point for the rtbitmap walk */
633 xfs_rtxnum_t restart_rtx;
635 /* number of extents to examine before stopping to issue discard ios */
636 int batch;
638 /* number of extents queued for discard */
639 int queued;
642 static int
643 xfs_trim_gather_rtgroup_extent(
644 struct xfs_rtgroup *rtg,
645 struct xfs_trans *tp,
646 const struct xfs_rtalloc_rec *rec,
647 void *priv)
649 struct xfs_trim_rtgroup *tr = priv;
650 xfs_rgblock_t rgbno;
651 xfs_extlen_t len;
653 if (--tr->batch <= 0) {
655 * If we've checked a large number of extents, update the
656 * cursor to point at this extent so we restart the next batch
657 * from this extent.
659 tr->restart_rtx = rec->ar_startext;
660 return -ECANCELED;
663 rgbno = xfs_rtx_to_rgbno(rtg, rec->ar_startext);
664 len = xfs_rtxlen_to_extlen(rtg_mount(rtg), rec->ar_extcount);
666 /* Ignore too small. */
667 if (len < tr->minlen_fsb) {
668 trace_xfs_discard_toosmall(rtg_group(rtg), rgbno, len);
669 return 0;
673 * If any blocks in the range are still busy, skip the discard and try
674 * again the next time.
676 if (xfs_extent_busy_search(rtg_group(rtg), rgbno, len)) {
677 trace_xfs_discard_busy(rtg_group(rtg), rgbno, len);
678 return 0;
681 xfs_extent_busy_insert_discard(rtg_group(rtg), rgbno, len,
682 &tr->extents->extent_list);
684 tr->queued++;
685 tr->restart_rtx = rec->ar_startext + rec->ar_extcount;
686 return 0;
689 /* Trim extents in this rtgroup using the busy extent machinery. */
690 static int
691 xfs_trim_rtgroup_extents(
692 struct xfs_rtgroup *rtg,
693 xfs_rtxnum_t low,
694 xfs_rtxnum_t high,
695 xfs_daddr_t minlen)
697 struct xfs_mount *mp = rtg_mount(rtg);
698 struct xfs_trim_rtgroup tr = {
699 .minlen_fsb = XFS_BB_TO_FSB(mp, minlen),
701 struct xfs_trans *tp;
702 int error;
704 error = xfs_trans_alloc_empty(mp, &tp);
705 if (error)
706 return error;
709 * Walk the free ranges between low and high. The query_range function
710 * trims the extents returned.
712 do {
713 tr.extents = kzalloc(sizeof(*tr.extents), GFP_KERNEL);
714 if (!tr.extents) {
715 error = -ENOMEM;
716 break;
719 tr.queued = 0;
720 tr.batch = XFS_DISCARD_MAX_EXAMINE;
721 tr.extents->owner = tr.extents;
722 INIT_LIST_HEAD(&tr.extents->extent_list);
724 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
725 error = xfs_rtalloc_query_range(rtg, tp, low, high,
726 xfs_trim_gather_rtgroup_extent, &tr);
727 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
728 if (error == -ECANCELED)
729 error = 0;
730 if (error) {
731 kfree(tr.extents);
732 break;
735 if (!tr.queued)
736 break;
739 * We hand the extent list to the discard function here so the
740 * discarded extents can be removed from the busy extent list.
741 * This allows the discards to run asynchronously with
742 * gathering the next round of extents to discard.
744 * However, we must ensure that we do not reference the extent
745 * list after this function call, as it may have been freed by
746 * the time control returns to us.
748 error = xfs_discard_extents(rtg_mount(rtg), tr.extents);
749 if (error)
750 break;
752 low = tr.restart_rtx;
753 } while (!xfs_trim_should_stop() && low <= high);
755 xfs_trans_cancel(tp);
756 return error;
759 static int
760 xfs_trim_rtdev_extents(
761 struct xfs_mount *mp,
762 xfs_daddr_t start,
763 xfs_daddr_t end,
764 xfs_daddr_t minlen)
766 xfs_rtblock_t start_rtbno, end_rtbno;
767 xfs_rtxnum_t start_rtx, end_rtx;
768 xfs_rgnumber_t start_rgno, end_rgno;
769 xfs_daddr_t daddr_offset;
770 int last_error = 0, error;
771 struct xfs_rtgroup *rtg = NULL;
773 /* Shift the start and end downwards to match the rt device. */
774 daddr_offset = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
775 if (start > daddr_offset)
776 start -= daddr_offset;
777 else
778 start = 0;
779 start_rtbno = xfs_daddr_to_rtb(mp, start);
780 start_rtx = xfs_rtb_to_rtx(mp, start_rtbno);
781 start_rgno = xfs_rtb_to_rgno(mp, start_rtbno);
783 if (end <= daddr_offset)
784 return 0;
785 else
786 end -= daddr_offset;
787 end_rtbno = xfs_daddr_to_rtb(mp, end);
788 end_rtx = xfs_rtb_to_rtx(mp, end_rtbno + mp->m_sb.sb_rextsize - 1);
789 end_rgno = xfs_rtb_to_rgno(mp, end_rtbno);
791 while ((rtg = xfs_rtgroup_next_range(mp, rtg, start_rgno, end_rgno))) {
792 xfs_rtxnum_t rtg_end = rtg->rtg_extents;
794 if (rtg_rgno(rtg) == end_rgno)
795 rtg_end = min(rtg_end, end_rtx);
797 if (xfs_has_rtgroups(mp))
798 error = xfs_trim_rtgroup_extents(rtg, start_rtx,
799 rtg_end, minlen);
800 else
801 error = xfs_trim_rtextents(rtg, start_rtx, rtg_end,
802 minlen);
803 if (error)
804 last_error = error;
806 if (xfs_trim_should_stop()) {
807 xfs_rtgroup_rele(rtg);
808 break;
810 start_rtx = 0;
813 return last_error;
815 #else
816 # define xfs_trim_rtdev_extents(...) (-EOPNOTSUPP)
817 #endif /* CONFIG_XFS_RT */
820 * trim a range of the filesystem.
822 * Note: the parameters passed from userspace are byte ranges into the
823 * filesystem which does not match to the format we use for filesystem block
824 * addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format
825 * is a linear address range. Hence we need to use DADDR based conversions and
826 * comparisons for determining the correct offset and regions to trim.
828 * The realtime device is mapped into the FITRIM "address space" immediately
829 * after the data device.
832 xfs_ioc_trim(
833 struct xfs_mount *mp,
834 struct fstrim_range __user *urange)
836 unsigned int granularity =
837 bdev_discard_granularity(mp->m_ddev_targp->bt_bdev);
838 struct block_device *rt_bdev = NULL;
839 struct fstrim_range range;
840 xfs_daddr_t start, end;
841 xfs_extlen_t minlen;
842 xfs_rfsblock_t max_blocks;
843 int error, last_error = 0;
845 if (!capable(CAP_SYS_ADMIN))
846 return -EPERM;
847 if (mp->m_rtdev_targp &&
848 bdev_max_discard_sectors(mp->m_rtdev_targp->bt_bdev))
849 rt_bdev = mp->m_rtdev_targp->bt_bdev;
850 if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev) && !rt_bdev)
851 return -EOPNOTSUPP;
853 if (rt_bdev)
854 granularity = max(granularity,
855 bdev_discard_granularity(rt_bdev));
858 * We haven't recovered the log, so we cannot use our bnobt-guided
859 * storage zapping commands.
861 if (xfs_has_norecovery(mp))
862 return -EROFS;
864 if (copy_from_user(&range, urange, sizeof(range)))
865 return -EFAULT;
867 range.minlen = max_t(u64, granularity, range.minlen);
868 minlen = XFS_B_TO_FSB(mp, range.minlen);
871 * Truncating down the len isn't actually quite correct, but using
872 * BBTOB would mean we trivially get overflows for values
873 * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default
874 * used by the fstrim application. In the end it really doesn't
875 * matter as trimming blocks is an advisory interface.
877 max_blocks = mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks;
878 if (range.start >= XFS_FSB_TO_B(mp, max_blocks) ||
879 range.minlen > XFS_FSB_TO_B(mp, mp->m_ag_max_usable) ||
880 range.len < mp->m_sb.sb_blocksize)
881 return -EINVAL;
883 start = BTOBB(range.start);
884 end = start + BTOBBT(range.len) - 1;
886 if (bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev)) {
887 error = xfs_trim_datadev_extents(mp, start, end, minlen);
888 if (error)
889 last_error = error;
892 if (rt_bdev && !xfs_trim_should_stop()) {
893 error = xfs_trim_rtdev_extents(mp, start, end, minlen);
894 if (error)
895 last_error = error;
898 if (last_error)
899 return last_error;
901 range.len = min_t(unsigned long long, range.len,
902 XFS_FSB_TO_B(mp, max_blocks) - range.start);
903 if (copy_to_user(urange, &range, sizeof(range)))
904 return -EFAULT;
905 return 0;