fs/btrfs/fiemap.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include "backref.h"
   4 #include "btrfs_inode.h"
   5 #include "fiemap.h"
   6 #include "file.h"
   7 #include "file-item.h"
   8
   9 struct btrfs_fiemap_entry {
  10         u64 offset;
  11         u64 phys;
  12         u64 len;
  13         u32 flags;
  14 };
  15
  16 /*
  17  * Indicate the caller of emit_fiemap_extent() that it needs to unlock the file
  18  * range from the inode's io tree, unlock the subvolume tree search path, flush
  19  * the fiemap cache and relock the file range and research the subvolume tree.
  20  * The value here is something negative that can't be confused with a valid
  21  * errno value and different from 1 because that's also a return value from
  22  * fiemap_fill_next_extent() and also it's often used to mean some btree search
  23  * did not find a key, so make it some distinct negative value.
  24  */
  25 #define BTRFS_FIEMAP_FLUSH_CACHE (-(MAX_ERRNO + 1))
  26
  27 /*
  28  * Used to:
  29  *
  30  * - Cache the next entry to be emitted to the fiemap buffer, so that we can
  31  *   merge extents that are contiguous and can be grouped as a single one;
  32  *
  33  * - Store extents ready to be written to the fiemap buffer in an intermediary
  34  *   buffer. This intermediary buffer is to ensure that in case the fiemap
  35  *   buffer is memory mapped to the fiemap target file, we don't deadlock
  36  *   during btrfs_page_mkwrite(). This is because during fiemap we are locking
  37  *   an extent range in order to prevent races with delalloc flushing and
  38  *   ordered extent completion, which is needed in order to reliably detect
  39  *   delalloc in holes and prealloc extents. And this can lead to a deadlock
  40  *   if the fiemap buffer is memory mapped to the file we are running fiemap
  41  *   against (a silly, useless in practice scenario, but possible) because
  42  *   btrfs_page_mkwrite() will try to lock the same extent range.
  43  */
  44 struct fiemap_cache {
  45         /* An array of ready fiemap entries. */
  46         struct btrfs_fiemap_entry *entries;
  47         /* Number of entries in the entries array. */
  48         int entries_size;
  49         /* Index of the next entry in the entries array to write to. */
  50         int entries_pos;
  51         /*
  52          * Once the entries array is full, this indicates what's the offset for
  53          * the next file extent item we must search for in the inode's subvolume
  54          * tree after unlocking the extent range in the inode's io tree and
  55          * releasing the search path.
  56          */
  57         u64 next_search_offset;
  58         /*
  59          * This matches struct fiemap_extent_info::fi_mapped_extents, we use it
  60          * to count ourselves emitted extents and stop instead of relying on
  61          * fiemap_fill_next_extent() because we buffer ready fiemap entries at
  62          * the @entries array, and we want to stop as soon as we hit the max
  63          * amount of extents to map, not just to save time but also to make the
  64          * logic at extent_fiemap() simpler.
  65          */
  66         unsigned int extents_mapped;
  67         /* Fields for the cached extent (unsubmitted, not ready, extent). */
  68         u64 offset;
  69         u64 phys;
  70         u64 len;
  71         u32 flags;
  72         bool cached;
  73 };
  74
  75 static int flush_fiemap_cache(struct fiemap_extent_info *fieinfo,
  76                               struct fiemap_cache *cache)
  77 {
  78         for (int i = 0; i < cache->entries_pos; i++) {
  79                 struct btrfs_fiemap_entry *entry = &cache->entries[i];
  80                 int ret;
  81
  82                 ret = fiemap_fill_next_extent(fieinfo, entry->offset,
  83                                               entry->phys, entry->len,
  84                                               entry->flags);
  85                 /*
  86                  * Ignore 1 (reached max entries) because we keep track of that
  87                  * ourselves in emit_fiemap_extent().
  88                  */
  89                 if (ret < 0)
  90                         return ret;
  91         }
  92         cache->entries_pos = 0;
  93
  94         return 0;
  95 }
  96
  97 /*
  98  * Helper to submit fiemap extent.
  99  *
 100  * Will try to merge current fiemap extent specified by @offset, @phys,
 101  * @len and @flags with cached one.
 102  * And only when we fails to merge, cached one will be submitted as
 103  * fiemap extent.
 104  *
 105  * Return value is the same as fiemap_fill_next_extent().
 106  */
 107 static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
 108                                 struct fiemap_cache *cache,
 109                                 u64 offset, u64 phys, u64 len, u32 flags)
 110 {
 111         struct btrfs_fiemap_entry *entry;
 112         u64 cache_end;
 113
 114         /* Set at the end of extent_fiemap(). */
 115         ASSERT((flags & FIEMAP_EXTENT_LAST) == 0);
 116
 117         if (!cache->cached)
 118                 goto assign;
 119
 120         /*
 121          * When iterating the extents of the inode, at extent_fiemap(), we may
 122          * find an extent that starts at an offset behind the end offset of the
 123          * previous extent we processed. This happens if fiemap is called
 124          * without FIEMAP_FLAG_SYNC and there are ordered extents completing
 125          * after we had to unlock the file range, release the search path, emit
 126          * the fiemap extents stored in the buffer (cache->entries array) and
 127          * the lock the remainder of the range and re-search the btree.
 128          *
 129          * For example we are in leaf X processing its last item, which is the
 130          * file extent item for file range [512K, 1M[, and after
 131          * btrfs_next_leaf() releases the path, there's an ordered extent that
 132          * completes for the file range [768K, 2M[, and that results in trimming
 133          * the file extent item so that it now corresponds to the file range
 134          * [512K, 768K[ and a new file extent item is inserted for the file
 135          * range [768K, 2M[, which may end up as the last item of leaf X or as
 136          * the first item of the next leaf - in either case btrfs_next_leaf()
 137          * will leave us with a path pointing to the new extent item, for the
 138          * file range [768K, 2M[, since that's the first key that follows the
 139          * last one we processed. So in order not to report overlapping extents
 140          * to user space, we trim the length of the previously cached extent and
 141          * emit it.
 142          *
 143          * Upon calling btrfs_next_leaf() we may also find an extent with an
 144          * offset smaller than or equals to cache->offset, and this happens
 145          * when we had a hole or prealloc extent with several delalloc ranges in
 146          * it, but after btrfs_next_leaf() released the path, delalloc was
 147          * flushed and the resulting ordered extents were completed, so we can
 148          * now have found a file extent item for an offset that is smaller than
 149          * or equals to what we have in cache->offset. We deal with this as
 150          * described below.
 151          */
 152         cache_end = cache->offset + cache->len;
 153         if (cache_end > offset) {
 154                 if (offset == cache->offset) {
 155                         /*
 156                          * We cached a dealloc range (found in the io tree) for
 157                          * a hole or prealloc extent and we have now found a
 158                          * file extent item for the same offset. What we have
 159                          * now is more recent and up to date, so discard what
 160                          * we had in the cache and use what we have just found.
 161                          */
 162                         goto assign;
 163                 } else if (offset > cache->offset) {
 164                         /*
 165                          * The extent range we previously found ends after the
 166                          * offset of the file extent item we found and that
 167                          * offset falls somewhere in the middle of that previous
 168                          * extent range. So adjust the range we previously found
 169                          * to end at the offset of the file extent item we have
 170                          * just found, since this extent is more up to date.
 171                          * Emit that adjusted range and cache the file extent
 172                          * item we have just found. This corresponds to the case
 173                          * where a previously found file extent item was split
 174                          * due to an ordered extent completing.
 175                          */
 176                         cache->len = offset - cache->offset;
 177                         goto emit;
 178                 } else {
 179                         const u64 range_end = offset + len;
 180
 181                         /*
 182                          * The offset of the file extent item we have just found
 183                          * is behind the cached offset. This means we were
 184                          * processing a hole or prealloc extent for which we
 185                          * have found delalloc ranges (in the io tree), so what
 186                          * we have in the cache is the last delalloc range we
 187                          * found while the file extent item we found can be
 188                          * either for a whole delalloc range we previously
 189                          * emitted or only a part of that range.
 190                          *
 191                          * We have two cases here:
 192                          *
 193                          * 1) The file extent item's range ends at or behind the
 194                          *    cached extent's end. In this case just ignore the
 195                          *    current file extent item because we don't want to
 196                          *    overlap with previous ranges that may have been
 197                          *    emitted already;
 198                          *
 199                          * 2) The file extent item starts behind the currently
 200                          *    cached extent but its end offset goes beyond the
 201                          *    end offset of the cached extent. We don't want to
 202                          *    overlap with a previous range that may have been
 203                          *    emitted already, so we emit the currently cached
 204                          *    extent and then partially store the current file
 205                          *    extent item's range in the cache, for the subrange
 206                          *    going the cached extent's end to the end of the
 207                          *    file extent item.
 208                          */
 209                         if (range_end <= cache_end)
 210                                 return 0;
 211
 212                         if (!(flags & (FIEMAP_EXTENT_ENCODED | FIEMAP_EXTENT_DELALLOC)))
 213                                 phys += cache_end - offset;
 214
 215                         offset = cache_end;
 216                         len = range_end - cache_end;
 217                         goto emit;
 218                 }
 219         }
 220
 221         /*
 222          * Only merges fiemap extents if
 223          * 1) Their logical addresses are continuous
 224          *
 225          * 2) Their physical addresses are continuous
 226          *    So truly compressed (physical size smaller than logical size)
 227          *    extents won't get merged with each other
 228          *
 229          * 3) Share same flags
 230          */
 231         if (cache->offset + cache->len  == offset &&
 232             cache->phys + cache->len == phys  &&
 233             cache->flags == flags) {
 234                 cache->len += len;
 235                 return 0;
 236         }
 237
 238 emit:
 239         /* Not mergeable, need to submit cached one */
 240
 241         if (cache->entries_pos == cache->entries_size) {
 242                 /*
 243                  * We will need to research for the end offset of the last
 244                  * stored extent and not from the current offset, because after
 245                  * unlocking the range and releasing the path, if there's a hole
 246                  * between that end offset and this current offset, a new extent
 247                  * may have been inserted due to a new write, so we don't want
 248                  * to miss it.
 249                  */
 250                 entry = &cache->entries[cache->entries_size - 1];
 251                 cache->next_search_offset = entry->offset + entry->len;
 252                 cache->cached = false;
 253
 254                 return BTRFS_FIEMAP_FLUSH_CACHE;
 255         }
 256
 257         entry = &cache->entries[cache->entries_pos];
 258         entry->offset = cache->offset;
 259         entry->phys = cache->phys;
 260         entry->len = cache->len;
 261         entry->flags = cache->flags;
 262         cache->entries_pos++;
 263         cache->extents_mapped++;
 264
 265         if (cache->extents_mapped == fieinfo->fi_extents_max) {
 266                 cache->cached = false;
 267                 return 1;
 268         }
 269 assign:
 270         cache->cached = true;
 271         cache->offset = offset;
 272         cache->phys = phys;
 273         cache->len = len;
 274         cache->flags = flags;
 275
 276         return 0;
 277 }
 278
 279 /*
 280  * Emit last fiemap cache
 281  *
 282  * The last fiemap cache may still be cached in the following case:
 283  * 0                  4k                    8k
 284  * |<- Fiemap range ->|
 285  * |<------------  First extent ----------->|
 286  *
 287  * In this case, the first extent range will be cached but not emitted.
 288  * So we must emit it before ending extent_fiemap().
 289  */
 290 static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
 291                                   struct fiemap_cache *cache)
 292 {
 293         int ret;
 294
 295         if (!cache->cached)
 296                 return 0;
 297
 298         ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
 299                                       cache->len, cache->flags);
 300         cache->cached = false;
 301         if (ret > 0)
 302                 ret = 0;
 303         return ret;
 304 }
 305
 306 static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path)
 307 {
 308         struct extent_buffer *clone = path->nodes[0];
 309         struct btrfs_key key;
 310         int slot;
 311         int ret;
 312
 313         path->slots[0]++;
 314         if (path->slots[0] < btrfs_header_nritems(path->nodes[0]))
 315                 return 0;
 316
 317         /*
 318          * Add a temporary extra ref to an already cloned extent buffer to
 319          * prevent btrfs_next_leaf() freeing it, we want to reuse it to avoid
 320          * the cost of allocating a new one.
 321          */
 322         ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &clone->bflags));
 323         atomic_inc(&clone->refs);
 324
 325         ret = btrfs_next_leaf(inode->root, path);
 326         if (ret != 0)
 327                 goto out;
 328
 329         /*
 330          * Don't bother with cloning if there are no more file extent items for
 331          * our inode.
 332          */
 333         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
 334         if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) {
 335                 ret = 1;
 336                 goto out;
 337         }
 338
 339         /*
 340          * Important to preserve the start field, for the optimizations when
 341          * checking if extents are shared (see extent_fiemap()).
 342          *
 343          * We must set ->start before calling copy_extent_buffer_full().  If we
 344          * are on sub-pagesize blocksize, we use ->start to determine the offset
 345          * into the folio where our eb exists, and if we update ->start after
 346          * the fact then any subsequent reads of the eb may read from a
 347          * different offset in the folio than where we originally copied into.
 348          */
 349         clone->start = path->nodes[0]->start;
 350         /* See the comment at fiemap_search_slot() about why we clone. */
 351         copy_extent_buffer_full(clone, path->nodes[0]);
 352
 353         slot = path->slots[0];
 354         btrfs_release_path(path);
 355         path->nodes[0] = clone;
 356         path->slots[0] = slot;
 357 out:
 358         if (ret)
 359                 free_extent_buffer(clone);
 360
 361         return ret;
 362 }
 363
 364 /*
 365  * Search for the first file extent item that starts at a given file offset or
 366  * the one that starts immediately before that offset.
 367  * Returns: 0 on success, < 0 on error, 1 if not found.
 368  */
 369 static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path,
 370                               u64 file_offset)
 371 {
 372         const u64 ino = btrfs_ino(inode);
 373         struct btrfs_root *root = inode->root;
 374         struct extent_buffer *clone;
 375         struct btrfs_key key;
 376         int slot;
 377         int ret;
 378
 379         key.objectid = ino;
 380         key.type = BTRFS_EXTENT_DATA_KEY;
 381         key.offset = file_offset;
 382
 383         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 384         if (ret < 0)
 385                 return ret;
 386
 387         if (ret > 0 && path->slots[0] > 0) {
 388                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
 389                 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
 390                         path->slots[0]--;
 391         }
 392
 393         if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
 394                 ret = btrfs_next_leaf(root, path);
 395                 if (ret != 0)
 396                         return ret;
 397
 398                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
 399                 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
 400                         return 1;
 401         }
 402
 403         /*
 404          * We clone the leaf and use it during fiemap. This is because while
 405          * using the leaf we do expensive things like checking if an extent is
 406          * shared, which can take a long time. In order to prevent blocking
 407          * other tasks for too long, we use a clone of the leaf. We have locked
 408          * the file range in the inode's io tree, so we know none of our file
 409          * extent items can change. This way we avoid blocking other tasks that
 410          * want to insert items for other inodes in the same leaf or b+tree
 411          * rebalance operations (triggered for example when someone is trying
 412          * to push items into this leaf when trying to insert an item in a
 413          * neighbour leaf).
 414          * We also need the private clone because holding a read lock on an
 415          * extent buffer of the subvolume's b+tree will make lockdep unhappy
 416          * when we check if extents are shared, as backref walking may need to
 417          * lock the same leaf we are processing.
 418          */
 419         clone = btrfs_clone_extent_buffer(path->nodes[0]);
 420         if (!clone)
 421                 return -ENOMEM;
 422
 423         slot = path->slots[0];
 424         btrfs_release_path(path);
 425         path->nodes[0] = clone;
 426         path->slots[0] = slot;
 427
 428         return 0;
 429 }
 430
 431 /*
 432  * Process a range which is a hole or a prealloc extent in the inode's subvolume
 433  * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc
 434  * extent. The end offset (@end) is inclusive.
 435  */
 436 static int fiemap_process_hole(struct btrfs_inode *inode,
 437                                struct fiemap_extent_info *fieinfo,
 438                                struct fiemap_cache *cache,
 439                                struct extent_state **delalloc_cached_state,
 440                                struct btrfs_backref_share_check_ctx *backref_ctx,
 441                                u64 disk_bytenr, u64 extent_offset,
 442                                u64 extent_gen,
 443                                u64 start, u64 end)
 444 {
 445         const u64 i_size = i_size_read(&inode->vfs_inode);
 446         u64 cur_offset = start;
 447         u64 last_delalloc_end = 0;
 448         u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN;
 449         bool checked_extent_shared = false;
 450         int ret;
 451
 452         /*
 453          * There can be no delalloc past i_size, so don't waste time looking for
 454          * it beyond i_size.
 455          */
 456         while (cur_offset < end && cur_offset < i_size) {
 457                 u64 delalloc_start;
 458                 u64 delalloc_end;
 459                 u64 prealloc_start;
 460                 u64 prealloc_len = 0;
 461                 bool delalloc;
 462
 463                 delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
 464                                                         delalloc_cached_state,
 465                                                         &delalloc_start,
 466                                                         &delalloc_end);
 467                 if (!delalloc)
 468                         break;
 469
 470                 /*
 471                  * If this is a prealloc extent we have to report every section
 472                  * of it that has no delalloc.
 473                  */
 474                 if (disk_bytenr != 0) {
 475                         if (last_delalloc_end == 0) {
 476                                 prealloc_start = start;
 477                                 prealloc_len = delalloc_start - start;
 478                         } else {
 479                                 prealloc_start = last_delalloc_end + 1;
 480                                 prealloc_len = delalloc_start - prealloc_start;
 481                         }
 482                 }
 483
 484                 if (prealloc_len > 0) {
 485                         if (!checked_extent_shared && fieinfo->fi_extents_max) {
 486                                 ret = btrfs_is_data_extent_shared(inode,
 487                                                                   disk_bytenr,
 488                                                                   extent_gen,
 489                                                                   backref_ctx);
 490                                 if (ret < 0)
 491                                         return ret;
 492                                 else if (ret > 0)
 493                                         prealloc_flags |= FIEMAP_EXTENT_SHARED;
 494
 495                                 checked_extent_shared = true;
 496                         }
 497                         ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
 498                                                  disk_bytenr + extent_offset,
 499                                                  prealloc_len, prealloc_flags);
 500                         if (ret)
 501                                 return ret;
 502                         extent_offset += prealloc_len;
 503                 }
 504
 505                 ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0,
 506                                          delalloc_end + 1 - delalloc_start,
 507                                          FIEMAP_EXTENT_DELALLOC |
 508                                          FIEMAP_EXTENT_UNKNOWN);
 509                 if (ret)
 510                         return ret;
 511
 512                 last_delalloc_end = delalloc_end;
 513                 cur_offset = delalloc_end + 1;
 514                 extent_offset += cur_offset - delalloc_start;
 515                 cond_resched();
 516         }
 517
 518         /*
 519          * Either we found no delalloc for the whole prealloc extent or we have
 520          * a prealloc extent that spans i_size or starts at or after i_size.
 521          */
 522         if (disk_bytenr != 0 && last_delalloc_end < end) {
 523                 u64 prealloc_start;
 524                 u64 prealloc_len;
 525
 526                 if (last_delalloc_end == 0) {
 527                         prealloc_start = start;
 528                         prealloc_len = end + 1 - start;
 529                 } else {
 530                         prealloc_start = last_delalloc_end + 1;
 531                         prealloc_len = end + 1 - prealloc_start;
 532                 }
 533
 534                 if (!checked_extent_shared && fieinfo->fi_extents_max) {
 535                         ret = btrfs_is_data_extent_shared(inode,
 536                                                           disk_bytenr,
 537                                                           extent_gen,
 538                                                           backref_ctx);
 539                         if (ret < 0)
 540                                 return ret;
 541                         else if (ret > 0)
 542                                 prealloc_flags |= FIEMAP_EXTENT_SHARED;
 543                 }
 544                 ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
 545                                          disk_bytenr + extent_offset,
 546                                          prealloc_len, prealloc_flags);
 547                 if (ret)
 548                         return ret;
 549         }
 550
 551         return 0;
 552 }
 553
 554 static int fiemap_find_last_extent_offset(struct btrfs_inode *inode,
 555                                           struct btrfs_path *path,
 556                                           u64 *last_extent_end_ret)
 557 {
 558         const u64 ino = btrfs_ino(inode);
 559         struct btrfs_root *root = inode->root;
 560         struct extent_buffer *leaf;
 561         struct btrfs_file_extent_item *ei;
 562         struct btrfs_key key;
 563         u64 disk_bytenr;
 564         int ret;
 565
 566         /*
 567          * Lookup the last file extent. We're not using i_size here because
 568          * there might be preallocation past i_size.
 569          */
 570         ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0);
 571         /* There can't be a file extent item at offset (u64)-1 */
 572         ASSERT(ret != 0);
 573         if (ret < 0)
 574                 return ret;
 575
 576         /*
 577          * For a non-existing key, btrfs_search_slot() always leaves us at a
 578          * slot > 0, except if the btree is empty, which is impossible because
 579          * at least it has the inode item for this inode and all the items for
 580          * the root inode 256.
 581          */
 582         ASSERT(path->slots[0] > 0);
 583         path->slots[0]--;
 584         leaf = path->nodes[0];
 585         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 586         if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
 587                 /* No file extent items in the subvolume tree. */
 588                 *last_extent_end_ret = 0;
 589                 return 0;
 590         }
 591
 592         /*
 593          * For an inline extent, the disk_bytenr is where inline data starts at,
 594          * so first check if we have an inline extent item before checking if we
 595          * have an implicit hole (disk_bytenr == 0).
 596          */
 597         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
 598         if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
 599                 *last_extent_end_ret = btrfs_file_extent_end(path);
 600                 return 0;
 601         }
 602
 603         /*
 604          * Find the last file extent item that is not a hole (when NO_HOLES is
 605          * not enabled). This should take at most 2 iterations in the worst
 606          * case: we have one hole file extent item at slot 0 of a leaf and
 607          * another hole file extent item as the last item in the previous leaf.
 608          * This is because we merge file extent items that represent holes.
 609          */
 610         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
 611         while (disk_bytenr == 0) {
 612                 ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
 613                 if (ret < 0) {
 614                         return ret;
 615                 } else if (ret > 0) {
 616                         /* No file extent items that are not holes. */
 617                         *last_extent_end_ret = 0;
 618                         return 0;
 619                 }
 620                 leaf = path->nodes[0];
 621                 ei = btrfs_item_ptr(leaf, path->slots[0],
 622                                     struct btrfs_file_extent_item);
 623                 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
 624         }
 625
 626         *last_extent_end_ret = btrfs_file_extent_end(path);
 627         return 0;
 628 }
 629
 630 static int extent_fiemap(struct btrfs_inode *inode,
 631                          struct fiemap_extent_info *fieinfo,
 632                          u64 start, u64 len)
 633 {
 634         const u64 ino = btrfs_ino(inode);
 635         struct extent_state *cached_state = NULL;
 636         struct extent_state *delalloc_cached_state = NULL;
 637         struct btrfs_path *path;
 638         struct fiemap_cache cache = { 0 };
 639         struct btrfs_backref_share_check_ctx *backref_ctx;
 640         u64 last_extent_end = 0;
 641         u64 prev_extent_end;
 642         u64 range_start;
 643         u64 range_end;
 644         const u64 sectorsize = inode->root->fs_info->sectorsize;
 645         bool stopped = false;
 646         int ret;
 647
 648         cache.entries_size = PAGE_SIZE / sizeof(struct btrfs_fiemap_entry);
 649         cache.entries = kmalloc_array(cache.entries_size,
 650                                       sizeof(struct btrfs_fiemap_entry),
 651                                       GFP_KERNEL);
 652         backref_ctx = btrfs_alloc_backref_share_check_ctx();
 653         path = btrfs_alloc_path();
 654         if (!cache.entries || !backref_ctx || !path) {
 655                 ret = -ENOMEM;
 656                 goto out;
 657         }
 658
 659 restart:
 660         range_start = round_down(start, sectorsize);
 661         range_end = round_up(start + len, sectorsize);
 662         prev_extent_end = range_start;
 663
 664         lock_extent(&inode->io_tree, range_start, range_end, &cached_state);
 665
 666         ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
 667         if (ret < 0)
 668                 goto out_unlock;
 669         btrfs_release_path(path);
 670
 671         path->reada = READA_FORWARD;
 672         ret = fiemap_search_slot(inode, path, range_start);
 673         if (ret < 0) {
 674                 goto out_unlock;
 675         } else if (ret > 0) {
 676                 /*
 677                  * No file extent item found, but we may have delalloc between
 678                  * the current offset and i_size. So check for that.
 679                  */
 680                 ret = 0;
 681                 goto check_eof_delalloc;
 682         }
 683
 684         while (prev_extent_end < range_end) {
 685                 struct extent_buffer *leaf = path->nodes[0];
 686                 struct btrfs_file_extent_item *ei;
 687                 struct btrfs_key key;
 688                 u64 extent_end;
 689                 u64 extent_len;
 690                 u64 extent_offset = 0;
 691                 u64 extent_gen;
 692                 u64 disk_bytenr = 0;
 693                 u64 flags = 0;
 694                 int extent_type;
 695                 u8 compression;
 696
 697                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 698                 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
 699                         break;
 700
 701                 extent_end = btrfs_file_extent_end(path);
 702
 703                 /*
 704                  * The first iteration can leave us at an extent item that ends
 705                  * before our range's start. Move to the next item.
 706                  */
 707                 if (extent_end <= range_start)
 708                         goto next_item;
 709
 710                 backref_ctx->curr_leaf_bytenr = leaf->start;
 711
 712                 /* We have in implicit hole (NO_HOLES feature enabled). */
 713                 if (prev_extent_end < key.offset) {
 714                         const u64 hole_end = min(key.offset, range_end) - 1;
 715
 716                         ret = fiemap_process_hole(inode, fieinfo, &cache,
 717                                                   &delalloc_cached_state,
 718                                                   backref_ctx, 0, 0, 0,
 719                                                   prev_extent_end, hole_end);
 720                         if (ret < 0) {
 721                                 goto out_unlock;
 722                         } else if (ret > 0) {
 723                                 /* fiemap_fill_next_extent() told us to stop. */
 724                                 stopped = true;
 725                                 break;
 726                         }
 727
 728                         /* We've reached the end of the fiemap range, stop. */
 729                         if (key.offset >= range_end) {
 730                                 stopped = true;
 731                                 break;
 732                         }
 733                 }
 734
 735                 extent_len = extent_end - key.offset;
 736                 ei = btrfs_item_ptr(leaf, path->slots[0],
 737                                     struct btrfs_file_extent_item);
 738                 compression = btrfs_file_extent_compression(leaf, ei);
 739                 extent_type = btrfs_file_extent_type(leaf, ei);
 740                 extent_gen = btrfs_file_extent_generation(leaf, ei);
 741
 742                 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
 743                         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
 744                         if (compression == BTRFS_COMPRESS_NONE)
 745                                 extent_offset = btrfs_file_extent_offset(leaf, ei);
 746                 }
 747
 748                 if (compression != BTRFS_COMPRESS_NONE)
 749                         flags |= FIEMAP_EXTENT_ENCODED;
 750
 751                 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 752                         flags |= FIEMAP_EXTENT_DATA_INLINE;
 753                         flags |= FIEMAP_EXTENT_NOT_ALIGNED;
 754                         ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0,
 755                                                  extent_len, flags);
 756                 } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
 757                         ret = fiemap_process_hole(inode, fieinfo, &cache,
 758                                                   &delalloc_cached_state,
 759                                                   backref_ctx,
 760                                                   disk_bytenr, extent_offset,
 761                                                   extent_gen, key.offset,
 762                                                   extent_end - 1);
 763                 } else if (disk_bytenr == 0) {
 764                         /* We have an explicit hole. */
 765                         ret = fiemap_process_hole(inode, fieinfo, &cache,
 766                                                   &delalloc_cached_state,
 767                                                   backref_ctx, 0, 0, 0,
 768                                                   key.offset, extent_end - 1);
 769                 } else {
 770                         /* We have a regular extent. */
 771                         if (fieinfo->fi_extents_max) {
 772                                 ret = btrfs_is_data_extent_shared(inode,
 773                                                                   disk_bytenr,
 774                                                                   extent_gen,
 775                                                                   backref_ctx);
 776                                 if (ret < 0)
 777                                         goto out_unlock;
 778                                 else if (ret > 0)
 779                                         flags |= FIEMAP_EXTENT_SHARED;
 780                         }
 781
 782                         ret = emit_fiemap_extent(fieinfo, &cache, key.offset,
 783                                                  disk_bytenr + extent_offset,
 784                                                  extent_len, flags);
 785                 }
 786
 787                 if (ret < 0) {
 788                         goto out_unlock;
 789                 } else if (ret > 0) {
 790                         /* emit_fiemap_extent() told us to stop. */
 791                         stopped = true;
 792                         break;
 793                 }
 794
 795                 prev_extent_end = extent_end;
 796 next_item:
 797                 if (fatal_signal_pending(current)) {
 798                         ret = -EINTR;
 799                         goto out_unlock;
 800                 }
 801
 802                 ret = fiemap_next_leaf_item(inode, path);
 803                 if (ret < 0) {
 804                         goto out_unlock;
 805                 } else if (ret > 0) {
 806                         /* No more file extent items for this inode. */
 807                         break;
 808                 }
 809                 cond_resched();
 810         }
 811
 812 check_eof_delalloc:
 813         if (!stopped && prev_extent_end < range_end) {
 814                 ret = fiemap_process_hole(inode, fieinfo, &cache,
 815                                           &delalloc_cached_state, backref_ctx,
 816                                           0, 0, 0, prev_extent_end, range_end - 1);
 817                 if (ret < 0)
 818                         goto out_unlock;
 819                 prev_extent_end = range_end;
 820         }
 821
 822         if (cache.cached && cache.offset + cache.len >= last_extent_end) {
 823                 const u64 i_size = i_size_read(&inode->vfs_inode);
 824
 825                 if (prev_extent_end < i_size) {
 826                         u64 delalloc_start;
 827                         u64 delalloc_end;
 828                         bool delalloc;
 829
 830                         delalloc = btrfs_find_delalloc_in_range(inode,
 831                                                                 prev_extent_end,
 832                                                                 i_size - 1,
 833                                                                 &delalloc_cached_state,
 834                                                                 &delalloc_start,
 835                                                                 &delalloc_end);
 836                         if (!delalloc)
 837                                 cache.flags |= FIEMAP_EXTENT_LAST;
 838                 } else {
 839                         cache.flags |= FIEMAP_EXTENT_LAST;
 840                 }
 841         }
 842
 843 out_unlock:
 844         unlock_extent(&inode->io_tree, range_start, range_end, &cached_state);
 845
 846         if (ret == BTRFS_FIEMAP_FLUSH_CACHE) {
 847                 btrfs_release_path(path);
 848                 ret = flush_fiemap_cache(fieinfo, &cache);
 849                 if (ret)
 850                         goto out;
 851                 len -= cache.next_search_offset - start;
 852                 start = cache.next_search_offset;
 853                 goto restart;
 854         } else if (ret < 0) {
 855                 goto out;
 856         }
 857
 858         /*
 859          * Must free the path before emitting to the fiemap buffer because we
 860          * may have a non-cloned leaf and if the fiemap buffer is memory mapped
 861          * to a file, a write into it (through btrfs_page_mkwrite()) may trigger
 862          * waiting for an ordered extent that in order to complete needs to
 863          * modify that leaf, therefore leading to a deadlock.
 864          */
 865         btrfs_free_path(path);
 866         path = NULL;
 867
 868         ret = flush_fiemap_cache(fieinfo, &cache);
 869         if (ret)
 870                 goto out;
 871
 872         ret = emit_last_fiemap_cache(fieinfo, &cache);
 873 out:
 874         free_extent_state(delalloc_cached_state);
 875         kfree(cache.entries);
 876         btrfs_free_backref_share_ctx(backref_ctx);
 877         btrfs_free_path(path);
 878         return ret;
 879 }
 880
 881 int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 882                  u64 start, u64 len)
 883 {
 884         struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
 885         int ret;
 886
 887         ret = fiemap_prep(inode, fieinfo, start, &len, 0);
 888         if (ret)
 889                 return ret;
 890
 891         /*
 892          * fiemap_prep() called filemap_write_and_wait() for the whole possible
 893          * file range (0 to LLONG_MAX), but that is not enough if we have
 894          * compression enabled. The first filemap_fdatawrite_range() only kicks
 895          * in the compression of data (in an async thread) and will return
 896          * before the compression is done and writeback is started. A second
 897          * filemap_fdatawrite_range() is needed to wait for the compression to
 898          * complete and writeback to start. We also need to wait for ordered
 899          * extents to complete, because our fiemap implementation uses mainly
 900          * file extent items to list the extents, searching for extent maps
 901          * only for file ranges with holes or prealloc extents to figure out
 902          * if we have delalloc in those ranges.
 903          */
 904         if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
 905                 ret = btrfs_wait_ordered_range(btrfs_inode, 0, LLONG_MAX);
 906                 if (ret)
 907                         return ret;
 908         }
 909
 910         btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED);
 911
 912         /*
 913          * We did an initial flush to avoid holding the inode's lock while
 914          * triggering writeback and waiting for the completion of IO and ordered
 915          * extents. Now after we locked the inode we do it again, because it's
 916          * possible a new write may have happened in between those two steps.
 917          */
 918         if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
 919                 ret = btrfs_wait_ordered_range(btrfs_inode, 0, LLONG_MAX);
 920                 if (ret) {
 921                         btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
 922                         return ret;
 923                 }
 924         }
 925
 926         ret = extent_fiemap(btrfs_inode, fieinfo, start, len);
 927         btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
 928
 929         return ret;
 930 }