fs/btrfs/block-group.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include "misc.h"
   4 #include "ctree.h"
   5 #include "block-group.h"
   6 #include "space-info.h"
   7 #include "disk-io.h"
   8 #include "free-space-cache.h"
   9 #include "free-space-tree.h"
  10 #include "disk-io.h"
  11 #include "volumes.h"
  12 #include "transaction.h"
  13 #include "ref-verify.h"
  14 #include "sysfs.h"
  15 #include "tree-log.h"
  16 #include "delalloc-space.h"
  17 #include "discard.h"
  18 #include "raid56.h"
  19
  20 /*
  21  * Return target flags in extended format or 0 if restripe for this chunk_type
  22  * is not in progress
  23  *
  24  * Should be called with balance_lock held
  25  */
  26 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
  27 {
  28         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
  29         u64 target = 0;
  30
  31         if (!bctl)
  32                 return 0;
  33
  34         if (flags & BTRFS_BLOCK_GROUP_DATA &&
  35             bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
  36                 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
  37         } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
  38                    bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
  39                 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
  40         } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
  41                    bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
  42                 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
  43         }
  44
  45         return target;
  46 }
  47
  48 /*
  49  * @flags: available profiles in extended format (see ctree.h)
  50  *
  51  * Return reduced profile in chunk format.  If profile changing is in progress
  52  * (either running or paused) picks the target profile (if it's already
  53  * available), otherwise falls back to plain reducing.
  54  */
  55 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
  56 {
  57         u64 num_devices = fs_info->fs_devices->rw_devices;
  58         u64 target;
  59         u64 raid_type;
  60         u64 allowed = 0;
  61
  62         /*
  63          * See if restripe for this chunk_type is in progress, if so try to
  64          * reduce to the target profile
  65          */
  66         spin_lock(&fs_info->balance_lock);
  67         target = get_restripe_target(fs_info, flags);
  68         if (target) {
  69                 /* Pick target profile only if it's already available */
  70                 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
  71                         spin_unlock(&fs_info->balance_lock);
  72                         return extended_to_chunk(target);
  73                 }
  74         }
  75         spin_unlock(&fs_info->balance_lock);
  76
  77         /* First, mask out the RAID levels which aren't possible */
  78         for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
  79                 if (num_devices >= btrfs_raid_array[raid_type].devs_min)
  80                         allowed |= btrfs_raid_array[raid_type].bg_flag;
  81         }
  82         allowed &= flags;
  83
  84         if (allowed & BTRFS_BLOCK_GROUP_RAID6)
  85                 allowed = BTRFS_BLOCK_GROUP_RAID6;
  86         else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
  87                 allowed = BTRFS_BLOCK_GROUP_RAID5;
  88         else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
  89                 allowed = BTRFS_BLOCK_GROUP_RAID10;
  90         else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
  91                 allowed = BTRFS_BLOCK_GROUP_RAID1;
  92         else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
  93                 allowed = BTRFS_BLOCK_GROUP_RAID0;
  94
  95         flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
  96
  97         return extended_to_chunk(flags | allowed);
  98 }
  99
 100 u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
 101 {
 102         unsigned seq;
 103         u64 flags;
 104
 105         do {
 106                 flags = orig_flags;
 107                 seq = read_seqbegin(&fs_info->profiles_lock);
 108
 109                 if (flags & BTRFS_BLOCK_GROUP_DATA)
 110                         flags |= fs_info->avail_data_alloc_bits;
 111                 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
 112                         flags |= fs_info->avail_system_alloc_bits;
 113                 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
 114                         flags |= fs_info->avail_metadata_alloc_bits;
 115         } while (read_seqretry(&fs_info->profiles_lock, seq));
 116
 117         return btrfs_reduce_alloc_profile(fs_info, flags);
 118 }
 119
 120 void btrfs_get_block_group(struct btrfs_block_group *cache)
 121 {
 122         atomic_inc(&cache->count);
 123 }
 124
 125 void btrfs_put_block_group(struct btrfs_block_group *cache)
 126 {
 127         if (atomic_dec_and_test(&cache->count)) {
 128                 WARN_ON(cache->pinned > 0);
 129                 WARN_ON(cache->reserved > 0);
 130
 131                 /*
 132                  * A block_group shouldn't be on the discard_list anymore.
 133                  * Remove the block_group from the discard_list to prevent us
 134                  * from causing a panic due to NULL pointer dereference.
 135                  */
 136                 if (WARN_ON(!list_empty(&cache->discard_list)))
 137                         btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
 138                                                   cache);
 139
 140                 /*
 141                  * If not empty, someone is still holding mutex of
 142                  * full_stripe_lock, which can only be released by caller.
 143                  * And it will definitely cause use-after-free when caller
 144                  * tries to release full stripe lock.
 145                  *
 146                  * No better way to resolve, but only to warn.
 147                  */
 148                 WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
 149                 kfree(cache->free_space_ctl);
 150                 kfree(cache);
 151         }
 152 }
 153
 154 /*
 155  * This adds the block group to the fs_info rb tree for the block group cache
 156  */
 157 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
 158                                        struct btrfs_block_group *block_group)
 159 {
 160         struct rb_node **p;
 161         struct rb_node *parent = NULL;
 162         struct btrfs_block_group *cache;
 163
 164         spin_lock(&info->block_group_cache_lock);
 165         p = &info->block_group_cache_tree.rb_node;
 166
 167         while (*p) {
 168                 parent = *p;
 169                 cache = rb_entry(parent, struct btrfs_block_group, cache_node);
 170                 if (block_group->start < cache->start) {
 171                         p = &(*p)->rb_left;
 172                 } else if (block_group->start > cache->start) {
 173                         p = &(*p)->rb_right;
 174                 } else {
 175                         spin_unlock(&info->block_group_cache_lock);
 176                         return -EEXIST;
 177                 }
 178         }
 179
 180         rb_link_node(&block_group->cache_node, parent, p);
 181         rb_insert_color(&block_group->cache_node,
 182                         &info->block_group_cache_tree);
 183
 184         if (info->first_logical_byte > block_group->start)
 185                 info->first_logical_byte = block_group->start;
 186
 187         spin_unlock(&info->block_group_cache_lock);
 188
 189         return 0;
 190 }
 191
 192 /*
 193  * This will return the block group at or after bytenr if contains is 0, else
 194  * it will return the block group that contains the bytenr
 195  */
 196 static struct btrfs_block_group *block_group_cache_tree_search(
 197                 struct btrfs_fs_info *info, u64 bytenr, int contains)
 198 {
 199         struct btrfs_block_group *cache, *ret = NULL;
 200         struct rb_node *n;
 201         u64 end, start;
 202
 203         spin_lock(&info->block_group_cache_lock);
 204         n = info->block_group_cache_tree.rb_node;
 205
 206         while (n) {
 207                 cache = rb_entry(n, struct btrfs_block_group, cache_node);
 208                 end = cache->start + cache->length - 1;
 209                 start = cache->start;
 210
 211                 if (bytenr < start) {
 212                         if (!contains && (!ret || start < ret->start))
 213                                 ret = cache;
 214                         n = n->rb_left;
 215                 } else if (bytenr > start) {
 216                         if (contains && bytenr <= end) {
 217                                 ret = cache;
 218                                 break;
 219                         }
 220                         n = n->rb_right;
 221                 } else {
 222                         ret = cache;
 223                         break;
 224                 }
 225         }
 226         if (ret) {
 227                 btrfs_get_block_group(ret);
 228                 if (bytenr == 0 && info->first_logical_byte > ret->start)
 229                         info->first_logical_byte = ret->start;
 230         }
 231         spin_unlock(&info->block_group_cache_lock);
 232
 233         return ret;
 234 }
 235
 236 /*
 237  * Return the block group that starts at or after bytenr
 238  */
 239 struct btrfs_block_group *btrfs_lookup_first_block_group(
 240                 struct btrfs_fs_info *info, u64 bytenr)
 241 {
 242         return block_group_cache_tree_search(info, bytenr, 0);
 243 }
 244
 245 /*
 246  * Return the block group that contains the given bytenr
 247  */
 248 struct btrfs_block_group *btrfs_lookup_block_group(
 249                 struct btrfs_fs_info *info, u64 bytenr)
 250 {
 251         return block_group_cache_tree_search(info, bytenr, 1);
 252 }
 253
 254 struct btrfs_block_group *btrfs_next_block_group(
 255                 struct btrfs_block_group *cache)
 256 {
 257         struct btrfs_fs_info *fs_info = cache->fs_info;
 258         struct rb_node *node;
 259
 260         spin_lock(&fs_info->block_group_cache_lock);
 261
 262         /* If our block group was removed, we need a full search. */
 263         if (RB_EMPTY_NODE(&cache->cache_node)) {
 264                 const u64 next_bytenr = cache->start + cache->length;
 265
 266                 spin_unlock(&fs_info->block_group_cache_lock);
 267                 btrfs_put_block_group(cache);
 268                 cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
 269         }
 270         node = rb_next(&cache->cache_node);
 271         btrfs_put_block_group(cache);
 272         if (node) {
 273                 cache = rb_entry(node, struct btrfs_block_group, cache_node);
 274                 btrfs_get_block_group(cache);
 275         } else
 276                 cache = NULL;
 277         spin_unlock(&fs_info->block_group_cache_lock);
 278         return cache;
 279 }
 280
 281 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
 282 {
 283         struct btrfs_block_group *bg;
 284         bool ret = true;
 285
 286         bg = btrfs_lookup_block_group(fs_info, bytenr);
 287         if (!bg)
 288                 return false;
 289
 290         spin_lock(&bg->lock);
 291         if (bg->ro)
 292                 ret = false;
 293         else
 294                 atomic_inc(&bg->nocow_writers);
 295         spin_unlock(&bg->lock);
 296
 297         /* No put on block group, done by btrfs_dec_nocow_writers */
 298         if (!ret)
 299                 btrfs_put_block_group(bg);
 300
 301         return ret;
 302 }
 303
 304 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
 305 {
 306         struct btrfs_block_group *bg;
 307
 308         bg = btrfs_lookup_block_group(fs_info, bytenr);
 309         ASSERT(bg);
 310         if (atomic_dec_and_test(&bg->nocow_writers))
 311                 wake_up_var(&bg->nocow_writers);
 312         /*
 313          * Once for our lookup and once for the lookup done by a previous call
 314          * to btrfs_inc_nocow_writers()
 315          */
 316         btrfs_put_block_group(bg);
 317         btrfs_put_block_group(bg);
 318 }
 319
 320 void btrfs_wait_nocow_writers(struct btrfs_block_group *bg)
 321 {
 322         wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
 323 }
 324
 325 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
 326                                         const u64 start)
 327 {
 328         struct btrfs_block_group *bg;
 329
 330         bg = btrfs_lookup_block_group(fs_info, start);
 331         ASSERT(bg);
 332         if (atomic_dec_and_test(&bg->reservations))
 333                 wake_up_var(&bg->reservations);
 334         btrfs_put_block_group(bg);
 335 }
 336
 337 void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg)
 338 {
 339         struct btrfs_space_info *space_info = bg->space_info;
 340
 341         ASSERT(bg->ro);
 342
 343         if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
 344                 return;
 345
 346         /*
 347          * Our block group is read only but before we set it to read only,
 348          * some task might have had allocated an extent from it already, but it
 349          * has not yet created a respective ordered extent (and added it to a
 350          * root's list of ordered extents).
 351          * Therefore wait for any task currently allocating extents, since the
 352          * block group's reservations counter is incremented while a read lock
 353          * on the groups' semaphore is held and decremented after releasing
 354          * the read access on that semaphore and creating the ordered extent.
 355          */
 356         down_write(&space_info->groups_sem);
 357         up_write(&space_info->groups_sem);
 358
 359         wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
 360 }
 361
 362 struct btrfs_caching_control *btrfs_get_caching_control(
 363                 struct btrfs_block_group *cache)
 364 {
 365         struct btrfs_caching_control *ctl;
 366
 367         spin_lock(&cache->lock);
 368         if (!cache->caching_ctl) {
 369                 spin_unlock(&cache->lock);
 370                 return NULL;
 371         }
 372
 373         ctl = cache->caching_ctl;
 374         refcount_inc(&ctl->count);
 375         spin_unlock(&cache->lock);
 376         return ctl;
 377 }
 378
 379 void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
 380 {
 381         if (refcount_dec_and_test(&ctl->count))
 382                 kfree(ctl);
 383 }
 384
 385 /*
 386  * When we wait for progress in the block group caching, its because our
 387  * allocation attempt failed at least once.  So, we must sleep and let some
 388  * progress happen before we try again.
 389  *
 390  * This function will sleep at least once waiting for new free space to show
 391  * up, and then it will check the block group free space numbers for our min
 392  * num_bytes.  Another option is to have it go ahead and look in the rbtree for
 393  * a free extent of a given size, but this is a good start.
 394  *
 395  * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
 396  * any of the information in this block group.
 397  */
 398 void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
 399                                            u64 num_bytes)
 400 {
 401         struct btrfs_caching_control *caching_ctl;
 402
 403         caching_ctl = btrfs_get_caching_control(cache);
 404         if (!caching_ctl)
 405                 return;
 406
 407         wait_event(caching_ctl->wait, btrfs_block_group_done(cache) ||
 408                    (cache->free_space_ctl->free_space >= num_bytes));
 409
 410         btrfs_put_caching_control(caching_ctl);
 411 }
 412
 413 int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
 414 {
 415         struct btrfs_caching_control *caching_ctl;
 416         int ret = 0;
 417
 418         caching_ctl = btrfs_get_caching_control(cache);
 419         if (!caching_ctl)
 420                 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
 421
 422         wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
 423         if (cache->cached == BTRFS_CACHE_ERROR)
 424                 ret = -EIO;
 425         btrfs_put_caching_control(caching_ctl);
 426         return ret;
 427 }
 428
 429 #ifdef CONFIG_BTRFS_DEBUG
 430 static void fragment_free_space(struct btrfs_block_group *block_group)
 431 {
 432         struct btrfs_fs_info *fs_info = block_group->fs_info;
 433         u64 start = block_group->start;
 434         u64 len = block_group->length;
 435         u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
 436                 fs_info->nodesize : fs_info->sectorsize;
 437         u64 step = chunk << 1;
 438
 439         while (len > chunk) {
 440                 btrfs_remove_free_space(block_group, start, chunk);
 441                 start += step;
 442                 if (len < step)
 443                         len = 0;
 444                 else
 445                         len -= step;
 446         }
 447 }
 448 #endif
 449
 450 /*
 451  * This is only called by btrfs_cache_block_group, since we could have freed
 452  * extents we need to check the pinned_extents for any extents that can't be
 453  * used yet since their free space will be released as soon as the transaction
 454  * commits.
 455  */
 456 u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end)
 457 {
 458         struct btrfs_fs_info *info = block_group->fs_info;
 459         u64 extent_start, extent_end, size, total_added = 0;
 460         int ret;
 461
 462         while (start < end) {
 463                 ret = find_first_extent_bit(&info->excluded_extents, start,
 464                                             &extent_start, &extent_end,
 465                                             EXTENT_DIRTY | EXTENT_UPTODATE,
 466                                             NULL);
 467                 if (ret)
 468                         break;
 469
 470                 if (extent_start <= start) {
 471                         start = extent_end + 1;
 472                 } else if (extent_start > start && extent_start < end) {
 473                         size = extent_start - start;
 474                         total_added += size;
 475                         ret = btrfs_add_free_space_async_trimmed(block_group,
 476                                                                  start, size);
 477                         BUG_ON(ret); /* -ENOMEM or logic error */
 478                         start = extent_end + 1;
 479                 } else {
 480                         break;
 481                 }
 482         }
 483
 484         if (start < end) {
 485                 size = end - start;
 486                 total_added += size;
 487                 ret = btrfs_add_free_space_async_trimmed(block_group, start,
 488                                                          size);
 489                 BUG_ON(ret); /* -ENOMEM or logic error */
 490         }
 491
 492         return total_added;
 493 }
 494
 495 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
 496 {
 497         struct btrfs_block_group *block_group = caching_ctl->block_group;
 498         struct btrfs_fs_info *fs_info = block_group->fs_info;
 499         struct btrfs_root *extent_root = fs_info->extent_root;
 500         struct btrfs_path *path;
 501         struct extent_buffer *leaf;
 502         struct btrfs_key key;
 503         u64 total_found = 0;
 504         u64 last = 0;
 505         u32 nritems;
 506         int ret;
 507         bool wakeup = true;
 508
 509         path = btrfs_alloc_path();
 510         if (!path)
 511                 return -ENOMEM;
 512
 513         last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
 514
 515 #ifdef CONFIG_BTRFS_DEBUG
 516         /*
 517          * If we're fragmenting we don't want to make anybody think we can
 518          * allocate from this block group until we've had a chance to fragment
 519          * the free space.
 520          */
 521         if (btrfs_should_fragment_free_space(block_group))
 522                 wakeup = false;
 523 #endif
 524         /*
 525          * We don't want to deadlock with somebody trying to allocate a new
 526          * extent for the extent root while also trying to search the extent
 527          * root to add free space.  So we skip locking and search the commit
 528          * root, since its read-only
 529          */
 530         path->skip_locking = 1;
 531         path->search_commit_root = 1;
 532         path->reada = READA_FORWARD;
 533
 534         key.objectid = last;
 535         key.offset = 0;
 536         key.type = BTRFS_EXTENT_ITEM_KEY;
 537
 538 next:
 539         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 540         if (ret < 0)
 541                 goto out;
 542
 543         leaf = path->nodes[0];
 544         nritems = btrfs_header_nritems(leaf);
 545
 546         while (1) {
 547                 if (btrfs_fs_closing(fs_info) > 1) {
 548                         last = (u64)-1;
 549                         break;
 550                 }
 551
 552                 if (path->slots[0] < nritems) {
 553                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 554                 } else {
 555                         ret = btrfs_find_next_key(extent_root, path, &key, 0, 0);
 556                         if (ret)
 557                                 break;
 558
 559                         if (need_resched() ||
 560                             rwsem_is_contended(&fs_info->commit_root_sem)) {
 561                                 if (wakeup)
 562                                         caching_ctl->progress = last;
 563                                 btrfs_release_path(path);
 564                                 up_read(&fs_info->commit_root_sem);
 565                                 mutex_unlock(&caching_ctl->mutex);
 566                                 cond_resched();
 567                                 mutex_lock(&caching_ctl->mutex);
 568                                 down_read(&fs_info->commit_root_sem);
 569                                 goto next;
 570                         }
 571
 572                         ret = btrfs_next_leaf(extent_root, path);
 573                         if (ret < 0)
 574                                 goto out;
 575                         if (ret)
 576                                 break;
 577                         leaf = path->nodes[0];
 578                         nritems = btrfs_header_nritems(leaf);
 579                         continue;
 580                 }
 581
 582                 if (key.objectid < last) {
 583                         key.objectid = last;
 584                         key.offset = 0;
 585                         key.type = BTRFS_EXTENT_ITEM_KEY;
 586
 587                         if (wakeup)
 588                                 caching_ctl->progress = last;
 589                         btrfs_release_path(path);
 590                         goto next;
 591                 }
 592
 593                 if (key.objectid < block_group->start) {
 594                         path->slots[0]++;
 595                         continue;
 596                 }
 597
 598                 if (key.objectid >= block_group->start + block_group->length)
 599                         break;
 600
 601                 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
 602                     key.type == BTRFS_METADATA_ITEM_KEY) {
 603                         total_found += add_new_free_space(block_group, last,
 604                                                           key.objectid);
 605                         if (key.type == BTRFS_METADATA_ITEM_KEY)
 606                                 last = key.objectid +
 607                                         fs_info->nodesize;
 608                         else
 609                                 last = key.objectid + key.offset;
 610
 611                         if (total_found > CACHING_CTL_WAKE_UP) {
 612                                 total_found = 0;
 613                                 if (wakeup)
 614                                         wake_up(&caching_ctl->wait);
 615                         }
 616                 }
 617                 path->slots[0]++;
 618         }
 619         ret = 0;
 620
 621         total_found += add_new_free_space(block_group, last,
 622                                 block_group->start + block_group->length);
 623         caching_ctl->progress = (u64)-1;
 624
 625 out:
 626         btrfs_free_path(path);
 627         return ret;
 628 }
 629
 630 static noinline void caching_thread(struct btrfs_work *work)
 631 {
 632         struct btrfs_block_group *block_group;
 633         struct btrfs_fs_info *fs_info;
 634         struct btrfs_caching_control *caching_ctl;
 635         int ret;
 636
 637         caching_ctl = container_of(work, struct btrfs_caching_control, work);
 638         block_group = caching_ctl->block_group;
 639         fs_info = block_group->fs_info;
 640
 641         mutex_lock(&caching_ctl->mutex);
 642         down_read(&fs_info->commit_root_sem);
 643
 644         if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
 645                 ret = load_free_space_tree(caching_ctl);
 646         else
 647                 ret = load_extent_tree_free(caching_ctl);
 648
 649         spin_lock(&block_group->lock);
 650         block_group->caching_ctl = NULL;
 651         block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
 652         spin_unlock(&block_group->lock);
 653
 654 #ifdef CONFIG_BTRFS_DEBUG
 655         if (btrfs_should_fragment_free_space(block_group)) {
 656                 u64 bytes_used;
 657
 658                 spin_lock(&block_group->space_info->lock);
 659                 spin_lock(&block_group->lock);
 660                 bytes_used = block_group->length - block_group->used;
 661                 block_group->space_info->bytes_used += bytes_used >> 1;
 662                 spin_unlock(&block_group->lock);
 663                 spin_unlock(&block_group->space_info->lock);
 664                 fragment_free_space(block_group);
 665         }
 666 #endif
 667
 668         caching_ctl->progress = (u64)-1;
 669
 670         up_read(&fs_info->commit_root_sem);
 671         btrfs_free_excluded_extents(block_group);
 672         mutex_unlock(&caching_ctl->mutex);
 673
 674         wake_up(&caching_ctl->wait);
 675
 676         btrfs_put_caching_control(caching_ctl);
 677         btrfs_put_block_group(block_group);
 678 }
 679
 680 int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only)
 681 {
 682         DEFINE_WAIT(wait);
 683         struct btrfs_fs_info *fs_info = cache->fs_info;
 684         struct btrfs_caching_control *caching_ctl;
 685         int ret = 0;
 686
 687         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
 688         if (!caching_ctl)
 689                 return -ENOMEM;
 690
 691         INIT_LIST_HEAD(&caching_ctl->list);
 692         mutex_init(&caching_ctl->mutex);
 693         init_waitqueue_head(&caching_ctl->wait);
 694         caching_ctl->block_group = cache;
 695         caching_ctl->progress = cache->start;
 696         refcount_set(&caching_ctl->count, 1);
 697         btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
 698
 699         spin_lock(&cache->lock);
 700         /*
 701          * This should be a rare occasion, but this could happen I think in the
 702          * case where one thread starts to load the space cache info, and then
 703          * some other thread starts a transaction commit which tries to do an
 704          * allocation while the other thread is still loading the space cache
 705          * info.  The previous loop should have kept us from choosing this block
 706          * group, but if we've moved to the state where we will wait on caching
 707          * block groups we need to first check if we're doing a fast load here,
 708          * so we can wait for it to finish, otherwise we could end up allocating
 709          * from a block group who's cache gets evicted for one reason or
 710          * another.
 711          */
 712         while (cache->cached == BTRFS_CACHE_FAST) {
 713                 struct btrfs_caching_control *ctl;
 714
 715                 ctl = cache->caching_ctl;
 716                 refcount_inc(&ctl->count);
 717                 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
 718                 spin_unlock(&cache->lock);
 719
 720                 schedule();
 721
 722                 finish_wait(&ctl->wait, &wait);
 723                 btrfs_put_caching_control(ctl);
 724                 spin_lock(&cache->lock);
 725         }
 726
 727         if (cache->cached != BTRFS_CACHE_NO) {
 728                 spin_unlock(&cache->lock);
 729                 kfree(caching_ctl);
 730                 return 0;
 731         }
 732         WARN_ON(cache->caching_ctl);
 733         cache->caching_ctl = caching_ctl;
 734         cache->cached = BTRFS_CACHE_FAST;
 735         spin_unlock(&cache->lock);
 736
 737         if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
 738                 mutex_lock(&caching_ctl->mutex);
 739                 ret = load_free_space_cache(cache);
 740
 741                 spin_lock(&cache->lock);
 742                 if (ret == 1) {
 743                         cache->caching_ctl = NULL;
 744                         cache->cached = BTRFS_CACHE_FINISHED;
 745                         cache->last_byte_to_unpin = (u64)-1;
 746                         caching_ctl->progress = (u64)-1;
 747                 } else {
 748                         if (load_cache_only) {
 749                                 cache->caching_ctl = NULL;
 750                                 cache->cached = BTRFS_CACHE_NO;
 751                         } else {
 752                                 cache->cached = BTRFS_CACHE_STARTED;
 753                                 cache->has_caching_ctl = 1;
 754                         }
 755                 }
 756                 spin_unlock(&cache->lock);
 757 #ifdef CONFIG_BTRFS_DEBUG
 758                 if (ret == 1 &&
 759                     btrfs_should_fragment_free_space(cache)) {
 760                         u64 bytes_used;
 761
 762                         spin_lock(&cache->space_info->lock);
 763                         spin_lock(&cache->lock);
 764                         bytes_used = cache->length - cache->used;
 765                         cache->space_info->bytes_used += bytes_used >> 1;
 766                         spin_unlock(&cache->lock);
 767                         spin_unlock(&cache->space_info->lock);
 768                         fragment_free_space(cache);
 769                 }
 770 #endif
 771                 mutex_unlock(&caching_ctl->mutex);
 772
 773                 wake_up(&caching_ctl->wait);
 774                 if (ret == 1) {
 775                         btrfs_put_caching_control(caching_ctl);
 776                         btrfs_free_excluded_extents(cache);
 777                         return 0;
 778                 }
 779         } else {
 780                 /*
 781                  * We're either using the free space tree or no caching at all.
 782                  * Set cached to the appropriate value and wakeup any waiters.
 783                  */
 784                 spin_lock(&cache->lock);
 785                 if (load_cache_only) {
 786                         cache->caching_ctl = NULL;
 787                         cache->cached = BTRFS_CACHE_NO;
 788                 } else {
 789                         cache->cached = BTRFS_CACHE_STARTED;
 790                         cache->has_caching_ctl = 1;
 791                 }
 792                 spin_unlock(&cache->lock);
 793                 wake_up(&caching_ctl->wait);
 794         }
 795
 796         if (load_cache_only) {
 797                 btrfs_put_caching_control(caching_ctl);
 798                 return 0;
 799         }
 800
 801         down_write(&fs_info->commit_root_sem);
 802         refcount_inc(&caching_ctl->count);
 803         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
 804         up_write(&fs_info->commit_root_sem);
 805
 806         btrfs_get_block_group(cache);
 807
 808         btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
 809
 810         return ret;
 811 }
 812
 813 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 814 {
 815         u64 extra_flags = chunk_to_extended(flags) &
 816                                 BTRFS_EXTENDED_PROFILE_MASK;
 817
 818         write_seqlock(&fs_info->profiles_lock);
 819         if (flags & BTRFS_BLOCK_GROUP_DATA)
 820                 fs_info->avail_data_alloc_bits &= ~extra_flags;
 821         if (flags & BTRFS_BLOCK_GROUP_METADATA)
 822                 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
 823         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
 824                 fs_info->avail_system_alloc_bits &= ~extra_flags;
 825         write_sequnlock(&fs_info->profiles_lock);
 826 }
 827
 828 /*
 829  * Clear incompat bits for the following feature(s):
 830  *
 831  * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
 832  *            in the whole filesystem
 833  *
 834  * - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups
 835  */
 836 static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
 837 {
 838         bool found_raid56 = false;
 839         bool found_raid1c34 = false;
 840
 841         if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) ||
 842             (flags & BTRFS_BLOCK_GROUP_RAID1C3) ||
 843             (flags & BTRFS_BLOCK_GROUP_RAID1C4)) {
 844                 struct list_head *head = &fs_info->space_info;
 845                 struct btrfs_space_info *sinfo;
 846
 847                 list_for_each_entry_rcu(sinfo, head, list) {
 848                         down_read(&sinfo->groups_sem);
 849                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
 850                                 found_raid56 = true;
 851                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
 852                                 found_raid56 = true;
 853                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3]))
 854                                 found_raid1c34 = true;
 855                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4]))
 856                                 found_raid1c34 = true;
 857                         up_read(&sinfo->groups_sem);
 858                 }
 859                 if (!found_raid56)
 860                         btrfs_clear_fs_incompat(fs_info, RAID56);
 861                 if (!found_raid1c34)
 862                         btrfs_clear_fs_incompat(fs_info, RAID1C34);
 863         }
 864 }
 865
 866 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 867                              u64 group_start, struct extent_map *em)
 868 {
 869         struct btrfs_fs_info *fs_info = trans->fs_info;
 870         struct btrfs_root *root = fs_info->extent_root;
 871         struct btrfs_path *path;
 872         struct btrfs_block_group *block_group;
 873         struct btrfs_free_cluster *cluster;
 874         struct btrfs_root *tree_root = fs_info->tree_root;
 875         struct btrfs_key key;
 876         struct inode *inode;
 877         struct kobject *kobj = NULL;
 878         int ret;
 879         int index;
 880         int factor;
 881         struct btrfs_caching_control *caching_ctl = NULL;
 882         bool remove_em;
 883         bool remove_rsv = false;
 884
 885         block_group = btrfs_lookup_block_group(fs_info, group_start);
 886         BUG_ON(!block_group);
 887         BUG_ON(!block_group->ro);
 888
 889         trace_btrfs_remove_block_group(block_group);
 890         /*
 891          * Free the reserved super bytes from this block group before
 892          * remove it.
 893          */
 894         btrfs_free_excluded_extents(block_group);
 895         btrfs_free_ref_tree_range(fs_info, block_group->start,
 896                                   block_group->length);
 897
 898         index = btrfs_bg_flags_to_raid_index(block_group->flags);
 899         factor = btrfs_bg_type_to_factor(block_group->flags);
 900
 901         /* make sure this block group isn't part of an allocation cluster */
 902         cluster = &fs_info->data_alloc_cluster;
 903         spin_lock(&cluster->refill_lock);
 904         btrfs_return_cluster_to_free_space(block_group, cluster);
 905         spin_unlock(&cluster->refill_lock);
 906
 907         /*
 908          * make sure this block group isn't part of a metadata
 909          * allocation cluster
 910          */
 911         cluster = &fs_info->meta_alloc_cluster;
 912         spin_lock(&cluster->refill_lock);
 913         btrfs_return_cluster_to_free_space(block_group, cluster);
 914         spin_unlock(&cluster->refill_lock);
 915
 916         path = btrfs_alloc_path();
 917         if (!path) {
 918                 ret = -ENOMEM;
 919                 goto out;
 920         }
 921
 922         /*
 923          * get the inode first so any iput calls done for the io_list
 924          * aren't the final iput (no unlinks allowed now)
 925          */
 926         inode = lookup_free_space_inode(block_group, path);
 927
 928         mutex_lock(&trans->transaction->cache_write_mutex);
 929         /*
 930          * Make sure our free space cache IO is done before removing the
 931          * free space inode
 932          */
 933         spin_lock(&trans->transaction->dirty_bgs_lock);
 934         if (!list_empty(&block_group->io_list)) {
 935                 list_del_init(&block_group->io_list);
 936
 937                 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
 938
 939                 spin_unlock(&trans->transaction->dirty_bgs_lock);
 940                 btrfs_wait_cache_io(trans, block_group, path);
 941                 btrfs_put_block_group(block_group);
 942                 spin_lock(&trans->transaction->dirty_bgs_lock);
 943         }
 944
 945         if (!list_empty(&block_group->dirty_list)) {
 946                 list_del_init(&block_group->dirty_list);
 947                 remove_rsv = true;
 948                 btrfs_put_block_group(block_group);
 949         }
 950         spin_unlock(&trans->transaction->dirty_bgs_lock);
 951         mutex_unlock(&trans->transaction->cache_write_mutex);
 952
 953         if (!IS_ERR(inode)) {
 954                 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
 955                 if (ret) {
 956                         btrfs_add_delayed_iput(inode);
 957                         goto out;
 958                 }
 959                 clear_nlink(inode);
 960                 /* One for the block groups ref */
 961                 spin_lock(&block_group->lock);
 962                 if (block_group->iref) {
 963                         block_group->iref = 0;
 964                         block_group->inode = NULL;
 965                         spin_unlock(&block_group->lock);
 966                         iput(inode);
 967                 } else {
 968                         spin_unlock(&block_group->lock);
 969                 }
 970                 /* One for our lookup ref */
 971                 btrfs_add_delayed_iput(inode);
 972         }
 973
 974         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
 975         key.type = 0;
 976         key.offset = block_group->start;
 977
 978         ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
 979         if (ret < 0)
 980                 goto out;
 981         if (ret > 0)
 982                 btrfs_release_path(path);
 983         if (ret == 0) {
 984                 ret = btrfs_del_item(trans, tree_root, path);
 985                 if (ret)
 986                         goto out;
 987                 btrfs_release_path(path);
 988         }
 989
 990         spin_lock(&fs_info->block_group_cache_lock);
 991         rb_erase(&block_group->cache_node,
 992                  &fs_info->block_group_cache_tree);
 993         RB_CLEAR_NODE(&block_group->cache_node);
 994
 995         /* Once for the block groups rbtree */
 996         btrfs_put_block_group(block_group);
 997
 998         if (fs_info->first_logical_byte == block_group->start)
 999                 fs_info->first_logical_byte = (u64)-1;
1000         spin_unlock(&fs_info->block_group_cache_lock);
1001
1002         down_write(&block_group->space_info->groups_sem);
1003         /*
1004          * we must use list_del_init so people can check to see if they
1005          * are still on the list after taking the semaphore
1006          */
1007         list_del_init(&block_group->list);
1008         if (list_empty(&block_group->space_info->block_groups[index])) {
1009                 kobj = block_group->space_info->block_group_kobjs[index];
1010                 block_group->space_info->block_group_kobjs[index] = NULL;
1011                 clear_avail_alloc_bits(fs_info, block_group->flags);
1012         }
1013         up_write(&block_group->space_info->groups_sem);
1014         clear_incompat_bg_bits(fs_info, block_group->flags);
1015         if (kobj) {
1016                 kobject_del(kobj);
1017                 kobject_put(kobj);
1018         }
1019
1020         if (block_group->has_caching_ctl)
1021                 caching_ctl = btrfs_get_caching_control(block_group);
1022         if (block_group->cached == BTRFS_CACHE_STARTED)
1023                 btrfs_wait_block_group_cache_done(block_group);
1024         if (block_group->has_caching_ctl) {
1025                 down_write(&fs_info->commit_root_sem);
1026                 if (!caching_ctl) {
1027                         struct btrfs_caching_control *ctl;
1028
1029                         list_for_each_entry(ctl,
1030                                     &fs_info->caching_block_groups, list)
1031                                 if (ctl->block_group == block_group) {
1032                                         caching_ctl = ctl;
1033                                         refcount_inc(&caching_ctl->count);
1034                                         break;
1035                                 }
1036                 }
1037                 if (caching_ctl)
1038                         list_del_init(&caching_ctl->list);
1039                 up_write(&fs_info->commit_root_sem);
1040                 if (caching_ctl) {
1041                         /* Once for the caching bgs list and once for us. */
1042                         btrfs_put_caching_control(caching_ctl);
1043                         btrfs_put_caching_control(caching_ctl);
1044                 }
1045         }
1046
1047         spin_lock(&trans->transaction->dirty_bgs_lock);
1048         WARN_ON(!list_empty(&block_group->dirty_list));
1049         WARN_ON(!list_empty(&block_group->io_list));
1050         spin_unlock(&trans->transaction->dirty_bgs_lock);
1051
1052         btrfs_remove_free_space_cache(block_group);
1053
1054         spin_lock(&block_group->space_info->lock);
1055         list_del_init(&block_group->ro_list);
1056
1057         if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
1058                 WARN_ON(block_group->space_info->total_bytes
1059                         < block_group->length);
1060                 WARN_ON(block_group->space_info->bytes_readonly
1061                         < block_group->length);
1062                 WARN_ON(block_group->space_info->disk_total
1063                         < block_group->length * factor);
1064         }
1065         block_group->space_info->total_bytes -= block_group->length;
1066         block_group->space_info->bytes_readonly -= block_group->length;
1067         block_group->space_info->disk_total -= block_group->length * factor;
1068
1069         spin_unlock(&block_group->space_info->lock);
1070
1071         key.objectid = block_group->start;
1072         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
1073         key.offset = block_group->length;
1074
1075         mutex_lock(&fs_info->chunk_mutex);
1076         spin_lock(&block_group->lock);
1077         block_group->removed = 1;
1078         /*
1079          * At this point trimming can't start on this block group, because we
1080          * removed the block group from the tree fs_info->block_group_cache_tree
1081          * so no one can't find it anymore and even if someone already got this
1082          * block group before we removed it from the rbtree, they have already
1083          * incremented block_group->trimming - if they didn't, they won't find
1084          * any free space entries because we already removed them all when we
1085          * called btrfs_remove_free_space_cache().
1086          *
1087          * And we must not remove the extent map from the fs_info->mapping_tree
1088          * to prevent the same logical address range and physical device space
1089          * ranges from being reused for a new block group. This is because our
1090          * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
1091          * completely transactionless, so while it is trimming a range the
1092          * currently running transaction might finish and a new one start,
1093          * allowing for new block groups to be created that can reuse the same
1094          * physical device locations unless we take this special care.
1095          *
1096          * There may also be an implicit trim operation if the file system
1097          * is mounted with -odiscard. The same protections must remain
1098          * in place until the extents have been discarded completely when
1099          * the transaction commit has completed.
1100          */
1101         remove_em = (atomic_read(&block_group->trimming) == 0);
1102         spin_unlock(&block_group->lock);
1103
1104         mutex_unlock(&fs_info->chunk_mutex);
1105
1106         ret = remove_block_group_free_space(trans, block_group);
1107         if (ret)
1108                 goto out;
1109
1110         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1111         if (ret > 0)
1112                 ret = -EIO;
1113         if (ret < 0)
1114                 goto out;
1115
1116         ret = btrfs_del_item(trans, root, path);
1117         if (ret)
1118                 goto out;
1119
1120         if (remove_em) {
1121                 struct extent_map_tree *em_tree;
1122
1123                 em_tree = &fs_info->mapping_tree;
1124                 write_lock(&em_tree->lock);
1125                 remove_extent_mapping(em_tree, em);
1126                 write_unlock(&em_tree->lock);
1127                 /* once for the tree */
1128                 free_extent_map(em);
1129         }
1130
1131 out:
1132         /* Once for the lookup reference */
1133         btrfs_put_block_group(block_group);
1134         if (remove_rsv)
1135                 btrfs_delayed_refs_rsv_release(fs_info, 1);
1136         btrfs_free_path(path);
1137         return ret;
1138 }
1139
1140 struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
1141                 struct btrfs_fs_info *fs_info, const u64 chunk_offset)
1142 {
1143         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
1144         struct extent_map *em;
1145         struct map_lookup *map;
1146         unsigned int num_items;
1147
1148         read_lock(&em_tree->lock);
1149         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
1150         read_unlock(&em_tree->lock);
1151         ASSERT(em && em->start == chunk_offset);
1152
1153         /*
1154          * We need to reserve 3 + N units from the metadata space info in order
1155          * to remove a block group (done at btrfs_remove_chunk() and at
1156          * btrfs_remove_block_group()), which are used for:
1157          *
1158          * 1 unit for adding the free space inode's orphan (located in the tree
1159          * of tree roots).
1160          * 1 unit for deleting the block group item (located in the extent
1161          * tree).
1162          * 1 unit for deleting the free space item (located in tree of tree
1163          * roots).
1164          * N units for deleting N device extent items corresponding to each
1165          * stripe (located in the device tree).
1166          *
1167          * In order to remove a block group we also need to reserve units in the
1168          * system space info in order to update the chunk tree (update one or
1169          * more device items and remove one chunk item), but this is done at
1170          * btrfs_remove_chunk() through a call to check_system_chunk().
1171          */
1172         map = em->map_lookup;
1173         num_items = 3 + map->num_stripes;
1174         free_extent_map(em);
1175
1176         return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
1177                                                            num_items);
1178 }
1179
1180 /*
1181  * Mark block group @cache read-only, so later write won't happen to block
1182  * group @cache.
1183  *
1184  * If @force is not set, this function will only mark the block group readonly
1185  * if we have enough free space (1M) in other metadata/system block groups.
1186  * If @force is not set, this function will mark the block group readonly
1187  * without checking free space.
1188  *
1189  * NOTE: This function doesn't care if other block groups can contain all the
1190  * data in this block group. That check should be done by relocation routine,
1191  * not this function.
1192  */
1193 static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
1194 {
1195         struct btrfs_space_info *sinfo = cache->space_info;
1196         u64 num_bytes;
1197         int ret = -ENOSPC;
1198
1199         spin_lock(&sinfo->lock);
1200         spin_lock(&cache->lock);
1201
1202         if (cache->ro) {
1203                 cache->ro++;
1204                 ret = 0;
1205                 goto out;
1206         }
1207
1208         num_bytes = cache->length - cache->reserved - cache->pinned -
1209                     cache->bytes_super - cache->used;
1210
1211         /*
1212          * Data never overcommits, even in mixed mode, so do just the straight
1213          * check of left over space in how much we have allocated.
1214          */
1215         if (force) {
1216                 ret = 0;
1217         } else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) {
1218                 u64 sinfo_used = btrfs_space_info_used(sinfo, true);
1219
1220                 /*
1221                  * Here we make sure if we mark this bg RO, we still have enough
1222                  * free space as buffer.
1223                  */
1224                 if (sinfo_used + num_bytes <= sinfo->total_bytes)
1225                         ret = 0;
1226         } else {
1227                 /*
1228                  * We overcommit metadata, so we need to do the
1229                  * btrfs_can_overcommit check here, and we need to pass in
1230                  * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
1231                  * leeway to allow us to mark this block group as read only.
1232                  */
1233                 if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes,
1234                                          BTRFS_RESERVE_NO_FLUSH))
1235                         ret = 0;
1236         }
1237
1238         if (!ret) {
1239                 sinfo->bytes_readonly += num_bytes;
1240                 cache->ro++;
1241                 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
1242         }
1243 out:
1244         spin_unlock(&cache->lock);
1245         spin_unlock(&sinfo->lock);
1246         if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
1247                 btrfs_info(cache->fs_info,
1248                         "unable to make block group %llu ro", cache->start);
1249                 btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
1250         }
1251         return ret;
1252 }
1253
1254 static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
1255                                  struct btrfs_block_group *bg)
1256 {
1257         struct btrfs_fs_info *fs_info = bg->fs_info;
1258         struct btrfs_transaction *prev_trans = NULL;
1259         const u64 start = bg->start;
1260         const u64 end = start + bg->length - 1;
1261         int ret;
1262
1263         spin_lock(&fs_info->trans_lock);
1264         if (trans->transaction->list.prev != &fs_info->trans_list) {
1265                 prev_trans = list_last_entry(&trans->transaction->list,
1266                                              struct btrfs_transaction, list);
1267                 refcount_inc(&prev_trans->use_count);
1268         }
1269         spin_unlock(&fs_info->trans_lock);
1270
1271         /*
1272          * Hold the unused_bg_unpin_mutex lock to avoid racing with
1273          * btrfs_finish_extent_commit(). If we are at transaction N, another
1274          * task might be running finish_extent_commit() for the previous
1275          * transaction N - 1, and have seen a range belonging to the block
1276          * group in pinned_extents before we were able to clear the whole block
1277          * group range from pinned_extents. This means that task can lookup for
1278          * the block group after we unpinned it from pinned_extents and removed
1279          * it, leading to a BUG_ON() at unpin_extent_range().
1280          */
1281         mutex_lock(&fs_info->unused_bg_unpin_mutex);
1282         if (prev_trans) {
1283                 ret = clear_extent_bits(&prev_trans->pinned_extents, start, end,
1284                                         EXTENT_DIRTY);
1285                 if (ret)
1286                         goto err;
1287         }
1288
1289         ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end,
1290                                 EXTENT_DIRTY);
1291         if (ret)
1292                 goto err;
1293         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
1294         if (prev_trans)
1295                 btrfs_put_transaction(prev_trans);
1296
1297         return true;
1298
1299 err:
1300         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
1301         if (prev_trans)
1302                 btrfs_put_transaction(prev_trans);
1303         btrfs_dec_block_group_ro(bg);
1304         return false;
1305 }
1306
1307 /*
1308  * Process the unused_bgs list and remove any that don't have any allocated
1309  * space inside of them.
1310  */
1311 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
1312 {
1313         struct btrfs_block_group *block_group;
1314         struct btrfs_space_info *space_info;
1315         struct btrfs_trans_handle *trans;
1316         const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC);
1317         int ret = 0;
1318
1319         if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1320                 return;
1321
1322         spin_lock(&fs_info->unused_bgs_lock);
1323         while (!list_empty(&fs_info->unused_bgs)) {
1324                 int trimming;
1325
1326                 block_group = list_first_entry(&fs_info->unused_bgs,
1327                                                struct btrfs_block_group,
1328                                                bg_list);
1329                 list_del_init(&block_group->bg_list);
1330
1331                 space_info = block_group->space_info;
1332
1333                 if (ret || btrfs_mixed_space_info(space_info)) {
1334                         btrfs_put_block_group(block_group);
1335                         continue;
1336                 }
1337                 spin_unlock(&fs_info->unused_bgs_lock);
1338
1339                 btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
1340
1341                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
1342
1343                 /* Don't want to race with allocators so take the groups_sem */
1344                 down_write(&space_info->groups_sem);
1345
1346                 /*
1347                  * Async discard moves the final block group discard to be prior
1348                  * to the unused_bgs code path.  Therefore, if it's not fully
1349                  * trimmed, punt it back to the async discard lists.
1350                  */
1351                 if (btrfs_test_opt(fs_info, DISCARD_ASYNC) &&
1352                     !btrfs_is_free_space_trimmed(block_group)) {
1353                         trace_btrfs_skip_unused_block_group(block_group);
1354                         up_write(&space_info->groups_sem);
1355                         /* Requeue if we failed because of async discard */
1356                         btrfs_discard_queue_work(&fs_info->discard_ctl,
1357                                                  block_group);
1358                         goto next;
1359                 }
1360
1361                 spin_lock(&block_group->lock);
1362                 if (block_group->reserved || block_group->pinned ||
1363                     block_group->used || block_group->ro ||
1364                     list_is_singular(&block_group->list)) {
1365                         /*
1366                          * We want to bail if we made new allocations or have
1367                          * outstanding allocations in this block group.  We do
1368                          * the ro check in case balance is currently acting on
1369                          * this block group.
1370                          */
1371                         trace_btrfs_skip_unused_block_group(block_group);
1372                         spin_unlock(&block_group->lock);
1373                         up_write(&space_info->groups_sem);
1374                         goto next;
1375                 }
1376                 spin_unlock(&block_group->lock);
1377
1378                 /* We don't want to force the issue, only flip if it's ok. */
1379                 ret = inc_block_group_ro(block_group, 0);
1380                 up_write(&space_info->groups_sem);
1381                 if (ret < 0) {
1382                         ret = 0;
1383                         goto next;
1384                 }
1385
1386                 /*
1387                  * Want to do this before we do anything else so we can recover
1388                  * properly if we fail to join the transaction.
1389                  */
1390                 trans = btrfs_start_trans_remove_block_group(fs_info,
1391                                                      block_group->start);
1392                 if (IS_ERR(trans)) {
1393                         btrfs_dec_block_group_ro(block_group);
1394                         ret = PTR_ERR(trans);
1395                         goto next;
1396                 }
1397
1398                 /*
1399                  * We could have pending pinned extents for this block group,
1400                  * just delete them, we don't care about them anymore.
1401                  */
1402                 if (!clean_pinned_extents(trans, block_group))
1403                         goto end_trans;
1404
1405                 /*
1406                  * At this point, the block_group is read only and should fail
1407                  * new allocations.  However, btrfs_finish_extent_commit() can
1408                  * cause this block_group to be placed back on the discard
1409                  * lists because now the block_group isn't fully discarded.
1410                  * Bail here and try again later after discarding everything.
1411                  */
1412                 spin_lock(&fs_info->discard_ctl.lock);
1413                 if (!list_empty(&block_group->discard_list)) {
1414                         spin_unlock(&fs_info->discard_ctl.lock);
1415                         btrfs_dec_block_group_ro(block_group);
1416                         btrfs_discard_queue_work(&fs_info->discard_ctl,
1417                                                  block_group);
1418                         goto end_trans;
1419                 }
1420                 spin_unlock(&fs_info->discard_ctl.lock);
1421
1422                 /* Reset pinned so btrfs_put_block_group doesn't complain */
1423                 spin_lock(&space_info->lock);
1424                 spin_lock(&block_group->lock);
1425
1426                 btrfs_space_info_update_bytes_pinned(fs_info, space_info,
1427                                                      -block_group->pinned);
1428                 space_info->bytes_readonly += block_group->pinned;
1429                 percpu_counter_add_batch(&space_info->total_bytes_pinned,
1430                                    -block_group->pinned,
1431                                    BTRFS_TOTAL_BYTES_PINNED_BATCH);
1432                 block_group->pinned = 0;
1433
1434                 spin_unlock(&block_group->lock);
1435                 spin_unlock(&space_info->lock);
1436
1437                 /*
1438                  * The normal path here is an unused block group is passed here,
1439                  * then trimming is handled in the transaction commit path.
1440                  * Async discard interposes before this to do the trimming
1441                  * before coming down the unused block group path as trimming
1442                  * will no longer be done later in the transaction commit path.
1443                  */
1444                 if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
1445                         goto flip_async;
1446
1447                 /* DISCARD can flip during remount */
1448                 trimming = btrfs_test_opt(fs_info, DISCARD_SYNC);
1449
1450                 /* Implicit trim during transaction commit. */
1451                 if (trimming)
1452                         btrfs_get_block_group_trimming(block_group);
1453
1454                 /*
1455                  * Btrfs_remove_chunk will abort the transaction if things go
1456                  * horribly wrong.
1457                  */
1458                 ret = btrfs_remove_chunk(trans, block_group->start);
1459
1460                 if (ret) {
1461                         if (trimming)
1462                                 btrfs_put_block_group_trimming(block_group);
1463                         goto end_trans;
1464                 }
1465
1466                 /*
1467                  * If we're not mounted with -odiscard, we can just forget
1468                  * about this block group. Otherwise we'll need to wait
1469                  * until transaction commit to do the actual discard.
1470                  */
1471                 if (trimming) {
1472                         spin_lock(&fs_info->unused_bgs_lock);
1473                         /*
1474                          * A concurrent scrub might have added us to the list
1475                          * fs_info->unused_bgs, so use a list_move operation
1476                          * to add the block group to the deleted_bgs list.
1477                          */
1478                         list_move(&block_group->bg_list,
1479                                   &trans->transaction->deleted_bgs);
1480                         spin_unlock(&fs_info->unused_bgs_lock);
1481                         btrfs_get_block_group(block_group);
1482                 }
1483 end_trans:
1484                 btrfs_end_transaction(trans);
1485 next:
1486                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
1487                 btrfs_put_block_group(block_group);
1488                 spin_lock(&fs_info->unused_bgs_lock);
1489         }
1490         spin_unlock(&fs_info->unused_bgs_lock);
1491         return;
1492
1493 flip_async:
1494         btrfs_end_transaction(trans);
1495         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
1496         btrfs_put_block_group(block_group);
1497         btrfs_discard_punt_unused_bgs_list(fs_info);
1498 }
1499
1500 void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
1501 {
1502         struct btrfs_fs_info *fs_info = bg->fs_info;
1503
1504         spin_lock(&fs_info->unused_bgs_lock);
1505         if (list_empty(&bg->bg_list)) {
1506                 btrfs_get_block_group(bg);
1507                 trace_btrfs_add_unused_block_group(bg);
1508                 list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
1509         }
1510         spin_unlock(&fs_info->unused_bgs_lock);
1511 }
1512
1513 static int find_first_block_group(struct btrfs_fs_info *fs_info,
1514                                   struct btrfs_path *path,
1515                                   struct btrfs_key *key)
1516 {
1517         struct btrfs_root *root = fs_info->extent_root;
1518         int ret = 0;
1519         struct btrfs_key found_key;
1520         struct extent_buffer *leaf;
1521         struct btrfs_block_group_item bg;
1522         u64 flags;
1523         int slot;
1524
1525         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
1526         if (ret < 0)
1527                 goto out;
1528
1529         while (1) {
1530                 slot = path->slots[0];
1531                 leaf = path->nodes[0];
1532                 if (slot >= btrfs_header_nritems(leaf)) {
1533                         ret = btrfs_next_leaf(root, path);
1534                         if (ret == 0)
1535                                 continue;
1536                         if (ret < 0)
1537                                 goto out;
1538                         break;
1539                 }
1540                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
1541
1542                 if (found_key.objectid >= key->objectid &&
1543                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
1544                         struct extent_map_tree *em_tree;
1545                         struct extent_map *em;
1546
1547                         em_tree = &root->fs_info->mapping_tree;
1548                         read_lock(&em_tree->lock);
1549                         em = lookup_extent_mapping(em_tree, found_key.objectid,
1550                                                    found_key.offset);
1551                         read_unlock(&em_tree->lock);
1552                         if (!em) {
1553                                 btrfs_err(fs_info,
1554                         "logical %llu len %llu found bg but no related chunk",
1555                                           found_key.objectid, found_key.offset);
1556                                 ret = -ENOENT;
1557                         } else if (em->start != found_key.objectid ||
1558                                    em->len != found_key.offset) {
1559                                 btrfs_err(fs_info,
1560                 "block group %llu len %llu mismatch with chunk %llu len %llu",
1561                                           found_key.objectid, found_key.offset,
1562                                           em->start, em->len);
1563                                 ret = -EUCLEAN;
1564                         } else {
1565                                 read_extent_buffer(leaf, &bg,
1566                                         btrfs_item_ptr_offset(leaf, slot),
1567                                         sizeof(bg));
1568                                 flags = btrfs_stack_block_group_flags(&bg) &
1569                                         BTRFS_BLOCK_GROUP_TYPE_MASK;
1570
1571                                 if (flags != (em->map_lookup->type &
1572                                               BTRFS_BLOCK_GROUP_TYPE_MASK)) {
1573                                         btrfs_err(fs_info,
1574 "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
1575                                                 found_key.objectid,
1576                                                 found_key.offset, flags,
1577                                                 (BTRFS_BLOCK_GROUP_TYPE_MASK &
1578                                                  em->map_lookup->type));
1579                                         ret = -EUCLEAN;
1580                                 } else {
1581                                         ret = 0;
1582                                 }
1583                         }
1584                         free_extent_map(em);
1585                         goto out;
1586                 }
1587                 path->slots[0]++;
1588         }
1589 out:
1590         return ret;
1591 }
1592
1593 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
1594 {
1595         u64 extra_flags = chunk_to_extended(flags) &
1596                                 BTRFS_EXTENDED_PROFILE_MASK;
1597
1598         write_seqlock(&fs_info->profiles_lock);
1599         if (flags & BTRFS_BLOCK_GROUP_DATA)
1600                 fs_info->avail_data_alloc_bits |= extra_flags;
1601         if (flags & BTRFS_BLOCK_GROUP_METADATA)
1602                 fs_info->avail_metadata_alloc_bits |= extra_flags;
1603         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
1604                 fs_info->avail_system_alloc_bits |= extra_flags;
1605         write_sequnlock(&fs_info->profiles_lock);
1606 }
1607
1608 /**
1609  * btrfs_rmap_block - Map a physical disk address to a list of logical addresses
1610  * @chunk_start:   logical address of block group
1611  * @physical:      physical address to map to logical addresses
1612  * @logical:       return array of logical addresses which map to @physical
1613  * @naddrs:        length of @logical
1614  * @stripe_len:    size of IO stripe for the given block group
1615  *
1616  * Maps a particular @physical disk address to a list of @logical addresses.
1617  * Used primarily to exclude those portions of a block group that contain super
1618  * block copies.
1619  */
1620 EXPORT_FOR_TESTS
1621 int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
1622                      u64 physical, u64 **logical, int *naddrs, int *stripe_len)
1623 {
1624         struct extent_map *em;
1625         struct map_lookup *map;
1626         u64 *buf;
1627         u64 bytenr;
1628         u64 data_stripe_length;
1629         u64 io_stripe_size;
1630         int i, nr = 0;
1631         int ret = 0;
1632
1633         em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
1634         if (IS_ERR(em))
1635                 return -EIO;
1636
1637         map = em->map_lookup;
1638         data_stripe_length = em->len;
1639         io_stripe_size = map->stripe_len;
1640
1641         if (map->type & BTRFS_BLOCK_GROUP_RAID10)
1642                 data_stripe_length = div_u64(data_stripe_length,
1643                                              map->num_stripes / map->sub_stripes);
1644         else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
1645                 data_stripe_length = div_u64(data_stripe_length, map->num_stripes);
1646         else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1647                 data_stripe_length = div_u64(data_stripe_length,
1648                                              nr_data_stripes(map));
1649                 io_stripe_size = map->stripe_len * nr_data_stripes(map);
1650         }
1651
1652         buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
1653         if (!buf) {
1654                 ret = -ENOMEM;
1655                 goto out;
1656         }
1657
1658         for (i = 0; i < map->num_stripes; i++) {
1659                 bool already_inserted = false;
1660                 u64 stripe_nr;
1661                 int j;
1662
1663                 if (!in_range(physical, map->stripes[i].physical,
1664                               data_stripe_length))
1665                         continue;
1666
1667                 stripe_nr = physical - map->stripes[i].physical;
1668                 stripe_nr = div64_u64(stripe_nr, map->stripe_len);
1669
1670                 if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
1671                         stripe_nr = stripe_nr * map->num_stripes + i;
1672                         stripe_nr = div_u64(stripe_nr, map->sub_stripes);
1673                 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
1674                         stripe_nr = stripe_nr * map->num_stripes + i;
1675                 }
1676                 /*
1677                  * The remaining case would be for RAID56, multiply by
1678                  * nr_data_stripes().  Alternatively, just use rmap_len below
1679                  * instead of map->stripe_len
1680                  */
1681
1682                 bytenr = chunk_start + stripe_nr * io_stripe_size;
1683
1684                 /* Ensure we don't add duplicate addresses */
1685                 for (j = 0; j < nr; j++) {
1686                         if (buf[j] == bytenr) {
1687                                 already_inserted = true;
1688                                 break;
1689                         }
1690                 }
1691
1692                 if (!already_inserted)
1693                         buf[nr++] = bytenr;
1694         }
1695
1696         *logical = buf;
1697         *naddrs = nr;
1698         *stripe_len = io_stripe_size;
1699 out:
1700         free_extent_map(em);
1701         return ret;
1702 }
1703
1704 static int exclude_super_stripes(struct btrfs_block_group *cache)
1705 {
1706         struct btrfs_fs_info *fs_info = cache->fs_info;
1707         u64 bytenr;
1708         u64 *logical;
1709         int stripe_len;
1710         int i, nr, ret;
1711
1712         if (cache->start < BTRFS_SUPER_INFO_OFFSET) {
1713                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start;
1714                 cache->bytes_super += stripe_len;
1715                 ret = btrfs_add_excluded_extent(fs_info, cache->start,
1716                                                 stripe_len);
1717                 if (ret)
1718                         return ret;
1719         }
1720
1721         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
1722                 bytenr = btrfs_sb_offset(i);
1723                 ret = btrfs_rmap_block(fs_info, cache->start,
1724                                        bytenr, &logical, &nr, &stripe_len);
1725                 if (ret)
1726                         return ret;
1727
1728                 while (nr--) {
1729                         u64 start, len;
1730
1731                         if (logical[nr] > cache->start + cache->length)
1732                                 continue;
1733
1734                         if (logical[nr] + stripe_len <= cache->start)
1735                                 continue;
1736
1737                         start = logical[nr];
1738                         if (start < cache->start) {
1739                                 start = cache->start;
1740                                 len = (logical[nr] + stripe_len) - start;
1741                         } else {
1742                                 len = min_t(u64, stripe_len,
1743                                             cache->start + cache->length - start);
1744                         }
1745
1746                         cache->bytes_super += len;
1747                         ret = btrfs_add_excluded_extent(fs_info, start, len);
1748                         if (ret) {
1749                                 kfree(logical);
1750                                 return ret;
1751                         }
1752                 }
1753
1754                 kfree(logical);
1755         }
1756         return 0;
1757 }
1758
1759 static void link_block_group(struct btrfs_block_group *cache)
1760 {
1761         struct btrfs_space_info *space_info = cache->space_info;
1762         int index = btrfs_bg_flags_to_raid_index(cache->flags);
1763         bool first = false;
1764
1765         down_write(&space_info->groups_sem);
1766         if (list_empty(&space_info->block_groups[index]))
1767                 first = true;
1768         list_add_tail(&cache->list, &space_info->block_groups[index]);
1769         up_write(&space_info->groups_sem);
1770
1771         if (first)
1772                 btrfs_sysfs_add_block_group_type(cache);
1773 }
1774
1775 static struct btrfs_block_group *btrfs_create_block_group_cache(
1776                 struct btrfs_fs_info *fs_info, u64 start, u64 size)
1777 {
1778         struct btrfs_block_group *cache;
1779
1780         cache = kzalloc(sizeof(*cache), GFP_NOFS);
1781         if (!cache)
1782                 return NULL;
1783
1784         cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
1785                                         GFP_NOFS);
1786         if (!cache->free_space_ctl) {
1787                 kfree(cache);
1788                 return NULL;
1789         }
1790
1791         cache->start = start;
1792         cache->length = size;
1793
1794         cache->fs_info = fs_info;
1795         cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
1796         set_free_space_tree_thresholds(cache);
1797
1798         cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
1799
1800         atomic_set(&cache->count, 1);
1801         spin_lock_init(&cache->lock);
1802         init_rwsem(&cache->data_rwsem);
1803         INIT_LIST_HEAD(&cache->list);
1804         INIT_LIST_HEAD(&cache->cluster_list);
1805         INIT_LIST_HEAD(&cache->bg_list);
1806         INIT_LIST_HEAD(&cache->ro_list);
1807         INIT_LIST_HEAD(&cache->discard_list);
1808         INIT_LIST_HEAD(&cache->dirty_list);
1809         INIT_LIST_HEAD(&cache->io_list);
1810         btrfs_init_free_space_ctl(cache);
1811         atomic_set(&cache->trimming, 0);
1812         mutex_init(&cache->free_space_lock);
1813         btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
1814
1815         return cache;
1816 }
1817
1818 /*
1819  * Iterate all chunks and verify that each of them has the corresponding block
1820  * group
1821  */
1822 static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
1823 {
1824         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
1825         struct extent_map *em;
1826         struct btrfs_block_group *bg;
1827         u64 start = 0;
1828         int ret = 0;
1829
1830         while (1) {
1831                 read_lock(&map_tree->lock);
1832                 /*
1833                  * lookup_extent_mapping will return the first extent map
1834                  * intersecting the range, so setting @len to 1 is enough to
1835                  * get the first chunk.
1836                  */
1837                 em = lookup_extent_mapping(map_tree, start, 1);
1838                 read_unlock(&map_tree->lock);
1839                 if (!em)
1840                         break;
1841
1842                 bg = btrfs_lookup_block_group(fs_info, em->start);
1843                 if (!bg) {
1844                         btrfs_err(fs_info,
1845         "chunk start=%llu len=%llu doesn't have corresponding block group",
1846                                      em->start, em->len);
1847                         ret = -EUCLEAN;
1848                         free_extent_map(em);
1849                         break;
1850                 }
1851                 if (bg->start != em->start || bg->length != em->len ||
1852                     (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
1853                     (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
1854                         btrfs_err(fs_info,
1855 "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
1856                                 em->start, em->len,
1857                                 em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
1858                                 bg->start, bg->length,
1859                                 bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
1860                         ret = -EUCLEAN;
1861                         free_extent_map(em);
1862                         btrfs_put_block_group(bg);
1863                         break;
1864                 }
1865                 start = em->start + em->len;
1866                 free_extent_map(em);
1867                 btrfs_put_block_group(bg);
1868         }
1869         return ret;
1870 }
1871
1872 static int read_one_block_group(struct btrfs_fs_info *info,
1873                                 struct btrfs_path *path,
1874                                 const struct btrfs_key *key,
1875                                 int need_clear)
1876 {
1877         struct extent_buffer *leaf = path->nodes[0];
1878         struct btrfs_block_group *cache;
1879         struct btrfs_space_info *space_info;
1880         struct btrfs_block_group_item bgi;
1881         const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
1882         int slot = path->slots[0];
1883         int ret;
1884
1885         ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY);
1886
1887         cache = btrfs_create_block_group_cache(info, key->objectid, key->offset);
1888         if (!cache)
1889                 return -ENOMEM;
1890
1891         if (need_clear) {
1892                 /*
1893                  * When we mount with old space cache, we need to
1894                  * set BTRFS_DC_CLEAR and set dirty flag.
1895                  *
1896                  * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
1897                  *    truncate the old free space cache inode and
1898                  *    setup a new one.
1899                  * b) Setting 'dirty flag' makes sure that we flush
1900                  *    the new space cache info onto disk.
1901                  */
1902                 if (btrfs_test_opt(info, SPACE_CACHE))
1903                         cache->disk_cache_state = BTRFS_DC_CLEAR;
1904         }
1905         read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
1906                            sizeof(bgi));
1907         cache->used = btrfs_stack_block_group_used(&bgi);
1908         cache->flags = btrfs_stack_block_group_flags(&bgi);
1909         if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
1910             (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
1911                         btrfs_err(info,
1912 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
1913                                   cache->start);
1914                         ret = -EINVAL;
1915                         goto error;
1916         }
1917
1918         /*
1919          * We need to exclude the super stripes now so that the space info has
1920          * super bytes accounted for, otherwise we'll think we have more space
1921          * than we actually do.
1922          */
1923         ret = exclude_super_stripes(cache);
1924         if (ret) {
1925                 /* We may have excluded something, so call this just in case. */
1926                 btrfs_free_excluded_extents(cache);
1927                 goto error;
1928         }
1929
1930         /*
1931          * Check for two cases, either we are full, and therefore don't need
1932          * to bother with the caching work since we won't find any space, or we
1933          * are empty, and we can just add all the space in and be done with it.
1934          * This saves us _a_lot_ of time, particularly in the full case.
1935          */
1936         if (key->offset == cache->used) {
1937                 cache->last_byte_to_unpin = (u64)-1;
1938                 cache->cached = BTRFS_CACHE_FINISHED;
1939                 btrfs_free_excluded_extents(cache);
1940         } else if (cache->used == 0) {
1941                 cache->last_byte_to_unpin = (u64)-1;
1942                 cache->cached = BTRFS_CACHE_FINISHED;
1943                 add_new_free_space(cache, key->objectid,
1944                                    key->objectid + key->offset);
1945                 btrfs_free_excluded_extents(cache);
1946         }
1947
1948         ret = btrfs_add_block_group_cache(info, cache);
1949         if (ret) {
1950                 btrfs_remove_free_space_cache(cache);
1951                 goto error;
1952         }
1953         trace_btrfs_add_block_group(info, cache, 0);
1954         btrfs_update_space_info(info, cache->flags, key->offset,
1955                                 cache->used, cache->bytes_super, &space_info);
1956
1957         cache->space_info = space_info;
1958
1959         link_block_group(cache);
1960
1961         set_avail_alloc_bits(info, cache->flags);
1962         if (btrfs_chunk_readonly(info, cache->start)) {
1963                 inc_block_group_ro(cache, 1);
1964         } else if (cache->used == 0) {
1965                 ASSERT(list_empty(&cache->bg_list));
1966                 if (btrfs_test_opt(info, DISCARD_ASYNC))
1967                         btrfs_discard_queue_work(&info->discard_ctl, cache);
1968                 else
1969                         btrfs_mark_bg_unused(cache);
1970         }
1971         return 0;
1972 error:
1973         btrfs_put_block_group(cache);
1974         return ret;
1975 }
1976
1977 int btrfs_read_block_groups(struct btrfs_fs_info *info)
1978 {
1979         struct btrfs_path *path;
1980         int ret;
1981         struct btrfs_block_group *cache;
1982         struct btrfs_space_info *space_info;
1983         struct btrfs_key key;
1984         int need_clear = 0;
1985         u64 cache_gen;
1986
1987         key.objectid = 0;
1988         key.offset = 0;
1989         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
1990         path = btrfs_alloc_path();
1991         if (!path)
1992                 return -ENOMEM;
1993         path->reada = READA_FORWARD;
1994
1995         cache_gen = btrfs_super_cache_generation(info->super_copy);
1996         if (btrfs_test_opt(info, SPACE_CACHE) &&
1997             btrfs_super_generation(info->super_copy) != cache_gen)
1998                 need_clear = 1;
1999         if (btrfs_test_opt(info, CLEAR_CACHE))
2000                 need_clear = 1;
2001
2002         while (1) {
2003                 ret = find_first_block_group(info, path, &key);
2004                 if (ret > 0)
2005                         break;
2006                 if (ret != 0)
2007                         goto error;
2008
2009                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2010                 ret = read_one_block_group(info, path, &key, need_clear);
2011                 if (ret < 0)
2012                         goto error;
2013                 key.objectid += key.offset;
2014                 key.offset = 0;
2015                 btrfs_release_path(path);
2016         }
2017
2018         rcu_read_lock();
2019         list_for_each_entry_rcu(space_info, &info->space_info, list) {
2020                 if (!(btrfs_get_alloc_profile(info, space_info->flags) &
2021                       (BTRFS_BLOCK_GROUP_RAID10 |
2022                        BTRFS_BLOCK_GROUP_RAID1_MASK |
2023                        BTRFS_BLOCK_GROUP_RAID56_MASK |
2024                        BTRFS_BLOCK_GROUP_DUP)))
2025                         continue;
2026                 /*
2027                  * Avoid allocating from un-mirrored block group if there are
2028                  * mirrored block groups.
2029                  */
2030                 list_for_each_entry(cache,
2031                                 &space_info->block_groups[BTRFS_RAID_RAID0],
2032                                 list)
2033                         inc_block_group_ro(cache, 1);
2034                 list_for_each_entry(cache,
2035                                 &space_info->block_groups[BTRFS_RAID_SINGLE],
2036                                 list)
2037                         inc_block_group_ro(cache, 1);
2038         }
2039         rcu_read_unlock();
2040
2041         btrfs_init_global_block_rsv(info);
2042         ret = check_chunk_block_group_mappings(info);
2043 error:
2044         btrfs_free_path(path);
2045         return ret;
2046 }
2047
2048 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
2049 {
2050         struct btrfs_fs_info *fs_info = trans->fs_info;
2051         struct btrfs_block_group *block_group;
2052         struct btrfs_root *extent_root = fs_info->extent_root;
2053         struct btrfs_block_group_item item;
2054         struct btrfs_key key;
2055         int ret = 0;
2056
2057         if (!trans->can_flush_pending_bgs)
2058                 return;
2059
2060         while (!list_empty(&trans->new_bgs)) {
2061                 block_group = list_first_entry(&trans->new_bgs,
2062                                                struct btrfs_block_group,
2063                                                bg_list);
2064                 if (ret)
2065                         goto next;
2066
2067                 spin_lock(&block_group->lock);
2068                 btrfs_set_stack_block_group_used(&item, block_group->used);
2069                 btrfs_set_stack_block_group_chunk_objectid(&item,
2070                                 BTRFS_FIRST_CHUNK_TREE_OBJECTID);
2071                 btrfs_set_stack_block_group_flags(&item, block_group->flags);
2072                 key.objectid = block_group->start;
2073                 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2074                 key.offset = block_group->length;
2075                 spin_unlock(&block_group->lock);
2076
2077                 ret = btrfs_insert_item(trans, extent_root, &key, &item,
2078                                         sizeof(item));
2079                 if (ret)
2080                         btrfs_abort_transaction(trans, ret);
2081                 ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset);
2082                 if (ret)
2083                         btrfs_abort_transaction(trans, ret);
2084                 add_block_group_free_space(trans, block_group);
2085                 /* Already aborted the transaction if it failed. */
2086 next:
2087                 btrfs_delayed_refs_rsv_release(fs_info, 1);
2088                 list_del_init(&block_group->bg_list);
2089         }
2090         btrfs_trans_release_chunk_metadata(trans);
2091 }
2092
2093 int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
2094                            u64 type, u64 chunk_offset, u64 size)
2095 {
2096         struct btrfs_fs_info *fs_info = trans->fs_info;
2097         struct btrfs_block_group *cache;
2098         int ret;
2099
2100         btrfs_set_log_full_commit(trans);
2101
2102         cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
2103         if (!cache)
2104                 return -ENOMEM;
2105
2106         cache->used = bytes_used;
2107         cache->flags = type;
2108         cache->last_byte_to_unpin = (u64)-1;
2109         cache->cached = BTRFS_CACHE_FINISHED;
2110         cache->needs_free_space = 1;
2111         ret = exclude_super_stripes(cache);
2112         if (ret) {
2113                 /* We may have excluded something, so call this just in case */
2114                 btrfs_free_excluded_extents(cache);
2115                 btrfs_put_block_group(cache);
2116                 return ret;
2117         }
2118
2119         add_new_free_space(cache, chunk_offset, chunk_offset + size);
2120
2121         btrfs_free_excluded_extents(cache);
2122
2123 #ifdef CONFIG_BTRFS_DEBUG
2124         if (btrfs_should_fragment_free_space(cache)) {
2125                 u64 new_bytes_used = size - bytes_used;
2126
2127                 bytes_used += new_bytes_used >> 1;
2128                 fragment_free_space(cache);
2129         }
2130 #endif
2131         /*
2132          * Ensure the corresponding space_info object is created and
2133          * assigned to our block group. We want our bg to be added to the rbtree
2134          * with its ->space_info set.
2135          */
2136         cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
2137         ASSERT(cache->space_info);
2138
2139         ret = btrfs_add_block_group_cache(fs_info, cache);
2140         if (ret) {
2141                 btrfs_remove_free_space_cache(cache);
2142                 btrfs_put_block_group(cache);
2143                 return ret;
2144         }
2145
2146         /*
2147          * Now that our block group has its ->space_info set and is inserted in
2148          * the rbtree, update the space info's counters.
2149          */
2150         trace_btrfs_add_block_group(fs_info, cache, 1);
2151         btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
2152                                 cache->bytes_super, &cache->space_info);
2153         btrfs_update_global_block_rsv(fs_info);
2154
2155         link_block_group(cache);
2156
2157         list_add_tail(&cache->bg_list, &trans->new_bgs);
2158         trans->delayed_ref_updates++;
2159         btrfs_update_delayed_refs_rsv(trans);
2160
2161         set_avail_alloc_bits(fs_info, type);
2162         return 0;
2163 }
2164
2165 static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
2166 {
2167         u64 num_devices;
2168         u64 stripped;
2169
2170         /*
2171          * if restripe for this chunk_type is on pick target profile and
2172          * return, otherwise do the usual balance
2173          */
2174         stripped = get_restripe_target(fs_info, flags);
2175         if (stripped)
2176                 return extended_to_chunk(stripped);
2177
2178         num_devices = fs_info->fs_devices->rw_devices;
2179
2180         stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK |
2181                 BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10;
2182
2183         if (num_devices == 1) {
2184                 stripped |= BTRFS_BLOCK_GROUP_DUP;
2185                 stripped = flags & ~stripped;
2186
2187                 /* turn raid0 into single device chunks */
2188                 if (flags & BTRFS_BLOCK_GROUP_RAID0)
2189                         return stripped;
2190
2191                 /* turn mirroring into duplication */
2192                 if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK |
2193                              BTRFS_BLOCK_GROUP_RAID10))
2194                         return stripped | BTRFS_BLOCK_GROUP_DUP;
2195         } else {
2196                 /* they already had raid on here, just return */
2197                 if (flags & stripped)
2198                         return flags;
2199
2200                 stripped |= BTRFS_BLOCK_GROUP_DUP;
2201                 stripped = flags & ~stripped;
2202
2203                 /* switch duplicated blocks with raid1 */
2204                 if (flags & BTRFS_BLOCK_GROUP_DUP)
2205                         return stripped | BTRFS_BLOCK_GROUP_RAID1;
2206
2207                 /* this is drive concat, leave it alone */
2208         }
2209
2210         return flags;
2211 }
2212
2213 /*
2214  * Mark one block group RO, can be called several times for the same block
2215  * group.
2216  *
2217  * @cache:              the destination block group
2218  * @do_chunk_alloc:     whether need to do chunk pre-allocation, this is to
2219  *                      ensure we still have some free space after marking this
2220  *                      block group RO.
2221  */
2222 int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
2223                              bool do_chunk_alloc)
2224 {
2225         struct btrfs_fs_info *fs_info = cache->fs_info;
2226         struct btrfs_trans_handle *trans;
2227         u64 alloc_flags;
2228         int ret;
2229
2230 again:
2231         trans = btrfs_join_transaction(fs_info->extent_root);
2232         if (IS_ERR(trans))
2233                 return PTR_ERR(trans);
2234
2235         /*
2236          * we're not allowed to set block groups readonly after the dirty
2237          * block groups cache has started writing.  If it already started,
2238          * back off and let this transaction commit
2239          */
2240         mutex_lock(&fs_info->ro_block_group_mutex);
2241         if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
2242                 u64 transid = trans->transid;
2243
2244                 mutex_unlock(&fs_info->ro_block_group_mutex);
2245                 btrfs_end_transaction(trans);
2246
2247                 ret = btrfs_wait_for_commit(fs_info, transid);
2248                 if (ret)
2249                         return ret;
2250                 goto again;
2251         }
2252
2253         if (do_chunk_alloc) {
2254                 /*
2255                  * If we are changing raid levels, try to allocate a
2256                  * corresponding block group with the new raid level.
2257                  */
2258                 alloc_flags = update_block_group_flags(fs_info, cache->flags);
2259                 if (alloc_flags != cache->flags) {
2260                         ret = btrfs_chunk_alloc(trans, alloc_flags,
2261                                                 CHUNK_ALLOC_FORCE);
2262                         /*
2263                          * ENOSPC is allowed here, we may have enough space
2264                          * already allocated at the new raid level to carry on
2265                          */
2266                         if (ret == -ENOSPC)
2267                                 ret = 0;
2268                         if (ret < 0)
2269                                 goto out;
2270                 }
2271         }
2272
2273         ret = inc_block_group_ro(cache, 0);
2274         if (!do_chunk_alloc)
2275                 goto unlock_out;
2276         if (!ret)
2277                 goto out;
2278         alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags);
2279         ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
2280         if (ret < 0)
2281                 goto out;
2282         ret = inc_block_group_ro(cache, 0);
2283 out:
2284         if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
2285                 alloc_flags = update_block_group_flags(fs_info, cache->flags);
2286                 mutex_lock(&fs_info->chunk_mutex);
2287                 check_system_chunk(trans, alloc_flags);
2288                 mutex_unlock(&fs_info->chunk_mutex);
2289         }
2290 unlock_out:
2291         mutex_unlock(&fs_info->ro_block_group_mutex);
2292
2293         btrfs_end_transaction(trans);
2294         return ret;
2295 }
2296
2297 void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
2298 {
2299         struct btrfs_space_info *sinfo = cache->space_info;
2300         u64 num_bytes;
2301
2302         BUG_ON(!cache->ro);
2303
2304         spin_lock(&sinfo->lock);
2305         spin_lock(&cache->lock);
2306         if (!--cache->ro) {
2307                 num_bytes = cache->length - cache->reserved -
2308                             cache->pinned - cache->bytes_super - cache->used;
2309                 sinfo->bytes_readonly -= num_bytes;
2310                 list_del_init(&cache->ro_list);
2311         }
2312         spin_unlock(&cache->lock);
2313         spin_unlock(&sinfo->lock);
2314 }
2315
2316 static int write_one_cache_group(struct btrfs_trans_handle *trans,
2317                                  struct btrfs_path *path,
2318                                  struct btrfs_block_group *cache)
2319 {
2320         struct btrfs_fs_info *fs_info = trans->fs_info;
2321         int ret;
2322         struct btrfs_root *extent_root = fs_info->extent_root;
2323         unsigned long bi;
2324         struct extent_buffer *leaf;
2325         struct btrfs_block_group_item bgi;
2326         struct btrfs_key key;
2327
2328         key.objectid = cache->start;
2329         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2330         key.offset = cache->length;
2331
2332         ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
2333         if (ret) {
2334                 if (ret > 0)
2335                         ret = -ENOENT;
2336                 goto fail;
2337         }
2338
2339         leaf = path->nodes[0];
2340         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2341         btrfs_set_stack_block_group_used(&bgi, cache->used);
2342         btrfs_set_stack_block_group_chunk_objectid(&bgi,
2343                         BTRFS_FIRST_CHUNK_TREE_OBJECTID);
2344         btrfs_set_stack_block_group_flags(&bgi, cache->flags);
2345         write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
2346         btrfs_mark_buffer_dirty(leaf);
2347 fail:
2348         btrfs_release_path(path);
2349         return ret;
2350
2351 }
2352
2353 static int cache_save_setup(struct btrfs_block_group *block_group,
2354                             struct btrfs_trans_handle *trans,
2355                             struct btrfs_path *path)
2356 {
2357         struct btrfs_fs_info *fs_info = block_group->fs_info;
2358         struct btrfs_root *root = fs_info->tree_root;
2359         struct inode *inode = NULL;
2360         struct extent_changeset *data_reserved = NULL;
2361         u64 alloc_hint = 0;
2362         int dcs = BTRFS_DC_ERROR;
2363         u64 num_pages = 0;
2364         int retries = 0;
2365         int ret = 0;
2366
2367         /*
2368          * If this block group is smaller than 100 megs don't bother caching the
2369          * block group.
2370          */
2371         if (block_group->length < (100 * SZ_1M)) {
2372                 spin_lock(&block_group->lock);
2373                 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
2374                 spin_unlock(&block_group->lock);
2375                 return 0;
2376         }
2377
2378         if (TRANS_ABORTED(trans))
2379                 return 0;
2380 again:
2381         inode = lookup_free_space_inode(block_group, path);
2382         if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
2383                 ret = PTR_ERR(inode);
2384                 btrfs_release_path(path);
2385                 goto out;
2386         }
2387
2388         if (IS_ERR(inode)) {
2389                 BUG_ON(retries);
2390                 retries++;
2391
2392                 if (block_group->ro)
2393                         goto out_free;
2394
2395                 ret = create_free_space_inode(trans, block_group, path);
2396                 if (ret)
2397                         goto out_free;
2398                 goto again;
2399         }
2400
2401         /*
2402          * We want to set the generation to 0, that way if anything goes wrong
2403          * from here on out we know not to trust this cache when we load up next
2404          * time.
2405          */
2406         BTRFS_I(inode)->generation = 0;
2407         ret = btrfs_update_inode(trans, root, inode);
2408         if (ret) {
2409                 /*
2410                  * So theoretically we could recover from this, simply set the
2411                  * super cache generation to 0 so we know to invalidate the
2412                  * cache, but then we'd have to keep track of the block groups
2413                  * that fail this way so we know we _have_ to reset this cache
2414                  * before the next commit or risk reading stale cache.  So to
2415                  * limit our exposure to horrible edge cases lets just abort the
2416                  * transaction, this only happens in really bad situations
2417                  * anyway.
2418                  */
2419                 btrfs_abort_transaction(trans, ret);
2420                 goto out_put;
2421         }
2422         WARN_ON(ret);
2423
2424         /* We've already setup this transaction, go ahead and exit */
2425         if (block_group->cache_generation == trans->transid &&
2426             i_size_read(inode)) {
2427                 dcs = BTRFS_DC_SETUP;
2428                 goto out_put;
2429         }
2430
2431         if (i_size_read(inode) > 0) {
2432                 ret = btrfs_check_trunc_cache_free_space(fs_info,
2433                                         &fs_info->global_block_rsv);
2434                 if (ret)
2435                         goto out_put;
2436
2437                 ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
2438                 if (ret)
2439                         goto out_put;
2440         }
2441
2442         spin_lock(&block_group->lock);
2443         if (block_group->cached != BTRFS_CACHE_FINISHED ||
2444             !btrfs_test_opt(fs_info, SPACE_CACHE)) {
2445                 /*
2446                  * don't bother trying to write stuff out _if_
2447                  * a) we're not cached,
2448                  * b) we're with nospace_cache mount option,
2449                  * c) we're with v2 space_cache (FREE_SPACE_TREE).
2450                  */
2451                 dcs = BTRFS_DC_WRITTEN;
2452                 spin_unlock(&block_group->lock);
2453                 goto out_put;
2454         }
2455         spin_unlock(&block_group->lock);
2456
2457         /*
2458          * We hit an ENOSPC when setting up the cache in this transaction, just
2459          * skip doing the setup, we've already cleared the cache so we're safe.
2460          */
2461         if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
2462                 ret = -ENOSPC;
2463                 goto out_put;
2464         }
2465
2466         /*
2467          * Try to preallocate enough space based on how big the block group is.
2468          * Keep in mind this has to include any pinned space which could end up
2469          * taking up quite a bit since it's not folded into the other space
2470          * cache.
2471          */
2472         num_pages = div_u64(block_group->length, SZ_256M);
2473         if (!num_pages)
2474                 num_pages = 1;
2475
2476         num_pages *= 16;
2477         num_pages *= PAGE_SIZE;
2478
2479         ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
2480         if (ret)
2481                 goto out_put;
2482
2483         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
2484                                               num_pages, num_pages,
2485                                               &alloc_hint);
2486         /*
2487          * Our cache requires contiguous chunks so that we don't modify a bunch
2488          * of metadata or split extents when writing the cache out, which means
2489          * we can enospc if we are heavily fragmented in addition to just normal
2490          * out of space conditions.  So if we hit this just skip setting up any
2491          * other block groups for this transaction, maybe we'll unpin enough
2492          * space the next time around.
2493          */
2494         if (!ret)
2495                 dcs = BTRFS_DC_SETUP;
2496         else if (ret == -ENOSPC)
2497                 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
2498
2499 out_put:
2500         iput(inode);
2501 out_free:
2502         btrfs_release_path(path);
2503 out:
2504         spin_lock(&block_group->lock);
2505         if (!ret && dcs == BTRFS_DC_SETUP)
2506                 block_group->cache_generation = trans->transid;
2507         block_group->disk_cache_state = dcs;
2508         spin_unlock(&block_group->lock);
2509
2510         extent_changeset_free(data_reserved);
2511         return ret;
2512 }
2513
2514 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
2515 {
2516         struct btrfs_fs_info *fs_info = trans->fs_info;
2517         struct btrfs_block_group *cache, *tmp;
2518         struct btrfs_transaction *cur_trans = trans->transaction;
2519         struct btrfs_path *path;
2520
2521         if (list_empty(&cur_trans->dirty_bgs) ||
2522             !btrfs_test_opt(fs_info, SPACE_CACHE))
2523                 return 0;
2524
2525         path = btrfs_alloc_path();
2526         if (!path)
2527                 return -ENOMEM;
2528
2529         /* Could add new block groups, use _safe just in case */
2530         list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
2531                                  dirty_list) {
2532                 if (cache->disk_cache_state == BTRFS_DC_CLEAR)
2533                         cache_save_setup(cache, trans, path);
2534         }
2535
2536         btrfs_free_path(path);
2537         return 0;
2538 }
2539
2540 /*
2541  * Transaction commit does final block group cache writeback during a critical
2542  * section where nothing is allowed to change the FS.  This is required in
2543  * order for the cache to actually match the block group, but can introduce a
2544  * lot of latency into the commit.
2545  *
2546  * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO.
2547  * There's a chance we'll have to redo some of it if the block group changes
2548  * again during the commit, but it greatly reduces the commit latency by
2549  * getting rid of the easy block groups while we're still allowing others to
2550  * join the commit.
2551  */
2552 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
2553 {
2554         struct btrfs_fs_info *fs_info = trans->fs_info;
2555         struct btrfs_block_group *cache;
2556         struct btrfs_transaction *cur_trans = trans->transaction;
2557         int ret = 0;
2558         int should_put;
2559         struct btrfs_path *path = NULL;
2560         LIST_HEAD(dirty);
2561         struct list_head *io = &cur_trans->io_bgs;
2562         int num_started = 0;
2563         int loops = 0;
2564
2565         spin_lock(&cur_trans->dirty_bgs_lock);
2566         if (list_empty(&cur_trans->dirty_bgs)) {
2567                 spin_unlock(&cur_trans->dirty_bgs_lock);
2568                 return 0;
2569         }
2570         list_splice_init(&cur_trans->dirty_bgs, &dirty);
2571         spin_unlock(&cur_trans->dirty_bgs_lock);
2572
2573 again:
2574         /* Make sure all the block groups on our dirty list actually exist */
2575         btrfs_create_pending_block_groups(trans);
2576
2577         if (!path) {
2578                 path = btrfs_alloc_path();
2579                 if (!path)
2580                         return -ENOMEM;
2581         }
2582
2583         /*
2584          * cache_write_mutex is here only to save us from balance or automatic
2585          * removal of empty block groups deleting this block group while we are
2586          * writing out the cache
2587          */
2588         mutex_lock(&trans->transaction->cache_write_mutex);
2589         while (!list_empty(&dirty)) {
2590                 bool drop_reserve = true;
2591
2592                 cache = list_first_entry(&dirty, struct btrfs_block_group,
2593                                          dirty_list);
2594                 /*
2595                  * This can happen if something re-dirties a block group that
2596                  * is already under IO.  Just wait for it to finish and then do
2597                  * it all again
2598                  */
2599                 if (!list_empty(&cache->io_list)) {
2600                         list_del_init(&cache->io_list);
2601                         btrfs_wait_cache_io(trans, cache, path);
2602                         btrfs_put_block_group(cache);
2603                 }
2604
2605
2606                 /*
2607                  * btrfs_wait_cache_io uses the cache->dirty_list to decide if
2608                  * it should update the cache_state.  Don't delete until after
2609                  * we wait.
2610                  *
2611                  * Since we're not running in the commit critical section
2612                  * we need the dirty_bgs_lock to protect from update_block_group
2613                  */
2614                 spin_lock(&cur_trans->dirty_bgs_lock);
2615                 list_del_init(&cache->dirty_list);
2616                 spin_unlock(&cur_trans->dirty_bgs_lock);
2617
2618                 should_put = 1;
2619
2620                 cache_save_setup(cache, trans, path);
2621
2622                 if (cache->disk_cache_state == BTRFS_DC_SETUP) {
2623                         cache->io_ctl.inode = NULL;
2624                         ret = btrfs_write_out_cache(trans, cache, path);
2625                         if (ret == 0 && cache->io_ctl.inode) {
2626                                 num_started++;
2627                                 should_put = 0;
2628
2629                                 /*
2630                                  * The cache_write_mutex is protecting the
2631                                  * io_list, also refer to the definition of
2632                                  * btrfs_transaction::io_bgs for more details
2633                                  */
2634                                 list_add_tail(&cache->io_list, io);
2635                         } else {
2636                                 /*
2637                                  * If we failed to write the cache, the
2638                                  * generation will be bad and life goes on
2639                                  */
2640                                 ret = 0;
2641                         }
2642                 }
2643                 if (!ret) {
2644                         ret = write_one_cache_group(trans, path, cache);
2645                         /*
2646                          * Our block group might still be attached to the list
2647                          * of new block groups in the transaction handle of some
2648                          * other task (struct btrfs_trans_handle->new_bgs). This
2649                          * means its block group item isn't yet in the extent
2650                          * tree. If this happens ignore the error, as we will
2651                          * try again later in the critical section of the
2652                          * transaction commit.
2653                          */
2654                         if (ret == -ENOENT) {
2655                                 ret = 0;
2656                                 spin_lock(&cur_trans->dirty_bgs_lock);
2657                                 if (list_empty(&cache->dirty_list)) {
2658                                         list_add_tail(&cache->dirty_list,
2659                                                       &cur_trans->dirty_bgs);
2660                                         btrfs_get_block_group(cache);
2661                                         drop_reserve = false;
2662                                 }
2663                                 spin_unlock(&cur_trans->dirty_bgs_lock);
2664                         } else if (ret) {
2665                                 btrfs_abort_transaction(trans, ret);
2666                         }
2667                 }
2668
2669                 /* If it's not on the io list, we need to put the block group */
2670                 if (should_put)
2671                         btrfs_put_block_group(cache);
2672                 if (drop_reserve)
2673                         btrfs_delayed_refs_rsv_release(fs_info, 1);
2674
2675                 if (ret)
2676                         break;
2677
2678                 /*
2679                  * Avoid blocking other tasks for too long. It might even save
2680                  * us from writing caches for block groups that are going to be
2681                  * removed.
2682                  */
2683                 mutex_unlock(&trans->transaction->cache_write_mutex);
2684                 mutex_lock(&trans->transaction->cache_write_mutex);
2685         }
2686         mutex_unlock(&trans->transaction->cache_write_mutex);
2687
2688         /*
2689          * Go through delayed refs for all the stuff we've just kicked off
2690          * and then loop back (just once)
2691          */
2692         ret = btrfs_run_delayed_refs(trans, 0);
2693         if (!ret && loops == 0) {
2694                 loops++;
2695                 spin_lock(&cur_trans->dirty_bgs_lock);
2696                 list_splice_init(&cur_trans->dirty_bgs, &dirty);
2697                 /*
2698                  * dirty_bgs_lock protects us from concurrent block group
2699                  * deletes too (not just cache_write_mutex).
2700                  */
2701                 if (!list_empty(&dirty)) {
2702                         spin_unlock(&cur_trans->dirty_bgs_lock);
2703                         goto again;
2704                 }
2705                 spin_unlock(&cur_trans->dirty_bgs_lock);
2706         } else if (ret < 0) {
2707                 btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
2708         }
2709
2710         btrfs_free_path(path);
2711         return ret;
2712 }
2713
2714 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
2715 {
2716         struct btrfs_fs_info *fs_info = trans->fs_info;
2717         struct btrfs_block_group *cache;
2718         struct btrfs_transaction *cur_trans = trans->transaction;
2719         int ret = 0;
2720         int should_put;
2721         struct btrfs_path *path;
2722         struct list_head *io = &cur_trans->io_bgs;
2723         int num_started = 0;
2724
2725         path = btrfs_alloc_path();
2726         if (!path)
2727                 return -ENOMEM;
2728
2729         /*
2730          * Even though we are in the critical section of the transaction commit,
2731          * we can still have concurrent tasks adding elements to this
2732          * transaction's list of dirty block groups. These tasks correspond to
2733          * endio free space workers started when writeback finishes for a
2734          * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
2735          * allocate new block groups as a result of COWing nodes of the root
2736          * tree when updating the free space inode. The writeback for the space
2737          * caches is triggered by an earlier call to
2738          * btrfs_start_dirty_block_groups() and iterations of the following
2739          * loop.
2740          * Also we want to do the cache_save_setup first and then run the
2741          * delayed refs to make sure we have the best chance at doing this all
2742          * in one shot.
2743          */
2744         spin_lock(&cur_trans->dirty_bgs_lock);
2745         while (!list_empty(&cur_trans->dirty_bgs)) {
2746                 cache = list_first_entry(&cur_trans->dirty_bgs,
2747                                          struct btrfs_block_group,
2748                                          dirty_list);
2749
2750                 /*
2751                  * This can happen if cache_save_setup re-dirties a block group
2752                  * that is already under IO.  Just wait for it to finish and
2753                  * then do it all again
2754                  */
2755                 if (!list_empty(&cache->io_list)) {
2756                         spin_unlock(&cur_trans->dirty_bgs_lock);
2757                         list_del_init(&cache->io_list);
2758                         btrfs_wait_cache_io(trans, cache, path);
2759                         btrfs_put_block_group(cache);
2760                         spin_lock(&cur_trans->dirty_bgs_lock);
2761                 }
2762
2763                 /*
2764                  * Don't remove from the dirty list until after we've waited on
2765                  * any pending IO
2766                  */
2767                 list_del_init(&cache->dirty_list);
2768                 spin_unlock(&cur_trans->dirty_bgs_lock);
2769                 should_put = 1;
2770
2771                 cache_save_setup(cache, trans, path);
2772
2773                 if (!ret)
2774                         ret = btrfs_run_delayed_refs(trans,
2775                                                      (unsigned long) -1);
2776
2777                 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
2778                         cache->io_ctl.inode = NULL;
2779                         ret = btrfs_write_out_cache(trans, cache, path);
2780                         if (ret == 0 && cache->io_ctl.inode) {
2781                                 num_started++;
2782                                 should_put = 0;
2783                                 list_add_tail(&cache->io_list, io);
2784                         } else {
2785                                 /*
2786                                  * If we failed to write the cache, the
2787                                  * generation will be bad and life goes on
2788                                  */
2789                                 ret = 0;
2790                         }
2791                 }
2792                 if (!ret) {
2793                         ret = write_one_cache_group(trans, path, cache);
2794                         /*
2795                          * One of the free space endio workers might have
2796                          * created a new block group while updating a free space
2797                          * cache's inode (at inode.c:btrfs_finish_ordered_io())
2798                          * and hasn't released its transaction handle yet, in
2799                          * which case the new block group is still attached to
2800                          * its transaction handle and its creation has not
2801                          * finished yet (no block group item in the extent tree
2802                          * yet, etc). If this is the case, wait for all free
2803                          * space endio workers to finish and retry. This is a
2804                          * a very rare case so no need for a more efficient and
2805                          * complex approach.
2806                          */
2807                         if (ret == -ENOENT) {
2808                                 wait_event(cur_trans->writer_wait,
2809                                    atomic_read(&cur_trans->num_writers) == 1);
2810                                 ret = write_one_cache_group(trans, path, cache);
2811                         }
2812                         if (ret)
2813                                 btrfs_abort_transaction(trans, ret);
2814                 }
2815
2816                 /* If its not on the io list, we need to put the block group */
2817                 if (should_put)
2818                         btrfs_put_block_group(cache);
2819                 btrfs_delayed_refs_rsv_release(fs_info, 1);
2820                 spin_lock(&cur_trans->dirty_bgs_lock);
2821         }
2822         spin_unlock(&cur_trans->dirty_bgs_lock);
2823
2824         /*
2825          * Refer to the definition of io_bgs member for details why it's safe
2826          * to use it without any locking
2827          */
2828         while (!list_empty(io)) {
2829                 cache = list_first_entry(io, struct btrfs_block_group,
2830                                          io_list);
2831                 list_del_init(&cache->io_list);
2832                 btrfs_wait_cache_io(trans, cache, path);
2833                 btrfs_put_block_group(cache);
2834         }
2835
2836         btrfs_free_path(path);
2837         return ret;
2838 }
2839
2840 int btrfs_update_block_group(struct btrfs_trans_handle *trans,
2841                              u64 bytenr, u64 num_bytes, int alloc)
2842 {
2843         struct btrfs_fs_info *info = trans->fs_info;
2844         struct btrfs_block_group *cache = NULL;
2845         u64 total = num_bytes;
2846         u64 old_val;
2847         u64 byte_in_group;
2848         int factor;
2849         int ret = 0;
2850
2851         /* Block accounting for super block */
2852         spin_lock(&info->delalloc_root_lock);
2853         old_val = btrfs_super_bytes_used(info->super_copy);
2854         if (alloc)
2855                 old_val += num_bytes;
2856         else
2857                 old_val -= num_bytes;
2858         btrfs_set_super_bytes_used(info->super_copy, old_val);
2859         spin_unlock(&info->delalloc_root_lock);
2860
2861         while (total) {
2862                 cache = btrfs_lookup_block_group(info, bytenr);
2863                 if (!cache) {
2864                         ret = -ENOENT;
2865                         break;
2866                 }
2867                 factor = btrfs_bg_type_to_factor(cache->flags);
2868
2869                 /*
2870                  * If this block group has free space cache written out, we
2871                  * need to make sure to load it if we are removing space.  This
2872                  * is because we need the unpinning stage to actually add the
2873                  * space back to the block group, otherwise we will leak space.
2874                  */
2875                 if (!alloc && !btrfs_block_group_done(cache))
2876                         btrfs_cache_block_group(cache, 1);
2877
2878                 byte_in_group = bytenr - cache->start;
2879                 WARN_ON(byte_in_group > cache->length);
2880
2881                 spin_lock(&cache->space_info->lock);
2882                 spin_lock(&cache->lock);
2883
2884                 if (btrfs_test_opt(info, SPACE_CACHE) &&
2885                     cache->disk_cache_state < BTRFS_DC_CLEAR)
2886                         cache->disk_cache_state = BTRFS_DC_CLEAR;
2887
2888                 old_val = cache->used;
2889                 num_bytes = min(total, cache->length - byte_in_group);
2890                 if (alloc) {
2891                         old_val += num_bytes;
2892                         cache->used = old_val;
2893                         cache->reserved -= num_bytes;
2894                         cache->space_info->bytes_reserved -= num_bytes;
2895                         cache->space_info->bytes_used += num_bytes;
2896                         cache->space_info->disk_used += num_bytes * factor;
2897                         spin_unlock(&cache->lock);
2898                         spin_unlock(&cache->space_info->lock);
2899                 } else {
2900                         old_val -= num_bytes;
2901                         cache->used = old_val;
2902                         cache->pinned += num_bytes;
2903                         btrfs_space_info_update_bytes_pinned(info,
2904                                         cache->space_info, num_bytes);
2905                         cache->space_info->bytes_used -= num_bytes;
2906                         cache->space_info->disk_used -= num_bytes * factor;
2907                         spin_unlock(&cache->lock);
2908                         spin_unlock(&cache->space_info->lock);
2909
2910                         percpu_counter_add_batch(
2911                                         &cache->space_info->total_bytes_pinned,
2912                                         num_bytes,
2913                                         BTRFS_TOTAL_BYTES_PINNED_BATCH);
2914                         set_extent_dirty(&trans->transaction->pinned_extents,
2915                                          bytenr, bytenr + num_bytes - 1,
2916                                          GFP_NOFS | __GFP_NOFAIL);
2917                 }
2918
2919                 spin_lock(&trans->transaction->dirty_bgs_lock);
2920                 if (list_empty(&cache->dirty_list)) {
2921                         list_add_tail(&cache->dirty_list,
2922                                       &trans->transaction->dirty_bgs);
2923                         trans->delayed_ref_updates++;
2924                         btrfs_get_block_group(cache);
2925                 }
2926                 spin_unlock(&trans->transaction->dirty_bgs_lock);
2927
2928                 /*
2929                  * No longer have used bytes in this block group, queue it for
2930                  * deletion. We do this after adding the block group to the
2931                  * dirty list to avoid races between cleaner kthread and space
2932                  * cache writeout.
2933                  */
2934                 if (!alloc && old_val == 0) {
2935                         if (!btrfs_test_opt(info, DISCARD_ASYNC))
2936                                 btrfs_mark_bg_unused(cache);
2937                 }
2938
2939                 btrfs_put_block_group(cache);
2940                 total -= num_bytes;
2941                 bytenr += num_bytes;
2942         }
2943
2944         /* Modified block groups are accounted for in the delayed_refs_rsv. */
2945         btrfs_update_delayed_refs_rsv(trans);
2946         return ret;
2947 }
2948
2949 /**
2950  * btrfs_add_reserved_bytes - update the block_group and space info counters
2951  * @cache:      The cache we are manipulating
2952  * @ram_bytes:  The number of bytes of file content, and will be same to
2953  *              @num_bytes except for the compress path.
2954  * @num_bytes:  The number of bytes in question
2955  * @delalloc:   The blocks are allocated for the delalloc write
2956  *
2957  * This is called by the allocator when it reserves space. If this is a
2958  * reservation and the block group has become read only we cannot make the
2959  * reservation and return -EAGAIN, otherwise this function always succeeds.
2960  */
2961 int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
2962                              u64 ram_bytes, u64 num_bytes, int delalloc)
2963 {
2964         struct btrfs_space_info *space_info = cache->space_info;
2965         int ret = 0;
2966
2967         spin_lock(&space_info->lock);
2968         spin_lock(&cache->lock);
2969         if (cache->ro) {
2970                 ret = -EAGAIN;
2971         } else {
2972                 cache->reserved += num_bytes;
2973                 space_info->bytes_reserved += num_bytes;
2974                 trace_btrfs_space_reservation(cache->fs_info, "space_info",
2975                                               space_info->flags, num_bytes, 1);
2976                 btrfs_space_info_update_bytes_may_use(cache->fs_info,
2977                                                       space_info, -ram_bytes);
2978                 if (delalloc)
2979                         cache->delalloc_bytes += num_bytes;
2980         }
2981         spin_unlock(&cache->lock);
2982         spin_unlock(&space_info->lock);
2983         return ret;
2984 }
2985
2986 /**
2987  * btrfs_free_reserved_bytes - update the block_group and space info counters
2988  * @cache:      The cache we are manipulating
2989  * @num_bytes:  The number of bytes in question
2990  * @delalloc:   The blocks are allocated for the delalloc write
2991  *
2992  * This is called by somebody who is freeing space that was never actually used
2993  * on disk.  For example if you reserve some space for a new leaf in transaction
2994  * A and before transaction A commits you free that leaf, you call this with
2995  * reserve set to 0 in order to clear the reservation.
2996  */
2997 void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
2998                                u64 num_bytes, int delalloc)
2999 {
3000         struct btrfs_space_info *space_info = cache->space_info;
3001
3002         spin_lock(&space_info->lock);
3003         spin_lock(&cache->lock);
3004         if (cache->ro)
3005                 space_info->bytes_readonly += num_bytes;
3006         cache->reserved -= num_bytes;
3007         space_info->bytes_reserved -= num_bytes;
3008         space_info->max_extent_size = 0;
3009
3010         if (delalloc)
3011                 cache->delalloc_bytes -= num_bytes;
3012         spin_unlock(&cache->lock);
3013         spin_unlock(&space_info->lock);
3014 }
3015
3016 static void force_metadata_allocation(struct btrfs_fs_info *info)
3017 {
3018         struct list_head *head = &info->space_info;
3019         struct btrfs_space_info *found;
3020
3021         rcu_read_lock();
3022         list_for_each_entry_rcu(found, head, list) {
3023                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3024                         found->force_alloc = CHUNK_ALLOC_FORCE;
3025         }
3026         rcu_read_unlock();
3027 }
3028
3029 static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
3030                               struct btrfs_space_info *sinfo, int force)
3031 {
3032         u64 bytes_used = btrfs_space_info_used(sinfo, false);
3033         u64 thresh;
3034
3035         if (force == CHUNK_ALLOC_FORCE)
3036                 return 1;
3037
3038         /*
3039          * in limited mode, we want to have some free space up to
3040          * about 1% of the FS size.
3041          */
3042         if (force == CHUNK_ALLOC_LIMITED) {
3043                 thresh = btrfs_super_total_bytes(fs_info->super_copy);
3044                 thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
3045
3046                 if (sinfo->total_bytes - bytes_used < thresh)
3047                         return 1;
3048         }
3049
3050         if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
3051                 return 0;
3052         return 1;
3053 }
3054
3055 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
3056 {
3057         u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type);
3058
3059         return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
3060 }
3061
3062 /*
3063  * If force is CHUNK_ALLOC_FORCE:
3064  *    - return 1 if it successfully allocates a chunk,
3065  *    - return errors including -ENOSPC otherwise.
3066  * If force is NOT CHUNK_ALLOC_FORCE:
3067  *    - return 0 if it doesn't need to allocate a new chunk,
3068  *    - return 1 if it successfully allocates a chunk,
3069  *    - return errors including -ENOSPC otherwise.
3070  */
3071 int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
3072                       enum btrfs_chunk_alloc_enum force)
3073 {
3074         struct btrfs_fs_info *fs_info = trans->fs_info;
3075         struct btrfs_space_info *space_info;
3076         bool wait_for_alloc = false;
3077         bool should_alloc = false;
3078         int ret = 0;
3079
3080         /* Don't re-enter if we're already allocating a chunk */
3081         if (trans->allocating_chunk)
3082                 return -ENOSPC;
3083
3084         space_info = btrfs_find_space_info(fs_info, flags);
3085         ASSERT(space_info);
3086
3087         do {
3088                 spin_lock(&space_info->lock);
3089                 if (force < space_info->force_alloc)
3090                         force = space_info->force_alloc;
3091                 should_alloc = should_alloc_chunk(fs_info, space_info, force);
3092                 if (space_info->full) {
3093                         /* No more free physical space */
3094                         if (should_alloc)
3095                                 ret = -ENOSPC;
3096                         else
3097                                 ret = 0;
3098                         spin_unlock(&space_info->lock);
3099                         return ret;
3100                 } else if (!should_alloc) {
3101                         spin_unlock(&space_info->lock);
3102                         return 0;
3103                 } else if (space_info->chunk_alloc) {
3104                         /*
3105                          * Someone is already allocating, so we need to block
3106                          * until this someone is finished and then loop to
3107                          * recheck if we should continue with our allocation
3108                          * attempt.
3109                          */
3110                         wait_for_alloc = true;
3111                         spin_unlock(&space_info->lock);
3112                         mutex_lock(&fs_info->chunk_mutex);
3113                         mutex_unlock(&fs_info->chunk_mutex);
3114                 } else {
3115                         /* Proceed with allocation */
3116                         space_info->chunk_alloc = 1;
3117                         wait_for_alloc = false;
3118                         spin_unlock(&space_info->lock);
3119                 }
3120
3121                 cond_resched();
3122         } while (wait_for_alloc);
3123
3124         mutex_lock(&fs_info->chunk_mutex);
3125         trans->allocating_chunk = true;
3126
3127         /*
3128          * If we have mixed data/metadata chunks we want to make sure we keep
3129          * allocating mixed chunks instead of individual chunks.
3130          */
3131         if (btrfs_mixed_space_info(space_info))
3132                 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
3133
3134         /*
3135          * if we're doing a data chunk, go ahead and make sure that
3136          * we keep a reasonable number of metadata chunks allocated in the
3137          * FS as well.
3138          */
3139         if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3140                 fs_info->data_chunk_allocations++;
3141                 if (!(fs_info->data_chunk_allocations %
3142                       fs_info->metadata_ratio))
3143                         force_metadata_allocation(fs_info);
3144         }
3145
3146         /*
3147          * Check if we have enough space in SYSTEM chunk because we may need
3148          * to update devices.
3149          */
3150         check_system_chunk(trans, flags);
3151
3152         ret = btrfs_alloc_chunk(trans, flags);
3153         trans->allocating_chunk = false;
3154
3155         spin_lock(&space_info->lock);
3156         if (ret < 0) {
3157                 if (ret == -ENOSPC)
3158                         space_info->full = 1;
3159                 else
3160                         goto out;
3161         } else {
3162                 ret = 1;
3163                 space_info->max_extent_size = 0;
3164         }
3165
3166         space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3167 out:
3168         space_info->chunk_alloc = 0;
3169         spin_unlock(&space_info->lock);
3170         mutex_unlock(&fs_info->chunk_mutex);
3171         /*
3172          * When we allocate a new chunk we reserve space in the chunk block
3173          * reserve to make sure we can COW nodes/leafs in the chunk tree or
3174          * add new nodes/leafs to it if we end up needing to do it when
3175          * inserting the chunk item and updating device items as part of the
3176          * second phase of chunk allocation, performed by
3177          * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
3178          * large number of new block groups to create in our transaction
3179          * handle's new_bgs list to avoid exhausting the chunk block reserve
3180          * in extreme cases - like having a single transaction create many new
3181          * block groups when starting to write out the free space caches of all
3182          * the block groups that were made dirty during the lifetime of the
3183          * transaction.
3184          */
3185         if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
3186                 btrfs_create_pending_block_groups(trans);
3187
3188         return ret;
3189 }
3190
3191 static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
3192 {
3193         u64 num_dev;
3194
3195         num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
3196         if (!num_dev)
3197                 num_dev = fs_info->fs_devices->rw_devices;
3198
3199         return num_dev;
3200 }
3201
3202 /*
3203  * Reserve space in the system space for allocating or removing a chunk
3204  */
3205 void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
3206 {
3207         struct btrfs_fs_info *fs_info = trans->fs_info;
3208         struct btrfs_space_info *info;
3209         u64 left;
3210         u64 thresh;
3211         int ret = 0;
3212         u64 num_devs;
3213
3214         /*
3215          * Needed because we can end up allocating a system chunk and for an
3216          * atomic and race free space reservation in the chunk block reserve.
3217          */
3218         lockdep_assert_held(&fs_info->chunk_mutex);
3219
3220         info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3221         spin_lock(&info->lock);
3222         left = info->total_bytes - btrfs_space_info_used(info, true);
3223         spin_unlock(&info->lock);
3224
3225         num_devs = get_profile_num_devs(fs_info, type);
3226
3227         /* num_devs device items to update and 1 chunk item to add or remove */
3228         thresh = btrfs_calc_metadata_size(fs_info, num_devs) +
3229                 btrfs_calc_insert_metadata_size(fs_info, 1);
3230
3231         if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
3232                 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
3233                            left, thresh, type);
3234                 btrfs_dump_space_info(fs_info, info, 0, 0);
3235         }
3236
3237         if (left < thresh) {
3238                 u64 flags = btrfs_system_alloc_profile(fs_info);
3239
3240                 /*
3241                  * Ignore failure to create system chunk. We might end up not
3242                  * needing it, as we might not need to COW all nodes/leafs from
3243                  * the paths we visit in the chunk tree (they were already COWed
3244                  * or created in the current transaction for example).
3245                  */
3246                 ret = btrfs_alloc_chunk(trans, flags);
3247         }
3248
3249         if (!ret) {
3250                 ret = btrfs_block_rsv_add(fs_info->chunk_root,
3251                                           &fs_info->chunk_block_rsv,
3252                                           thresh, BTRFS_RESERVE_NO_FLUSH);
3253                 if (!ret)
3254                         trans->chunk_bytes_reserved += thresh;
3255         }
3256 }
3257
3258 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
3259 {
3260         struct btrfs_block_group *block_group;
3261         u64 last = 0;
3262
3263         while (1) {
3264                 struct inode *inode;
3265
3266                 block_group = btrfs_lookup_first_block_group(info, last);
3267                 while (block_group) {
3268                         btrfs_wait_block_group_cache_done(block_group);
3269                         spin_lock(&block_group->lock);
3270                         if (block_group->iref)
3271                                 break;
3272                         spin_unlock(&block_group->lock);
3273                         block_group = btrfs_next_block_group(block_group);
3274                 }
3275                 if (!block_group) {
3276                         if (last == 0)
3277                                 break;
3278                         last = 0;
3279                         continue;
3280                 }
3281
3282                 inode = block_group->inode;
3283                 block_group->iref = 0;
3284                 block_group->inode = NULL;
3285                 spin_unlock(&block_group->lock);
3286                 ASSERT(block_group->io_ctl.inode == NULL);
3287                 iput(inode);
3288                 last = block_group->start + block_group->length;
3289                 btrfs_put_block_group(block_group);
3290         }
3291 }
3292
3293 /*
3294  * Must be called only after stopping all workers, since we could have block
3295  * group caching kthreads running, and therefore they could race with us if we
3296  * freed the block groups before stopping them.
3297  */
3298 int btrfs_free_block_groups(struct btrfs_fs_info *info)
3299 {
3300         struct btrfs_block_group *block_group;
3301         struct btrfs_space_info *space_info;
3302         struct btrfs_caching_control *caching_ctl;
3303         struct rb_node *n;
3304
3305         down_write(&info->commit_root_sem);
3306         while (!list_empty(&info->caching_block_groups)) {
3307                 caching_ctl = list_entry(info->caching_block_groups.next,
3308                                          struct btrfs_caching_control, list);
3309                 list_del(&caching_ctl->list);
3310                 btrfs_put_caching_control(caching_ctl);
3311         }
3312         up_write(&info->commit_root_sem);
3313
3314         spin_lock(&info->unused_bgs_lock);
3315         while (!list_empty(&info->unused_bgs)) {
3316                 block_group = list_first_entry(&info->unused_bgs,
3317                                                struct btrfs_block_group,
3318                                                bg_list);
3319                 list_del_init(&block_group->bg_list);
3320                 btrfs_put_block_group(block_group);
3321         }
3322         spin_unlock(&info->unused_bgs_lock);
3323
3324         spin_lock(&info->block_group_cache_lock);
3325         while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
3326                 block_group = rb_entry(n, struct btrfs_block_group,
3327                                        cache_node);
3328                 rb_erase(&block_group->cache_node,
3329                          &info->block_group_cache_tree);
3330                 RB_CLEAR_NODE(&block_group->cache_node);
3331                 spin_unlock(&info->block_group_cache_lock);
3332
3333                 down_write(&block_group->space_info->groups_sem);
3334                 list_del(&block_group->list);
3335                 up_write(&block_group->space_info->groups_sem);
3336
3337                 /*
3338                  * We haven't cached this block group, which means we could
3339                  * possibly have excluded extents on this block group.
3340                  */
3341                 if (block_group->cached == BTRFS_CACHE_NO ||
3342                     block_group->cached == BTRFS_CACHE_ERROR)
3343                         btrfs_free_excluded_extents(block_group);
3344
3345                 btrfs_remove_free_space_cache(block_group);
3346                 ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
3347                 ASSERT(list_empty(&block_group->dirty_list));
3348                 ASSERT(list_empty(&block_group->io_list));
3349                 ASSERT(list_empty(&block_group->bg_list));
3350                 ASSERT(atomic_read(&block_group->count) == 1);
3351                 btrfs_put_block_group(block_group);
3352
3353                 spin_lock(&info->block_group_cache_lock);
3354         }
3355         spin_unlock(&info->block_group_cache_lock);
3356
3357         /*
3358          * Now that all the block groups are freed, go through and free all the
3359          * space_info structs.  This is only called during the final stages of
3360          * unmount, and so we know nobody is using them.  We call
3361          * synchronize_rcu() once before we start, just to be on the safe side.
3362          */
3363         synchronize_rcu();
3364
3365         btrfs_release_global_block_rsv(info);
3366
3367         while (!list_empty(&info->space_info)) {
3368                 space_info = list_entry(info->space_info.next,
3369                                         struct btrfs_space_info,
3370                                         list);
3371
3372                 /*
3373                  * Do not hide this behind enospc_debug, this is actually
3374                  * important and indicates a real bug if this happens.
3375                  */
3376                 if (WARN_ON(space_info->bytes_pinned > 0 ||
3377                             space_info->bytes_reserved > 0 ||
3378                             space_info->bytes_may_use > 0))
3379                         btrfs_dump_space_info(info, space_info, 0, 0);
3380                 WARN_ON(space_info->reclaim_size > 0);
3381                 list_del(&space_info->list);
3382                 btrfs_sysfs_remove_space_info(space_info);
3383         }
3384         return 0;
3385 }