fs/btrfs/volumes.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2007 Oracle.  All rights reserved.
   4  */
   5
   6 #include <linux/sched.h>
   7 #include <linux/sched/mm.h>
   8 #include <linux/slab.h>
   9 #include <linux/ratelimit.h>
  10 #include <linux/kthread.h>
  11 #include <linux/semaphore.h>
  12 #include <linux/uuid.h>
  13 #include <linux/list_sort.h>
  14 #include <linux/namei.h>
  15 #include "misc.h"
  16 #include "ctree.h"
  17 #include "disk-io.h"
  18 #include "transaction.h"
  19 #include "volumes.h"
  20 #include "raid56.h"
  21 #include "rcu-string.h"
  22 #include "dev-replace.h"
  23 #include "sysfs.h"
  24 #include "tree-checker.h"
  25 #include "space-info.h"
  26 #include "block-group.h"
  27 #include "discard.h"
  28 #include "zoned.h"
  29 #include "fs.h"
  30 #include "accessors.h"
  31 #include "uuid-tree.h"
  32 #include "ioctl.h"
  33 #include "relocation.h"
  34 #include "scrub.h"
  35 #include "super.h"
  36 #include "raid-stripe-tree.h"
  37
  38 #define BTRFS_BLOCK_GROUP_STRIPE_MASK   (BTRFS_BLOCK_GROUP_RAID0 | \
  39                                          BTRFS_BLOCK_GROUP_RAID10 | \
  40                                          BTRFS_BLOCK_GROUP_RAID56_MASK)
  41
  42 struct btrfs_io_geometry {
  43         u32 stripe_index;
  44         u32 stripe_nr;
  45         int mirror_num;
  46         int num_stripes;
  47         u64 stripe_offset;
  48         u64 raid56_full_stripe_start;
  49         int max_errors;
  50         enum btrfs_map_op op;
  51 };
  52
  53 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
  54         [BTRFS_RAID_RAID10] = {
  55                 .sub_stripes    = 2,
  56                 .dev_stripes    = 1,
  57                 .devs_max       = 0,    /* 0 == as many as possible */
  58                 .devs_min       = 2,
  59                 .tolerated_failures = 1,
  60                 .devs_increment = 2,
  61                 .ncopies        = 2,
  62                 .nparity        = 0,
  63                 .raid_name      = "raid10",
  64                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID10,
  65                 .mindev_error   = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
  66         },
  67         [BTRFS_RAID_RAID1] = {
  68                 .sub_stripes    = 1,
  69                 .dev_stripes    = 1,
  70                 .devs_max       = 2,
  71                 .devs_min       = 2,
  72                 .tolerated_failures = 1,
  73                 .devs_increment = 2,
  74                 .ncopies        = 2,
  75                 .nparity        = 0,
  76                 .raid_name      = "raid1",
  77                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1,
  78                 .mindev_error   = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
  79         },
  80         [BTRFS_RAID_RAID1C3] = {
  81                 .sub_stripes    = 1,
  82                 .dev_stripes    = 1,
  83                 .devs_max       = 3,
  84                 .devs_min       = 3,
  85                 .tolerated_failures = 2,
  86                 .devs_increment = 3,
  87                 .ncopies        = 3,
  88                 .nparity        = 0,
  89                 .raid_name      = "raid1c3",
  90                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1C3,
  91                 .mindev_error   = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
  92         },
  93         [BTRFS_RAID_RAID1C4] = {
  94                 .sub_stripes    = 1,
  95                 .dev_stripes    = 1,
  96                 .devs_max       = 4,
  97                 .devs_min       = 4,
  98                 .tolerated_failures = 3,
  99                 .devs_increment = 4,
 100                 .ncopies        = 4,
 101                 .nparity        = 0,
 102                 .raid_name      = "raid1c4",
 103                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1C4,
 104                 .mindev_error   = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
 105         },
 106         [BTRFS_RAID_DUP] = {
 107                 .sub_stripes    = 1,
 108                 .dev_stripes    = 2,
 109                 .devs_max       = 1,
 110                 .devs_min       = 1,
 111                 .tolerated_failures = 0,
 112                 .devs_increment = 1,
 113                 .ncopies        = 2,
 114                 .nparity        = 0,
 115                 .raid_name      = "dup",
 116                 .bg_flag        = BTRFS_BLOCK_GROUP_DUP,
 117                 .mindev_error   = 0,
 118         },
 119         [BTRFS_RAID_RAID0] = {
 120                 .sub_stripes    = 1,
 121                 .dev_stripes    = 1,
 122                 .devs_max       = 0,
 123                 .devs_min       = 1,
 124                 .tolerated_failures = 0,
 125                 .devs_increment = 1,
 126                 .ncopies        = 1,
 127                 .nparity        = 0,
 128                 .raid_name      = "raid0",
 129                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID0,
 130                 .mindev_error   = 0,
 131         },
 132         [BTRFS_RAID_SINGLE] = {
 133                 .sub_stripes    = 1,
 134                 .dev_stripes    = 1,
 135                 .devs_max       = 1,
 136                 .devs_min       = 1,
 137                 .tolerated_failures = 0,
 138                 .devs_increment = 1,
 139                 .ncopies        = 1,
 140                 .nparity        = 0,
 141                 .raid_name      = "single",
 142                 .bg_flag        = 0,
 143                 .mindev_error   = 0,
 144         },
 145         [BTRFS_RAID_RAID5] = {
 146                 .sub_stripes    = 1,
 147                 .dev_stripes    = 1,
 148                 .devs_max       = 0,
 149                 .devs_min       = 2,
 150                 .tolerated_failures = 1,
 151                 .devs_increment = 1,
 152                 .ncopies        = 1,
 153                 .nparity        = 1,
 154                 .raid_name      = "raid5",
 155                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID5,
 156                 .mindev_error   = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
 157         },
 158         [BTRFS_RAID_RAID6] = {
 159                 .sub_stripes    = 1,
 160                 .dev_stripes    = 1,
 161                 .devs_max       = 0,
 162                 .devs_min       = 3,
 163                 .tolerated_failures = 2,
 164                 .devs_increment = 1,
 165                 .ncopies        = 1,
 166                 .nparity        = 2,
 167                 .raid_name      = "raid6",
 168                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID6,
 169                 .mindev_error   = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
 170         },
 171 };
 172
 173 /*
 174  * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
 175  * can be used as index to access btrfs_raid_array[].
 176  */
 177 enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
 178 {
 179         const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK);
 180
 181         if (!profile)
 182                 return BTRFS_RAID_SINGLE;
 183
 184         return BTRFS_BG_FLAG_TO_INDEX(profile);
 185 }
 186
 187 const char *btrfs_bg_type_to_raid_name(u64 flags)
 188 {
 189         const int index = btrfs_bg_flags_to_raid_index(flags);
 190
 191         if (index >= BTRFS_NR_RAID_TYPES)
 192                 return NULL;
 193
 194         return btrfs_raid_array[index].raid_name;
 195 }
 196
 197 int btrfs_nr_parity_stripes(u64 type)
 198 {
 199         enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(type);
 200
 201         return btrfs_raid_array[index].nparity;
 202 }
 203
 204 /*
 205  * Fill @buf with textual description of @bg_flags, no more than @size_buf
 206  * bytes including terminating null byte.
 207  */
 208 void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
 209 {
 210         int i;
 211         int ret;
 212         char *bp = buf;
 213         u64 flags = bg_flags;
 214         u32 size_bp = size_buf;
 215
 216         if (!flags) {
 217                 strcpy(bp, "NONE");
 218                 return;
 219         }
 220
 221 #define DESCRIBE_FLAG(flag, desc)                                               \
 222         do {                                                            \
 223                 if (flags & (flag)) {                                   \
 224                         ret = snprintf(bp, size_bp, "%s|", (desc));     \
 225                         if (ret < 0 || ret >= size_bp)                  \
 226                                 goto out_overflow;                      \
 227                         size_bp -= ret;                                 \
 228                         bp += ret;                                      \
 229                         flags &= ~(flag);                               \
 230                 }                                                       \
 231         } while (0)
 232
 233         DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
 234         DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
 235         DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
 236
 237         DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
 238         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
 239                 DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
 240                               btrfs_raid_array[i].raid_name);
 241 #undef DESCRIBE_FLAG
 242
 243         if (flags) {
 244                 ret = snprintf(bp, size_bp, "0x%llx|", flags);
 245                 size_bp -= ret;
 246         }
 247
 248         if (size_bp < size_buf)
 249                 buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
 250
 251         /*
 252          * The text is trimmed, it's up to the caller to provide sufficiently
 253          * large buffer
 254          */
 255 out_overflow:;
 256 }
 257
 258 static int init_first_rw_device(struct btrfs_trans_handle *trans);
 259 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
 260 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
 261
 262 /*
 263  * Device locking
 264  * ==============
 265  *
 266  * There are several mutexes that protect manipulation of devices and low-level
 267  * structures like chunks but not block groups, extents or files
 268  *
 269  * uuid_mutex (global lock)
 270  * ------------------------
 271  * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
 272  * the SCAN_DEV ioctl registration or from mount either implicitly (the first
 273  * device) or requested by the device= mount option
 274  *
 275  * the mutex can be very coarse and can cover long-running operations
 276  *
 277  * protects: updates to fs_devices counters like missing devices, rw devices,
 278  * seeding, structure cloning, opening/closing devices at mount/umount time
 279  *
 280  * global::fs_devs - add, remove, updates to the global list
 281  *
 282  * does not protect: manipulation of the fs_devices::devices list in general
 283  * but in mount context it could be used to exclude list modifications by eg.
 284  * scan ioctl
 285  *
 286  * btrfs_device::name - renames (write side), read is RCU
 287  *
 288  * fs_devices::device_list_mutex (per-fs, with RCU)
 289  * ------------------------------------------------
 290  * protects updates to fs_devices::devices, ie. adding and deleting
 291  *
 292  * simple list traversal with read-only actions can be done with RCU protection
 293  *
 294  * may be used to exclude some operations from running concurrently without any
 295  * modifications to the list (see write_all_supers)
 296  *
 297  * Is not required at mount and close times, because our device list is
 298  * protected by the uuid_mutex at that point.
 299  *
 300  * balance_mutex
 301  * -------------
 302  * protects balance structures (status, state) and context accessed from
 303  * several places (internally, ioctl)
 304  *
 305  * chunk_mutex
 306  * -----------
 307  * protects chunks, adding or removing during allocation, trim or when a new
 308  * device is added/removed. Additionally it also protects post_commit_list of
 309  * individual devices, since they can be added to the transaction's
 310  * post_commit_list only with chunk_mutex held.
 311  *
 312  * cleaner_mutex
 313  * -------------
 314  * a big lock that is held by the cleaner thread and prevents running subvolume
 315  * cleaning together with relocation or delayed iputs
 316  *
 317  *
 318  * Lock nesting
 319  * ============
 320  *
 321  * uuid_mutex
 322  *   device_list_mutex
 323  *     chunk_mutex
 324  *   balance_mutex
 325  *
 326  *
 327  * Exclusive operations
 328  * ====================
 329  *
 330  * Maintains the exclusivity of the following operations that apply to the
 331  * whole filesystem and cannot run in parallel.
 332  *
 333  * - Balance (*)
 334  * - Device add
 335  * - Device remove
 336  * - Device replace (*)
 337  * - Resize
 338  *
 339  * The device operations (as above) can be in one of the following states:
 340  *
 341  * - Running state
 342  * - Paused state
 343  * - Completed state
 344  *
 345  * Only device operations marked with (*) can go into the Paused state for the
 346  * following reasons:
 347  *
 348  * - ioctl (only Balance can be Paused through ioctl)
 349  * - filesystem remounted as read-only
 350  * - filesystem unmounted and mounted as read-only
 351  * - system power-cycle and filesystem mounted as read-only
 352  * - filesystem or device errors leading to forced read-only
 353  *
 354  * The status of exclusive operation is set and cleared atomically.
 355  * During the course of Paused state, fs_info::exclusive_operation remains set.
 356  * A device operation in Paused or Running state can be canceled or resumed
 357  * either by ioctl (Balance only) or when remounted as read-write.
 358  * The exclusive status is cleared when the device operation is canceled or
 359  * completed.
 360  */
 361
 362 DEFINE_MUTEX(uuid_mutex);
 363 static LIST_HEAD(fs_uuids);
 364 struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
 365 {
 366         return &fs_uuids;
 367 }
 368
 369 /*
 370  * Allocate new btrfs_fs_devices structure identified by a fsid.
 371  *
 372  * @fsid:    if not NULL, copy the UUID to fs_devices::fsid and to
 373  *           fs_devices::metadata_fsid
 374  *
 375  * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
 376  * The returned struct is not linked onto any lists and can be destroyed with
 377  * kfree() right away.
 378  */
 379 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
 380 {
 381         struct btrfs_fs_devices *fs_devs;
 382
 383         fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
 384         if (!fs_devs)
 385                 return ERR_PTR(-ENOMEM);
 386
 387         mutex_init(&fs_devs->device_list_mutex);
 388
 389         INIT_LIST_HEAD(&fs_devs->devices);
 390         INIT_LIST_HEAD(&fs_devs->alloc_list);
 391         INIT_LIST_HEAD(&fs_devs->fs_list);
 392         INIT_LIST_HEAD(&fs_devs->seed_list);
 393
 394         if (fsid) {
 395                 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
 396                 memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
 397         }
 398
 399         return fs_devs;
 400 }
 401
 402 static void btrfs_free_device(struct btrfs_device *device)
 403 {
 404         WARN_ON(!list_empty(&device->post_commit_list));
 405         rcu_string_free(device->name);
 406         extent_io_tree_release(&device->alloc_state);
 407         btrfs_destroy_dev_zone_info(device);
 408         kfree(device);
 409 }
 410
 411 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
 412 {
 413         struct btrfs_device *device;
 414
 415         WARN_ON(fs_devices->opened);
 416         while (!list_empty(&fs_devices->devices)) {
 417                 device = list_entry(fs_devices->devices.next,
 418                                     struct btrfs_device, dev_list);
 419                 list_del(&device->dev_list);
 420                 btrfs_free_device(device);
 421         }
 422         kfree(fs_devices);
 423 }
 424
 425 void __exit btrfs_cleanup_fs_uuids(void)
 426 {
 427         struct btrfs_fs_devices *fs_devices;
 428
 429         while (!list_empty(&fs_uuids)) {
 430                 fs_devices = list_entry(fs_uuids.next,
 431                                         struct btrfs_fs_devices, fs_list);
 432                 list_del(&fs_devices->fs_list);
 433                 free_fs_devices(fs_devices);
 434         }
 435 }
 436
 437 static bool match_fsid_fs_devices(const struct btrfs_fs_devices *fs_devices,
 438                                   const u8 *fsid, const u8 *metadata_fsid)
 439 {
 440         if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) != 0)
 441                 return false;
 442
 443         if (!metadata_fsid)
 444                 return true;
 445
 446         if (memcmp(metadata_fsid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE) != 0)
 447                 return false;
 448
 449         return true;
 450 }
 451
 452 static noinline struct btrfs_fs_devices *find_fsid(
 453                 const u8 *fsid, const u8 *metadata_fsid)
 454 {
 455         struct btrfs_fs_devices *fs_devices;
 456
 457         ASSERT(fsid);
 458
 459         /* Handle non-split brain cases */
 460         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 461                 if (match_fsid_fs_devices(fs_devices, fsid, metadata_fsid))
 462                         return fs_devices;
 463         }
 464         return NULL;
 465 }
 466
 467 static int
 468 btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
 469                       int flush, struct file **bdev_file,
 470                       struct btrfs_super_block **disk_super)
 471 {
 472         struct block_device *bdev;
 473         int ret;
 474
 475         *bdev_file = bdev_file_open_by_path(device_path, flags, holder, NULL);
 476
 477         if (IS_ERR(*bdev_file)) {
 478                 ret = PTR_ERR(*bdev_file);
 479                 btrfs_err(NULL, "failed to open device for path %s with flags 0x%x: %d",
 480                           device_path, flags, ret);
 481                 goto error;
 482         }
 483         bdev = file_bdev(*bdev_file);
 484
 485         if (flush)
 486                 sync_blockdev(bdev);
 487         if (holder) {
 488                 ret = set_blocksize(*bdev_file, BTRFS_BDEV_BLOCKSIZE);
 489                 if (ret) {
 490                         fput(*bdev_file);
 491                         goto error;
 492                 }
 493         }
 494         invalidate_bdev(bdev);
 495         *disk_super = btrfs_read_dev_super(bdev);
 496         if (IS_ERR(*disk_super)) {
 497                 ret = PTR_ERR(*disk_super);
 498                 fput(*bdev_file);
 499                 goto error;
 500         }
 501
 502         return 0;
 503
 504 error:
 505         *disk_super = NULL;
 506         *bdev_file = NULL;
 507         return ret;
 508 }
 509
 510 /*
 511  *  Search and remove all stale devices (which are not mounted).  When both
 512  *  inputs are NULL, it will search and release all stale devices.
 513  *
 514  *  @devt:         Optional. When provided will it release all unmounted devices
 515  *                 matching this devt only.
 516  *  @skip_device:  Optional. Will skip this device when searching for the stale
 517  *                 devices.
 518  *
 519  *  Return:     0 for success or if @devt is 0.
 520  *              -EBUSY if @devt is a mounted device.
 521  *              -ENOENT if @devt does not match any device in the list.
 522  */
 523 static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device)
 524 {
 525         struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
 526         struct btrfs_device *device, *tmp_device;
 527         int ret;
 528         bool freed = false;
 529
 530         lockdep_assert_held(&uuid_mutex);
 531
 532         /* Return good status if there is no instance of devt. */
 533         ret = 0;
 534         list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
 535
 536                 mutex_lock(&fs_devices->device_list_mutex);
 537                 list_for_each_entry_safe(device, tmp_device,
 538                                          &fs_devices->devices, dev_list) {
 539                         if (skip_device && skip_device == device)
 540                                 continue;
 541                         if (devt && devt != device->devt)
 542                                 continue;
 543                         if (fs_devices->opened) {
 544                                 if (devt)
 545                                         ret = -EBUSY;
 546                                 break;
 547                         }
 548
 549                         /* delete the stale device */
 550                         fs_devices->num_devices--;
 551                         list_del(&device->dev_list);
 552                         btrfs_free_device(device);
 553
 554                         freed = true;
 555                 }
 556                 mutex_unlock(&fs_devices->device_list_mutex);
 557
 558                 if (fs_devices->num_devices == 0) {
 559                         btrfs_sysfs_remove_fsid(fs_devices);
 560                         list_del(&fs_devices->fs_list);
 561                         free_fs_devices(fs_devices);
 562                 }
 563         }
 564
 565         /* If there is at least one freed device return 0. */
 566         if (freed)
 567                 return 0;
 568
 569         return ret;
 570 }
 571
 572 static struct btrfs_fs_devices *find_fsid_by_device(
 573                                         struct btrfs_super_block *disk_super,
 574                                         dev_t devt, bool *same_fsid_diff_dev)
 575 {
 576         struct btrfs_fs_devices *fsid_fs_devices;
 577         struct btrfs_fs_devices *devt_fs_devices;
 578         const bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
 579                                         BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
 580         bool found_by_devt = false;
 581
 582         /* Find the fs_device by the usual method, if found use it. */
 583         fsid_fs_devices = find_fsid(disk_super->fsid,
 584                     has_metadata_uuid ? disk_super->metadata_uuid : NULL);
 585
 586         /* The temp_fsid feature is supported only with single device filesystem. */
 587         if (btrfs_super_num_devices(disk_super) != 1)
 588                 return fsid_fs_devices;
 589
 590         /*
 591          * A seed device is an integral component of the sprout device, which
 592          * functions as a multi-device filesystem. So, temp-fsid feature is
 593          * not supported.
 594          */
 595         if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)
 596                 return fsid_fs_devices;
 597
 598         /* Try to find a fs_devices by matching devt. */
 599         list_for_each_entry(devt_fs_devices, &fs_uuids, fs_list) {
 600                 struct btrfs_device *device;
 601
 602                 list_for_each_entry(device, &devt_fs_devices->devices, dev_list) {
 603                         if (device->devt == devt) {
 604                                 found_by_devt = true;
 605                                 break;
 606                         }
 607                 }
 608                 if (found_by_devt)
 609                         break;
 610         }
 611
 612         if (found_by_devt) {
 613                 /* Existing device. */
 614                 if (fsid_fs_devices == NULL) {
 615                         if (devt_fs_devices->opened == 0) {
 616                                 /* Stale device. */
 617                                 return NULL;
 618                         } else {
 619                                 /* temp_fsid is mounting a subvol. */
 620                                 return devt_fs_devices;
 621                         }
 622                 } else {
 623                         /* Regular or temp_fsid device mounting a subvol. */
 624                         return devt_fs_devices;
 625                 }
 626         } else {
 627                 /* New device. */
 628                 if (fsid_fs_devices == NULL) {
 629                         return NULL;
 630                 } else {
 631                         /* sb::fsid is already used create a new temp_fsid. */
 632                         *same_fsid_diff_dev = true;
 633                         return NULL;
 634                 }
 635         }
 636
 637         /* Not reached. */
 638 }
 639
 640 /*
 641  * This is only used on mount, and we are protected from competing things
 642  * messing with our fs_devices by the uuid_mutex, thus we do not need the
 643  * fs_devices->device_list_mutex here.
 644  */
 645 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 646                         struct btrfs_device *device, blk_mode_t flags,
 647                         void *holder)
 648 {
 649         struct file *bdev_file;
 650         struct btrfs_super_block *disk_super;
 651         u64 devid;
 652         int ret;
 653
 654         if (device->bdev)
 655                 return -EINVAL;
 656         if (!device->name)
 657                 return -EINVAL;
 658
 659         ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
 660                                     &bdev_file, &disk_super);
 661         if (ret)
 662                 return ret;
 663
 664         devid = btrfs_stack_device_id(&disk_super->dev_item);
 665         if (devid != device->devid)
 666                 goto error_free_page;
 667
 668         if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
 669                 goto error_free_page;
 670
 671         device->generation = btrfs_super_generation(disk_super);
 672
 673         if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
 674                 if (btrfs_super_incompat_flags(disk_super) &
 675                     BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
 676                         pr_err(
 677                 "BTRFS: Invalid seeding and uuid-changed device detected\n");
 678                         goto error_free_page;
 679                 }
 680
 681                 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 682                 fs_devices->seeding = true;
 683         } else {
 684                 if (bdev_read_only(file_bdev(bdev_file)))
 685                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 686                 else
 687                         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 688         }
 689
 690         if (!bdev_nonrot(file_bdev(bdev_file)))
 691                 fs_devices->rotating = true;
 692
 693         if (bdev_max_discard_sectors(file_bdev(bdev_file)))
 694                 fs_devices->discardable = true;
 695
 696         device->bdev_file = bdev_file;
 697         device->bdev = file_bdev(bdev_file);
 698         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
 699
 700         if (device->devt != device->bdev->bd_dev) {
 701                 btrfs_warn(NULL,
 702                            "device %s maj:min changed from %d:%d to %d:%d",
 703                            device->name->str, MAJOR(device->devt),
 704                            MINOR(device->devt), MAJOR(device->bdev->bd_dev),
 705                            MINOR(device->bdev->bd_dev));
 706
 707                 device->devt = device->bdev->bd_dev;
 708         }
 709
 710         fs_devices->open_devices++;
 711         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
 712             device->devid != BTRFS_DEV_REPLACE_DEVID) {
 713                 fs_devices->rw_devices++;
 714                 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
 715         }
 716         btrfs_release_disk_super(disk_super);
 717
 718         return 0;
 719
 720 error_free_page:
 721         btrfs_release_disk_super(disk_super);
 722         fput(bdev_file);
 723
 724         return -EINVAL;
 725 }
 726
 727 const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb)
 728 {
 729         bool has_metadata_uuid = (btrfs_super_incompat_flags(sb) &
 730                                   BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
 731
 732         return has_metadata_uuid ? sb->metadata_uuid : sb->fsid;
 733 }
 734
 735 /*
 736  * Add new device to list of registered devices
 737  *
 738  * Returns:
 739  * device pointer which was just added or updated when successful
 740  * error pointer when failed
 741  */
 742 static noinline struct btrfs_device *device_list_add(const char *path,
 743                            struct btrfs_super_block *disk_super,
 744                            bool *new_device_added)
 745 {
 746         struct btrfs_device *device;
 747         struct btrfs_fs_devices *fs_devices = NULL;
 748         struct rcu_string *name;
 749         u64 found_transid = btrfs_super_generation(disk_super);
 750         u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
 751         dev_t path_devt;
 752         int error;
 753         bool same_fsid_diff_dev = false;
 754         bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
 755                 BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
 756
 757         if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
 758                 btrfs_err(NULL,
 759 "device %s has incomplete metadata_uuid change, please use btrfstune to complete",
 760                           path);
 761                 return ERR_PTR(-EAGAIN);
 762         }
 763
 764         error = lookup_bdev(path, &path_devt);
 765         if (error) {
 766                 btrfs_err(NULL, "failed to lookup block device for path %s: %d",
 767                           path, error);
 768                 return ERR_PTR(error);
 769         }
 770
 771         fs_devices = find_fsid_by_device(disk_super, path_devt, &same_fsid_diff_dev);
 772
 773         if (!fs_devices) {
 774                 fs_devices = alloc_fs_devices(disk_super->fsid);
 775                 if (IS_ERR(fs_devices))
 776                         return ERR_CAST(fs_devices);
 777
 778                 if (has_metadata_uuid)
 779                         memcpy(fs_devices->metadata_uuid,
 780                                disk_super->metadata_uuid, BTRFS_FSID_SIZE);
 781
 782                 if (same_fsid_diff_dev) {
 783                         generate_random_uuid(fs_devices->fsid);
 784                         fs_devices->temp_fsid = true;
 785                 pr_info("BTRFS: device %s (%d:%d) using temp-fsid %pU\n",
 786                                 path, MAJOR(path_devt), MINOR(path_devt),
 787                                 fs_devices->fsid);
 788                 }
 789
 790                 mutex_lock(&fs_devices->device_list_mutex);
 791                 list_add(&fs_devices->fs_list, &fs_uuids);
 792
 793                 device = NULL;
 794         } else {
 795                 struct btrfs_dev_lookup_args args = {
 796                         .devid = devid,
 797                         .uuid = disk_super->dev_item.uuid,
 798                 };
 799
 800                 mutex_lock(&fs_devices->device_list_mutex);
 801                 device = btrfs_find_device(fs_devices, &args);
 802
 803                 if (found_transid > fs_devices->latest_generation) {
 804                         memcpy(fs_devices->fsid, disk_super->fsid,
 805                                         BTRFS_FSID_SIZE);
 806                         memcpy(fs_devices->metadata_uuid,
 807                                btrfs_sb_fsid_ptr(disk_super), BTRFS_FSID_SIZE);
 808                 }
 809         }
 810
 811         if (!device) {
 812                 unsigned int nofs_flag;
 813
 814                 if (fs_devices->opened) {
 815                         btrfs_err(NULL,
 816 "device %s (%d:%d) belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)",
 817                                   path, MAJOR(path_devt), MINOR(path_devt),
 818                                   fs_devices->fsid, current->comm,
 819                                   task_pid_nr(current));
 820                         mutex_unlock(&fs_devices->device_list_mutex);
 821                         return ERR_PTR(-EBUSY);
 822                 }
 823
 824                 nofs_flag = memalloc_nofs_save();
 825                 device = btrfs_alloc_device(NULL, &devid,
 826                                             disk_super->dev_item.uuid, path);
 827                 memalloc_nofs_restore(nofs_flag);
 828                 if (IS_ERR(device)) {
 829                         mutex_unlock(&fs_devices->device_list_mutex);
 830                         /* we can safely leave the fs_devices entry around */
 831                         return device;
 832                 }
 833
 834                 device->devt = path_devt;
 835
 836                 list_add_rcu(&device->dev_list, &fs_devices->devices);
 837                 fs_devices->num_devices++;
 838
 839                 device->fs_devices = fs_devices;
 840                 *new_device_added = true;
 841
 842                 if (disk_super->label[0])
 843                         pr_info(
 844 "BTRFS: device label %s devid %llu transid %llu %s (%d:%d) scanned by %s (%d)\n",
 845                                 disk_super->label, devid, found_transid, path,
 846                                 MAJOR(path_devt), MINOR(path_devt),
 847                                 current->comm, task_pid_nr(current));
 848                 else
 849                         pr_info(
 850 "BTRFS: device fsid %pU devid %llu transid %llu %s (%d:%d) scanned by %s (%d)\n",
 851                                 disk_super->fsid, devid, found_transid, path,
 852                                 MAJOR(path_devt), MINOR(path_devt),
 853                                 current->comm, task_pid_nr(current));
 854
 855         } else if (!device->name || strcmp(device->name->str, path)) {
 856                 /*
 857                  * When FS is already mounted.
 858                  * 1. If you are here and if the device->name is NULL that
 859                  *    means this device was missing at time of FS mount.
 860                  * 2. If you are here and if the device->name is different
 861                  *    from 'path' that means either
 862                  *      a. The same device disappeared and reappeared with
 863                  *         different name. or
 864                  *      b. The missing-disk-which-was-replaced, has
 865                  *         reappeared now.
 866                  *
 867                  * We must allow 1 and 2a above. But 2b would be a spurious
 868                  * and unintentional.
 869                  *
 870                  * Further in case of 1 and 2a above, the disk at 'path'
 871                  * would have missed some transaction when it was away and
 872                  * in case of 2a the stale bdev has to be updated as well.
 873                  * 2b must not be allowed at all time.
 874                  */
 875
 876                 /*
 877                  * For now, we do allow update to btrfs_fs_device through the
 878                  * btrfs dev scan cli after FS has been mounted.  We're still
 879                  * tracking a problem where systems fail mount by subvolume id
 880                  * when we reject replacement on a mounted FS.
 881                  */
 882                 if (!fs_devices->opened && found_transid < device->generation) {
 883                         /*
 884                          * That is if the FS is _not_ mounted and if you
 885                          * are here, that means there is more than one
 886                          * disk with same uuid and devid.We keep the one
 887                          * with larger generation number or the last-in if
 888                          * generation are equal.
 889                          */
 890                         mutex_unlock(&fs_devices->device_list_mutex);
 891                         btrfs_err(NULL,
 892 "device %s already registered with a higher generation, found %llu expect %llu",
 893                                   path, found_transid, device->generation);
 894                         return ERR_PTR(-EEXIST);
 895                 }
 896
 897                 /*
 898                  * We are going to replace the device path for a given devid,
 899                  * make sure it's the same device if the device is mounted
 900                  *
 901                  * NOTE: the device->fs_info may not be reliable here so pass
 902                  * in a NULL to message helpers instead. This avoids a possible
 903                  * use-after-free when the fs_info and fs_info->sb are already
 904                  * torn down.
 905                  */
 906                 if (device->bdev) {
 907                         if (device->devt != path_devt) {
 908                                 mutex_unlock(&fs_devices->device_list_mutex);
 909                                 btrfs_warn_in_rcu(NULL,
 910         "duplicate device %s devid %llu generation %llu scanned by %s (%d)",
 911                                                   path, devid, found_transid,
 912                                                   current->comm,
 913                                                   task_pid_nr(current));
 914                                 return ERR_PTR(-EEXIST);
 915                         }
 916                         btrfs_info_in_rcu(NULL,
 917         "devid %llu device path %s changed to %s scanned by %s (%d)",
 918                                           devid, btrfs_dev_name(device),
 919                                           path, current->comm,
 920                                           task_pid_nr(current));
 921                 }
 922
 923                 name = rcu_string_strdup(path, GFP_NOFS);
 924                 if (!name) {
 925                         mutex_unlock(&fs_devices->device_list_mutex);
 926                         return ERR_PTR(-ENOMEM);
 927                 }
 928                 rcu_string_free(device->name);
 929                 rcu_assign_pointer(device->name, name);
 930                 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
 931                         fs_devices->missing_devices--;
 932                         clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
 933                 }
 934                 device->devt = path_devt;
 935         }
 936
 937         /*
 938          * Unmount does not free the btrfs_device struct but would zero
 939          * generation along with most of the other members. So just update
 940          * it back. We need it to pick the disk with largest generation
 941          * (as above).
 942          */
 943         if (!fs_devices->opened) {
 944                 device->generation = found_transid;
 945                 fs_devices->latest_generation = max_t(u64, found_transid,
 946                                                 fs_devices->latest_generation);
 947         }
 948
 949         fs_devices->total_devices = btrfs_super_num_devices(disk_super);
 950
 951         mutex_unlock(&fs_devices->device_list_mutex);
 952         return device;
 953 }
 954
 955 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 956 {
 957         struct btrfs_fs_devices *fs_devices;
 958         struct btrfs_device *device;
 959         struct btrfs_device *orig_dev;
 960         int ret = 0;
 961
 962         lockdep_assert_held(&uuid_mutex);
 963
 964         fs_devices = alloc_fs_devices(orig->fsid);
 965         if (IS_ERR(fs_devices))
 966                 return fs_devices;
 967
 968         fs_devices->total_devices = orig->total_devices;
 969
 970         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
 971                 const char *dev_path = NULL;
 972
 973                 /*
 974                  * This is ok to do without RCU read locked because we hold the
 975                  * uuid mutex so nothing we touch in here is going to disappear.
 976                  */
 977                 if (orig_dev->name)
 978                         dev_path = orig_dev->name->str;
 979
 980                 device = btrfs_alloc_device(NULL, &orig_dev->devid,
 981                                             orig_dev->uuid, dev_path);
 982                 if (IS_ERR(device)) {
 983                         ret = PTR_ERR(device);
 984                         goto error;
 985                 }
 986
 987                 if (orig_dev->zone_info) {
 988                         struct btrfs_zoned_device_info *zone_info;
 989
 990                         zone_info = btrfs_clone_dev_zone_info(orig_dev);
 991                         if (!zone_info) {
 992                                 btrfs_free_device(device);
 993                                 ret = -ENOMEM;
 994                                 goto error;
 995                         }
 996                         device->zone_info = zone_info;
 997                 }
 998
 999                 list_add(&device->dev_list, &fs_devices->devices);
1000                 device->fs_devices = fs_devices;
1001                 fs_devices->num_devices++;
1002         }
1003         return fs_devices;
1004 error:
1005         free_fs_devices(fs_devices);
1006         return ERR_PTR(ret);
1007 }
1008
1009 static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
1010                                       struct btrfs_device **latest_dev)
1011 {
1012         struct btrfs_device *device, *next;
1013
1014         /* This is the initialized path, it is safe to release the devices. */
1015         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
1016                 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
1017                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1018                                       &device->dev_state) &&
1019                             !test_bit(BTRFS_DEV_STATE_MISSING,
1020                                       &device->dev_state) &&
1021                             (!*latest_dev ||
1022                              device->generation > (*latest_dev)->generation)) {
1023                                 *latest_dev = device;
1024                         }
1025                         continue;
1026                 }
1027
1028                 /*
1029                  * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
1030                  * in btrfs_init_dev_replace() so just continue.
1031                  */
1032                 if (device->devid == BTRFS_DEV_REPLACE_DEVID)
1033                         continue;
1034
1035                 if (device->bdev_file) {
1036                         fput(device->bdev_file);
1037                         device->bdev = NULL;
1038                         device->bdev_file = NULL;
1039                         fs_devices->open_devices--;
1040                 }
1041                 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1042                         list_del_init(&device->dev_alloc_list);
1043                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1044                         fs_devices->rw_devices--;
1045                 }
1046                 list_del_init(&device->dev_list);
1047                 fs_devices->num_devices--;
1048                 btrfs_free_device(device);
1049         }
1050
1051 }
1052
1053 /*
1054  * After we have read the system tree and know devids belonging to this
1055  * filesystem, remove the device which does not belong there.
1056  */
1057 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
1058 {
1059         struct btrfs_device *latest_dev = NULL;
1060         struct btrfs_fs_devices *seed_dev;
1061
1062         mutex_lock(&uuid_mutex);
1063         __btrfs_free_extra_devids(fs_devices, &latest_dev);
1064
1065         list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
1066                 __btrfs_free_extra_devids(seed_dev, &latest_dev);
1067
1068         fs_devices->latest_dev = latest_dev;
1069
1070         mutex_unlock(&uuid_mutex);
1071 }
1072
1073 static void btrfs_close_bdev(struct btrfs_device *device)
1074 {
1075         if (!device->bdev)
1076                 return;
1077
1078         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1079                 sync_blockdev(device->bdev);
1080                 invalidate_bdev(device->bdev);
1081         }
1082
1083         fput(device->bdev_file);
1084 }
1085
1086 static void btrfs_close_one_device(struct btrfs_device *device)
1087 {
1088         struct btrfs_fs_devices *fs_devices = device->fs_devices;
1089
1090         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1091             device->devid != BTRFS_DEV_REPLACE_DEVID) {
1092                 list_del_init(&device->dev_alloc_list);
1093                 fs_devices->rw_devices--;
1094         }
1095
1096         if (device->devid == BTRFS_DEV_REPLACE_DEVID)
1097                 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
1098
1099         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
1100                 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
1101                 fs_devices->missing_devices--;
1102         }
1103
1104         btrfs_close_bdev(device);
1105         if (device->bdev) {
1106                 fs_devices->open_devices--;
1107                 device->bdev = NULL;
1108         }
1109         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1110         btrfs_destroy_dev_zone_info(device);
1111
1112         device->fs_info = NULL;
1113         atomic_set(&device->dev_stats_ccnt, 0);
1114         extent_io_tree_release(&device->alloc_state);
1115
1116         /*
1117          * Reset the flush error record. We might have a transient flush error
1118          * in this mount, and if so we aborted the current transaction and set
1119          * the fs to an error state, guaranteeing no super blocks can be further
1120          * committed. However that error might be transient and if we unmount the
1121          * filesystem and mount it again, we should allow the mount to succeed
1122          * (btrfs_check_rw_degradable() should not fail) - if after mounting the
1123          * filesystem again we still get flush errors, then we will again abort
1124          * any transaction and set the error state, guaranteeing no commits of
1125          * unsafe super blocks.
1126          */
1127         device->last_flush_error = 0;
1128
1129         /* Verify the device is back in a pristine state  */
1130         WARN_ON(test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
1131         WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1132         WARN_ON(!list_empty(&device->dev_alloc_list));
1133         WARN_ON(!list_empty(&device->post_commit_list));
1134 }
1135
1136 static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
1137 {
1138         struct btrfs_device *device, *tmp;
1139
1140         lockdep_assert_held(&uuid_mutex);
1141
1142         if (--fs_devices->opened > 0)
1143                 return;
1144
1145         list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
1146                 btrfs_close_one_device(device);
1147
1148         WARN_ON(fs_devices->open_devices);
1149         WARN_ON(fs_devices->rw_devices);
1150         fs_devices->opened = 0;
1151         fs_devices->seeding = false;
1152         fs_devices->fs_info = NULL;
1153 }
1154
1155 void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1156 {
1157         LIST_HEAD(list);
1158         struct btrfs_fs_devices *tmp;
1159
1160         mutex_lock(&uuid_mutex);
1161         close_fs_devices(fs_devices);
1162         if (!fs_devices->opened) {
1163                 list_splice_init(&fs_devices->seed_list, &list);
1164
1165                 /*
1166                  * If the struct btrfs_fs_devices is not assembled with any
1167                  * other device, it can be re-initialized during the next mount
1168                  * without the needing device-scan step. Therefore, it can be
1169                  * fully freed.
1170                  */
1171                 if (fs_devices->num_devices == 1) {
1172                         list_del(&fs_devices->fs_list);
1173                         free_fs_devices(fs_devices);
1174                 }
1175         }
1176
1177
1178         list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
1179                 close_fs_devices(fs_devices);
1180                 list_del(&fs_devices->seed_list);
1181                 free_fs_devices(fs_devices);
1182         }
1183         mutex_unlock(&uuid_mutex);
1184 }
1185
1186 static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
1187                                 blk_mode_t flags, void *holder)
1188 {
1189         struct btrfs_device *device;
1190         struct btrfs_device *latest_dev = NULL;
1191         struct btrfs_device *tmp_device;
1192         int ret = 0;
1193
1194         list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
1195                                  dev_list) {
1196                 int ret2;
1197
1198                 ret2 = btrfs_open_one_device(fs_devices, device, flags, holder);
1199                 if (ret2 == 0 &&
1200                     (!latest_dev || device->generation > latest_dev->generation)) {
1201                         latest_dev = device;
1202                 } else if (ret2 == -ENODATA) {
1203                         fs_devices->num_devices--;
1204                         list_del(&device->dev_list);
1205                         btrfs_free_device(device);
1206                 }
1207                 if (ret == 0 && ret2 != 0)
1208                         ret = ret2;
1209         }
1210
1211         if (fs_devices->open_devices == 0) {
1212                 if (ret)
1213                         return ret;
1214                 return -EINVAL;
1215         }
1216
1217         fs_devices->opened = 1;
1218         fs_devices->latest_dev = latest_dev;
1219         fs_devices->total_rw_bytes = 0;
1220         fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
1221         fs_devices->read_policy = BTRFS_READ_POLICY_PID;
1222
1223         return 0;
1224 }
1225
1226 static int devid_cmp(void *priv, const struct list_head *a,
1227                      const struct list_head *b)
1228 {
1229         const struct btrfs_device *dev1, *dev2;
1230
1231         dev1 = list_entry(a, struct btrfs_device, dev_list);
1232         dev2 = list_entry(b, struct btrfs_device, dev_list);
1233
1234         if (dev1->devid < dev2->devid)
1235                 return -1;
1236         else if (dev1->devid > dev2->devid)
1237                 return 1;
1238         return 0;
1239 }
1240
1241 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1242                        blk_mode_t flags, void *holder)
1243 {
1244         int ret;
1245
1246         lockdep_assert_held(&uuid_mutex);
1247         /*
1248          * The device_list_mutex cannot be taken here in case opening the
1249          * underlying device takes further locks like open_mutex.
1250          *
1251          * We also don't need the lock here as this is called during mount and
1252          * exclusion is provided by uuid_mutex
1253          */
1254
1255         if (fs_devices->opened) {
1256                 fs_devices->opened++;
1257                 ret = 0;
1258         } else {
1259                 list_sort(NULL, &fs_devices->devices, devid_cmp);
1260                 ret = open_fs_devices(fs_devices, flags, holder);
1261         }
1262
1263         return ret;
1264 }
1265
1266 void btrfs_release_disk_super(struct btrfs_super_block *super)
1267 {
1268         struct page *page = virt_to_page(super);
1269
1270         put_page(page);
1271 }
1272
1273 static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
1274                                                        u64 bytenr, u64 bytenr_orig)
1275 {
1276         struct btrfs_super_block *disk_super;
1277         struct page *page;
1278         void *p;
1279         pgoff_t index;
1280
1281         /* make sure our super fits in the device */
1282         if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev))
1283                 return ERR_PTR(-EINVAL);
1284
1285         /* make sure our super fits in the page */
1286         if (sizeof(*disk_super) > PAGE_SIZE)
1287                 return ERR_PTR(-EINVAL);
1288
1289         /* make sure our super doesn't straddle pages on disk */
1290         index = bytenr >> PAGE_SHIFT;
1291         if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
1292                 return ERR_PTR(-EINVAL);
1293
1294         /* pull in the page with our super */
1295         page = read_cache_page_gfp(bdev->bd_mapping, index, GFP_KERNEL);
1296
1297         if (IS_ERR(page))
1298                 return ERR_CAST(page);
1299
1300         p = page_address(page);
1301
1302         /* align our pointer to the offset of the super block */
1303         disk_super = p + offset_in_page(bytenr);
1304
1305         if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
1306             btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
1307                 btrfs_release_disk_super(p);
1308                 return ERR_PTR(-EINVAL);
1309         }
1310
1311         if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
1312                 disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
1313
1314         return disk_super;
1315 }
1316
1317 int btrfs_forget_devices(dev_t devt)
1318 {
1319         int ret;
1320
1321         mutex_lock(&uuid_mutex);
1322         ret = btrfs_free_stale_devices(devt, NULL);
1323         mutex_unlock(&uuid_mutex);
1324
1325         return ret;
1326 }
1327
1328 static bool btrfs_skip_registration(struct btrfs_super_block *disk_super,
1329                                     const char *path, dev_t devt,
1330                                     bool mount_arg_dev)
1331 {
1332         struct btrfs_fs_devices *fs_devices;
1333
1334         /*
1335          * Do not skip device registration for mounted devices with matching
1336          * maj:min but different paths. Booting without initrd relies on
1337          * /dev/root initially, later replaced with the actual root device.
1338          * A successful scan ensures grub2-probe selects the correct device.
1339          */
1340         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
1341                 struct btrfs_device *device;
1342
1343                 mutex_lock(&fs_devices->device_list_mutex);
1344
1345                 if (!fs_devices->opened) {
1346                         mutex_unlock(&fs_devices->device_list_mutex);
1347                         continue;
1348                 }
1349
1350                 list_for_each_entry(device, &fs_devices->devices, dev_list) {
1351                         if (device->bdev && (device->bdev->bd_dev == devt) &&
1352                             strcmp(device->name->str, path) != 0) {
1353                                 mutex_unlock(&fs_devices->device_list_mutex);
1354
1355                                 /* Do not skip registration. */
1356                                 return false;
1357                         }
1358                 }
1359                 mutex_unlock(&fs_devices->device_list_mutex);
1360         }
1361
1362         if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 &&
1363             !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING))
1364                 return true;
1365
1366         return false;
1367 }
1368
1369 /*
1370  * Look for a btrfs signature on a device. This may be called out of the mount path
1371  * and we are not allowed to call set_blocksize during the scan. The superblock
1372  * is read via pagecache.
1373  *
1374  * With @mount_arg_dev it's a scan during mount time that will always register
1375  * the device or return an error. Multi-device and seeding devices are registered
1376  * in both cases.
1377  */
1378 struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
1379                                            bool mount_arg_dev)
1380 {
1381         struct btrfs_super_block *disk_super;
1382         bool new_device_added = false;
1383         struct btrfs_device *device = NULL;
1384         struct file *bdev_file;
1385         u64 bytenr;
1386         dev_t devt;
1387         int ret;
1388
1389         lockdep_assert_held(&uuid_mutex);
1390
1391         /*
1392          * Avoid an exclusive open here, as the systemd-udev may initiate the
1393          * device scan which may race with the user's mount or mkfs command,
1394          * resulting in failure.
1395          * Since the device scan is solely for reading purposes, there is no
1396          * need for an exclusive open. Additionally, the devices are read again
1397          * during the mount process. It is ok to get some inconsistent
1398          * values temporarily, as the device paths of the fsid are the only
1399          * required information for assembling the volume.
1400          */
1401         bdev_file = bdev_file_open_by_path(path, flags, NULL, NULL);
1402         if (IS_ERR(bdev_file))
1403                 return ERR_CAST(bdev_file);
1404
1405         /*
1406          * We would like to check all the super blocks, but doing so would
1407          * allow a mount to succeed after a mkfs from a different filesystem.
1408          * Currently, recovery from a bad primary btrfs superblock is done
1409          * using the userspace command 'btrfs check --super'.
1410          */
1411         ret = btrfs_sb_log_location_bdev(file_bdev(bdev_file), 0, READ, &bytenr);
1412         if (ret) {
1413                 device = ERR_PTR(ret);
1414                 goto error_bdev_put;
1415         }
1416
1417         disk_super = btrfs_read_disk_super(file_bdev(bdev_file), bytenr,
1418                                            btrfs_sb_offset(0));
1419         if (IS_ERR(disk_super)) {
1420                 device = ERR_CAST(disk_super);
1421                 goto error_bdev_put;
1422         }
1423
1424         devt = file_bdev(bdev_file)->bd_dev;
1425         if (btrfs_skip_registration(disk_super, path, devt, mount_arg_dev)) {
1426                 pr_debug("BTRFS: skip registering single non-seed device %s (%d:%d)\n",
1427                           path, MAJOR(devt), MINOR(devt));
1428
1429                 btrfs_free_stale_devices(devt, NULL);
1430
1431                 device = NULL;
1432                 goto free_disk_super;
1433         }
1434
1435         device = device_list_add(path, disk_super, &new_device_added);
1436         if (!IS_ERR(device) && new_device_added)
1437                 btrfs_free_stale_devices(device->devt, device);
1438
1439 free_disk_super:
1440         btrfs_release_disk_super(disk_super);
1441
1442 error_bdev_put:
1443         fput(bdev_file);
1444
1445         return device;
1446 }
1447
1448 /*
1449  * Try to find a chunk that intersects [start, start + len] range and when one
1450  * such is found, record the end of it in *start
1451  */
1452 static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
1453                                     u64 len)
1454 {
1455         u64 physical_start, physical_end;
1456
1457         lockdep_assert_held(&device->fs_info->chunk_mutex);
1458
1459         if (find_first_extent_bit(&device->alloc_state, *start,
1460                                   &physical_start, &physical_end,
1461                                   CHUNK_ALLOCATED, NULL)) {
1462
1463                 if (in_range(physical_start, *start, len) ||
1464                     in_range(*start, physical_start,
1465                              physical_end + 1 - physical_start)) {
1466                         *start = physical_end + 1;
1467                         return true;
1468                 }
1469         }
1470         return false;
1471 }
1472
1473 static u64 dev_extent_search_start(struct btrfs_device *device)
1474 {
1475         switch (device->fs_devices->chunk_alloc_policy) {
1476         case BTRFS_CHUNK_ALLOC_REGULAR:
1477                 return BTRFS_DEVICE_RANGE_RESERVED;
1478         case BTRFS_CHUNK_ALLOC_ZONED:
1479                 /*
1480                  * We don't care about the starting region like regular
1481                  * allocator, because we anyway use/reserve the first two zones
1482                  * for superblock logging.
1483                  */
1484                 return 0;
1485         default:
1486                 BUG();
1487         }
1488 }
1489
1490 static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
1491                                         u64 *hole_start, u64 *hole_size,
1492                                         u64 num_bytes)
1493 {
1494         u64 zone_size = device->zone_info->zone_size;
1495         u64 pos;
1496         int ret;
1497         bool changed = false;
1498
1499         ASSERT(IS_ALIGNED(*hole_start, zone_size));
1500
1501         while (*hole_size > 0) {
1502                 pos = btrfs_find_allocatable_zones(device, *hole_start,
1503                                                    *hole_start + *hole_size,
1504                                                    num_bytes);
1505                 if (pos != *hole_start) {
1506                         *hole_size = *hole_start + *hole_size - pos;
1507                         *hole_start = pos;
1508                         changed = true;
1509                         if (*hole_size < num_bytes)
1510                                 break;
1511                 }
1512
1513                 ret = btrfs_ensure_empty_zones(device, pos, num_bytes);
1514
1515                 /* Range is ensured to be empty */
1516                 if (!ret)
1517                         return changed;
1518
1519                 /* Given hole range was invalid (outside of device) */
1520                 if (ret == -ERANGE) {
1521                         *hole_start += *hole_size;
1522                         *hole_size = 0;
1523                         return true;
1524                 }
1525
1526                 *hole_start += zone_size;
1527                 *hole_size -= zone_size;
1528                 changed = true;
1529         }
1530
1531         return changed;
1532 }
1533
1534 /*
1535  * Check if specified hole is suitable for allocation.
1536  *
1537  * @device:     the device which we have the hole
1538  * @hole_start: starting position of the hole
1539  * @hole_size:  the size of the hole
1540  * @num_bytes:  the size of the free space that we need
1541  *
1542  * This function may modify @hole_start and @hole_size to reflect the suitable
1543  * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
1544  */
1545 static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
1546                                   u64 *hole_size, u64 num_bytes)
1547 {
1548         bool changed = false;
1549         u64 hole_end = *hole_start + *hole_size;
1550
1551         for (;;) {
1552                 /*
1553                  * Check before we set max_hole_start, otherwise we could end up
1554                  * sending back this offset anyway.
1555                  */
1556                 if (contains_pending_extent(device, hole_start, *hole_size)) {
1557                         if (hole_end >= *hole_start)
1558                                 *hole_size = hole_end - *hole_start;
1559                         else
1560                                 *hole_size = 0;
1561                         changed = true;
1562                 }
1563
1564                 switch (device->fs_devices->chunk_alloc_policy) {
1565                 case BTRFS_CHUNK_ALLOC_REGULAR:
1566                         /* No extra check */
1567                         break;
1568                 case BTRFS_CHUNK_ALLOC_ZONED:
1569                         if (dev_extent_hole_check_zoned(device, hole_start,
1570                                                         hole_size, num_bytes)) {
1571                                 changed = true;
1572                                 /*
1573                                  * The changed hole can contain pending extent.
1574                                  * Loop again to check that.
1575                                  */
1576                                 continue;
1577                         }
1578                         break;
1579                 default:
1580                         BUG();
1581                 }
1582
1583                 break;
1584         }
1585
1586         return changed;
1587 }
1588
1589 /*
1590  * Find free space in the specified device.
1591  *
1592  * @device:       the device which we search the free space in
1593  * @num_bytes:    the size of the free space that we need
1594  * @search_start: the position from which to begin the search
1595  * @start:        store the start of the free space.
1596  * @len:          the size of the free space. that we find, or the size
1597  *                of the max free space if we don't find suitable free space
1598  *
1599  * This does a pretty simple search, the expectation is that it is called very
1600  * infrequently and that a given device has a small number of extents.
1601  *
1602  * @start is used to store the start of the free space if we find. But if we
1603  * don't find suitable free space, it will be used to store the start position
1604  * of the max free space.
1605  *
1606  * @len is used to store the size of the free space that we find.
1607  * But if we don't find suitable free space, it is used to store the size of
1608  * the max free space.
1609  *
1610  * NOTE: This function will search *commit* root of device tree, and does extra
1611  * check to ensure dev extents are not double allocated.
1612  * This makes the function safe to allocate dev extents but may not report
1613  * correct usable device space, as device extent freed in current transaction
1614  * is not reported as available.
1615  */
1616 static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1617                                 u64 *start, u64 *len)
1618 {
1619         struct btrfs_fs_info *fs_info = device->fs_info;
1620         struct btrfs_root *root = fs_info->dev_root;
1621         struct btrfs_key key;
1622         struct btrfs_dev_extent *dev_extent;
1623         struct btrfs_path *path;
1624         u64 search_start;
1625         u64 hole_size;
1626         u64 max_hole_start;
1627         u64 max_hole_size = 0;
1628         u64 extent_end;
1629         u64 search_end = device->total_bytes;
1630         int ret;
1631         int slot;
1632         struct extent_buffer *l;
1633
1634         search_start = dev_extent_search_start(device);
1635         max_hole_start = search_start;
1636
1637         WARN_ON(device->zone_info &&
1638                 !IS_ALIGNED(num_bytes, device->zone_info->zone_size));
1639
1640         path = btrfs_alloc_path();
1641         if (!path) {
1642                 ret = -ENOMEM;
1643                 goto out;
1644         }
1645 again:
1646         if (search_start >= search_end ||
1647                 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1648                 ret = -ENOSPC;
1649                 goto out;
1650         }
1651
1652         path->reada = READA_FORWARD;
1653         path->search_commit_root = 1;
1654         path->skip_locking = 1;
1655
1656         key.objectid = device->devid;
1657         key.offset = search_start;
1658         key.type = BTRFS_DEV_EXTENT_KEY;
1659
1660         ret = btrfs_search_backwards(root, &key, path);
1661         if (ret < 0)
1662                 goto out;
1663
1664         while (search_start < search_end) {
1665                 l = path->nodes[0];
1666                 slot = path->slots[0];
1667                 if (slot >= btrfs_header_nritems(l)) {
1668                         ret = btrfs_next_leaf(root, path);
1669                         if (ret == 0)
1670                                 continue;
1671                         if (ret < 0)
1672                                 goto out;
1673
1674                         break;
1675                 }
1676                 btrfs_item_key_to_cpu(l, &key, slot);
1677
1678                 if (key.objectid < device->devid)
1679                         goto next;
1680
1681                 if (key.objectid > device->devid)
1682                         break;
1683
1684                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1685                         goto next;
1686
1687                 if (key.offset > search_end)
1688                         break;
1689
1690                 if (key.offset > search_start) {
1691                         hole_size = key.offset - search_start;
1692                         dev_extent_hole_check(device, &search_start, &hole_size,
1693                                               num_bytes);
1694
1695                         if (hole_size > max_hole_size) {
1696                                 max_hole_start = search_start;
1697                                 max_hole_size = hole_size;
1698                         }
1699
1700                         /*
1701                          * If this free space is greater than which we need,
1702                          * it must be the max free space that we have found
1703                          * until now, so max_hole_start must point to the start
1704                          * of this free space and the length of this free space
1705                          * is stored in max_hole_size. Thus, we return
1706                          * max_hole_start and max_hole_size and go back to the
1707                          * caller.
1708                          */
1709                         if (hole_size >= num_bytes) {
1710                                 ret = 0;
1711                                 goto out;
1712                         }
1713                 }
1714
1715                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1716                 extent_end = key.offset + btrfs_dev_extent_length(l,
1717                                                                   dev_extent);
1718                 if (extent_end > search_start)
1719                         search_start = extent_end;
1720 next:
1721                 path->slots[0]++;
1722                 cond_resched();
1723         }
1724
1725         /*
1726          * At this point, search_start should be the end of
1727          * allocated dev extents, and when shrinking the device,
1728          * search_end may be smaller than search_start.
1729          */
1730         if (search_end > search_start) {
1731                 hole_size = search_end - search_start;
1732                 if (dev_extent_hole_check(device, &search_start, &hole_size,
1733                                           num_bytes)) {
1734                         btrfs_release_path(path);
1735                         goto again;
1736                 }
1737
1738                 if (hole_size > max_hole_size) {
1739                         max_hole_start = search_start;
1740                         max_hole_size = hole_size;
1741                 }
1742         }
1743
1744         /* See above. */
1745         if (max_hole_size < num_bytes)
1746                 ret = -ENOSPC;
1747         else
1748                 ret = 0;
1749
1750         ASSERT(max_hole_start + max_hole_size <= search_end);
1751 out:
1752         btrfs_free_path(path);
1753         *start = max_hole_start;
1754         if (len)
1755                 *len = max_hole_size;
1756         return ret;
1757 }
1758
1759 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1760                           struct btrfs_device *device,
1761                           u64 start, u64 *dev_extent_len)
1762 {
1763         struct btrfs_fs_info *fs_info = device->fs_info;
1764         struct btrfs_root *root = fs_info->dev_root;
1765         int ret;
1766         struct btrfs_path *path;
1767         struct btrfs_key key;
1768         struct btrfs_key found_key;
1769         struct extent_buffer *leaf = NULL;
1770         struct btrfs_dev_extent *extent = NULL;
1771
1772         path = btrfs_alloc_path();
1773         if (!path)
1774                 return -ENOMEM;
1775
1776         key.objectid = device->devid;
1777         key.offset = start;
1778         key.type = BTRFS_DEV_EXTENT_KEY;
1779 again:
1780         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1781         if (ret > 0) {
1782                 ret = btrfs_previous_item(root, path, key.objectid,
1783                                           BTRFS_DEV_EXTENT_KEY);
1784                 if (ret)
1785                         goto out;
1786                 leaf = path->nodes[0];
1787                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1788                 extent = btrfs_item_ptr(leaf, path->slots[0],
1789                                         struct btrfs_dev_extent);
1790                 BUG_ON(found_key.offset > start || found_key.offset +
1791                        btrfs_dev_extent_length(leaf, extent) < start);
1792                 key = found_key;
1793                 btrfs_release_path(path);
1794                 goto again;
1795         } else if (ret == 0) {
1796                 leaf = path->nodes[0];
1797                 extent = btrfs_item_ptr(leaf, path->slots[0],
1798                                         struct btrfs_dev_extent);
1799         } else {
1800                 goto out;
1801         }
1802
1803         *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1804
1805         ret = btrfs_del_item(trans, root, path);
1806         if (ret == 0)
1807                 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1808 out:
1809         btrfs_free_path(path);
1810         return ret;
1811 }
1812
1813 static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1814 {
1815         struct rb_node *n;
1816         u64 ret = 0;
1817
1818         read_lock(&fs_info->mapping_tree_lock);
1819         n = rb_last(&fs_info->mapping_tree.rb_root);
1820         if (n) {
1821                 struct btrfs_chunk_map *map;
1822
1823                 map = rb_entry(n, struct btrfs_chunk_map, rb_node);
1824                 ret = map->start + map->chunk_len;
1825         }
1826         read_unlock(&fs_info->mapping_tree_lock);
1827
1828         return ret;
1829 }
1830
1831 static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1832                                     u64 *devid_ret)
1833 {
1834         int ret;
1835         struct btrfs_key key;
1836         struct btrfs_key found_key;
1837         struct btrfs_path *path;
1838
1839         path = btrfs_alloc_path();
1840         if (!path)
1841                 return -ENOMEM;
1842
1843         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1844         key.type = BTRFS_DEV_ITEM_KEY;
1845         key.offset = (u64)-1;
1846
1847         ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1848         if (ret < 0)
1849                 goto error;
1850
1851         if (ret == 0) {
1852                 /* Corruption */
1853                 btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
1854                 ret = -EUCLEAN;
1855                 goto error;
1856         }
1857
1858         ret = btrfs_previous_item(fs_info->chunk_root, path,
1859                                   BTRFS_DEV_ITEMS_OBJECTID,
1860                                   BTRFS_DEV_ITEM_KEY);
1861         if (ret) {
1862                 *devid_ret = 1;
1863         } else {
1864                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1865                                       path->slots[0]);
1866                 *devid_ret = found_key.offset + 1;
1867         }
1868         ret = 0;
1869 error:
1870         btrfs_free_path(path);
1871         return ret;
1872 }
1873
1874 /*
1875  * the device information is stored in the chunk root
1876  * the btrfs_device struct should be fully filled in
1877  */
1878 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1879                             struct btrfs_device *device)
1880 {
1881         int ret;
1882         struct btrfs_path *path;
1883         struct btrfs_dev_item *dev_item;
1884         struct extent_buffer *leaf;
1885         struct btrfs_key key;
1886         unsigned long ptr;
1887
1888         path = btrfs_alloc_path();
1889         if (!path)
1890                 return -ENOMEM;
1891
1892         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1893         key.type = BTRFS_DEV_ITEM_KEY;
1894         key.offset = device->devid;
1895
1896         btrfs_reserve_chunk_metadata(trans, true);
1897         ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
1898                                       &key, sizeof(*dev_item));
1899         btrfs_trans_release_chunk_metadata(trans);
1900         if (ret)
1901                 goto out;
1902
1903         leaf = path->nodes[0];
1904         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1905
1906         btrfs_set_device_id(leaf, dev_item, device->devid);
1907         btrfs_set_device_generation(leaf, dev_item, 0);
1908         btrfs_set_device_type(leaf, dev_item, device->type);
1909         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1910         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1911         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1912         btrfs_set_device_total_bytes(leaf, dev_item,
1913                                      btrfs_device_get_disk_total_bytes(device));
1914         btrfs_set_device_bytes_used(leaf, dev_item,
1915                                     btrfs_device_get_bytes_used(device));
1916         btrfs_set_device_group(leaf, dev_item, 0);
1917         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1918         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1919         btrfs_set_device_start_offset(leaf, dev_item, 0);
1920
1921         ptr = btrfs_device_uuid(dev_item);
1922         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1923         ptr = btrfs_device_fsid(dev_item);
1924         write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
1925                             ptr, BTRFS_FSID_SIZE);
1926         btrfs_mark_buffer_dirty(trans, leaf);
1927
1928         ret = 0;
1929 out:
1930         btrfs_free_path(path);
1931         return ret;
1932 }
1933
1934 /*
1935  * Function to update ctime/mtime for a given device path.
1936  * Mainly used for ctime/mtime based probe like libblkid.
1937  *
1938  * We don't care about errors here, this is just to be kind to userspace.
1939  */
1940 static void update_dev_time(const char *device_path)
1941 {
1942         struct path path;
1943         int ret;
1944
1945         ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
1946         if (ret)
1947                 return;
1948
1949         inode_update_time(d_inode(path.dentry), S_MTIME | S_CTIME | S_VERSION);
1950         path_put(&path);
1951 }
1952
1953 static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans,
1954                              struct btrfs_device *device)
1955 {
1956         struct btrfs_root *root = device->fs_info->chunk_root;
1957         int ret;
1958         struct btrfs_path *path;
1959         struct btrfs_key key;
1960
1961         path = btrfs_alloc_path();
1962         if (!path)
1963                 return -ENOMEM;
1964
1965         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1966         key.type = BTRFS_DEV_ITEM_KEY;
1967         key.offset = device->devid;
1968
1969         btrfs_reserve_chunk_metadata(trans, false);
1970         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1971         btrfs_trans_release_chunk_metadata(trans);
1972         if (ret) {
1973                 if (ret > 0)
1974                         ret = -ENOENT;
1975                 goto out;
1976         }
1977
1978         ret = btrfs_del_item(trans, root, path);
1979 out:
1980         btrfs_free_path(path);
1981         return ret;
1982 }
1983
1984 /*
1985  * Verify that @num_devices satisfies the RAID profile constraints in the whole
1986  * filesystem. It's up to the caller to adjust that number regarding eg. device
1987  * replace.
1988  */
1989 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1990                 u64 num_devices)
1991 {
1992         u64 all_avail;
1993         unsigned seq;
1994         int i;
1995
1996         do {
1997                 seq = read_seqbegin(&fs_info->profiles_lock);
1998
1999                 all_avail = fs_info->avail_data_alloc_bits |
2000                             fs_info->avail_system_alloc_bits |
2001                             fs_info->avail_metadata_alloc_bits;
2002         } while (read_seqretry(&fs_info->profiles_lock, seq));
2003
2004         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
2005                 if (!(all_avail & btrfs_raid_array[i].bg_flag))
2006                         continue;
2007
2008                 if (num_devices < btrfs_raid_array[i].devs_min)
2009                         return btrfs_raid_array[i].mindev_error;
2010         }
2011
2012         return 0;
2013 }
2014
2015 static struct btrfs_device * btrfs_find_next_active_device(
2016                 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
2017 {
2018         struct btrfs_device *next_device;
2019
2020         list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
2021                 if (next_device != device &&
2022                     !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
2023                     && next_device->bdev)
2024                         return next_device;
2025         }
2026
2027         return NULL;
2028 }
2029
2030 /*
2031  * Helper function to check if the given device is part of s_bdev / latest_dev
2032  * and replace it with the provided or the next active device, in the context
2033  * where this function called, there should be always be another device (or
2034  * this_dev) which is active.
2035  */
2036 void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
2037                                             struct btrfs_device *next_device)
2038 {
2039         struct btrfs_fs_info *fs_info = device->fs_info;
2040
2041         if (!next_device)
2042                 next_device = btrfs_find_next_active_device(fs_info->fs_devices,
2043                                                             device);
2044         ASSERT(next_device);
2045
2046         if (fs_info->sb->s_bdev &&
2047                         (fs_info->sb->s_bdev == device->bdev))
2048                 fs_info->sb->s_bdev = next_device->bdev;
2049
2050         if (fs_info->fs_devices->latest_dev->bdev == device->bdev)
2051                 fs_info->fs_devices->latest_dev = next_device;
2052 }
2053
2054 /*
2055  * Return btrfs_fs_devices::num_devices excluding the device that's being
2056  * currently replaced.
2057  */
2058 static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
2059 {
2060         u64 num_devices = fs_info->fs_devices->num_devices;
2061
2062         down_read(&fs_info->dev_replace.rwsem);
2063         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
2064                 ASSERT(num_devices > 1);
2065                 num_devices--;
2066         }
2067         up_read(&fs_info->dev_replace.rwsem);
2068
2069         return num_devices;
2070 }
2071
2072 static void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info,
2073                                      struct block_device *bdev, int copy_num)
2074 {
2075         struct btrfs_super_block *disk_super;
2076         const size_t len = sizeof(disk_super->magic);
2077         const u64 bytenr = btrfs_sb_offset(copy_num);
2078         int ret;
2079
2080         disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr);
2081         if (IS_ERR(disk_super))
2082                 return;
2083
2084         memset(&disk_super->magic, 0, len);
2085         folio_mark_dirty(virt_to_folio(disk_super));
2086         btrfs_release_disk_super(disk_super);
2087
2088         ret = sync_blockdev_range(bdev, bytenr, bytenr + len - 1);
2089         if (ret)
2090                 btrfs_warn(fs_info, "error clearing superblock number %d (%d)",
2091                         copy_num, ret);
2092 }
2093
2094 void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_device *device)
2095 {
2096         int copy_num;
2097         struct block_device *bdev = device->bdev;
2098
2099         if (!bdev)
2100                 return;
2101
2102         for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
2103                 if (bdev_is_zoned(bdev))
2104                         btrfs_reset_sb_log_zones(bdev, copy_num);
2105                 else
2106                         btrfs_scratch_superblock(fs_info, bdev, copy_num);
2107         }
2108
2109         /* Notify udev that device has changed */
2110         btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
2111
2112         /* Update ctime/mtime for device path for libblkid */
2113         update_dev_time(device->name->str);
2114 }
2115
2116 int btrfs_rm_device(struct btrfs_fs_info *fs_info,
2117                     struct btrfs_dev_lookup_args *args,
2118                     struct file **bdev_file)
2119 {
2120         struct btrfs_trans_handle *trans;
2121         struct btrfs_device *device;
2122         struct btrfs_fs_devices *cur_devices;
2123         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2124         u64 num_devices;
2125         int ret = 0;
2126
2127         if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
2128                 btrfs_err(fs_info, "device remove not supported on extent tree v2 yet");
2129                 return -EINVAL;
2130         }
2131
2132         /*
2133          * The device list in fs_devices is accessed without locks (neither
2134          * uuid_mutex nor device_list_mutex) as it won't change on a mounted
2135          * filesystem and another device rm cannot run.
2136          */
2137         num_devices = btrfs_num_devices(fs_info);
2138
2139         ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
2140         if (ret)
2141                 return ret;
2142
2143         device = btrfs_find_device(fs_info->fs_devices, args);
2144         if (!device) {
2145                 if (args->missing)
2146                         ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2147                 else
2148                         ret = -ENOENT;
2149                 return ret;
2150         }
2151
2152         if (btrfs_pinned_by_swapfile(fs_info, device)) {
2153                 btrfs_warn_in_rcu(fs_info,
2154                   "cannot remove device %s (devid %llu) due to active swapfile",
2155                                   btrfs_dev_name(device), device->devid);
2156                 return -ETXTBSY;
2157         }
2158
2159         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
2160                 return BTRFS_ERROR_DEV_TGT_REPLACE;
2161
2162         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
2163             fs_info->fs_devices->rw_devices == 1)
2164                 return BTRFS_ERROR_DEV_ONLY_WRITABLE;
2165
2166         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2167                 mutex_lock(&fs_info->chunk_mutex);
2168                 list_del_init(&device->dev_alloc_list);
2169                 device->fs_devices->rw_devices--;
2170                 mutex_unlock(&fs_info->chunk_mutex);
2171         }
2172
2173         ret = btrfs_shrink_device(device, 0);
2174         if (ret)
2175                 goto error_undo;
2176
2177         trans = btrfs_start_transaction(fs_info->chunk_root, 0);
2178         if (IS_ERR(trans)) {
2179                 ret = PTR_ERR(trans);
2180                 goto error_undo;
2181         }
2182
2183         ret = btrfs_rm_dev_item(trans, device);
2184         if (ret) {
2185                 /* Any error in dev item removal is critical */
2186                 btrfs_crit(fs_info,
2187                            "failed to remove device item for devid %llu: %d",
2188                            device->devid, ret);
2189                 btrfs_abort_transaction(trans, ret);
2190                 btrfs_end_transaction(trans);
2191                 return ret;
2192         }
2193
2194         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2195         btrfs_scrub_cancel_dev(device);
2196
2197         /*
2198          * the device list mutex makes sure that we don't change
2199          * the device list while someone else is writing out all
2200          * the device supers. Whoever is writing all supers, should
2201          * lock the device list mutex before getting the number of
2202          * devices in the super block (super_copy). Conversely,
2203          * whoever updates the number of devices in the super block
2204          * (super_copy) should hold the device list mutex.
2205          */
2206
2207         /*
2208          * In normal cases the cur_devices == fs_devices. But in case
2209          * of deleting a seed device, the cur_devices should point to
2210          * its own fs_devices listed under the fs_devices->seed_list.
2211          */
2212         cur_devices = device->fs_devices;
2213         mutex_lock(&fs_devices->device_list_mutex);
2214         list_del_rcu(&device->dev_list);
2215
2216         cur_devices->num_devices--;
2217         cur_devices->total_devices--;
2218         /* Update total_devices of the parent fs_devices if it's seed */
2219         if (cur_devices != fs_devices)
2220                 fs_devices->total_devices--;
2221
2222         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
2223                 cur_devices->missing_devices--;
2224
2225         btrfs_assign_next_active_device(device, NULL);
2226
2227         if (device->bdev_file) {
2228                 cur_devices->open_devices--;
2229                 /* remove sysfs entry */
2230                 btrfs_sysfs_remove_device(device);
2231         }
2232
2233         num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
2234         btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
2235         mutex_unlock(&fs_devices->device_list_mutex);
2236
2237         /*
2238          * At this point, the device is zero sized and detached from the
2239          * devices list.  All that's left is to zero out the old supers and
2240          * free the device.
2241          *
2242          * We cannot call btrfs_close_bdev() here because we're holding the sb
2243          * write lock, and fput() on the block device will pull in the
2244          * ->open_mutex on the block device and it's dependencies.  Instead
2245          *  just flush the device and let the caller do the final bdev_release.
2246          */
2247         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2248                 btrfs_scratch_superblocks(fs_info, device);
2249                 if (device->bdev) {
2250                         sync_blockdev(device->bdev);
2251                         invalidate_bdev(device->bdev);
2252                 }
2253         }
2254
2255         *bdev_file = device->bdev_file;
2256         synchronize_rcu();
2257         btrfs_free_device(device);
2258
2259         /*
2260          * This can happen if cur_devices is the private seed devices list.  We
2261          * cannot call close_fs_devices() here because it expects the uuid_mutex
2262          * to be held, but in fact we don't need that for the private
2263          * seed_devices, we can simply decrement cur_devices->opened and then
2264          * remove it from our list and free the fs_devices.
2265          */
2266         if (cur_devices->num_devices == 0) {
2267                 list_del_init(&cur_devices->seed_list);
2268                 ASSERT(cur_devices->opened == 1);
2269                 cur_devices->opened--;
2270                 free_fs_devices(cur_devices);
2271         }
2272
2273         ret = btrfs_commit_transaction(trans);
2274
2275         return ret;
2276
2277 error_undo:
2278         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2279                 mutex_lock(&fs_info->chunk_mutex);
2280                 list_add(&device->dev_alloc_list,
2281                          &fs_devices->alloc_list);
2282                 device->fs_devices->rw_devices++;
2283                 mutex_unlock(&fs_info->chunk_mutex);
2284         }
2285         return ret;
2286 }
2287
2288 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
2289 {
2290         struct btrfs_fs_devices *fs_devices;
2291
2292         lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
2293
2294         /*
2295          * in case of fs with no seed, srcdev->fs_devices will point
2296          * to fs_devices of fs_info. However when the dev being replaced is
2297          * a seed dev it will point to the seed's local fs_devices. In short
2298          * srcdev will have its correct fs_devices in both the cases.
2299          */
2300         fs_devices = srcdev->fs_devices;
2301
2302         list_del_rcu(&srcdev->dev_list);
2303         list_del(&srcdev->dev_alloc_list);
2304         fs_devices->num_devices--;
2305         if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2306                 fs_devices->missing_devices--;
2307
2308         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2309                 fs_devices->rw_devices--;
2310
2311         if (srcdev->bdev)
2312                 fs_devices->open_devices--;
2313 }
2314
2315 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
2316 {
2317         struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2318
2319         mutex_lock(&uuid_mutex);
2320
2321         btrfs_close_bdev(srcdev);
2322         synchronize_rcu();
2323         btrfs_free_device(srcdev);
2324
2325         /* if this is no devs we rather delete the fs_devices */
2326         if (!fs_devices->num_devices) {
2327                 /*
2328                  * On a mounted FS, num_devices can't be zero unless it's a
2329                  * seed. In case of a seed device being replaced, the replace
2330                  * target added to the sprout FS, so there will be no more
2331                  * device left under the seed FS.
2332                  */
2333                 ASSERT(fs_devices->seeding);
2334
2335                 list_del_init(&fs_devices->seed_list);
2336                 close_fs_devices(fs_devices);
2337                 free_fs_devices(fs_devices);
2338         }
2339         mutex_unlock(&uuid_mutex);
2340 }
2341
2342 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
2343 {
2344         struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
2345
2346         mutex_lock(&fs_devices->device_list_mutex);
2347
2348         btrfs_sysfs_remove_device(tgtdev);
2349
2350         if (tgtdev->bdev)
2351                 fs_devices->open_devices--;
2352
2353         fs_devices->num_devices--;
2354
2355         btrfs_assign_next_active_device(tgtdev, NULL);
2356
2357         list_del_rcu(&tgtdev->dev_list);
2358
2359         mutex_unlock(&fs_devices->device_list_mutex);
2360
2361         btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev);
2362
2363         btrfs_close_bdev(tgtdev);
2364         synchronize_rcu();
2365         btrfs_free_device(tgtdev);
2366 }
2367
2368 /*
2369  * Populate args from device at path.
2370  *
2371  * @fs_info:    the filesystem
2372  * @args:       the args to populate
2373  * @path:       the path to the device
2374  *
2375  * This will read the super block of the device at @path and populate @args with
2376  * the devid, fsid, and uuid.  This is meant to be used for ioctls that need to
2377  * lookup a device to operate on, but need to do it before we take any locks.
2378  * This properly handles the special case of "missing" that a user may pass in,
2379  * and does some basic sanity checks.  The caller must make sure that @path is
2380  * properly NUL terminated before calling in, and must call
2381  * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and
2382  * uuid buffers.
2383  *
2384  * Return: 0 for success, -errno for failure
2385  */
2386 int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
2387                                  struct btrfs_dev_lookup_args *args,
2388                                  const char *path)
2389 {
2390         struct btrfs_super_block *disk_super;
2391         struct file *bdev_file;
2392         int ret;
2393
2394         if (!path || !path[0])
2395                 return -EINVAL;
2396         if (!strcmp(path, "missing")) {
2397                 args->missing = true;
2398                 return 0;
2399         }
2400
2401         args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL);
2402         args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL);
2403         if (!args->uuid || !args->fsid) {
2404                 btrfs_put_dev_args_from_path(args);
2405                 return -ENOMEM;
2406         }
2407
2408         ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0,
2409                                     &bdev_file, &disk_super);
2410         if (ret) {
2411                 btrfs_put_dev_args_from_path(args);
2412                 return ret;
2413         }
2414
2415         args->devid = btrfs_stack_device_id(&disk_super->dev_item);
2416         memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE);
2417         if (btrfs_fs_incompat(fs_info, METADATA_UUID))
2418                 memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE);
2419         else
2420                 memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
2421         btrfs_release_disk_super(disk_super);
2422         fput(bdev_file);
2423         return 0;
2424 }
2425
2426 /*
2427  * Only use this jointly with btrfs_get_dev_args_from_path() because we will
2428  * allocate our ->uuid and ->fsid pointers, everybody else uses local variables
2429  * that don't need to be freed.
2430  */
2431 void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args)
2432 {
2433         kfree(args->uuid);
2434         kfree(args->fsid);
2435         args->uuid = NULL;
2436         args->fsid = NULL;
2437 }
2438
2439 struct btrfs_device *btrfs_find_device_by_devspec(
2440                 struct btrfs_fs_info *fs_info, u64 devid,
2441                 const char *device_path)
2442 {
2443         BTRFS_DEV_LOOKUP_ARGS(args);
2444         struct btrfs_device *device;
2445         int ret;
2446
2447         if (devid) {
2448                 args.devid = devid;
2449                 device = btrfs_find_device(fs_info->fs_devices, &args);
2450                 if (!device)
2451                         return ERR_PTR(-ENOENT);
2452                 return device;
2453         }
2454
2455         ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path);
2456         if (ret)
2457                 return ERR_PTR(ret);
2458         device = btrfs_find_device(fs_info->fs_devices, &args);
2459         btrfs_put_dev_args_from_path(&args);
2460         if (!device)
2461                 return ERR_PTR(-ENOENT);
2462         return device;
2463 }
2464
2465 static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info)
2466 {
2467         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2468         struct btrfs_fs_devices *old_devices;
2469         struct btrfs_fs_devices *seed_devices;
2470
2471         lockdep_assert_held(&uuid_mutex);
2472         if (!fs_devices->seeding)
2473                 return ERR_PTR(-EINVAL);
2474
2475         /*
2476          * Private copy of the seed devices, anchored at
2477          * fs_info->fs_devices->seed_list
2478          */
2479         seed_devices = alloc_fs_devices(NULL);
2480         if (IS_ERR(seed_devices))
2481                 return seed_devices;
2482
2483         /*
2484          * It's necessary to retain a copy of the original seed fs_devices in
2485          * fs_uuids so that filesystems which have been seeded can successfully
2486          * reference the seed device from open_seed_devices. This also supports
2487          * multiple fs seed.
2488          */
2489         old_devices = clone_fs_devices(fs_devices);
2490         if (IS_ERR(old_devices)) {
2491                 kfree(seed_devices);
2492                 return old_devices;
2493         }
2494
2495         list_add(&old_devices->fs_list, &fs_uuids);
2496
2497         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2498         seed_devices->opened = 1;
2499         INIT_LIST_HEAD(&seed_devices->devices);
2500         INIT_LIST_HEAD(&seed_devices->alloc_list);
2501         mutex_init(&seed_devices->device_list_mutex);
2502
2503         return seed_devices;
2504 }
2505
2506 /*
2507  * Splice seed devices into the sprout fs_devices.
2508  * Generate a new fsid for the sprouted read-write filesystem.
2509  */
2510 static void btrfs_setup_sprout(struct btrfs_fs_info *fs_info,
2511                                struct btrfs_fs_devices *seed_devices)
2512 {
2513         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2514         struct btrfs_super_block *disk_super = fs_info->super_copy;
2515         struct btrfs_device *device;
2516         u64 super_flags;
2517
2518         /*
2519          * We are updating the fsid, the thread leading to device_list_add()
2520          * could race, so uuid_mutex is needed.
2521          */
2522         lockdep_assert_held(&uuid_mutex);
2523
2524         /*
2525          * The threads listed below may traverse dev_list but can do that without
2526          * device_list_mutex:
2527          * - All device ops and balance - as we are in btrfs_exclop_start.
2528          * - Various dev_list readers - are using RCU.
2529          * - btrfs_ioctl_fitrim() - is using RCU.
2530          *
2531          * For-read threads as below are using device_list_mutex:
2532          * - Readonly scrub btrfs_scrub_dev()
2533          * - Readonly scrub btrfs_scrub_progress()
2534          * - btrfs_get_dev_stats()
2535          */
2536         lockdep_assert_held(&fs_devices->device_list_mutex);
2537
2538         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2539                               synchronize_rcu);
2540         list_for_each_entry(device, &seed_devices->devices, dev_list)
2541                 device->fs_devices = seed_devices;
2542
2543         fs_devices->seeding = false;
2544         fs_devices->num_devices = 0;
2545         fs_devices->open_devices = 0;
2546         fs_devices->missing_devices = 0;
2547         fs_devices->rotating = false;
2548         list_add(&seed_devices->seed_list, &fs_devices->seed_list);
2549
2550         generate_random_uuid(fs_devices->fsid);
2551         memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
2552         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2553
2554         super_flags = btrfs_super_flags(disk_super) &
2555                       ~BTRFS_SUPER_FLAG_SEEDING;
2556         btrfs_set_super_flags(disk_super, super_flags);
2557 }
2558
2559 /*
2560  * Store the expected generation for seed devices in device items.
2561  */
2562 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
2563 {
2564         BTRFS_DEV_LOOKUP_ARGS(args);
2565         struct btrfs_fs_info *fs_info = trans->fs_info;
2566         struct btrfs_root *root = fs_info->chunk_root;
2567         struct btrfs_path *path;
2568         struct extent_buffer *leaf;
2569         struct btrfs_dev_item *dev_item;
2570         struct btrfs_device *device;
2571         struct btrfs_key key;
2572         u8 fs_uuid[BTRFS_FSID_SIZE];
2573         u8 dev_uuid[BTRFS_UUID_SIZE];
2574         int ret;
2575
2576         path = btrfs_alloc_path();
2577         if (!path)
2578                 return -ENOMEM;
2579
2580         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2581         key.offset = 0;
2582         key.type = BTRFS_DEV_ITEM_KEY;
2583
2584         while (1) {
2585                 btrfs_reserve_chunk_metadata(trans, false);
2586                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2587                 btrfs_trans_release_chunk_metadata(trans);
2588                 if (ret < 0)
2589                         goto error;
2590
2591                 leaf = path->nodes[0];
2592 next_slot:
2593                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2594                         ret = btrfs_next_leaf(root, path);
2595                         if (ret > 0)
2596                                 break;
2597                         if (ret < 0)
2598                                 goto error;
2599                         leaf = path->nodes[0];
2600                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2601                         btrfs_release_path(path);
2602                         continue;
2603                 }
2604
2605                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2606                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2607                     key.type != BTRFS_DEV_ITEM_KEY)
2608                         break;
2609
2610                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
2611                                           struct btrfs_dev_item);
2612                 args.devid = btrfs_device_id(leaf, dev_item);
2613                 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2614                                    BTRFS_UUID_SIZE);
2615                 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2616                                    BTRFS_FSID_SIZE);
2617                 args.uuid = dev_uuid;
2618                 args.fsid = fs_uuid;
2619                 device = btrfs_find_device(fs_info->fs_devices, &args);
2620                 BUG_ON(!device); /* Logic error */
2621
2622                 if (device->fs_devices->seeding) {
2623                         btrfs_set_device_generation(leaf, dev_item,
2624                                                     device->generation);
2625                         btrfs_mark_buffer_dirty(trans, leaf);
2626                 }
2627
2628                 path->slots[0]++;
2629                 goto next_slot;
2630         }
2631         ret = 0;
2632 error:
2633         btrfs_free_path(path);
2634         return ret;
2635 }
2636
2637 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2638 {
2639         struct btrfs_root *root = fs_info->dev_root;
2640         struct btrfs_trans_handle *trans;
2641         struct btrfs_device *device;
2642         struct file *bdev_file;
2643         struct super_block *sb = fs_info->sb;
2644         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2645         struct btrfs_fs_devices *seed_devices = NULL;
2646         u64 orig_super_total_bytes;
2647         u64 orig_super_num_devices;
2648         int ret = 0;
2649         bool seeding_dev = false;
2650         bool locked = false;
2651
2652         if (sb_rdonly(sb) && !fs_devices->seeding)
2653                 return -EROFS;
2654
2655         bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE,
2656                                         fs_info->bdev_holder, NULL);
2657         if (IS_ERR(bdev_file))
2658                 return PTR_ERR(bdev_file);
2659
2660         if (!btrfs_check_device_zone_type(fs_info, file_bdev(bdev_file))) {
2661                 ret = -EINVAL;
2662                 goto error;
2663         }
2664
2665         if (fs_devices->seeding) {
2666                 seeding_dev = true;
2667                 down_write(&sb->s_umount);
2668                 mutex_lock(&uuid_mutex);
2669                 locked = true;
2670         }
2671
2672         sync_blockdev(file_bdev(bdev_file));
2673
2674         rcu_read_lock();
2675         list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
2676                 if (device->bdev == file_bdev(bdev_file)) {
2677                         ret = -EEXIST;
2678                         rcu_read_unlock();
2679                         goto error;
2680                 }
2681         }
2682         rcu_read_unlock();
2683
2684         device = btrfs_alloc_device(fs_info, NULL, NULL, device_path);
2685         if (IS_ERR(device)) {
2686                 /* we can safely leave the fs_devices entry around */
2687                 ret = PTR_ERR(device);
2688                 goto error;
2689         }
2690
2691         device->fs_info = fs_info;
2692         device->bdev_file = bdev_file;
2693         device->bdev = file_bdev(bdev_file);
2694         ret = lookup_bdev(device_path, &device->devt);
2695         if (ret)
2696                 goto error_free_device;
2697
2698         ret = btrfs_get_dev_zone_info(device, false);
2699         if (ret)
2700                 goto error_free_device;
2701
2702         trans = btrfs_start_transaction(root, 0);
2703         if (IS_ERR(trans)) {
2704                 ret = PTR_ERR(trans);
2705                 goto error_free_zone;
2706         }
2707
2708         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2709         device->generation = trans->transid;
2710         device->io_width = fs_info->sectorsize;
2711         device->io_align = fs_info->sectorsize;
2712         device->sector_size = fs_info->sectorsize;
2713         device->total_bytes =
2714                 round_down(bdev_nr_bytes(device->bdev), fs_info->sectorsize);
2715         device->disk_total_bytes = device->total_bytes;
2716         device->commit_total_bytes = device->total_bytes;
2717         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2718         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2719         device->dev_stats_valid = 1;
2720         set_blocksize(device->bdev_file, BTRFS_BDEV_BLOCKSIZE);
2721
2722         if (seeding_dev) {
2723                 btrfs_clear_sb_rdonly(sb);
2724
2725                 /* GFP_KERNEL allocation must not be under device_list_mutex */
2726                 seed_devices = btrfs_init_sprout(fs_info);
2727                 if (IS_ERR(seed_devices)) {
2728                         ret = PTR_ERR(seed_devices);
2729                         btrfs_abort_transaction(trans, ret);
2730                         goto error_trans;
2731                 }
2732         }
2733
2734         mutex_lock(&fs_devices->device_list_mutex);
2735         if (seeding_dev) {
2736                 btrfs_setup_sprout(fs_info, seed_devices);
2737                 btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev,
2738                                                 device);
2739         }
2740
2741         device->fs_devices = fs_devices;
2742
2743         mutex_lock(&fs_info->chunk_mutex);
2744         list_add_rcu(&device->dev_list, &fs_devices->devices);
2745         list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
2746         fs_devices->num_devices++;
2747         fs_devices->open_devices++;
2748         fs_devices->rw_devices++;
2749         fs_devices->total_devices++;
2750         fs_devices->total_rw_bytes += device->total_bytes;
2751
2752         atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2753
2754         if (!bdev_nonrot(device->bdev))
2755                 fs_devices->rotating = true;
2756
2757         orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
2758         btrfs_set_super_total_bytes(fs_info->super_copy,
2759                 round_down(orig_super_total_bytes + device->total_bytes,
2760                            fs_info->sectorsize));
2761
2762         orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
2763         btrfs_set_super_num_devices(fs_info->super_copy,
2764                                     orig_super_num_devices + 1);
2765
2766         /*
2767          * we've got more storage, clear any full flags on the space
2768          * infos
2769          */
2770         btrfs_clear_space_info_full(fs_info);
2771
2772         mutex_unlock(&fs_info->chunk_mutex);
2773
2774         /* Add sysfs device entry */
2775         btrfs_sysfs_add_device(device);
2776
2777         mutex_unlock(&fs_devices->device_list_mutex);
2778
2779         if (seeding_dev) {
2780                 mutex_lock(&fs_info->chunk_mutex);
2781                 ret = init_first_rw_device(trans);
2782                 mutex_unlock(&fs_info->chunk_mutex);
2783                 if (ret) {
2784                         btrfs_abort_transaction(trans, ret);
2785                         goto error_sysfs;
2786                 }
2787         }
2788
2789         ret = btrfs_add_dev_item(trans, device);
2790         if (ret) {
2791                 btrfs_abort_transaction(trans, ret);
2792                 goto error_sysfs;
2793         }
2794
2795         if (seeding_dev) {
2796                 ret = btrfs_finish_sprout(trans);
2797                 if (ret) {
2798                         btrfs_abort_transaction(trans, ret);
2799                         goto error_sysfs;
2800                 }
2801
2802                 /*
2803                  * fs_devices now represents the newly sprouted filesystem and
2804                  * its fsid has been changed by btrfs_sprout_splice().
2805                  */
2806                 btrfs_sysfs_update_sprout_fsid(fs_devices);
2807         }
2808
2809         ret = btrfs_commit_transaction(trans);
2810
2811         if (seeding_dev) {
2812                 mutex_unlock(&uuid_mutex);
2813                 up_write(&sb->s_umount);
2814                 locked = false;
2815
2816                 if (ret) /* transaction commit */
2817                         return ret;
2818
2819                 ret = btrfs_relocate_sys_chunks(fs_info);
2820                 if (ret < 0)
2821                         btrfs_handle_fs_error(fs_info, ret,
2822                                     "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2823                 trans = btrfs_attach_transaction(root);
2824                 if (IS_ERR(trans)) {
2825                         if (PTR_ERR(trans) == -ENOENT)
2826                                 return 0;
2827                         ret = PTR_ERR(trans);
2828                         trans = NULL;
2829                         goto error_sysfs;
2830                 }
2831                 ret = btrfs_commit_transaction(trans);
2832         }
2833
2834         /*
2835          * Now that we have written a new super block to this device, check all
2836          * other fs_devices list if device_path alienates any other scanned
2837          * device.
2838          * We can ignore the return value as it typically returns -EINVAL and
2839          * only succeeds if the device was an alien.
2840          */
2841         btrfs_forget_devices(device->devt);
2842
2843         /* Update ctime/mtime for blkid or udev */
2844         update_dev_time(device_path);
2845
2846         return ret;
2847
2848 error_sysfs:
2849         btrfs_sysfs_remove_device(device);
2850         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2851         mutex_lock(&fs_info->chunk_mutex);
2852         list_del_rcu(&device->dev_list);
2853         list_del(&device->dev_alloc_list);
2854         fs_info->fs_devices->num_devices--;
2855         fs_info->fs_devices->open_devices--;
2856         fs_info->fs_devices->rw_devices--;
2857         fs_info->fs_devices->total_devices--;
2858         fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
2859         atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
2860         btrfs_set_super_total_bytes(fs_info->super_copy,
2861                                     orig_super_total_bytes);
2862         btrfs_set_super_num_devices(fs_info->super_copy,
2863                                     orig_super_num_devices);
2864         mutex_unlock(&fs_info->chunk_mutex);
2865         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2866 error_trans:
2867         if (seeding_dev)
2868                 btrfs_set_sb_rdonly(sb);
2869         if (trans)
2870                 btrfs_end_transaction(trans);
2871 error_free_zone:
2872         btrfs_destroy_dev_zone_info(device);
2873 error_free_device:
2874         btrfs_free_device(device);
2875 error:
2876         fput(bdev_file);
2877         if (locked) {
2878                 mutex_unlock(&uuid_mutex);
2879                 up_write(&sb->s_umount);
2880         }
2881         return ret;
2882 }
2883
2884 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2885                                         struct btrfs_device *device)
2886 {
2887         int ret;
2888         struct btrfs_path *path;
2889         struct btrfs_root *root = device->fs_info->chunk_root;
2890         struct btrfs_dev_item *dev_item;
2891         struct extent_buffer *leaf;
2892         struct btrfs_key key;
2893
2894         path = btrfs_alloc_path();
2895         if (!path)
2896                 return -ENOMEM;
2897
2898         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2899         key.type = BTRFS_DEV_ITEM_KEY;
2900         key.offset = device->devid;
2901
2902         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2903         if (ret < 0)
2904                 goto out;
2905
2906         if (ret > 0) {
2907                 ret = -ENOENT;
2908                 goto out;
2909         }
2910
2911         leaf = path->nodes[0];
2912         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2913
2914         btrfs_set_device_id(leaf, dev_item, device->devid);
2915         btrfs_set_device_type(leaf, dev_item, device->type);
2916         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2917         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2918         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2919         btrfs_set_device_total_bytes(leaf, dev_item,
2920                                      btrfs_device_get_disk_total_bytes(device));
2921         btrfs_set_device_bytes_used(leaf, dev_item,
2922                                     btrfs_device_get_bytes_used(device));
2923         btrfs_mark_buffer_dirty(trans, leaf);
2924
2925 out:
2926         btrfs_free_path(path);
2927         return ret;
2928 }
2929
2930 int btrfs_grow_device(struct btrfs_trans_handle *trans,
2931                       struct btrfs_device *device, u64 new_size)
2932 {
2933         struct btrfs_fs_info *fs_info = device->fs_info;
2934         struct btrfs_super_block *super_copy = fs_info->super_copy;
2935         u64 old_total;
2936         u64 diff;
2937         int ret;
2938
2939         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2940                 return -EACCES;
2941
2942         new_size = round_down(new_size, fs_info->sectorsize);
2943
2944         mutex_lock(&fs_info->chunk_mutex);
2945         old_total = btrfs_super_total_bytes(super_copy);
2946         diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
2947
2948         if (new_size <= device->total_bytes ||
2949             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2950                 mutex_unlock(&fs_info->chunk_mutex);
2951                 return -EINVAL;
2952         }
2953
2954         btrfs_set_super_total_bytes(super_copy,
2955                         round_down(old_total + diff, fs_info->sectorsize));
2956         device->fs_devices->total_rw_bytes += diff;
2957         atomic64_add(diff, &fs_info->free_chunk_space);
2958
2959         btrfs_device_set_total_bytes(device, new_size);
2960         btrfs_device_set_disk_total_bytes(device, new_size);
2961         btrfs_clear_space_info_full(device->fs_info);
2962         if (list_empty(&device->post_commit_list))
2963                 list_add_tail(&device->post_commit_list,
2964                               &trans->transaction->dev_update_list);
2965         mutex_unlock(&fs_info->chunk_mutex);
2966
2967         btrfs_reserve_chunk_metadata(trans, false);
2968         ret = btrfs_update_device(trans, device);
2969         btrfs_trans_release_chunk_metadata(trans);
2970
2971         return ret;
2972 }
2973
2974 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2975 {
2976         struct btrfs_fs_info *fs_info = trans->fs_info;
2977         struct btrfs_root *root = fs_info->chunk_root;
2978         int ret;
2979         struct btrfs_path *path;
2980         struct btrfs_key key;
2981
2982         path = btrfs_alloc_path();
2983         if (!path)
2984                 return -ENOMEM;
2985
2986         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2987         key.offset = chunk_offset;
2988         key.type = BTRFS_CHUNK_ITEM_KEY;
2989
2990         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2991         if (ret < 0)
2992                 goto out;
2993         else if (ret > 0) { /* Logic error or corruption */
2994                 btrfs_err(fs_info, "failed to lookup chunk %llu when freeing",
2995                           chunk_offset);
2996                 btrfs_abort_transaction(trans, -ENOENT);
2997                 ret = -EUCLEAN;
2998                 goto out;
2999         }
3000
3001         ret = btrfs_del_item(trans, root, path);
3002         if (ret < 0) {
3003                 btrfs_err(fs_info, "failed to delete chunk %llu item", chunk_offset);
3004                 btrfs_abort_transaction(trans, ret);
3005                 goto out;
3006         }
3007 out:
3008         btrfs_free_path(path);
3009         return ret;
3010 }
3011
3012 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
3013 {
3014         struct btrfs_super_block *super_copy = fs_info->super_copy;
3015         struct btrfs_disk_key *disk_key;
3016         struct btrfs_chunk *chunk;
3017         u8 *ptr;
3018         int ret = 0;
3019         u32 num_stripes;
3020         u32 array_size;
3021         u32 len = 0;
3022         u32 cur;
3023         struct btrfs_key key;
3024
3025         lockdep_assert_held(&fs_info->chunk_mutex);
3026         array_size = btrfs_super_sys_array_size(super_copy);
3027
3028         ptr = super_copy->sys_chunk_array;
3029         cur = 0;
3030
3031         while (cur < array_size) {
3032                 disk_key = (struct btrfs_disk_key *)ptr;
3033                 btrfs_disk_key_to_cpu(&key, disk_key);
3034
3035                 len = sizeof(*disk_key);
3036
3037                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
3038                         chunk = (struct btrfs_chunk *)(ptr + len);
3039                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
3040                         len += btrfs_chunk_item_size(num_stripes);
3041                 } else {
3042                         ret = -EIO;
3043                         break;
3044                 }
3045                 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
3046                     key.offset == chunk_offset) {
3047                         memmove(ptr, ptr + len, array_size - (cur + len));
3048                         array_size -= len;
3049                         btrfs_set_super_sys_array_size(super_copy, array_size);
3050                 } else {
3051                         ptr += len;
3052                         cur += len;
3053                 }
3054         }
3055         return ret;
3056 }
3057
3058 struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_info,
3059                                                     u64 logical, u64 length)
3060 {
3061         struct rb_node *node = fs_info->mapping_tree.rb_root.rb_node;
3062         struct rb_node *prev = NULL;
3063         struct rb_node *orig_prev;
3064         struct btrfs_chunk_map *map;
3065         struct btrfs_chunk_map *prev_map = NULL;
3066
3067         while (node) {
3068                 map = rb_entry(node, struct btrfs_chunk_map, rb_node);
3069                 prev = node;
3070                 prev_map = map;
3071
3072                 if (logical < map->start) {
3073                         node = node->rb_left;
3074                 } else if (logical >= map->start + map->chunk_len) {
3075                         node = node->rb_right;
3076                 } else {
3077                         refcount_inc(&map->refs);
3078                         return map;
3079                 }
3080         }
3081
3082         if (!prev)
3083                 return NULL;
3084
3085         orig_prev = prev;
3086         while (prev && logical >= prev_map->start + prev_map->chunk_len) {
3087                 prev = rb_next(prev);
3088                 prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node);
3089         }
3090
3091         if (!prev) {
3092                 prev = orig_prev;
3093                 prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node);
3094                 while (prev && logical < prev_map->start) {
3095                         prev = rb_prev(prev);
3096                         prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node);
3097                 }
3098         }
3099
3100         if (prev) {
3101                 u64 end = logical + length;
3102
3103                 /*
3104                  * Caller can pass a U64_MAX length when it wants to get any
3105                  * chunk starting at an offset of 'logical' or higher, so deal
3106                  * with underflow by resetting the end offset to U64_MAX.
3107                  */
3108                 if (end < logical)
3109                         end = U64_MAX;
3110
3111                 if (end > prev_map->start &&
3112                     logical < prev_map->start + prev_map->chunk_len) {
3113                         refcount_inc(&prev_map->refs);
3114                         return prev_map;
3115                 }
3116         }
3117
3118         return NULL;
3119 }
3120
3121 struct btrfs_chunk_map *btrfs_find_chunk_map(struct btrfs_fs_info *fs_info,
3122                                              u64 logical, u64 length)
3123 {
3124         struct btrfs_chunk_map *map;
3125
3126         read_lock(&fs_info->mapping_tree_lock);
3127         map = btrfs_find_chunk_map_nolock(fs_info, logical, length);
3128         read_unlock(&fs_info->mapping_tree_lock);
3129
3130         return map;
3131 }
3132
3133 /*
3134  * Find the mapping containing the given logical extent.
3135  *
3136  * @logical: Logical block offset in bytes.
3137  * @length: Length of extent in bytes.
3138  *
3139  * Return: Chunk mapping or ERR_PTR.
3140  */
3141 struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
3142                                             u64 logical, u64 length)
3143 {
3144         struct btrfs_chunk_map *map;
3145
3146         map = btrfs_find_chunk_map(fs_info, logical, length);
3147
3148         if (unlikely(!map)) {
3149                 btrfs_crit(fs_info,
3150                            "unable to find chunk map for logical %llu length %llu",
3151                            logical, length);
3152                 return ERR_PTR(-EINVAL);
3153         }
3154
3155         if (unlikely(map->start > logical || map->start + map->chunk_len <= logical)) {
3156                 btrfs_crit(fs_info,
3157                            "found a bad chunk map, wanted %llu-%llu, found %llu-%llu",
3158                            logical, logical + length, map->start,
3159                            map->start + map->chunk_len);
3160                 btrfs_free_chunk_map(map);
3161                 return ERR_PTR(-EINVAL);
3162         }
3163
3164         /* Callers are responsible for dropping the reference. */
3165         return map;
3166 }
3167
3168 static int remove_chunk_item(struct btrfs_trans_handle *trans,
3169                              struct btrfs_chunk_map *map, u64 chunk_offset)
3170 {
3171         int i;
3172
3173         /*
3174          * Removing chunk items and updating the device items in the chunks btree
3175          * requires holding the chunk_mutex.
3176          * See the comment at btrfs_chunk_alloc() for the details.
3177          */
3178         lockdep_assert_held(&trans->fs_info->chunk_mutex);
3179
3180         for (i = 0; i < map->num_stripes; i++) {
3181                 int ret;
3182
3183                 ret = btrfs_update_device(trans, map->stripes[i].dev);
3184                 if (ret)
3185                         return ret;
3186         }
3187
3188         return btrfs_free_chunk(trans, chunk_offset);
3189 }
3190
3191 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
3192 {
3193         struct btrfs_fs_info *fs_info = trans->fs_info;
3194         struct btrfs_chunk_map *map;
3195         u64 dev_extent_len = 0;
3196         int i, ret = 0;
3197         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
3198
3199         map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
3200         if (IS_ERR(map)) {
3201                 /*
3202                  * This is a logic error, but we don't want to just rely on the
3203                  * user having built with ASSERT enabled, so if ASSERT doesn't
3204                  * do anything we still error out.
3205                  */
3206                 ASSERT(0);
3207                 return PTR_ERR(map);
3208         }
3209
3210         /*
3211          * First delete the device extent items from the devices btree.
3212          * We take the device_list_mutex to avoid racing with the finishing phase
3213          * of a device replace operation. See the comment below before acquiring
3214          * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex
3215          * because that can result in a deadlock when deleting the device extent
3216          * items from the devices btree - COWing an extent buffer from the btree
3217          * may result in allocating a new metadata chunk, which would attempt to
3218          * lock again fs_info->chunk_mutex.
3219          */
3220         mutex_lock(&fs_devices->device_list_mutex);
3221         for (i = 0; i < map->num_stripes; i++) {
3222                 struct btrfs_device *device = map->stripes[i].dev;
3223                 ret = btrfs_free_dev_extent(trans, device,
3224                                             map->stripes[i].physical,
3225                                             &dev_extent_len);
3226                 if (ret) {
3227                         mutex_unlock(&fs_devices->device_list_mutex);
3228                         btrfs_abort_transaction(trans, ret);
3229                         goto out;
3230                 }
3231
3232                 if (device->bytes_used > 0) {
3233                         mutex_lock(&fs_info->chunk_mutex);
3234                         btrfs_device_set_bytes_used(device,
3235                                         device->bytes_used - dev_extent_len);
3236                         atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
3237                         btrfs_clear_space_info_full(fs_info);
3238                         mutex_unlock(&fs_info->chunk_mutex);
3239                 }
3240         }
3241         mutex_unlock(&fs_devices->device_list_mutex);
3242
3243         /*
3244          * We acquire fs_info->chunk_mutex for 2 reasons:
3245          *
3246          * 1) Just like with the first phase of the chunk allocation, we must
3247          *    reserve system space, do all chunk btree updates and deletions, and
3248          *    update the system chunk array in the superblock while holding this
3249          *    mutex. This is for similar reasons as explained on the comment at
3250          *    the top of btrfs_chunk_alloc();
3251          *
3252          * 2) Prevent races with the final phase of a device replace operation
3253          *    that replaces the device object associated with the map's stripes,
3254          *    because the device object's id can change at any time during that
3255          *    final phase of the device replace operation
3256          *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
3257          *    replaced device and then see it with an ID of
3258          *    BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating
3259          *    the device item, which does not exists on the chunk btree.
3260          *    The finishing phase of device replace acquires both the
3261          *    device_list_mutex and the chunk_mutex, in that order, so we are
3262          *    safe by just acquiring the chunk_mutex.
3263          */
3264         trans->removing_chunk = true;
3265         mutex_lock(&fs_info->chunk_mutex);
3266
3267         check_system_chunk(trans, map->type);
3268
3269         ret = remove_chunk_item(trans, map, chunk_offset);
3270         /*
3271          * Normally we should not get -ENOSPC since we reserved space before
3272          * through the call to check_system_chunk().
3273          *
3274          * Despite our system space_info having enough free space, we may not
3275          * be able to allocate extents from its block groups, because all have
3276          * an incompatible profile, which will force us to allocate a new system
3277          * block group with the right profile, or right after we called
3278          * check_system_space() above, a scrub turned the only system block group
3279          * with enough free space into RO mode.
3280          * This is explained with more detail at do_chunk_alloc().
3281          *
3282          * So if we get -ENOSPC, allocate a new system chunk and retry once.
3283          */
3284         if (ret == -ENOSPC) {
3285                 const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
3286                 struct btrfs_block_group *sys_bg;
3287
3288                 sys_bg = btrfs_create_chunk(trans, sys_flags);
3289                 if (IS_ERR(sys_bg)) {
3290                         ret = PTR_ERR(sys_bg);
3291                         btrfs_abort_transaction(trans, ret);
3292                         goto out;
3293                 }
3294
3295                 ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
3296                 if (ret) {
3297                         btrfs_abort_transaction(trans, ret);
3298                         goto out;
3299                 }
3300
3301                 ret = remove_chunk_item(trans, map, chunk_offset);
3302                 if (ret) {
3303                         btrfs_abort_transaction(trans, ret);
3304                         goto out;
3305                 }
3306         } else if (ret) {
3307                 btrfs_abort_transaction(trans, ret);
3308                 goto out;
3309         }
3310
3311         trace_btrfs_chunk_free(fs_info, map, chunk_offset, map->chunk_len);
3312
3313         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
3314                 ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
3315                 if (ret) {
3316                         btrfs_abort_transaction(trans, ret);
3317                         goto out;
3318                 }
3319         }
3320
3321         mutex_unlock(&fs_info->chunk_mutex);
3322         trans->removing_chunk = false;
3323
3324         /*
3325          * We are done with chunk btree updates and deletions, so release the
3326          * system space we previously reserved (with check_system_chunk()).
3327          */
3328         btrfs_trans_release_chunk_metadata(trans);
3329
3330         ret = btrfs_remove_block_group(trans, map);
3331         if (ret) {
3332                 btrfs_abort_transaction(trans, ret);
3333                 goto out;
3334         }
3335
3336 out:
3337         if (trans->removing_chunk) {
3338                 mutex_unlock(&fs_info->chunk_mutex);
3339                 trans->removing_chunk = false;
3340         }
3341         /* once for us */
3342         btrfs_free_chunk_map(map);
3343         return ret;
3344 }
3345
3346 int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
3347 {
3348         struct btrfs_root *root = fs_info->chunk_root;
3349         struct btrfs_trans_handle *trans;
3350         struct btrfs_block_group *block_group;
3351         u64 length;
3352         int ret;
3353
3354         if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
3355                 btrfs_err(fs_info,
3356                           "relocate: not supported on extent tree v2 yet");
3357                 return -EINVAL;
3358         }
3359
3360         /*
3361          * Prevent races with automatic removal of unused block groups.
3362          * After we relocate and before we remove the chunk with offset
3363          * chunk_offset, automatic removal of the block group can kick in,
3364          * resulting in a failure when calling btrfs_remove_chunk() below.
3365          *
3366          * Make sure to acquire this mutex before doing a tree search (dev
3367          * or chunk trees) to find chunks. Otherwise the cleaner kthread might
3368          * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
3369          * we release the path used to search the chunk/dev tree and before
3370          * the current task acquires this mutex and calls us.
3371          */
3372         lockdep_assert_held(&fs_info->reclaim_bgs_lock);
3373
3374         /* step one, relocate all the extents inside this chunk */
3375         btrfs_scrub_pause(fs_info);
3376         ret = btrfs_relocate_block_group(fs_info, chunk_offset);
3377         btrfs_scrub_continue(fs_info);
3378         if (ret) {
3379                 /*
3380                  * If we had a transaction abort, stop all running scrubs.
3381                  * See transaction.c:cleanup_transaction() why we do it here.
3382                  */
3383                 if (BTRFS_FS_ERROR(fs_info))
3384                         btrfs_scrub_cancel(fs_info);
3385                 return ret;
3386         }
3387
3388         block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
3389         if (!block_group)
3390                 return -ENOENT;
3391         btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
3392         length = block_group->length;
3393         btrfs_put_block_group(block_group);
3394
3395         /*
3396          * On a zoned file system, discard the whole block group, this will
3397          * trigger a REQ_OP_ZONE_RESET operation on the device zone. If
3398          * resetting the zone fails, don't treat it as a fatal problem from the
3399          * filesystem's point of view.
3400          */
3401         if (btrfs_is_zoned(fs_info)) {
3402                 ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL);
3403                 if (ret)
3404                         btrfs_info(fs_info,
3405                                 "failed to reset zone %llu after relocation",
3406                                 chunk_offset);
3407         }
3408
3409         trans = btrfs_start_trans_remove_block_group(root->fs_info,
3410                                                      chunk_offset);
3411         if (IS_ERR(trans)) {
3412                 ret = PTR_ERR(trans);
3413                 btrfs_handle_fs_error(root->fs_info, ret, NULL);
3414                 return ret;
3415         }
3416
3417         /*
3418          * step two, delete the device extents and the
3419          * chunk tree entries
3420          */
3421         ret = btrfs_remove_chunk(trans, chunk_offset);
3422         btrfs_end_transaction(trans);
3423         return ret;
3424 }
3425
3426 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
3427 {
3428         struct btrfs_root *chunk_root = fs_info->chunk_root;
3429         struct btrfs_path *path;
3430         struct extent_buffer *leaf;
3431         struct btrfs_chunk *chunk;
3432         struct btrfs_key key;
3433         struct btrfs_key found_key;
3434         u64 chunk_type;
3435         bool retried = false;
3436         int failed = 0;
3437         int ret;
3438
3439         path = btrfs_alloc_path();
3440         if (!path)
3441                 return -ENOMEM;
3442
3443 again:
3444         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3445         key.offset = (u64)-1;
3446         key.type = BTRFS_CHUNK_ITEM_KEY;
3447
3448         while (1) {
3449                 mutex_lock(&fs_info->reclaim_bgs_lock);
3450                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3451                 if (ret < 0) {
3452                         mutex_unlock(&fs_info->reclaim_bgs_lock);
3453                         goto error;
3454                 }
3455                 if (ret == 0) {
3456                         /*
3457                          * On the first search we would find chunk tree with
3458                          * offset -1, which is not possible. On subsequent
3459                          * loops this would find an existing item on an invalid
3460                          * offset (one less than the previous one, wrong
3461                          * alignment and size).
3462                          */
3463                         ret = -EUCLEAN;
3464                         mutex_unlock(&fs_info->reclaim_bgs_lock);
3465                         goto error;
3466                 }
3467
3468                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
3469                                           key.type);
3470                 if (ret)
3471                         mutex_unlock(&fs_info->reclaim_bgs_lock);
3472                 if (ret < 0)
3473                         goto error;
3474                 if (ret > 0)
3475                         break;
3476
3477                 leaf = path->nodes[0];
3478                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3479
3480                 chunk = btrfs_item_ptr(leaf, path->slots[0],
3481                                        struct btrfs_chunk);
3482                 chunk_type = btrfs_chunk_type(leaf, chunk);
3483                 btrfs_release_path(path);
3484
3485                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3486                         ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3487                         if (ret == -ENOSPC)
3488                                 failed++;
3489                         else
3490                                 BUG_ON(ret);
3491                 }
3492                 mutex_unlock(&fs_info->reclaim_bgs_lock);
3493
3494                 if (found_key.offset == 0)
3495                         break;
3496                 key.offset = found_key.offset - 1;
3497         }
3498         ret = 0;
3499         if (failed && !retried) {
3500                 failed = 0;
3501                 retried = true;
3502                 goto again;
3503         } else if (WARN_ON(failed && retried)) {
3504                 ret = -ENOSPC;
3505         }
3506 error:
3507         btrfs_free_path(path);
3508         return ret;
3509 }
3510
3511 /*
3512  * return 1 : allocate a data chunk successfully,
3513  * return <0: errors during allocating a data chunk,
3514  * return 0 : no need to allocate a data chunk.
3515  */
3516 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
3517                                       u64 chunk_offset)
3518 {
3519         struct btrfs_block_group *cache;
3520         u64 bytes_used;
3521         u64 chunk_type;
3522
3523         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3524         ASSERT(cache);
3525         chunk_type = cache->flags;
3526         btrfs_put_block_group(cache);
3527
3528         if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
3529                 return 0;
3530
3531         spin_lock(&fs_info->data_sinfo->lock);
3532         bytes_used = fs_info->data_sinfo->bytes_used;
3533         spin_unlock(&fs_info->data_sinfo->lock);
3534
3535         if (!bytes_used) {
3536                 struct btrfs_trans_handle *trans;
3537                 int ret;
3538
3539                 trans = btrfs_join_transaction(fs_info->tree_root);
3540                 if (IS_ERR(trans))
3541                         return PTR_ERR(trans);
3542
3543                 ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
3544                 btrfs_end_transaction(trans);
3545                 if (ret < 0)
3546                         return ret;
3547                 return 1;
3548         }
3549
3550         return 0;
3551 }
3552
3553 static void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
3554                                            const struct btrfs_disk_balance_args *disk)
3555 {
3556         memset(cpu, 0, sizeof(*cpu));
3557
3558         cpu->profiles = le64_to_cpu(disk->profiles);
3559         cpu->usage = le64_to_cpu(disk->usage);
3560         cpu->devid = le64_to_cpu(disk->devid);
3561         cpu->pstart = le64_to_cpu(disk->pstart);
3562         cpu->pend = le64_to_cpu(disk->pend);
3563         cpu->vstart = le64_to_cpu(disk->vstart);
3564         cpu->vend = le64_to_cpu(disk->vend);
3565         cpu->target = le64_to_cpu(disk->target);
3566         cpu->flags = le64_to_cpu(disk->flags);
3567         cpu->limit = le64_to_cpu(disk->limit);
3568         cpu->stripes_min = le32_to_cpu(disk->stripes_min);
3569         cpu->stripes_max = le32_to_cpu(disk->stripes_max);
3570 }
3571
3572 static void btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
3573                                            const struct btrfs_balance_args *cpu)
3574 {
3575         memset(disk, 0, sizeof(*disk));
3576
3577         disk->profiles = cpu_to_le64(cpu->profiles);
3578         disk->usage = cpu_to_le64(cpu->usage);
3579         disk->devid = cpu_to_le64(cpu->devid);
3580         disk->pstart = cpu_to_le64(cpu->pstart);
3581         disk->pend = cpu_to_le64(cpu->pend);
3582         disk->vstart = cpu_to_le64(cpu->vstart);
3583         disk->vend = cpu_to_le64(cpu->vend);
3584         disk->target = cpu_to_le64(cpu->target);
3585         disk->flags = cpu_to_le64(cpu->flags);
3586         disk->limit = cpu_to_le64(cpu->limit);
3587         disk->stripes_min = cpu_to_le32(cpu->stripes_min);
3588         disk->stripes_max = cpu_to_le32(cpu->stripes_max);
3589 }
3590
3591 static int insert_balance_item(struct btrfs_fs_info *fs_info,
3592                                struct btrfs_balance_control *bctl)
3593 {
3594         struct btrfs_root *root = fs_info->tree_root;
3595         struct btrfs_trans_handle *trans;
3596         struct btrfs_balance_item *item;
3597         struct btrfs_disk_balance_args disk_bargs;
3598         struct btrfs_path *path;
3599         struct extent_buffer *leaf;
3600         struct btrfs_key key;
3601         int ret, err;
3602
3603         path = btrfs_alloc_path();
3604         if (!path)
3605                 return -ENOMEM;
3606
3607         trans = btrfs_start_transaction(root, 0);
3608         if (IS_ERR(trans)) {
3609                 btrfs_free_path(path);
3610                 return PTR_ERR(trans);
3611         }
3612
3613         key.objectid = BTRFS_BALANCE_OBJECTID;
3614         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3615         key.offset = 0;
3616
3617         ret = btrfs_insert_empty_item(trans, root, path, &key,
3618                                       sizeof(*item));
3619         if (ret)
3620                 goto out;
3621
3622         leaf = path->nodes[0];
3623         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3624
3625         memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3626
3627         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
3628         btrfs_set_balance_data(leaf, item, &disk_bargs);
3629         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
3630         btrfs_set_balance_meta(leaf, item, &disk_bargs);
3631         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
3632         btrfs_set_balance_sys(leaf, item, &disk_bargs);
3633
3634         btrfs_set_balance_flags(leaf, item, bctl->flags);
3635
3636         btrfs_mark_buffer_dirty(trans, leaf);
3637 out:
3638         btrfs_free_path(path);
3639         err = btrfs_commit_transaction(trans);
3640         if (err && !ret)
3641                 ret = err;
3642         return ret;
3643 }
3644
3645 static int del_balance_item(struct btrfs_fs_info *fs_info)
3646 {
3647         struct btrfs_root *root = fs_info->tree_root;
3648         struct btrfs_trans_handle *trans;
3649         struct btrfs_path *path;
3650         struct btrfs_key key;
3651         int ret, err;
3652
3653         path = btrfs_alloc_path();
3654         if (!path)
3655                 return -ENOMEM;
3656
3657         trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
3658         if (IS_ERR(trans)) {
3659                 btrfs_free_path(path);
3660                 return PTR_ERR(trans);
3661         }
3662
3663         key.objectid = BTRFS_BALANCE_OBJECTID;
3664         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3665         key.offset = 0;
3666
3667         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3668         if (ret < 0)
3669                 goto out;
3670         if (ret > 0) {
3671                 ret = -ENOENT;
3672                 goto out;
3673         }
3674
3675         ret = btrfs_del_item(trans, root, path);
3676 out:
3677         btrfs_free_path(path);
3678         err = btrfs_commit_transaction(trans);
3679         if (err && !ret)
3680                 ret = err;
3681         return ret;
3682 }
3683
3684 /*
3685  * This is a heuristic used to reduce the number of chunks balanced on
3686  * resume after balance was interrupted.
3687  */
3688 static void update_balance_args(struct btrfs_balance_control *bctl)
3689 {
3690         /*
3691          * Turn on soft mode for chunk types that were being converted.
3692          */
3693         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
3694                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
3695         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
3696                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
3697         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
3698                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
3699
3700         /*
3701          * Turn on usage filter if is not already used.  The idea is
3702          * that chunks that we have already balanced should be
3703          * reasonably full.  Don't do it for chunks that are being
3704          * converted - that will keep us from relocating unconverted
3705          * (albeit full) chunks.
3706          */
3707         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3708             !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3709             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3710                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
3711                 bctl->data.usage = 90;
3712         }
3713         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3714             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3715             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3716                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
3717                 bctl->sys.usage = 90;
3718         }
3719         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3720             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3721             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3722                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
3723                 bctl->meta.usage = 90;
3724         }
3725 }
3726
3727 /*
3728  * Clear the balance status in fs_info and delete the balance item from disk.
3729  */
3730 static void reset_balance_state(struct btrfs_fs_info *fs_info)
3731 {
3732         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3733         int ret;
3734
3735         ASSERT(fs_info->balance_ctl);
3736
3737         spin_lock(&fs_info->balance_lock);
3738         fs_info->balance_ctl = NULL;
3739         spin_unlock(&fs_info->balance_lock);
3740
3741         kfree(bctl);
3742         ret = del_balance_item(fs_info);
3743         if (ret)
3744                 btrfs_handle_fs_error(fs_info, ret, NULL);
3745 }
3746
3747 /*
3748  * Balance filters.  Return 1 if chunk should be filtered out
3749  * (should not be balanced).
3750  */
3751 static int chunk_profiles_filter(u64 chunk_type,
3752                                  struct btrfs_balance_args *bargs)
3753 {
3754         chunk_type = chunk_to_extended(chunk_type) &
3755                                 BTRFS_EXTENDED_PROFILE_MASK;
3756
3757         if (bargs->profiles & chunk_type)
3758                 return 0;
3759
3760         return 1;
3761 }
3762
3763 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
3764                               struct btrfs_balance_args *bargs)
3765 {
3766         struct btrfs_block_group *cache;
3767         u64 chunk_used;
3768         u64 user_thresh_min;
3769         u64 user_thresh_max;
3770         int ret = 1;
3771
3772         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3773         chunk_used = cache->used;
3774
3775         if (bargs->usage_min == 0)
3776                 user_thresh_min = 0;
3777         else
3778                 user_thresh_min = mult_perc(cache->length, bargs->usage_min);
3779
3780         if (bargs->usage_max == 0)
3781                 user_thresh_max = 1;
3782         else if (bargs->usage_max > 100)
3783                 user_thresh_max = cache->length;
3784         else
3785                 user_thresh_max = mult_perc(cache->length, bargs->usage_max);
3786
3787         if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
3788                 ret = 0;
3789
3790         btrfs_put_block_group(cache);
3791         return ret;
3792 }
3793
3794 static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3795                 u64 chunk_offset, struct btrfs_balance_args *bargs)
3796 {
3797         struct btrfs_block_group *cache;
3798         u64 chunk_used, user_thresh;
3799         int ret = 1;
3800
3801         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3802         chunk_used = cache->used;
3803
3804         if (bargs->usage_min == 0)
3805                 user_thresh = 1;
3806         else if (bargs->usage > 100)
3807                 user_thresh = cache->length;
3808         else
3809                 user_thresh = mult_perc(cache->length, bargs->usage);
3810
3811         if (chunk_used < user_thresh)
3812                 ret = 0;
3813
3814         btrfs_put_block_group(cache);
3815         return ret;
3816 }
3817
3818 static int chunk_devid_filter(struct extent_buffer *leaf,
3819                               struct btrfs_chunk *chunk,
3820                               struct btrfs_balance_args *bargs)
3821 {
3822         struct btrfs_stripe *stripe;
3823         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3824         int i;
3825
3826         for (i = 0; i < num_stripes; i++) {
3827                 stripe = btrfs_stripe_nr(chunk, i);
3828                 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
3829                         return 0;
3830         }
3831
3832         return 1;
3833 }
3834
3835 static u64 calc_data_stripes(u64 type, int num_stripes)
3836 {
3837         const int index = btrfs_bg_flags_to_raid_index(type);
3838         const int ncopies = btrfs_raid_array[index].ncopies;
3839         const int nparity = btrfs_raid_array[index].nparity;
3840
3841         return (num_stripes - nparity) / ncopies;
3842 }
3843
3844 /* [pstart, pend) */
3845 static int chunk_drange_filter(struct extent_buffer *leaf,
3846                                struct btrfs_chunk *chunk,
3847                                struct btrfs_balance_args *bargs)
3848 {
3849         struct btrfs_stripe *stripe;
3850         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3851         u64 stripe_offset;
3852         u64 stripe_length;
3853         u64 type;
3854         int factor;
3855         int i;
3856
3857         if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
3858                 return 0;
3859
3860         type = btrfs_chunk_type(leaf, chunk);
3861         factor = calc_data_stripes(type, num_stripes);
3862
3863         for (i = 0; i < num_stripes; i++) {
3864                 stripe = btrfs_stripe_nr(chunk, i);
3865                 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
3866                         continue;
3867
3868                 stripe_offset = btrfs_stripe_offset(leaf, stripe);
3869                 stripe_length = btrfs_chunk_length(leaf, chunk);
3870                 stripe_length = div_u64(stripe_length, factor);
3871
3872                 if (stripe_offset < bargs->pend &&
3873                     stripe_offset + stripe_length > bargs->pstart)
3874                         return 0;
3875         }
3876
3877         return 1;
3878 }
3879
3880 /* [vstart, vend) */
3881 static int chunk_vrange_filter(struct extent_buffer *leaf,
3882                                struct btrfs_chunk *chunk,
3883                                u64 chunk_offset,
3884                                struct btrfs_balance_args *bargs)
3885 {
3886         if (chunk_offset < bargs->vend &&
3887             chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
3888                 /* at least part of the chunk is inside this vrange */
3889                 return 0;
3890
3891         return 1;
3892 }
3893
3894 static int chunk_stripes_range_filter(struct extent_buffer *leaf,
3895                                struct btrfs_chunk *chunk,
3896                                struct btrfs_balance_args *bargs)
3897 {
3898         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3899
3900         if (bargs->stripes_min <= num_stripes
3901                         && num_stripes <= bargs->stripes_max)
3902                 return 0;
3903
3904         return 1;
3905 }
3906
3907 static int chunk_soft_convert_filter(u64 chunk_type,
3908                                      struct btrfs_balance_args *bargs)
3909 {
3910         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3911                 return 0;
3912
3913         chunk_type = chunk_to_extended(chunk_type) &
3914                                 BTRFS_EXTENDED_PROFILE_MASK;
3915
3916         if (bargs->target == chunk_type)
3917                 return 1;
3918
3919         return 0;
3920 }
3921
3922 static int should_balance_chunk(struct extent_buffer *leaf,
3923                                 struct btrfs_chunk *chunk, u64 chunk_offset)
3924 {
3925         struct btrfs_fs_info *fs_info = leaf->fs_info;
3926         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3927         struct btrfs_balance_args *bargs = NULL;
3928         u64 chunk_type = btrfs_chunk_type(leaf, chunk);
3929
3930         /* type filter */
3931         if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
3932               (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
3933                 return 0;
3934         }
3935
3936         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3937                 bargs = &bctl->data;
3938         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3939                 bargs = &bctl->sys;
3940         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3941                 bargs = &bctl->meta;
3942
3943         /* profiles filter */
3944         if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3945             chunk_profiles_filter(chunk_type, bargs)) {
3946                 return 0;
3947         }
3948
3949         /* usage filter */
3950         if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3951             chunk_usage_filter(fs_info, chunk_offset, bargs)) {
3952                 return 0;
3953         } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3954             chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3955                 return 0;
3956         }
3957
3958         /* devid filter */
3959         if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3960             chunk_devid_filter(leaf, chunk, bargs)) {
3961                 return 0;
3962         }
3963
3964         /* drange filter, makes sense only with devid filter */
3965         if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3966             chunk_drange_filter(leaf, chunk, bargs)) {
3967                 return 0;
3968         }
3969
3970         /* vrange filter */
3971         if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3972             chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3973                 return 0;
3974         }
3975
3976         /* stripes filter */
3977         if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
3978             chunk_stripes_range_filter(leaf, chunk, bargs)) {
3979                 return 0;
3980         }
3981
3982         /* soft profile changing mode */
3983         if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3984             chunk_soft_convert_filter(chunk_type, bargs)) {
3985                 return 0;
3986         }
3987
3988         /*
3989          * limited by count, must be the last filter
3990          */
3991         if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
3992                 if (bargs->limit == 0)
3993                         return 0;
3994                 else
3995                         bargs->limit--;
3996         } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
3997                 /*
3998                  * Same logic as the 'limit' filter; the minimum cannot be
3999                  * determined here because we do not have the global information
4000                  * about the count of all chunks that satisfy the filters.
4001                  */
4002                 if (bargs->limit_max == 0)
4003                         return 0;
4004                 else
4005                         bargs->limit_max--;
4006         }
4007
4008         return 1;
4009 }
4010
4011 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
4012 {
4013         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
4014         struct btrfs_root *chunk_root = fs_info->chunk_root;
4015         u64 chunk_type;
4016         struct btrfs_chunk *chunk;
4017         struct btrfs_path *path = NULL;
4018         struct btrfs_key key;
4019         struct btrfs_key found_key;
4020         struct extent_buffer *leaf;
4021         int slot;
4022         int ret;
4023         int enospc_errors = 0;
4024         bool counting = true;
4025         /* The single value limit and min/max limits use the same bytes in the */
4026         u64 limit_data = bctl->data.limit;
4027         u64 limit_meta = bctl->meta.limit;
4028         u64 limit_sys = bctl->sys.limit;
4029         u32 count_data = 0;
4030         u32 count_meta = 0;
4031         u32 count_sys = 0;
4032         int chunk_reserved = 0;
4033
4034         path = btrfs_alloc_path();
4035         if (!path) {
4036                 ret = -ENOMEM;
4037                 goto error;
4038         }
4039
4040         /* zero out stat counters */
4041         spin_lock(&fs_info->balance_lock);
4042         memset(&bctl->stat, 0, sizeof(bctl->stat));
4043         spin_unlock(&fs_info->balance_lock);
4044 again:
4045         if (!counting) {
4046                 /*
4047                  * The single value limit and min/max limits use the same bytes
4048                  * in the
4049                  */
4050                 bctl->data.limit = limit_data;
4051                 bctl->meta.limit = limit_meta;
4052                 bctl->sys.limit = limit_sys;
4053         }
4054         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
4055         key.offset = (u64)-1;
4056         key.type = BTRFS_CHUNK_ITEM_KEY;
4057
4058         while (1) {
4059                 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
4060                     atomic_read(&fs_info->balance_cancel_req)) {
4061                         ret = -ECANCELED;
4062                         goto error;
4063                 }
4064
4065                 mutex_lock(&fs_info->reclaim_bgs_lock);
4066                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
4067                 if (ret < 0) {
4068                         mutex_unlock(&fs_info->reclaim_bgs_lock);
4069                         goto error;
4070                 }
4071
4072                 /*
4073                  * this shouldn't happen, it means the last relocate
4074                  * failed
4075                  */
4076                 if (ret == 0)
4077                         BUG(); /* FIXME break ? */
4078
4079                 ret = btrfs_previous_item(chunk_root, path, 0,
4080                                           BTRFS_CHUNK_ITEM_KEY);
4081                 if (ret) {
4082                         mutex_unlock(&fs_info->reclaim_bgs_lock);
4083                         ret = 0;
4084                         break;
4085                 }
4086
4087                 leaf = path->nodes[0];
4088                 slot = path->slots[0];
4089                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
4090
4091                 if (found_key.objectid != key.objectid) {
4092                         mutex_unlock(&fs_info->reclaim_bgs_lock);
4093                         break;
4094                 }
4095
4096                 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
4097                 chunk_type = btrfs_chunk_type(leaf, chunk);
4098
4099                 if (!counting) {
4100                         spin_lock(&fs_info->balance_lock);
4101                         bctl->stat.considered++;
4102                         spin_unlock(&fs_info->balance_lock);
4103                 }
4104
4105                 ret = should_balance_chunk(leaf, chunk, found_key.offset);
4106
4107                 btrfs_release_path(path);
4108                 if (!ret) {
4109                         mutex_unlock(&fs_info->reclaim_bgs_lock);
4110                         goto loop;
4111                 }
4112
4113                 if (counting) {
4114                         mutex_unlock(&fs_info->reclaim_bgs_lock);
4115                         spin_lock(&fs_info->balance_lock);
4116                         bctl->stat.expected++;
4117                         spin_unlock(&fs_info->balance_lock);
4118
4119                         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
4120                                 count_data++;
4121                         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
4122                                 count_sys++;
4123                         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
4124                                 count_meta++;
4125
4126                         goto loop;
4127                 }
4128
4129                 /*
4130                  * Apply limit_min filter, no need to check if the LIMITS
4131                  * filter is used, limit_min is 0 by default
4132                  */
4133                 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
4134                                         count_data < bctl->data.limit_min)
4135                                 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
4136                                         count_meta < bctl->meta.limit_min)
4137                                 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
4138                                         count_sys < bctl->sys.limit_min)) {
4139                         mutex_unlock(&fs_info->reclaim_bgs_lock);
4140                         goto loop;
4141                 }
4142
4143                 if (!chunk_reserved) {
4144                         /*
4145                          * We may be relocating the only data chunk we have,
4146                          * which could potentially end up with losing data's
4147                          * raid profile, so lets allocate an empty one in
4148                          * advance.
4149                          */
4150                         ret = btrfs_may_alloc_data_chunk(fs_info,
4151                                                          found_key.offset);
4152                         if (ret < 0) {
4153                                 mutex_unlock(&fs_info->reclaim_bgs_lock);
4154                                 goto error;
4155                         } else if (ret == 1) {
4156                                 chunk_reserved = 1;
4157                         }
4158                 }
4159
4160                 ret = btrfs_relocate_chunk(fs_info, found_key.offset);
4161                 mutex_unlock(&fs_info->reclaim_bgs_lock);
4162                 if (ret == -ENOSPC) {
4163                         enospc_errors++;
4164                 } else if (ret == -ETXTBSY) {
4165                         btrfs_info(fs_info,
4166            "skipping relocation of block group %llu due to active swapfile",
4167                                    found_key.offset);
4168                         ret = 0;
4169                 } else if (ret) {
4170                         goto error;
4171                 } else {
4172                         spin_lock(&fs_info->balance_lock);
4173                         bctl->stat.completed++;
4174                         spin_unlock(&fs_info->balance_lock);
4175                 }
4176 loop:
4177                 if (found_key.offset == 0)
4178                         break;
4179                 key.offset = found_key.offset - 1;
4180         }
4181
4182         if (counting) {
4183                 btrfs_release_path(path);
4184                 counting = false;
4185                 goto again;
4186         }
4187 error:
4188         btrfs_free_path(path);
4189         if (enospc_errors) {
4190                 btrfs_info(fs_info, "%d enospc errors during balance",
4191                            enospc_errors);
4192                 if (!ret)
4193                         ret = -ENOSPC;
4194         }
4195
4196         return ret;
4197 }
4198
4199 /*
4200  * See if a given profile is valid and reduced.
4201  *
4202  * @flags:     profile to validate
4203  * @extended:  if true @flags is treated as an extended profile
4204  */
4205 static int alloc_profile_is_valid(u64 flags, int extended)
4206 {
4207         u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
4208                                BTRFS_BLOCK_GROUP_PROFILE_MASK);
4209
4210         flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
4211
4212         /* 1) check that all other bits are zeroed */
4213         if (flags & ~mask)
4214                 return 0;
4215
4216         /* 2) see if profile is reduced */
4217         if (flags == 0)
4218                 return !extended; /* "0" is valid for usual profiles */
4219
4220         return has_single_bit_set(flags);
4221 }
4222
4223 /*
4224  * Validate target profile against allowed profiles and return true if it's OK.
4225  * Otherwise print the error message and return false.
4226  */
4227 static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
4228                 const struct btrfs_balance_args *bargs,
4229                 u64 allowed, const char *type)
4230 {
4231         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
4232                 return true;
4233
4234         /* Profile is valid and does not have bits outside of the allowed set */
4235         if (alloc_profile_is_valid(bargs->target, 1) &&
4236             (bargs->target & ~allowed) == 0)
4237                 return true;
4238
4239         btrfs_err(fs_info, "balance: invalid convert %s profile %s",
4240                         type, btrfs_bg_type_to_raid_name(bargs->target));
4241         return false;
4242 }
4243
4244 /*
4245  * Fill @buf with textual description of balance filter flags @bargs, up to
4246  * @size_buf including the terminating null. The output may be trimmed if it
4247  * does not fit into the provided buffer.
4248  */
4249 static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
4250                                  u32 size_buf)
4251 {
4252         int ret;
4253         u32 size_bp = size_buf;
4254         char *bp = buf;
4255         u64 flags = bargs->flags;
4256         char tmp_buf[128] = {'\0'};
4257
4258         if (!flags)
4259                 return;
4260
4261 #define CHECK_APPEND_NOARG(a)                                           \
4262         do {                                                            \
4263                 ret = snprintf(bp, size_bp, (a));                       \
4264                 if (ret < 0 || ret >= size_bp)                          \
4265                         goto out_overflow;                              \
4266                 size_bp -= ret;                                         \
4267                 bp += ret;                                              \
4268         } while (0)
4269
4270 #define CHECK_APPEND_1ARG(a, v1)                                        \
4271         do {                                                            \
4272                 ret = snprintf(bp, size_bp, (a), (v1));                 \
4273                 if (ret < 0 || ret >= size_bp)                          \
4274                         goto out_overflow;                              \
4275                 size_bp -= ret;                                         \
4276                 bp += ret;                                              \
4277         } while (0)
4278
4279 #define CHECK_APPEND_2ARG(a, v1, v2)                                    \
4280         do {                                                            \
4281                 ret = snprintf(bp, size_bp, (a), (v1), (v2));           \
4282                 if (ret < 0 || ret >= size_bp)                          \
4283                         goto out_overflow;                              \
4284                 size_bp -= ret;                                         \
4285                 bp += ret;                                              \
4286         } while (0)
4287
4288         if (flags & BTRFS_BALANCE_ARGS_CONVERT)
4289                 CHECK_APPEND_1ARG("convert=%s,",
4290                                   btrfs_bg_type_to_raid_name(bargs->target));
4291
4292         if (flags & BTRFS_BALANCE_ARGS_SOFT)
4293                 CHECK_APPEND_NOARG("soft,");
4294
4295         if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
4296                 btrfs_describe_block_groups(bargs->profiles, tmp_buf,
4297                                             sizeof(tmp_buf));
4298                 CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
4299         }
4300
4301         if (flags & BTRFS_BALANCE_ARGS_USAGE)
4302                 CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
4303
4304         if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
4305                 CHECK_APPEND_2ARG("usage=%u..%u,",
4306                                   bargs->usage_min, bargs->usage_max);
4307
4308         if (flags & BTRFS_BALANCE_ARGS_DEVID)
4309                 CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
4310
4311         if (flags & BTRFS_BALANCE_ARGS_DRANGE)
4312                 CHECK_APPEND_2ARG("drange=%llu..%llu,",
4313                                   bargs->pstart, bargs->pend);
4314
4315         if (flags & BTRFS_BALANCE_ARGS_VRANGE)
4316                 CHECK_APPEND_2ARG("vrange=%llu..%llu,",
4317                                   bargs->vstart, bargs->vend);
4318
4319         if (flags & BTRFS_BALANCE_ARGS_LIMIT)
4320                 CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
4321
4322         if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
4323                 CHECK_APPEND_2ARG("limit=%u..%u,",
4324                                 bargs->limit_min, bargs->limit_max);
4325
4326         if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
4327                 CHECK_APPEND_2ARG("stripes=%u..%u,",
4328                                   bargs->stripes_min, bargs->stripes_max);
4329
4330 #undef CHECK_APPEND_2ARG
4331 #undef CHECK_APPEND_1ARG
4332 #undef CHECK_APPEND_NOARG
4333
4334 out_overflow:
4335
4336         if (size_bp < size_buf)
4337                 buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
4338         else
4339                 buf[0] = '\0';
4340 }
4341
4342 static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
4343 {
4344         u32 size_buf = 1024;
4345         char tmp_buf[192] = {'\0'};
4346         char *buf;
4347         char *bp;
4348         u32 size_bp = size_buf;
4349         int ret;
4350         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
4351
4352         buf = kzalloc(size_buf, GFP_KERNEL);
4353         if (!buf)
4354                 return;
4355
4356         bp = buf;
4357
4358 #define CHECK_APPEND_1ARG(a, v1)                                        \
4359         do {                                                            \
4360                 ret = snprintf(bp, size_bp, (a), (v1));                 \
4361                 if (ret < 0 || ret >= size_bp)                          \
4362                         goto out_overflow;                              \
4363                 size_bp -= ret;                                         \
4364                 bp += ret;                                              \
4365         } while (0)
4366
4367         if (bctl->flags & BTRFS_BALANCE_FORCE)
4368                 CHECK_APPEND_1ARG("%s", "-f ");
4369
4370         if (bctl->flags & BTRFS_BALANCE_DATA) {
4371                 describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
4372                 CHECK_APPEND_1ARG("-d%s ", tmp_buf);
4373         }
4374
4375         if (bctl->flags & BTRFS_BALANCE_METADATA) {
4376                 describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
4377                 CHECK_APPEND_1ARG("-m%s ", tmp_buf);
4378         }
4379
4380         if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
4381                 describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
4382                 CHECK_APPEND_1ARG("-s%s ", tmp_buf);
4383         }
4384
4385 #undef CHECK_APPEND_1ARG
4386
4387 out_overflow:
4388
4389         if (size_bp < size_buf)
4390                 buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
4391         btrfs_info(fs_info, "balance: %s %s",
4392                    (bctl->flags & BTRFS_BALANCE_RESUME) ?
4393                    "resume" : "start", buf);
4394
4395         kfree(buf);
4396 }
4397
4398 /*
4399  * Should be called with balance mutexe held
4400  */
4401 int btrfs_balance(struct btrfs_fs_info *fs_info,
4402                   struct btrfs_balance_control *bctl,
4403                   struct btrfs_ioctl_balance_args *bargs)
4404 {
4405         u64 meta_target, data_target;
4406         u64 allowed;
4407         int mixed = 0;
4408         int ret;
4409         u64 num_devices;
4410         unsigned seq;
4411         bool reducing_redundancy;
4412         bool paused = false;
4413         int i;
4414
4415         if (btrfs_fs_closing(fs_info) ||
4416             atomic_read(&fs_info->balance_pause_req) ||
4417             btrfs_should_cancel_balance(fs_info)) {
4418                 ret = -EINVAL;
4419                 goto out;
4420         }
4421
4422         allowed = btrfs_super_incompat_flags(fs_info->super_copy);
4423         if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
4424                 mixed = 1;
4425
4426         /*
4427          * In case of mixed groups both data and meta should be picked,
4428          * and identical options should be given for both of them.
4429          */
4430         allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
4431         if (mixed && (bctl->flags & allowed)) {
4432                 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
4433                     !(bctl->flags & BTRFS_BALANCE_METADATA) ||
4434                     memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
4435                         btrfs_err(fs_info,
4436           "balance: mixed groups data and metadata options must be the same");
4437                         ret = -EINVAL;
4438                         goto out;
4439                 }
4440         }
4441
4442         /*
4443          * rw_devices will not change at the moment, device add/delete/replace
4444          * are exclusive
4445          */
4446         num_devices = fs_info->fs_devices->rw_devices;
4447
4448         /*
4449          * SINGLE profile on-disk has no profile bit, but in-memory we have a
4450          * special bit for it, to make it easier to distinguish.  Thus we need
4451          * to set it manually, or balance would refuse the profile.
4452          */
4453         allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
4454         for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
4455                 if (num_devices >= btrfs_raid_array[i].devs_min)
4456                         allowed |= btrfs_raid_array[i].bg_flag;
4457
4458         if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
4459             !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
4460             !validate_convert_profile(fs_info, &bctl->sys,  allowed, "system")) {
4461                 ret = -EINVAL;
4462                 goto out;
4463         }
4464
4465         /*
4466          * Allow to reduce metadata or system integrity only if force set for
4467          * profiles with redundancy (copies, parity)
4468          */
4469         allowed = 0;
4470         for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
4471                 if (btrfs_raid_array[i].ncopies >= 2 ||
4472                     btrfs_raid_array[i].tolerated_failures >= 1)
4473                         allowed |= btrfs_raid_array[i].bg_flag;
4474         }
4475         do {
4476                 seq = read_seqbegin(&fs_info->profiles_lock);
4477
4478                 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4479                      (fs_info->avail_system_alloc_bits & allowed) &&
4480                      !(bctl->sys.target & allowed)) ||
4481                     ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4482                      (fs_info->avail_metadata_alloc_bits & allowed) &&
4483                      !(bctl->meta.target & allowed)))
4484                         reducing_redundancy = true;
4485                 else
4486                         reducing_redundancy = false;
4487
4488                 /* if we're not converting, the target field is uninitialized */
4489                 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4490                         bctl->meta.target : fs_info->avail_metadata_alloc_bits;
4491                 data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4492                         bctl->data.target : fs_info->avail_data_alloc_bits;
4493         } while (read_seqretry(&fs_info->profiles_lock, seq));
4494
4495         if (reducing_redundancy) {
4496                 if (bctl->flags & BTRFS_BALANCE_FORCE) {
4497                         btrfs_info(fs_info,
4498                            "balance: force reducing metadata redundancy");
4499                 } else {
4500                         btrfs_err(fs_info,
4501         "balance: reduces metadata redundancy, use --force if you want this");
4502                         ret = -EINVAL;
4503                         goto out;
4504                 }
4505         }
4506
4507         if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
4508                 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
4509                 btrfs_warn(fs_info,
4510         "balance: metadata profile %s has lower redundancy than data profile %s",
4511                                 btrfs_bg_type_to_raid_name(meta_target),
4512                                 btrfs_bg_type_to_raid_name(data_target));
4513         }
4514
4515         ret = insert_balance_item(fs_info, bctl);
4516         if (ret && ret != -EEXIST)
4517                 goto out;
4518
4519         if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
4520                 BUG_ON(ret == -EEXIST);
4521                 BUG_ON(fs_info->balance_ctl);
4522                 spin_lock(&fs_info->balance_lock);
4523                 fs_info->balance_ctl = bctl;
4524                 spin_unlock(&fs_info->balance_lock);
4525         } else {
4526                 BUG_ON(ret != -EEXIST);
4527                 spin_lock(&fs_info->balance_lock);
4528                 update_balance_args(bctl);
4529                 spin_unlock(&fs_info->balance_lock);
4530         }
4531
4532         ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4533         set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4534         describe_balance_start_or_resume(fs_info);
4535         mutex_unlock(&fs_info->balance_mutex);
4536
4537         ret = __btrfs_balance(fs_info);
4538
4539         mutex_lock(&fs_info->balance_mutex);
4540         if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) {
4541                 btrfs_info(fs_info, "balance: paused");
4542                 btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED);
4543                 paused = true;
4544         }
4545         /*
4546          * Balance can be canceled by:
4547          *
4548          * - Regular cancel request
4549          *   Then ret == -ECANCELED and balance_cancel_req > 0
4550          *
4551          * - Fatal signal to "btrfs" process
4552          *   Either the signal caught by wait_reserve_ticket() and callers
4553          *   got -EINTR, or caught by btrfs_should_cancel_balance() and
4554          *   got -ECANCELED.
4555          *   Either way, in this case balance_cancel_req = 0, and
4556          *   ret == -EINTR or ret == -ECANCELED.
4557          *
4558          * So here we only check the return value to catch canceled balance.
4559          */
4560         else if (ret == -ECANCELED || ret == -EINTR)
4561                 btrfs_info(fs_info, "balance: canceled");
4562         else
4563                 btrfs_info(fs_info, "balance: ended with status: %d", ret);
4564
4565         clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4566
4567         if (bargs) {
4568                 memset(bargs, 0, sizeof(*bargs));
4569                 btrfs_update_ioctl_balance_args(fs_info, bargs);
4570         }
4571
4572         /* We didn't pause, we can clean everything up. */
4573         if (!paused) {
4574                 reset_balance_state(fs_info);
4575                 btrfs_exclop_finish(fs_info);
4576         }
4577
4578         wake_up(&fs_info->balance_wait_q);
4579
4580         return ret;
4581 out:
4582         if (bctl->flags & BTRFS_BALANCE_RESUME)
4583                 reset_balance_state(fs_info);
4584         else
4585                 kfree(bctl);
4586         btrfs_exclop_finish(fs_info);
4587
4588         return ret;
4589 }
4590
4591 static int balance_kthread(void *data)
4592 {
4593         struct btrfs_fs_info *fs_info = data;
4594         int ret = 0;
4595
4596         sb_start_write(fs_info->sb);
4597         mutex_lock(&fs_info->balance_mutex);
4598         if (fs_info->balance_ctl)
4599                 ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
4600         mutex_unlock(&fs_info->balance_mutex);
4601         sb_end_write(fs_info->sb);
4602
4603         return ret;
4604 }
4605
4606 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
4607 {
4608         struct task_struct *tsk;
4609
4610         mutex_lock(&fs_info->balance_mutex);
4611         if (!fs_info->balance_ctl) {
4612                 mutex_unlock(&fs_info->balance_mutex);
4613                 return 0;
4614         }
4615         mutex_unlock(&fs_info->balance_mutex);
4616
4617         if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
4618                 btrfs_info(fs_info, "balance: resume skipped");
4619                 return 0;
4620         }
4621
4622         spin_lock(&fs_info->super_lock);
4623         ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
4624         fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
4625         spin_unlock(&fs_info->super_lock);
4626         /*
4627          * A ro->rw remount sequence should continue with the paused balance
4628          * regardless of who pauses it, system or the user as of now, so set
4629          * the resume flag.
4630          */
4631         spin_lock(&fs_info->balance_lock);
4632         fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
4633         spin_unlock(&fs_info->balance_lock);
4634
4635         tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
4636         return PTR_ERR_OR_ZERO(tsk);
4637 }
4638
4639 int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
4640 {
4641         struct btrfs_balance_control *bctl;
4642         struct btrfs_balance_item *item;
4643         struct btrfs_disk_balance_args disk_bargs;
4644         struct btrfs_path *path;
4645         struct extent_buffer *leaf;
4646         struct btrfs_key key;
4647         int ret;
4648
4649         path = btrfs_alloc_path();
4650         if (!path)
4651                 return -ENOMEM;
4652
4653         key.objectid = BTRFS_BALANCE_OBJECTID;
4654         key.type = BTRFS_TEMPORARY_ITEM_KEY;
4655         key.offset = 0;
4656
4657         ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
4658         if (ret < 0)
4659                 goto out;
4660         if (ret > 0) { /* ret = -ENOENT; */
4661                 ret = 0;
4662                 goto out;
4663         }
4664
4665         bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
4666         if (!bctl) {
4667                 ret = -ENOMEM;
4668                 goto out;
4669         }
4670
4671         leaf = path->nodes[0];
4672         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
4673
4674         bctl->flags = btrfs_balance_flags(leaf, item);
4675         bctl->flags |= BTRFS_BALANCE_RESUME;
4676
4677         btrfs_balance_data(leaf, item, &disk_bargs);
4678         btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
4679         btrfs_balance_meta(leaf, item, &disk_bargs);
4680         btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
4681         btrfs_balance_sys(leaf, item, &disk_bargs);
4682         btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
4683
4684         /*
4685          * This should never happen, as the paused balance state is recovered
4686          * during mount without any chance of other exclusive ops to collide.
4687          *
4688          * This gives the exclusive op status to balance and keeps in paused
4689          * state until user intervention (cancel or umount). If the ownership
4690          * cannot be assigned, show a message but do not fail. The balance
4691          * is in a paused state and must have fs_info::balance_ctl properly
4692          * set up.
4693          */
4694         if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED))
4695                 btrfs_warn(fs_info,
4696         "balance: cannot set exclusive op status, resume manually");
4697
4698         btrfs_release_path(path);
4699
4700         mutex_lock(&fs_info->balance_mutex);
4701         BUG_ON(fs_info->balance_ctl);
4702         spin_lock(&fs_info->balance_lock);
4703         fs_info->balance_ctl = bctl;
4704         spin_unlock(&fs_info->balance_lock);
4705         mutex_unlock(&fs_info->balance_mutex);
4706 out:
4707         btrfs_free_path(path);
4708         return ret;
4709 }
4710
4711 int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
4712 {
4713         int ret = 0;
4714
4715         mutex_lock(&fs_info->balance_mutex);
4716         if (!fs_info->balance_ctl) {
4717                 mutex_unlock(&fs_info->balance_mutex);
4718                 return -ENOTCONN;
4719         }
4720
4721         if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4722                 atomic_inc(&fs_info->balance_pause_req);
4723                 mutex_unlock(&fs_info->balance_mutex);
4724
4725                 wait_event(fs_info->balance_wait_q,
4726                            !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4727
4728                 mutex_lock(&fs_info->balance_mutex);
4729                 /* we are good with balance_ctl ripped off from under us */
4730                 BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4731                 atomic_dec(&fs_info->balance_pause_req);
4732         } else {
4733                 ret = -ENOTCONN;
4734         }
4735
4736         mutex_unlock(&fs_info->balance_mutex);
4737         return ret;
4738 }
4739
4740 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
4741 {
4742         mutex_lock(&fs_info->balance_mutex);
4743         if (!fs_info->balance_ctl) {
4744                 mutex_unlock(&fs_info->balance_mutex);
4745                 return -ENOTCONN;
4746         }
4747
4748         /*
4749          * A paused balance with the item stored on disk can be resumed at
4750          * mount time if the mount is read-write. Otherwise it's still paused
4751          * and we must not allow cancelling as it deletes the item.
4752          */
4753         if (sb_rdonly(fs_info->sb)) {
4754                 mutex_unlock(&fs_info->balance_mutex);
4755                 return -EROFS;
4756         }
4757
4758         atomic_inc(&fs_info->balance_cancel_req);
4759         /*
4760          * if we are running just wait and return, balance item is
4761          * deleted in btrfs_balance in this case
4762          */
4763         if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4764                 mutex_unlock(&fs_info->balance_mutex);
4765                 wait_event(fs_info->balance_wait_q,
4766                            !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4767                 mutex_lock(&fs_info->balance_mutex);
4768         } else {
4769                 mutex_unlock(&fs_info->balance_mutex);
4770                 /*
4771                  * Lock released to allow other waiters to continue, we'll
4772                  * reexamine the status again.
4773                  */
4774                 mutex_lock(&fs_info->balance_mutex);
4775
4776                 if (fs_info->balance_ctl) {
4777                         reset_balance_state(fs_info);
4778                         btrfs_exclop_finish(fs_info);
4779                         btrfs_info(fs_info, "balance: canceled");
4780                 }
4781         }
4782
4783         ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4784         atomic_dec(&fs_info->balance_cancel_req);
4785         mutex_unlock(&fs_info->balance_mutex);
4786         return 0;
4787 }
4788
4789 /*
4790  * shrinking a device means finding all of the device extents past
4791  * the new size, and then following the back refs to the chunks.
4792  * The chunk relocation code actually frees the device extent
4793  */
4794 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
4795 {
4796         struct btrfs_fs_info *fs_info = device->fs_info;
4797         struct btrfs_root *root = fs_info->dev_root;
4798         struct btrfs_trans_handle *trans;
4799         struct btrfs_dev_extent *dev_extent = NULL;
4800         struct btrfs_path *path;
4801         u64 length;
4802         u64 chunk_offset;
4803         int ret;
4804         int slot;
4805         int failed = 0;
4806         bool retried = false;
4807         struct extent_buffer *l;
4808         struct btrfs_key key;
4809         struct btrfs_super_block *super_copy = fs_info->super_copy;
4810         u64 old_total = btrfs_super_total_bytes(super_copy);
4811         u64 old_size = btrfs_device_get_total_bytes(device);
4812         u64 diff;
4813         u64 start;
4814         u64 free_diff = 0;
4815
4816         new_size = round_down(new_size, fs_info->sectorsize);
4817         start = new_size;
4818         diff = round_down(old_size - new_size, fs_info->sectorsize);
4819
4820         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
4821                 return -EINVAL;
4822
4823         path = btrfs_alloc_path();
4824         if (!path)
4825                 return -ENOMEM;
4826
4827         path->reada = READA_BACK;
4828
4829         trans = btrfs_start_transaction(root, 0);
4830         if (IS_ERR(trans)) {
4831                 btrfs_free_path(path);
4832                 return PTR_ERR(trans);
4833         }
4834
4835         mutex_lock(&fs_info->chunk_mutex);
4836
4837         btrfs_device_set_total_bytes(device, new_size);
4838         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
4839                 device->fs_devices->total_rw_bytes -= diff;
4840
4841                 /*
4842                  * The new free_chunk_space is new_size - used, so we have to
4843                  * subtract the delta of the old free_chunk_space which included
4844                  * old_size - used.  If used > new_size then just subtract this
4845                  * entire device's free space.
4846                  */
4847                 if (device->bytes_used < new_size)
4848                         free_diff = (old_size - device->bytes_used) -
4849                                     (new_size - device->bytes_used);
4850                 else
4851                         free_diff = old_size - device->bytes_used;
4852                 atomic64_sub(free_diff, &fs_info->free_chunk_space);
4853         }
4854
4855         /*
4856          * Once the device's size has been set to the new size, ensure all
4857          * in-memory chunks are synced to disk so that the loop below sees them
4858          * and relocates them accordingly.
4859          */
4860         if (contains_pending_extent(device, &start, diff)) {
4861                 mutex_unlock(&fs_info->chunk_mutex);
4862                 ret = btrfs_commit_transaction(trans);
4863                 if (ret)
4864                         goto done;
4865         } else {
4866                 mutex_unlock(&fs_info->chunk_mutex);
4867                 btrfs_end_transaction(trans);
4868         }
4869
4870 again:
4871         key.objectid = device->devid;
4872         key.offset = (u64)-1;
4873         key.type = BTRFS_DEV_EXTENT_KEY;
4874
4875         do {
4876                 mutex_lock(&fs_info->reclaim_bgs_lock);
4877                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4878                 if (ret < 0) {
4879                         mutex_unlock(&fs_info->reclaim_bgs_lock);
4880                         goto done;
4881                 }
4882
4883                 ret = btrfs_previous_item(root, path, 0, key.type);
4884                 if (ret) {
4885                         mutex_unlock(&fs_info->reclaim_bgs_lock);
4886                         if (ret < 0)
4887                                 goto done;
4888                         ret = 0;
4889                         btrfs_release_path(path);
4890                         break;
4891                 }
4892
4893                 l = path->nodes[0];
4894                 slot = path->slots[0];
4895                 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
4896
4897                 if (key.objectid != device->devid) {
4898                         mutex_unlock(&fs_info->reclaim_bgs_lock);
4899                         btrfs_release_path(path);
4900                         break;
4901                 }
4902
4903                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
4904                 length = btrfs_dev_extent_length(l, dev_extent);
4905
4906                 if (key.offset + length <= new_size) {
4907                         mutex_unlock(&fs_info->reclaim_bgs_lock);
4908                         btrfs_release_path(path);
4909                         break;
4910                 }
4911
4912                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
4913                 btrfs_release_path(path);
4914
4915                 /*
4916                  * We may be relocating the only data chunk we have,
4917                  * which could potentially end up with losing data's
4918                  * raid profile, so lets allocate an empty one in
4919                  * advance.
4920                  */
4921                 ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
4922                 if (ret < 0) {
4923                         mutex_unlock(&fs_info->reclaim_bgs_lock);
4924                         goto done;
4925                 }
4926
4927                 ret = btrfs_relocate_chunk(fs_info, chunk_offset);
4928                 mutex_unlock(&fs_info->reclaim_bgs_lock);
4929                 if (ret == -ENOSPC) {
4930                         failed++;
4931                 } else if (ret) {
4932                         if (ret == -ETXTBSY) {
4933                                 btrfs_warn(fs_info,
4934                    "could not shrink block group %llu due to active swapfile",
4935                                            chunk_offset);
4936                         }
4937                         goto done;
4938                 }
4939         } while (key.offset-- > 0);
4940
4941         if (failed && !retried) {
4942                 failed = 0;
4943                 retried = true;
4944                 goto again;
4945         } else if (failed && retried) {
4946                 ret = -ENOSPC;
4947                 goto done;
4948         }
4949
4950         /* Shrinking succeeded, else we would be at "done". */
4951         trans = btrfs_start_transaction(root, 0);
4952         if (IS_ERR(trans)) {
4953                 ret = PTR_ERR(trans);
4954                 goto done;
4955         }
4956
4957         mutex_lock(&fs_info->chunk_mutex);
4958         /* Clear all state bits beyond the shrunk device size */
4959         clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
4960                           CHUNK_STATE_MASK);
4961
4962         btrfs_device_set_disk_total_bytes(device, new_size);
4963         if (list_empty(&device->post_commit_list))
4964                 list_add_tail(&device->post_commit_list,
4965                               &trans->transaction->dev_update_list);
4966
4967         WARN_ON(diff > old_total);
4968         btrfs_set_super_total_bytes(super_copy,
4969                         round_down(old_total - diff, fs_info->sectorsize));
4970         mutex_unlock(&fs_info->chunk_mutex);
4971
4972         btrfs_reserve_chunk_metadata(trans, false);
4973         /* Now btrfs_update_device() will change the on-disk size. */
4974         ret = btrfs_update_device(trans, device);
4975         btrfs_trans_release_chunk_metadata(trans);
4976         if (ret < 0) {
4977                 btrfs_abort_transaction(trans, ret);
4978                 btrfs_end_transaction(trans);
4979         } else {
4980                 ret = btrfs_commit_transaction(trans);
4981         }
4982 done:
4983         btrfs_free_path(path);
4984         if (ret) {
4985                 mutex_lock(&fs_info->chunk_mutex);
4986                 btrfs_device_set_total_bytes(device, old_size);
4987                 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
4988                         device->fs_devices->total_rw_bytes += diff;
4989                         atomic64_add(free_diff, &fs_info->free_chunk_space);
4990                 }
4991                 mutex_unlock(&fs_info->chunk_mutex);
4992         }
4993         return ret;
4994 }
4995
4996 static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
4997                            struct btrfs_key *key,
4998                            struct btrfs_chunk *chunk, int item_size)
4999 {
5000         struct btrfs_super_block *super_copy = fs_info->super_copy;
5001         struct btrfs_disk_key disk_key;
5002         u32 array_size;
5003         u8 *ptr;
5004
5005         lockdep_assert_held(&fs_info->chunk_mutex);
5006
5007         array_size = btrfs_super_sys_array_size(super_copy);
5008         if (array_size + item_size + sizeof(disk_key)
5009                         > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
5010                 return -EFBIG;
5011
5012         ptr = super_copy->sys_chunk_array + array_size;
5013         btrfs_cpu_key_to_disk(&disk_key, key);
5014         memcpy(ptr, &disk_key, sizeof(disk_key));
5015         ptr += sizeof(disk_key);
5016         memcpy(ptr, chunk, item_size);
5017         item_size += sizeof(disk_key);
5018         btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
5019
5020         return 0;
5021 }
5022
5023 /*
5024  * sort the devices in descending order by max_avail, total_avail
5025  */
5026 static int btrfs_cmp_device_info(const void *a, const void *b)
5027 {
5028         const struct btrfs_device_info *di_a = a;
5029         const struct btrfs_device_info *di_b = b;
5030
5031         if (di_a->max_avail > di_b->max_avail)
5032                 return -1;
5033         if (di_a->max_avail < di_b->max_avail)
5034                 return 1;
5035         if (di_a->total_avail > di_b->total_avail)
5036                 return -1;
5037         if (di_a->total_avail < di_b->total_avail)
5038                 return 1;
5039         return 0;
5040 }
5041
5042 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
5043 {
5044         if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
5045                 return;
5046
5047         btrfs_set_fs_incompat(info, RAID56);
5048 }
5049
5050 static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
5051 {
5052         if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
5053                 return;
5054
5055         btrfs_set_fs_incompat(info, RAID1C34);
5056 }
5057
5058 /*
5059  * Structure used internally for btrfs_create_chunk() function.
5060  * Wraps needed parameters.
5061  */
5062 struct alloc_chunk_ctl {
5063         u64 start;
5064         u64 type;
5065         /* Total number of stripes to allocate */
5066         int num_stripes;
5067         /* sub_stripes info for map */
5068         int sub_stripes;
5069         /* Stripes per device */
5070         int dev_stripes;
5071         /* Maximum number of devices to use */
5072         int devs_max;
5073         /* Minimum number of devices to use */
5074         int devs_min;
5075         /* ndevs has to be a multiple of this */
5076         int devs_increment;
5077         /* Number of copies */
5078         int ncopies;
5079         /* Number of stripes worth of bytes to store parity information */
5080         int nparity;
5081         u64 max_stripe_size;
5082         u64 max_chunk_size;
5083         u64 dev_extent_min;
5084         u64 stripe_size;
5085         u64 chunk_size;
5086         int ndevs;
5087 };
5088
5089 static void init_alloc_chunk_ctl_policy_regular(
5090                                 struct btrfs_fs_devices *fs_devices,
5091                                 struct alloc_chunk_ctl *ctl)
5092 {
5093         struct btrfs_space_info *space_info;
5094
5095         space_info = btrfs_find_space_info(fs_devices->fs_info, ctl->type);
5096         ASSERT(space_info);
5097
5098         ctl->max_chunk_size = READ_ONCE(space_info->chunk_size);
5099         ctl->max_stripe_size = min_t(u64, ctl->max_chunk_size, SZ_1G);
5100
5101         if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM)
5102                 ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK);
5103
5104         /* We don't want a chunk larger than 10% of writable space */
5105         ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10),
5106                                   ctl->max_chunk_size);
5107         ctl->dev_extent_min = btrfs_stripe_nr_to_offset(ctl->dev_stripes);
5108 }
5109
5110 static void init_alloc_chunk_ctl_policy_zoned(
5111                                       struct btrfs_fs_devices *fs_devices,
5112                                       struct alloc_chunk_ctl *ctl)
5113 {
5114         u64 zone_size = fs_devices->fs_info->zone_size;
5115         u64 limit;
5116         int min_num_stripes = ctl->devs_min * ctl->dev_stripes;
5117         int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies;
5118         u64 min_chunk_size = min_data_stripes * zone_size;
5119         u64 type = ctl->type;
5120
5121         ctl->max_stripe_size = zone_size;
5122         if (type & BTRFS_BLOCK_GROUP_DATA) {
5123                 ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE,
5124                                                  zone_size);
5125         } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
5126                 ctl->max_chunk_size = ctl->max_stripe_size;
5127         } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
5128                 ctl->max_chunk_size = 2 * ctl->max_stripe_size;
5129                 ctl->devs_max = min_t(int, ctl->devs_max,
5130                                       BTRFS_MAX_DEVS_SYS_CHUNK);
5131         } else {
5132                 BUG();
5133         }
5134
5135         /* We don't want a chunk larger than 10% of writable space */
5136         limit = max(round_down(mult_perc(fs_devices->total_rw_bytes, 10),
5137                                zone_size),
5138                     min_chunk_size);
5139         ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
5140         ctl->dev_extent_min = zone_size * ctl->dev_stripes;
5141 }
5142
5143 static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
5144                                  struct alloc_chunk_ctl *ctl)
5145 {
5146         int index = btrfs_bg_flags_to_raid_index(ctl->type);
5147
5148         ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
5149         ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
5150         ctl->devs_max = btrfs_raid_array[index].devs_max;
5151         if (!ctl->devs_max)
5152                 ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
5153         ctl->devs_min = btrfs_raid_array[index].devs_min;
5154         ctl->devs_increment = btrfs_raid_array[index].devs_increment;
5155         ctl->ncopies = btrfs_raid_array[index].ncopies;
5156         ctl->nparity = btrfs_raid_array[index].nparity;
5157         ctl->ndevs = 0;
5158
5159         switch (fs_devices->chunk_alloc_policy) {
5160         case BTRFS_CHUNK_ALLOC_REGULAR:
5161                 init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
5162                 break;
5163         case BTRFS_CHUNK_ALLOC_ZONED:
5164                 init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
5165                 break;
5166         default:
5167                 BUG();
5168         }
5169 }
5170
5171 static int gather_device_info(struct btrfs_fs_devices *fs_devices,
5172                               struct alloc_chunk_ctl *ctl,
5173                               struct btrfs_device_info *devices_info)
5174 {
5175         struct btrfs_fs_info *info = fs_devices->fs_info;
5176         struct btrfs_device *device;
5177         u64 total_avail;
5178         u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
5179         int ret;
5180         int ndevs = 0;
5181         u64 max_avail;
5182         u64 dev_offset;
5183
5184         /*
5185          * in the first pass through the devices list, we gather information
5186          * about the available holes on each device.
5187          */
5188         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
5189                 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
5190                         WARN(1, KERN_ERR
5191                                "BTRFS: read-only device in alloc_list\n");
5192                         continue;
5193                 }
5194
5195                 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
5196                                         &device->dev_state) ||
5197                     test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
5198                         continue;
5199
5200                 if (device->total_bytes > device->bytes_used)
5201                         total_avail = device->total_bytes - device->bytes_used;
5202                 else
5203                         total_avail = 0;
5204
5205                 /* If there is no space on this device, skip it. */
5206                 if (total_avail < ctl->dev_extent_min)
5207                         continue;
5208
5209                 ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
5210                                            &max_avail);
5211                 if (ret && ret != -ENOSPC)
5212                         return ret;
5213
5214                 if (ret == 0)
5215                         max_avail = dev_extent_want;
5216
5217                 if (max_avail < ctl->dev_extent_min) {
5218                         if (btrfs_test_opt(info, ENOSPC_DEBUG))
5219                                 btrfs_debug(info,
5220                         "%s: devid %llu has no free space, have=%llu want=%llu",
5221                                             __func__, device->devid, max_avail,
5222                                             ctl->dev_extent_min);
5223                         continue;
5224                 }
5225
5226                 if (ndevs == fs_devices->rw_devices) {
5227                         WARN(1, "%s: found more than %llu devices\n",
5228                              __func__, fs_devices->rw_devices);
5229                         break;
5230                 }
5231                 devices_info[ndevs].dev_offset = dev_offset;
5232                 devices_info[ndevs].max_avail = max_avail;
5233                 devices_info[ndevs].total_avail = total_avail;
5234                 devices_info[ndevs].dev = device;
5235                 ++ndevs;
5236         }
5237         ctl->ndevs = ndevs;
5238
5239         /*
5240          * now sort the devices by hole size / available space
5241          */
5242         sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
5243              btrfs_cmp_device_info, NULL);
5244
5245         return 0;
5246 }
5247
5248 static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
5249                                       struct btrfs_device_info *devices_info)
5250 {
5251         /* Number of stripes that count for block group size */
5252         int data_stripes;
5253
5254         /*
5255          * The primary goal is to maximize the number of stripes, so use as
5256          * many devices as possible, even if the stripes are not maximum sized.
5257          *
5258          * The DUP profile stores more than one stripe per device, the
5259          * max_avail is the total size so we have to adjust.
5260          */
5261         ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
5262                                    ctl->dev_stripes);
5263         ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5264
5265         /* This will have to be fixed for RAID1 and RAID10 over more drives */
5266         data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5267
5268         /*
5269          * Use the number of data stripes to figure out how big this chunk is
5270          * really going to be in terms of logical address space, and compare
5271          * that answer with the max chunk size. If it's higher, we try to
5272          * reduce stripe_size.
5273          */
5274         if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
5275                 /*
5276                  * Reduce stripe_size, round it up to a 16MB boundary again and
5277                  * then use it, unless it ends up being even bigger than the
5278                  * previous value we had already.
5279                  */
5280                 ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
5281                                                         data_stripes), SZ_16M),
5282                                        ctl->stripe_size);
5283         }
5284
5285         /* Stripe size should not go beyond 1G. */
5286         ctl->stripe_size = min_t(u64, ctl->stripe_size, SZ_1G);
5287
5288         /* Align to BTRFS_STRIPE_LEN */
5289         ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
5290         ctl->chunk_size = ctl->stripe_size * data_stripes;
5291
5292         return 0;
5293 }
5294
5295 static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
5296                                     struct btrfs_device_info *devices_info)
5297 {
5298         u64 zone_size = devices_info[0].dev->zone_info->zone_size;
5299         /* Number of stripes that count for block group size */
5300         int data_stripes;
5301
5302         /*
5303          * It should hold because:
5304          *    dev_extent_min == dev_extent_want == zone_size * dev_stripes
5305          */
5306         ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min);
5307
5308         ctl->stripe_size = zone_size;
5309         ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5310         data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5311
5312         /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */
5313         if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
5314                 ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
5315                                              ctl->stripe_size) + ctl->nparity,
5316                                      ctl->dev_stripes);
5317                 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5318                 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5319                 ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
5320         }
5321
5322         ctl->chunk_size = ctl->stripe_size * data_stripes;
5323
5324         return 0;
5325 }
5326
5327 static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
5328                               struct alloc_chunk_ctl *ctl,
5329                               struct btrfs_device_info *devices_info)
5330 {
5331         struct btrfs_fs_info *info = fs_devices->fs_info;
5332
5333         /*
5334          * Round down to number of usable stripes, devs_increment can be any
5335          * number so we can't use round_down() that requires power of 2, while
5336          * rounddown is safe.
5337          */
5338         ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);
5339
5340         if (ctl->ndevs < ctl->devs_min) {
5341                 if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
5342                         btrfs_debug(info,
5343         "%s: not enough devices with free space: have=%d minimum required=%d",
5344                                     __func__, ctl->ndevs, ctl->devs_min);
5345                 }
5346                 return -ENOSPC;
5347         }
5348
5349         ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
5350
5351         switch (fs_devices->chunk_alloc_policy) {
5352         case BTRFS_CHUNK_ALLOC_REGULAR:
5353                 return decide_stripe_size_regular(ctl, devices_info);
5354         case BTRFS_CHUNK_ALLOC_ZONED:
5355                 return decide_stripe_size_zoned(ctl, devices_info);
5356         default:
5357                 BUG();
5358         }
5359 }
5360
5361 static void chunk_map_device_set_bits(struct btrfs_chunk_map *map, unsigned int bits)
5362 {
5363         for (int i = 0; i < map->num_stripes; i++) {
5364                 struct btrfs_io_stripe *stripe = &map->stripes[i];
5365                 struct btrfs_device *device = stripe->dev;
5366
5367                 set_extent_bit(&device->alloc_state, stripe->physical,
5368                                stripe->physical + map->stripe_size - 1,
5369                                bits | EXTENT_NOWAIT, NULL);
5370         }
5371 }
5372
5373 static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits)
5374 {
5375         for (int i = 0; i < map->num_stripes; i++) {
5376                 struct btrfs_io_stripe *stripe = &map->stripes[i];
5377                 struct btrfs_device *device = stripe->dev;
5378
5379                 __clear_extent_bit(&device->alloc_state, stripe->physical,
5380                                    stripe->physical + map->stripe_size - 1,
5381                                    bits | EXTENT_NOWAIT,
5382                                    NULL, NULL);
5383         }
5384 }
5385
5386 void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map)
5387 {
5388         write_lock(&fs_info->mapping_tree_lock);
5389         rb_erase_cached(&map->rb_node, &fs_info->mapping_tree);
5390         RB_CLEAR_NODE(&map->rb_node);
5391         chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
5392         write_unlock(&fs_info->mapping_tree_lock);
5393
5394         /* Once for the tree reference. */
5395         btrfs_free_chunk_map(map);
5396 }
5397
5398 EXPORT_FOR_TESTS
5399 int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map)
5400 {
5401         struct rb_node **p;
5402         struct rb_node *parent = NULL;
5403         bool leftmost = true;
5404
5405         write_lock(&fs_info->mapping_tree_lock);
5406         p = &fs_info->mapping_tree.rb_root.rb_node;
5407         while (*p) {
5408                 struct btrfs_chunk_map *entry;
5409
5410                 parent = *p;
5411                 entry = rb_entry(parent, struct btrfs_chunk_map, rb_node);
5412
5413                 if (map->start < entry->start) {
5414                         p = &(*p)->rb_left;
5415                 } else if (map->start > entry->start) {
5416                         p = &(*p)->rb_right;
5417                         leftmost = false;
5418                 } else {
5419                         write_unlock(&fs_info->mapping_tree_lock);
5420                         return -EEXIST;
5421                 }
5422         }
5423         rb_link_node(&map->rb_node, parent, p);
5424         rb_insert_color_cached(&map->rb_node, &fs_info->mapping_tree, leftmost);
5425         chunk_map_device_set_bits(map, CHUNK_ALLOCATED);
5426         chunk_map_device_clear_bits(map, CHUNK_TRIMMED);
5427         write_unlock(&fs_info->mapping_tree_lock);
5428
5429         return 0;
5430 }
5431
5432 EXPORT_FOR_TESTS
5433 struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp)
5434 {
5435         struct btrfs_chunk_map *map;
5436
5437         map = kmalloc(btrfs_chunk_map_size(num_stripes), gfp);
5438         if (!map)
5439                 return NULL;
5440
5441         refcount_set(&map->refs, 1);
5442         RB_CLEAR_NODE(&map->rb_node);
5443
5444         return map;
5445 }
5446
5447 static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
5448                         struct alloc_chunk_ctl *ctl,
5449                         struct btrfs_device_info *devices_info)
5450 {
5451         struct btrfs_fs_info *info = trans->fs_info;
5452         struct btrfs_chunk_map *map;
5453         struct btrfs_block_group *block_group;
5454         u64 start = ctl->start;
5455         u64 type = ctl->type;
5456         int ret;
5457
5458         map = btrfs_alloc_chunk_map(ctl->num_stripes, GFP_NOFS);
5459         if (!map)
5460                 return ERR_PTR(-ENOMEM);
5461
5462         map->start = start;
5463         map->chunk_len = ctl->chunk_size;
5464         map->stripe_size = ctl->stripe_size;
5465         map->type = type;
5466         map->io_align = BTRFS_STRIPE_LEN;
5467         map->io_width = BTRFS_STRIPE_LEN;
5468         map->sub_stripes = ctl->sub_stripes;
5469         map->num_stripes = ctl->num_stripes;
5470
5471         for (int i = 0; i < ctl->ndevs; i++) {
5472                 for (int j = 0; j < ctl->dev_stripes; j++) {
5473                         int s = i * ctl->dev_stripes + j;
5474                         map->stripes[s].dev = devices_info[i].dev;
5475                         map->stripes[s].physical = devices_info[i].dev_offset +
5476                                                    j * ctl->stripe_size;
5477                 }
5478         }
5479
5480         trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
5481
5482         ret = btrfs_add_chunk_map(info, map);
5483         if (ret) {
5484                 btrfs_free_chunk_map(map);
5485                 return ERR_PTR(ret);
5486         }
5487
5488         block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size);
5489         if (IS_ERR(block_group)) {
5490                 btrfs_remove_chunk_map(info, map);
5491                 return block_group;
5492         }
5493
5494         for (int i = 0; i < map->num_stripes; i++) {
5495                 struct btrfs_device *dev = map->stripes[i].dev;
5496
5497                 btrfs_device_set_bytes_used(dev,
5498                                             dev->bytes_used + ctl->stripe_size);
5499                 if (list_empty(&dev->post_commit_list))
5500                         list_add_tail(&dev->post_commit_list,
5501                                       &trans->transaction->dev_update_list);
5502         }
5503
5504         atomic64_sub(ctl->stripe_size * map->num_stripes,
5505                      &info->free_chunk_space);
5506
5507         check_raid56_incompat_flag(info, type);
5508         check_raid1c34_incompat_flag(info, type);
5509
5510         return block_group;
5511 }
5512
5513 struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
5514                                             u64 type)
5515 {
5516         struct btrfs_fs_info *info = trans->fs_info;
5517         struct btrfs_fs_devices *fs_devices = info->fs_devices;
5518         struct btrfs_device_info *devices_info = NULL;
5519         struct alloc_chunk_ctl ctl;
5520         struct btrfs_block_group *block_group;
5521         int ret;
5522
5523         lockdep_assert_held(&info->chunk_mutex);
5524
5525         if (!alloc_profile_is_valid(type, 0)) {
5526                 ASSERT(0);
5527                 return ERR_PTR(-EINVAL);
5528         }
5529
5530         if (list_empty(&fs_devices->alloc_list)) {
5531                 if (btrfs_test_opt(info, ENOSPC_DEBUG))
5532                         btrfs_debug(info, "%s: no writable device", __func__);
5533                 return ERR_PTR(-ENOSPC);
5534         }
5535
5536         if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
5537                 btrfs_err(info, "invalid chunk type 0x%llx requested", type);
5538                 ASSERT(0);
5539                 return ERR_PTR(-EINVAL);
5540         }
5541
5542         ctl.start = find_next_chunk(info);
5543         ctl.type = type;
5544         init_alloc_chunk_ctl(fs_devices, &ctl);
5545
5546         devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
5547                                GFP_NOFS);
5548         if (!devices_info)
5549                 return ERR_PTR(-ENOMEM);
5550
5551         ret = gather_device_info(fs_devices, &ctl, devices_info);
5552         if (ret < 0) {
5553                 block_group = ERR_PTR(ret);
5554                 goto out;
5555         }
5556
5557         ret = decide_stripe_size(fs_devices, &ctl, devices_info);
5558         if (ret < 0) {
5559                 block_group = ERR_PTR(ret);
5560                 goto out;
5561         }
5562
5563         block_group = create_chunk(trans, &ctl, devices_info);
5564
5565 out:
5566         kfree(devices_info);
5567         return block_group;
5568 }
5569
5570 /*
5571  * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the
5572  * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system
5573  * chunks.
5574  *
5575  * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
5576  * phases.
5577  */
5578 int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
5579                                      struct btrfs_block_group *bg)
5580 {
5581         struct btrfs_fs_info *fs_info = trans->fs_info;
5582         struct btrfs_root *chunk_root = fs_info->chunk_root;
5583         struct btrfs_key key;
5584         struct btrfs_chunk *chunk;
5585         struct btrfs_stripe *stripe;
5586         struct btrfs_chunk_map *map;
5587         size_t item_size;
5588         int i;
5589         int ret;
5590
5591         /*
5592          * We take the chunk_mutex for 2 reasons:
5593          *
5594          * 1) Updates and insertions in the chunk btree must be done while holding
5595          *    the chunk_mutex, as well as updating the system chunk array in the
5596          *    superblock. See the comment on top of btrfs_chunk_alloc() for the
5597          *    details;
5598          *
5599          * 2) To prevent races with the final phase of a device replace operation
5600          *    that replaces the device object associated with the map's stripes,
5601          *    because the device object's id can change at any time during that
5602          *    final phase of the device replace operation
5603          *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
5604          *    replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
5605          *    which would cause a failure when updating the device item, which does
5606          *    not exists, or persisting a stripe of the chunk item with such ID.
5607          *    Here we can't use the device_list_mutex because our caller already
5608          *    has locked the chunk_mutex, and the final phase of device replace
5609          *    acquires both mutexes - first the device_list_mutex and then the
5610          *    chunk_mutex. Using any of those two mutexes protects us from a
5611          *    concurrent device replace.
5612          */
5613         lockdep_assert_held(&fs_info->chunk_mutex);
5614
5615         map = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
5616         if (IS_ERR(map)) {
5617                 ret = PTR_ERR(map);
5618                 btrfs_abort_transaction(trans, ret);
5619                 return ret;
5620         }
5621
5622         item_size = btrfs_chunk_item_size(map->num_stripes);
5623
5624         chunk = kzalloc(item_size, GFP_NOFS);
5625         if (!chunk) {
5626                 ret = -ENOMEM;
5627                 btrfs_abort_transaction(trans, ret);
5628                 goto out;
5629         }
5630
5631         for (i = 0; i < map->num_stripes; i++) {
5632                 struct btrfs_device *device = map->stripes[i].dev;
5633
5634                 ret = btrfs_update_device(trans, device);
5635                 if (ret)
5636                         goto out;
5637         }
5638
5639         stripe = &chunk->stripe;
5640         for (i = 0; i < map->num_stripes; i++) {
5641                 struct btrfs_device *device = map->stripes[i].dev;
5642                 const u64 dev_offset = map->stripes[i].physical;
5643
5644                 btrfs_set_stack_stripe_devid(stripe, device->devid);
5645                 btrfs_set_stack_stripe_offset(stripe, dev_offset);
5646                 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
5647                 stripe++;
5648         }
5649
5650         btrfs_set_stack_chunk_length(chunk, bg->length);
5651         btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID);
5652         btrfs_set_stack_chunk_stripe_len(chunk, BTRFS_STRIPE_LEN);
5653         btrfs_set_stack_chunk_type(chunk, map->type);
5654         btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
5655         btrfs_set_stack_chunk_io_align(chunk, BTRFS_STRIPE_LEN);
5656         btrfs_set_stack_chunk_io_width(chunk, BTRFS_STRIPE_LEN);
5657         btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
5658         btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
5659
5660         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
5661         key.type = BTRFS_CHUNK_ITEM_KEY;
5662         key.offset = bg->start;
5663
5664         ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
5665         if (ret)
5666                 goto out;
5667
5668         set_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, &bg->runtime_flags);
5669
5670         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
5671                 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
5672                 if (ret)
5673                         goto out;
5674         }
5675
5676 out:
5677         kfree(chunk);
5678         btrfs_free_chunk_map(map);
5679         return ret;
5680 }
5681
5682 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
5683 {
5684         struct btrfs_fs_info *fs_info = trans->fs_info;
5685         u64 alloc_profile;
5686         struct btrfs_block_group *meta_bg;
5687         struct btrfs_block_group *sys_bg;
5688
5689         /*
5690          * When adding a new device for sprouting, the seed device is read-only
5691          * so we must first allocate a metadata and a system chunk. But before
5692          * adding the block group items to the extent, device and chunk btrees,
5693          * we must first:
5694          *
5695          * 1) Create both chunks without doing any changes to the btrees, as
5696          *    otherwise we would get -ENOSPC since the block groups from the
5697          *    seed device are read-only;
5698          *
5699          * 2) Add the device item for the new sprout device - finishing the setup
5700          *    of a new block group requires updating the device item in the chunk
5701          *    btree, so it must exist when we attempt to do it. The previous step
5702          *    ensures this does not fail with -ENOSPC.
5703          *
5704          * After that we can add the block group items to their btrees:
5705          * update existing device item in the chunk btree, add a new block group
5706          * item to the extent btree, add a new chunk item to the chunk btree and
5707          * finally add the new device extent items to the devices btree.
5708          */
5709
5710         alloc_profile = btrfs_metadata_alloc_profile(fs_info);
5711         meta_bg = btrfs_create_chunk(trans, alloc_profile);
5712         if (IS_ERR(meta_bg))
5713                 return PTR_ERR(meta_bg);
5714
5715         alloc_profile = btrfs_system_alloc_profile(fs_info);
5716         sys_bg = btrfs_create_chunk(trans, alloc_profile);
5717         if (IS_ERR(sys_bg))
5718                 return PTR_ERR(sys_bg);
5719
5720         return 0;
5721 }
5722
5723 static inline int btrfs_chunk_max_errors(struct btrfs_chunk_map *map)
5724 {
5725         const int index = btrfs_bg_flags_to_raid_index(map->type);
5726
5727         return btrfs_raid_array[index].tolerated_failures;
5728 }
5729
5730 bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
5731 {
5732         struct btrfs_chunk_map *map;
5733         int miss_ndevs = 0;
5734         int i;
5735         bool ret = true;
5736
5737         map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
5738         if (IS_ERR(map))
5739                 return false;
5740
5741         for (i = 0; i < map->num_stripes; i++) {
5742                 if (test_bit(BTRFS_DEV_STATE_MISSING,
5743                                         &map->stripes[i].dev->dev_state)) {
5744                         miss_ndevs++;
5745                         continue;
5746                 }
5747                 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
5748                                         &map->stripes[i].dev->dev_state)) {
5749                         ret = false;
5750                         goto end;
5751                 }
5752         }
5753
5754         /*
5755          * If the number of missing devices is larger than max errors, we can
5756          * not write the data into that chunk successfully.
5757          */
5758         if (miss_ndevs > btrfs_chunk_max_errors(map))
5759                 ret = false;
5760 end:
5761         btrfs_free_chunk_map(map);
5762         return ret;
5763 }
5764
5765 void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info)
5766 {
5767         write_lock(&fs_info->mapping_tree_lock);
5768         while (!RB_EMPTY_ROOT(&fs_info->mapping_tree.rb_root)) {
5769                 struct btrfs_chunk_map *map;
5770                 struct rb_node *node;
5771
5772                 node = rb_first_cached(&fs_info->mapping_tree);
5773                 map = rb_entry(node, struct btrfs_chunk_map, rb_node);
5774                 rb_erase_cached(&map->rb_node, &fs_info->mapping_tree);
5775                 RB_CLEAR_NODE(&map->rb_node);
5776                 chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
5777                 /* Once for the tree ref. */
5778                 btrfs_free_chunk_map(map);
5779                 cond_resched_rwlock_write(&fs_info->mapping_tree_lock);
5780         }
5781         write_unlock(&fs_info->mapping_tree_lock);
5782 }
5783
5784 static int btrfs_chunk_map_num_copies(const struct btrfs_chunk_map *map)
5785 {
5786         enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(map->type);
5787
5788         if (map->type & BTRFS_BLOCK_GROUP_RAID5)
5789                 return 2;
5790
5791         /*
5792          * There could be two corrupted data stripes, we need to loop retry in
5793          * order to rebuild the correct data.
5794          *
5795          * Fail a stripe at a time on every retry except the stripe under
5796          * reconstruction.
5797          */
5798         if (map->type & BTRFS_BLOCK_GROUP_RAID6)
5799                 return map->num_stripes;
5800
5801         /* Non-RAID56, use their ncopies from btrfs_raid_array. */
5802         return btrfs_raid_array[index].ncopies;
5803 }
5804
5805 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5806 {
5807         struct btrfs_chunk_map *map;
5808         int ret;
5809
5810         map = btrfs_get_chunk_map(fs_info, logical, len);
5811         if (IS_ERR(map))
5812                 /*
5813                  * We could return errors for these cases, but that could get
5814                  * ugly and we'd probably do the same thing which is just not do
5815                  * anything else and exit, so return 1 so the callers don't try
5816                  * to use other copies.
5817                  */
5818                 return 1;
5819
5820         ret = btrfs_chunk_map_num_copies(map);
5821         btrfs_free_chunk_map(map);
5822         return ret;
5823 }
5824
5825 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
5826                                     u64 logical)
5827 {
5828         struct btrfs_chunk_map *map;
5829         unsigned long len = fs_info->sectorsize;
5830
5831         if (!btrfs_fs_incompat(fs_info, RAID56))
5832                 return len;
5833
5834         map = btrfs_get_chunk_map(fs_info, logical, len);
5835
5836         if (!WARN_ON(IS_ERR(map))) {
5837                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5838                         len = btrfs_stripe_nr_to_offset(nr_data_stripes(map));
5839                 btrfs_free_chunk_map(map);
5840         }
5841         return len;
5842 }
5843
5844 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5845 {
5846         struct btrfs_chunk_map *map;
5847         int ret = 0;
5848
5849         if (!btrfs_fs_incompat(fs_info, RAID56))
5850                 return 0;
5851
5852         map = btrfs_get_chunk_map(fs_info, logical, len);
5853
5854         if (!WARN_ON(IS_ERR(map))) {
5855                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5856                         ret = 1;
5857                 btrfs_free_chunk_map(map);
5858         }
5859         return ret;
5860 }
5861
5862 static int find_live_mirror(struct btrfs_fs_info *fs_info,
5863                             struct btrfs_chunk_map *map, int first,
5864                             int dev_replace_is_ongoing)
5865 {
5866         const enum btrfs_read_policy policy = READ_ONCE(fs_info->fs_devices->read_policy);
5867         int i;
5868         int num_stripes;
5869         int preferred_mirror;
5870         int tolerance;
5871         struct btrfs_device *srcdev;
5872
5873         ASSERT((map->type &
5874                  (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
5875
5876         if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5877                 num_stripes = map->sub_stripes;
5878         else
5879                 num_stripes = map->num_stripes;
5880
5881         switch (policy) {
5882         default:
5883                 /* Shouldn't happen, just warn and use pid instead of failing */
5884                 btrfs_warn_rl(fs_info, "unknown read_policy type %u, reset to pid",
5885                               policy);
5886                 WRITE_ONCE(fs_info->fs_devices->read_policy, BTRFS_READ_POLICY_PID);
5887                 fallthrough;
5888         case BTRFS_READ_POLICY_PID:
5889                 preferred_mirror = first + (current->pid % num_stripes);
5890                 break;
5891         }
5892
5893         if (dev_replace_is_ongoing &&
5894             fs_info->dev_replace.cont_reading_from_srcdev_mode ==
5895              BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
5896                 srcdev = fs_info->dev_replace.srcdev;
5897         else
5898                 srcdev = NULL;
5899
5900         /*
5901          * try to avoid the drive that is the source drive for a
5902          * dev-replace procedure, only choose it if no other non-missing
5903          * mirror is available
5904          */
5905         for (tolerance = 0; tolerance < 2; tolerance++) {
5906                 if (map->stripes[preferred_mirror].dev->bdev &&
5907                     (tolerance || map->stripes[preferred_mirror].dev != srcdev))
5908                         return preferred_mirror;
5909                 for (i = first; i < first + num_stripes; i++) {
5910                         if (map->stripes[i].dev->bdev &&
5911                             (tolerance || map->stripes[i].dev != srcdev))
5912                                 return i;
5913                 }
5914         }
5915
5916         /* we couldn't find one that doesn't fail.  Just return something
5917          * and the io error handling code will clean up eventually
5918          */
5919         return preferred_mirror;
5920 }
5921
5922 static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
5923                                                        u64 logical,
5924                                                        u16 total_stripes)
5925 {
5926         struct btrfs_io_context *bioc;
5927
5928         bioc = kzalloc(
5929                  /* The size of btrfs_io_context */
5930                 sizeof(struct btrfs_io_context) +
5931                 /* Plus the variable array for the stripes */
5932                 sizeof(struct btrfs_io_stripe) * (total_stripes),
5933                 GFP_NOFS);
5934
5935         if (!bioc)
5936                 return NULL;
5937
5938         refcount_set(&bioc->refs, 1);
5939
5940         bioc->fs_info = fs_info;
5941         bioc->replace_stripe_src = -1;
5942         bioc->full_stripe_logical = (u64)-1;
5943         bioc->logical = logical;
5944
5945         return bioc;
5946 }
5947
5948 void btrfs_get_bioc(struct btrfs_io_context *bioc)
5949 {
5950         WARN_ON(!refcount_read(&bioc->refs));
5951         refcount_inc(&bioc->refs);
5952 }
5953
5954 void btrfs_put_bioc(struct btrfs_io_context *bioc)
5955 {
5956         if (!bioc)
5957                 return;
5958         if (refcount_dec_and_test(&bioc->refs))
5959                 kfree(bioc);
5960 }
5961
5962 /*
5963  * Please note that, discard won't be sent to target device of device
5964  * replace.
5965  */
5966 struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
5967                                                u64 logical, u64 *length_ret,
5968                                                u32 *num_stripes)
5969 {
5970         struct btrfs_chunk_map *map;
5971         struct btrfs_discard_stripe *stripes;
5972         u64 length = *length_ret;
5973         u64 offset;
5974         u32 stripe_nr;
5975         u32 stripe_nr_end;
5976         u32 stripe_cnt;
5977         u64 stripe_end_offset;
5978         u64 stripe_offset;
5979         u32 stripe_index;
5980         u32 factor = 0;
5981         u32 sub_stripes = 0;
5982         u32 stripes_per_dev = 0;
5983         u32 remaining_stripes = 0;
5984         u32 last_stripe = 0;
5985         int ret;
5986         int i;
5987
5988         map = btrfs_get_chunk_map(fs_info, logical, length);
5989         if (IS_ERR(map))
5990                 return ERR_CAST(map);
5991
5992         /* we don't discard raid56 yet */
5993         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5994                 ret = -EOPNOTSUPP;
5995                 goto out_free_map;
5996         }
5997
5998         offset = logical - map->start;
5999         length = min_t(u64, map->start + map->chunk_len - logical, length);
6000         *length_ret = length;
6001
6002         /*
6003          * stripe_nr counts the total number of stripes we have to stride
6004          * to get to this block
6005          */
6006         stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
6007
6008         /* stripe_offset is the offset of this block in its stripe */
6009         stripe_offset = offset - btrfs_stripe_nr_to_offset(stripe_nr);
6010
6011         stripe_nr_end = round_up(offset + length, BTRFS_STRIPE_LEN) >>
6012                         BTRFS_STRIPE_LEN_SHIFT;
6013         stripe_cnt = stripe_nr_end - stripe_nr;
6014         stripe_end_offset = btrfs_stripe_nr_to_offset(stripe_nr_end) -
6015                             (offset + length);
6016         /*
6017          * after this, stripe_nr is the number of stripes on this
6018          * device we have to walk to find the data, and stripe_index is
6019          * the number of our device in the stripe array
6020          */
6021         *num_stripes = 1;
6022         stripe_index = 0;
6023         if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
6024                          BTRFS_BLOCK_GROUP_RAID10)) {
6025                 if (map->type & BTRFS_BLOCK_GROUP_RAID0)
6026                         sub_stripes = 1;
6027                 else
6028                         sub_stripes = map->sub_stripes;
6029
6030                 factor = map->num_stripes / sub_stripes;
6031                 *num_stripes = min_t(u64, map->num_stripes,
6032                                     sub_stripes * stripe_cnt);
6033                 stripe_index = stripe_nr % factor;
6034                 stripe_nr /= factor;
6035                 stripe_index *= sub_stripes;
6036
6037                 remaining_stripes = stripe_cnt % factor;
6038                 stripes_per_dev = stripe_cnt / factor;
6039                 last_stripe = ((stripe_nr_end - 1) % factor) * sub_stripes;
6040         } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
6041                                 BTRFS_BLOCK_GROUP_DUP)) {
6042                 *num_stripes = map->num_stripes;
6043         } else {
6044                 stripe_index = stripe_nr % map->num_stripes;
6045                 stripe_nr /= map->num_stripes;
6046         }
6047
6048         stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS);
6049         if (!stripes) {
6050                 ret = -ENOMEM;
6051                 goto out_free_map;
6052         }
6053
6054         for (i = 0; i < *num_stripes; i++) {
6055                 stripes[i].physical =
6056                         map->stripes[stripe_index].physical +
6057                         stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr);
6058                 stripes[i].dev = map->stripes[stripe_index].dev;
6059
6060                 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
6061                                  BTRFS_BLOCK_GROUP_RAID10)) {
6062                         stripes[i].length = btrfs_stripe_nr_to_offset(stripes_per_dev);
6063
6064                         if (i / sub_stripes < remaining_stripes)
6065                                 stripes[i].length += BTRFS_STRIPE_LEN;
6066
6067                         /*
6068                          * Special for the first stripe and
6069                          * the last stripe:
6070                          *
6071                          * |-------|...|-------|
6072                          *     |----------|
6073                          *    off     end_off
6074                          */
6075                         if (i < sub_stripes)
6076                                 stripes[i].length -= stripe_offset;
6077
6078                         if (stripe_index >= last_stripe &&
6079                             stripe_index <= (last_stripe +
6080                                              sub_stripes - 1))
6081                                 stripes[i].length -= stripe_end_offset;
6082
6083                         if (i == sub_stripes - 1)
6084                                 stripe_offset = 0;
6085                 } else {
6086                         stripes[i].length = length;
6087                 }
6088
6089                 stripe_index++;
6090                 if (stripe_index == map->num_stripes) {
6091                         stripe_index = 0;
6092                         stripe_nr++;
6093                 }
6094         }
6095
6096         btrfs_free_chunk_map(map);
6097         return stripes;
6098 out_free_map:
6099         btrfs_free_chunk_map(map);
6100         return ERR_PTR(ret);
6101 }
6102
6103 static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
6104 {
6105         struct btrfs_block_group *cache;
6106         bool ret;
6107
6108         /* Non zoned filesystem does not use "to_copy" flag */
6109         if (!btrfs_is_zoned(fs_info))
6110                 return false;
6111
6112         cache = btrfs_lookup_block_group(fs_info, logical);
6113
6114         ret = test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
6115
6116         btrfs_put_block_group(cache);
6117         return ret;
6118 }
6119
6120 static void handle_ops_on_dev_replace(struct btrfs_io_context *bioc,
6121                                       struct btrfs_dev_replace *dev_replace,
6122                                       u64 logical,
6123                                       struct btrfs_io_geometry *io_geom)
6124 {
6125         u64 srcdev_devid = dev_replace->srcdev->devid;
6126         /*
6127          * At this stage, num_stripes is still the real number of stripes,
6128          * excluding the duplicated stripes.
6129          */
6130         int num_stripes = io_geom->num_stripes;
6131         int max_errors = io_geom->max_errors;
6132         int nr_extra_stripes = 0;
6133         int i;
6134
6135         /*
6136          * A block group which has "to_copy" set will eventually be copied by
6137          * the dev-replace process. We can avoid cloning IO here.
6138          */
6139         if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
6140                 return;
6141
6142         /*
6143          * Duplicate the write operations while the dev-replace procedure is
6144          * running. Since the copying of the old disk to the new disk takes
6145          * place at run time while the filesystem is mounted writable, the
6146          * regular write operations to the old disk have to be duplicated to go
6147          * to the new disk as well.
6148          *
6149          * Note that device->missing is handled by the caller, and that the
6150          * write to the old disk is already set up in the stripes array.
6151          */
6152         for (i = 0; i < num_stripes; i++) {
6153                 struct btrfs_io_stripe *old = &bioc->stripes[i];
6154                 struct btrfs_io_stripe *new = &bioc->stripes[num_stripes + nr_extra_stripes];
6155
6156                 if (old->dev->devid != srcdev_devid)
6157                         continue;
6158
6159                 new->physical = old->physical;
6160                 new->dev = dev_replace->tgtdev;
6161                 if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
6162                         bioc->replace_stripe_src = i;
6163                 nr_extra_stripes++;
6164         }
6165
6166         /* We can only have at most 2 extra nr_stripes (for DUP). */
6167         ASSERT(nr_extra_stripes <= 2);
6168         /*
6169          * For GET_READ_MIRRORS, we can only return at most 1 extra stripe for
6170          * replace.
6171          * If we have 2 extra stripes, only choose the one with smaller physical.
6172          */
6173         if (io_geom->op == BTRFS_MAP_GET_READ_MIRRORS && nr_extra_stripes == 2) {
6174                 struct btrfs_io_stripe *first = &bioc->stripes[num_stripes];
6175                 struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1];
6176
6177                 /* Only DUP can have two extra stripes. */
6178                 ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP);
6179
6180                 /*
6181                  * Swap the last stripe stripes and reduce @nr_extra_stripes.
6182                  * The extra stripe would still be there, but won't be accessed.
6183                  */
6184                 if (first->physical > second->physical) {
6185                         swap(second->physical, first->physical);
6186                         swap(second->dev, first->dev);
6187                         nr_extra_stripes--;
6188                 }
6189         }
6190
6191         io_geom->num_stripes = num_stripes + nr_extra_stripes;
6192         io_geom->max_errors = max_errors + nr_extra_stripes;
6193         bioc->replace_nr_stripes = nr_extra_stripes;
6194 }
6195
6196 static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, u64 offset,
6197                             struct btrfs_io_geometry *io_geom)
6198 {
6199         /*
6200          * Stripe_nr is the stripe where this block falls.  stripe_offset is
6201          * the offset of this block in its stripe.
6202          */
6203         io_geom->stripe_offset = offset & BTRFS_STRIPE_LEN_MASK;
6204         io_geom->stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
6205         ASSERT(io_geom->stripe_offset < U32_MAX);
6206
6207         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6208                 unsigned long full_stripe_len =
6209                         btrfs_stripe_nr_to_offset(nr_data_stripes(map));
6210
6211                 /*
6212                  * For full stripe start, we use previously calculated
6213                  * @stripe_nr. Align it to nr_data_stripes, then multiply with
6214                  * STRIPE_LEN.
6215                  *
6216                  * By this we can avoid u64 division completely.  And we have
6217                  * to go rounddown(), not round_down(), as nr_data_stripes is
6218                  * not ensured to be power of 2.
6219                  */
6220                 io_geom->raid56_full_stripe_start = btrfs_stripe_nr_to_offset(
6221                         rounddown(io_geom->stripe_nr, nr_data_stripes(map)));
6222
6223                 ASSERT(io_geom->raid56_full_stripe_start + full_stripe_len > offset);
6224                 ASSERT(io_geom->raid56_full_stripe_start <= offset);
6225                 /*
6226                  * For writes to RAID56, allow to write a full stripe set, but
6227                  * no straddling of stripe sets.
6228                  */
6229                 if (io_geom->op == BTRFS_MAP_WRITE)
6230                         return full_stripe_len - (offset - io_geom->raid56_full_stripe_start);
6231         }
6232
6233         /*
6234          * For other RAID types and for RAID56 reads, allow a single stripe (on
6235          * a single disk).
6236          */
6237         if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK)
6238                 return BTRFS_STRIPE_LEN - io_geom->stripe_offset;
6239         return U64_MAX;
6240 }
6241
6242 static int set_io_stripe(struct btrfs_fs_info *fs_info, u64 logical,
6243                          u64 *length, struct btrfs_io_stripe *dst,
6244                          struct btrfs_chunk_map *map,
6245                          struct btrfs_io_geometry *io_geom)
6246 {
6247         dst->dev = map->stripes[io_geom->stripe_index].dev;
6248
6249         if (io_geom->op == BTRFS_MAP_READ &&
6250             btrfs_need_stripe_tree_update(fs_info, map->type))
6251                 return btrfs_get_raid_extent_offset(fs_info, logical, length,
6252                                                     map->type,
6253                                                     io_geom->stripe_index, dst);
6254
6255         dst->physical = map->stripes[io_geom->stripe_index].physical +
6256                         io_geom->stripe_offset +
6257                         btrfs_stripe_nr_to_offset(io_geom->stripe_nr);
6258         return 0;
6259 }
6260
6261 static bool is_single_device_io(struct btrfs_fs_info *fs_info,
6262                                 const struct btrfs_io_stripe *smap,
6263                                 const struct btrfs_chunk_map *map,
6264                                 int num_alloc_stripes,
6265                                 enum btrfs_map_op op, int mirror_num)
6266 {
6267         if (!smap)
6268                 return false;
6269
6270         if (num_alloc_stripes != 1)
6271                 return false;
6272
6273         if (btrfs_need_stripe_tree_update(fs_info, map->type) && op != BTRFS_MAP_READ)
6274                 return false;
6275
6276         if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)
6277                 return false;
6278
6279         return true;
6280 }
6281
6282 static void map_blocks_raid0(const struct btrfs_chunk_map *map,
6283                              struct btrfs_io_geometry *io_geom)
6284 {
6285         io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes;
6286         io_geom->stripe_nr /= map->num_stripes;
6287         if (io_geom->op == BTRFS_MAP_READ)
6288                 io_geom->mirror_num = 1;
6289 }
6290
6291 static void map_blocks_raid1(struct btrfs_fs_info *fs_info,
6292                              struct btrfs_chunk_map *map,
6293                              struct btrfs_io_geometry *io_geom,
6294                              bool dev_replace_is_ongoing)
6295 {
6296         if (io_geom->op != BTRFS_MAP_READ) {
6297                 io_geom->num_stripes = map->num_stripes;
6298                 return;
6299         }
6300
6301         if (io_geom->mirror_num) {
6302                 io_geom->stripe_index = io_geom->mirror_num - 1;
6303                 return;
6304         }
6305
6306         io_geom->stripe_index = find_live_mirror(fs_info, map, 0,
6307                                                  dev_replace_is_ongoing);
6308         io_geom->mirror_num = io_geom->stripe_index + 1;
6309 }
6310
6311 static void map_blocks_dup(const struct btrfs_chunk_map *map,
6312                            struct btrfs_io_geometry *io_geom)
6313 {
6314         if (io_geom->op != BTRFS_MAP_READ) {
6315                 io_geom->num_stripes = map->num_stripes;
6316                 return;
6317         }
6318
6319         if (io_geom->mirror_num) {
6320                 io_geom->stripe_index = io_geom->mirror_num - 1;
6321                 return;
6322         }
6323
6324         io_geom->mirror_num = 1;
6325 }
6326
6327 static void map_blocks_raid10(struct btrfs_fs_info *fs_info,
6328                               struct btrfs_chunk_map *map,
6329                               struct btrfs_io_geometry *io_geom,
6330                               bool dev_replace_is_ongoing)
6331 {
6332         u32 factor = map->num_stripes / map->sub_stripes;
6333         int old_stripe_index;
6334
6335         io_geom->stripe_index = (io_geom->stripe_nr % factor) * map->sub_stripes;
6336         io_geom->stripe_nr /= factor;
6337
6338         if (io_geom->op != BTRFS_MAP_READ) {
6339                 io_geom->num_stripes = map->sub_stripes;
6340                 return;
6341         }
6342
6343         if (io_geom->mirror_num) {
6344                 io_geom->stripe_index += io_geom->mirror_num - 1;
6345                 return;
6346         }
6347
6348         old_stripe_index = io_geom->stripe_index;
6349         io_geom->stripe_index = find_live_mirror(fs_info, map,
6350                                                  io_geom->stripe_index,
6351                                                  dev_replace_is_ongoing);
6352         io_geom->mirror_num = io_geom->stripe_index - old_stripe_index + 1;
6353 }
6354
6355 static void map_blocks_raid56_write(struct btrfs_chunk_map *map,
6356                                     struct btrfs_io_geometry *io_geom,
6357                                     u64 logical, u64 *length)
6358 {
6359         int data_stripes = nr_data_stripes(map);
6360
6361         /*
6362          * Needs full stripe mapping.
6363          *
6364          * Push stripe_nr back to the start of the full stripe For those cases
6365          * needing a full stripe, @stripe_nr is the full stripe number.
6366          *
6367          * Originally we go raid56_full_stripe_start / full_stripe_len, but
6368          * that can be expensive.  Here we just divide @stripe_nr with
6369          * @data_stripes.
6370          */
6371         io_geom->stripe_nr /= data_stripes;
6372
6373         /* RAID[56] write or recovery. Return all stripes */
6374         io_geom->num_stripes = map->num_stripes;
6375         io_geom->max_errors = btrfs_chunk_max_errors(map);
6376
6377         /* Return the length to the full stripe end. */
6378         *length = min(logical + *length,
6379                       io_geom->raid56_full_stripe_start + map->start +
6380                       btrfs_stripe_nr_to_offset(data_stripes)) -
6381                 logical;
6382         io_geom->stripe_index = 0;
6383         io_geom->stripe_offset = 0;
6384 }
6385
6386 static void map_blocks_raid56_read(struct btrfs_chunk_map *map,
6387                                    struct btrfs_io_geometry *io_geom)
6388 {
6389         int data_stripes = nr_data_stripes(map);
6390
6391         ASSERT(io_geom->mirror_num <= 1);
6392         /* Just grab the data stripe directly. */
6393         io_geom->stripe_index = io_geom->stripe_nr % data_stripes;
6394         io_geom->stripe_nr /= data_stripes;
6395
6396         /* We distribute the parity blocks across stripes. */
6397         io_geom->stripe_index =
6398                 (io_geom->stripe_nr + io_geom->stripe_index) % map->num_stripes;
6399
6400         if (io_geom->op == BTRFS_MAP_READ && io_geom->mirror_num < 1)
6401                 io_geom->mirror_num = 1;
6402 }
6403
6404 static void map_blocks_single(const struct btrfs_chunk_map *map,
6405                               struct btrfs_io_geometry *io_geom)
6406 {
6407         io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes;
6408         io_geom->stripe_nr /= map->num_stripes;
6409         io_geom->mirror_num = io_geom->stripe_index + 1;
6410 }
6411
6412 /*
6413  * Map one logical range to one or more physical ranges.
6414  *
6415  * @length:             (Mandatory) mapped length of this run.
6416  *                      One logical range can be split into different segments
6417  *                      due to factors like zones and RAID0/5/6/10 stripe
6418  *                      boundaries.
6419  *
6420  * @bioc_ret:           (Mandatory) returned btrfs_io_context structure.
6421  *                      which has one or more physical ranges (btrfs_io_stripe)
6422  *                      recorded inside.
6423  *                      Caller should call btrfs_put_bioc() to free it after use.
6424  *
6425  * @smap:               (Optional) single physical range optimization.
6426  *                      If the map request can be fulfilled by one single
6427  *                      physical range, and this is parameter is not NULL,
6428  *                      then @bioc_ret would be NULL, and @smap would be
6429  *                      updated.
6430  *
6431  * @mirror_num_ret:     (Mandatory) returned mirror number if the original
6432  *                      value is 0.
6433  *
6434  *                      Mirror number 0 means to choose any live mirrors.
6435  *
6436  *                      For non-RAID56 profiles, non-zero mirror_num means
6437  *                      the Nth mirror. (e.g. mirror_num 1 means the first
6438  *                      copy).
6439  *
6440  *                      For RAID56 profile, mirror 1 means rebuild from P and
6441  *                      the remaining data stripes.
6442  *
6443  *                      For RAID6 profile, mirror > 2 means mark another
6444  *                      data/P stripe error and rebuild from the remaining
6445  *                      stripes..
6446  */
6447 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6448                     u64 logical, u64 *length,
6449                     struct btrfs_io_context **bioc_ret,
6450                     struct btrfs_io_stripe *smap, int *mirror_num_ret)
6451 {
6452         struct btrfs_chunk_map *map;
6453         struct btrfs_io_geometry io_geom = { 0 };
6454         u64 map_offset;
6455         int ret = 0;
6456         int num_copies;
6457         struct btrfs_io_context *bioc = NULL;
6458         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
6459         int dev_replace_is_ongoing = 0;
6460         u16 num_alloc_stripes;
6461         u64 max_len;
6462
6463         ASSERT(bioc_ret);
6464
6465         io_geom.mirror_num = (mirror_num_ret ? *mirror_num_ret : 0);
6466         io_geom.num_stripes = 1;
6467         io_geom.stripe_index = 0;
6468         io_geom.op = op;
6469
6470         map = btrfs_get_chunk_map(fs_info, logical, *length);
6471         if (IS_ERR(map))
6472                 return PTR_ERR(map);
6473
6474         num_copies = btrfs_chunk_map_num_copies(map);
6475         if (io_geom.mirror_num > num_copies)
6476                 return -EINVAL;
6477
6478         map_offset = logical - map->start;
6479         io_geom.raid56_full_stripe_start = (u64)-1;
6480         max_len = btrfs_max_io_len(map, map_offset, &io_geom);
6481         *length = min_t(u64, map->chunk_len - map_offset, max_len);
6482
6483         down_read(&dev_replace->rwsem);
6484         dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
6485         /*
6486          * Hold the semaphore for read during the whole operation, write is
6487          * requested at commit time but must wait.
6488          */
6489         if (!dev_replace_is_ongoing)
6490                 up_read(&dev_replace->rwsem);
6491
6492         switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
6493         case BTRFS_BLOCK_GROUP_RAID0:
6494                 map_blocks_raid0(map, &io_geom);
6495                 break;
6496         case BTRFS_BLOCK_GROUP_RAID1:
6497         case BTRFS_BLOCK_GROUP_RAID1C3:
6498         case BTRFS_BLOCK_GROUP_RAID1C4:
6499                 map_blocks_raid1(fs_info, map, &io_geom, dev_replace_is_ongoing);
6500                 break;
6501         case BTRFS_BLOCK_GROUP_DUP:
6502                 map_blocks_dup(map, &io_geom);
6503                 break;
6504         case BTRFS_BLOCK_GROUP_RAID10:
6505                 map_blocks_raid10(fs_info, map, &io_geom, dev_replace_is_ongoing);
6506                 break;
6507         case BTRFS_BLOCK_GROUP_RAID5:
6508         case BTRFS_BLOCK_GROUP_RAID6:
6509                 if (op != BTRFS_MAP_READ || io_geom.mirror_num > 1)
6510                         map_blocks_raid56_write(map, &io_geom, logical, length);
6511                 else
6512                         map_blocks_raid56_read(map, &io_geom);
6513                 break;
6514         default:
6515                 /*
6516                  * After this, stripe_nr is the number of stripes on this
6517                  * device we have to walk to find the data, and stripe_index is
6518                  * the number of our device in the stripe array
6519                  */
6520                 map_blocks_single(map, &io_geom);
6521                 break;
6522         }
6523         if (io_geom.stripe_index >= map->num_stripes) {
6524                 btrfs_crit(fs_info,
6525                            "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
6526                            io_geom.stripe_index, map->num_stripes);
6527                 ret = -EINVAL;
6528                 goto out;
6529         }
6530
6531         num_alloc_stripes = io_geom.num_stripes;
6532         if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
6533             op != BTRFS_MAP_READ)
6534                 /*
6535                  * For replace case, we need to add extra stripes for extra
6536                  * duplicated stripes.
6537                  *
6538                  * For both WRITE and GET_READ_MIRRORS, we may have at most
6539                  * 2 more stripes (DUP types, otherwise 1).
6540                  */
6541                 num_alloc_stripes += 2;
6542
6543         /*
6544          * If this I/O maps to a single device, try to return the device and
6545          * physical block information on the stack instead of allocating an
6546          * I/O context structure.
6547          */
6548         if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, op,
6549                                 io_geom.mirror_num)) {
6550                 ret = set_io_stripe(fs_info, logical, length, smap, map, &io_geom);
6551                 if (mirror_num_ret)
6552                         *mirror_num_ret = io_geom.mirror_num;
6553                 *bioc_ret = NULL;
6554                 goto out;
6555         }
6556
6557         bioc = alloc_btrfs_io_context(fs_info, logical, num_alloc_stripes);
6558         if (!bioc) {
6559                 ret = -ENOMEM;
6560                 goto out;
6561         }
6562         bioc->map_type = map->type;
6563
6564         /*
6565          * For RAID56 full map, we need to make sure the stripes[] follows the
6566          * rule that data stripes are all ordered, then followed with P and Q
6567          * (if we have).
6568          *
6569          * It's still mostly the same as other profiles, just with extra rotation.
6570          */
6571         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
6572             (op != BTRFS_MAP_READ || io_geom.mirror_num > 1)) {
6573                 /*
6574                  * For RAID56 @stripe_nr is already the number of full stripes
6575                  * before us, which is also the rotation value (needs to modulo
6576                  * with num_stripes).
6577                  *
6578                  * In this case, we just add @stripe_nr with @i, then do the
6579                  * modulo, to reduce one modulo call.
6580                  */
6581                 bioc->full_stripe_logical = map->start +
6582                         btrfs_stripe_nr_to_offset(io_geom.stripe_nr *
6583                                                   nr_data_stripes(map));
6584                 for (int i = 0; i < io_geom.num_stripes; i++) {
6585                         struct btrfs_io_stripe *dst = &bioc->stripes[i];
6586                         u32 stripe_index;
6587
6588                         stripe_index = (i + io_geom.stripe_nr) % io_geom.num_stripes;
6589                         dst->dev = map->stripes[stripe_index].dev;
6590                         dst->physical =
6591                                 map->stripes[stripe_index].physical +
6592                                 io_geom.stripe_offset +
6593                                 btrfs_stripe_nr_to_offset(io_geom.stripe_nr);
6594                 }
6595         } else {
6596                 /*
6597                  * For all other non-RAID56 profiles, just copy the target
6598                  * stripe into the bioc.
6599                  */
6600                 for (int i = 0; i < io_geom.num_stripes; i++) {
6601                         ret = set_io_stripe(fs_info, logical, length,
6602                                             &bioc->stripes[i], map, &io_geom);
6603                         if (ret < 0)
6604                                 break;
6605                         io_geom.stripe_index++;
6606                 }
6607         }
6608
6609         if (ret) {
6610                 *bioc_ret = NULL;
6611                 btrfs_put_bioc(bioc);
6612                 goto out;
6613         }
6614
6615         if (op != BTRFS_MAP_READ)
6616                 io_geom.max_errors = btrfs_chunk_max_errors(map);
6617
6618         if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
6619             op != BTRFS_MAP_READ) {
6620                 handle_ops_on_dev_replace(bioc, dev_replace, logical, &io_geom);
6621         }
6622
6623         *bioc_ret = bioc;
6624         bioc->num_stripes = io_geom.num_stripes;
6625         bioc->max_errors = io_geom.max_errors;
6626         bioc->mirror_num = io_geom.mirror_num;
6627
6628 out:
6629         if (dev_replace_is_ongoing) {
6630                 lockdep_assert_held(&dev_replace->rwsem);
6631                 /* Unlock and let waiting writers proceed */
6632                 up_read(&dev_replace->rwsem);
6633         }
6634         btrfs_free_chunk_map(map);
6635         return ret;
6636 }
6637
6638 static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
6639                                       const struct btrfs_fs_devices *fs_devices)
6640 {
6641         if (args->fsid == NULL)
6642                 return true;
6643         if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0)
6644                 return true;
6645         return false;
6646 }
6647
6648 static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args,
6649                                   const struct btrfs_device *device)
6650 {
6651         if (args->missing) {
6652                 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) &&
6653                     !device->bdev)
6654                         return true;
6655                 return false;
6656         }
6657
6658         if (device->devid != args->devid)
6659                 return false;
6660         if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0)
6661                 return false;
6662         return true;
6663 }
6664
6665 /*
6666  * Find a device specified by @devid or @uuid in the list of @fs_devices, or
6667  * return NULL.
6668  *
6669  * If devid and uuid are both specified, the match must be exact, otherwise
6670  * only devid is used.
6671  */
6672 struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices,
6673                                        const struct btrfs_dev_lookup_args *args)
6674 {
6675         struct btrfs_device *device;
6676         struct btrfs_fs_devices *seed_devs;
6677
6678         if (dev_args_match_fs_devices(args, fs_devices)) {
6679                 list_for_each_entry(device, &fs_devices->devices, dev_list) {
6680                         if (dev_args_match_device(args, device))
6681                                 return device;
6682                 }
6683         }
6684
6685         list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
6686                 if (!dev_args_match_fs_devices(args, seed_devs))
6687                         continue;
6688                 list_for_each_entry(device, &seed_devs->devices, dev_list) {
6689                         if (dev_args_match_device(args, device))
6690                                 return device;
6691                 }
6692         }
6693
6694         return NULL;
6695 }
6696
6697 static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
6698                                             u64 devid, u8 *dev_uuid)
6699 {
6700         struct btrfs_device *device;
6701         unsigned int nofs_flag;
6702
6703         /*
6704          * We call this under the chunk_mutex, so we want to use NOFS for this
6705          * allocation, however we don't want to change btrfs_alloc_device() to
6706          * always do NOFS because we use it in a lot of other GFP_KERNEL safe
6707          * places.
6708          */
6709
6710         nofs_flag = memalloc_nofs_save();
6711         device = btrfs_alloc_device(NULL, &devid, dev_uuid, NULL);
6712         memalloc_nofs_restore(nofs_flag);
6713         if (IS_ERR(device))
6714                 return device;
6715
6716         list_add(&device->dev_list, &fs_devices->devices);
6717         device->fs_devices = fs_devices;
6718         fs_devices->num_devices++;
6719
6720         set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
6721         fs_devices->missing_devices++;
6722
6723         return device;
6724 }
6725
6726 /*
6727  * Allocate new device struct, set up devid and UUID.
6728  *
6729  * @fs_info:    used only for generating a new devid, can be NULL if
6730  *              devid is provided (i.e. @devid != NULL).
6731  * @devid:      a pointer to devid for this device.  If NULL a new devid
6732  *              is generated.
6733  * @uuid:       a pointer to UUID for this device.  If NULL a new UUID
6734  *              is generated.
6735  * @path:       a pointer to device path if available, NULL otherwise.
6736  *
6737  * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
6738  * on error.  Returned struct is not linked onto any lists and must be
6739  * destroyed with btrfs_free_device.
6740  */
6741 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
6742                                         const u64 *devid, const u8 *uuid,
6743                                         const char *path)
6744 {
6745         struct btrfs_device *dev;
6746         u64 tmp;
6747
6748         if (WARN_ON(!devid && !fs_info))
6749                 return ERR_PTR(-EINVAL);
6750
6751         dev = kzalloc(sizeof(*dev), GFP_KERNEL);
6752         if (!dev)
6753                 return ERR_PTR(-ENOMEM);
6754
6755         INIT_LIST_HEAD(&dev->dev_list);
6756         INIT_LIST_HEAD(&dev->dev_alloc_list);
6757         INIT_LIST_HEAD(&dev->post_commit_list);
6758
6759         atomic_set(&dev->dev_stats_ccnt, 0);
6760         btrfs_device_data_ordered_init(dev);
6761         extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE);
6762
6763         if (devid)
6764                 tmp = *devid;
6765         else {
6766                 int ret;
6767
6768                 ret = find_next_devid(fs_info, &tmp);
6769                 if (ret) {
6770                         btrfs_free_device(dev);
6771                         return ERR_PTR(ret);
6772                 }
6773         }
6774         dev->devid = tmp;
6775
6776         if (uuid)
6777                 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
6778         else
6779                 generate_random_uuid(dev->uuid);
6780
6781         if (path) {
6782                 struct rcu_string *name;
6783
6784                 name = rcu_string_strdup(path, GFP_KERNEL);
6785                 if (!name) {
6786                         btrfs_free_device(dev);
6787                         return ERR_PTR(-ENOMEM);
6788                 }
6789                 rcu_assign_pointer(dev->name, name);
6790         }
6791
6792         return dev;
6793 }
6794
6795 static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
6796                                         u64 devid, u8 *uuid, bool error)
6797 {
6798         if (error)
6799                 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
6800                               devid, uuid);
6801         else
6802                 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
6803                               devid, uuid);
6804 }
6805
6806 u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map)
6807 {
6808         const int data_stripes = calc_data_stripes(map->type, map->num_stripes);
6809
6810         return div_u64(map->chunk_len, data_stripes);
6811 }
6812
6813 #if BITS_PER_LONG == 32
6814 /*
6815  * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE
6816  * can't be accessed on 32bit systems.
6817  *
6818  * This function do mount time check to reject the fs if it already has
6819  * metadata chunk beyond that limit.
6820  */
6821 static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
6822                                   u64 logical, u64 length, u64 type)
6823 {
6824         if (!(type & BTRFS_BLOCK_GROUP_METADATA))
6825                 return 0;
6826
6827         if (logical + length < MAX_LFS_FILESIZE)
6828                 return 0;
6829
6830         btrfs_err_32bit_limit(fs_info);
6831         return -EOVERFLOW;
6832 }
6833
6834 /*
6835  * This is to give early warning for any metadata chunk reaching
6836  * BTRFS_32BIT_EARLY_WARN_THRESHOLD.
6837  * Although we can still access the metadata, it's not going to be possible
6838  * once the limit is reached.
6839  */
6840 static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
6841                                   u64 logical, u64 length, u64 type)
6842 {
6843         if (!(type & BTRFS_BLOCK_GROUP_METADATA))
6844                 return;
6845
6846         if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD)
6847                 return;
6848
6849         btrfs_warn_32bit_limit(fs_info);
6850 }
6851 #endif
6852
6853 static struct btrfs_device *handle_missing_device(struct btrfs_fs_info *fs_info,
6854                                                   u64 devid, u8 *uuid)
6855 {
6856         struct btrfs_device *dev;
6857
6858         if (!btrfs_test_opt(fs_info, DEGRADED)) {
6859                 btrfs_report_missing_device(fs_info, devid, uuid, true);
6860                 return ERR_PTR(-ENOENT);
6861         }
6862
6863         dev = add_missing_dev(fs_info->fs_devices, devid, uuid);
6864         if (IS_ERR(dev)) {
6865                 btrfs_err(fs_info, "failed to init missing device %llu: %ld",
6866                           devid, PTR_ERR(dev));
6867                 return dev;
6868         }
6869         btrfs_report_missing_device(fs_info, devid, uuid, false);
6870
6871         return dev;
6872 }
6873
6874 static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
6875                           struct btrfs_chunk *chunk)
6876 {
6877         BTRFS_DEV_LOOKUP_ARGS(args);
6878         struct btrfs_fs_info *fs_info = leaf->fs_info;
6879         struct btrfs_chunk_map *map;
6880         u64 logical;
6881         u64 length;
6882         u64 devid;
6883         u64 type;
6884         u8 uuid[BTRFS_UUID_SIZE];
6885         int index;
6886         int num_stripes;
6887         int ret;
6888         int i;
6889
6890         logical = key->offset;
6891         length = btrfs_chunk_length(leaf, chunk);
6892         type = btrfs_chunk_type(leaf, chunk);
6893         index = btrfs_bg_flags_to_raid_index(type);
6894         num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
6895
6896 #if BITS_PER_LONG == 32
6897         ret = check_32bit_meta_chunk(fs_info, logical, length, type);
6898         if (ret < 0)
6899                 return ret;
6900         warn_32bit_meta_chunk(fs_info, logical, length, type);
6901 #endif
6902
6903         /*
6904          * Only need to verify chunk item if we're reading from sys chunk array,
6905          * as chunk item in tree block is already verified by tree-checker.
6906          */
6907         if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
6908                 ret = btrfs_check_chunk_valid(leaf, chunk, logical);
6909                 if (ret)
6910                         return ret;
6911         }
6912
6913         map = btrfs_find_chunk_map(fs_info, logical, 1);
6914
6915         /* already mapped? */
6916         if (map && map->start <= logical && map->start + map->chunk_len > logical) {
6917                 btrfs_free_chunk_map(map);
6918                 return 0;
6919         } else if (map) {
6920                 btrfs_free_chunk_map(map);
6921         }
6922
6923         map = btrfs_alloc_chunk_map(num_stripes, GFP_NOFS);
6924         if (!map)
6925                 return -ENOMEM;
6926
6927         map->start = logical;
6928         map->chunk_len = length;
6929         map->num_stripes = num_stripes;
6930         map->io_width = btrfs_chunk_io_width(leaf, chunk);
6931         map->io_align = btrfs_chunk_io_align(leaf, chunk);
6932         map->type = type;
6933         /*
6934          * We can't use the sub_stripes value, as for profiles other than
6935          * RAID10, they may have 0 as sub_stripes for filesystems created by
6936          * older mkfs (<v5.4).
6937          * In that case, it can cause divide-by-zero errors later.
6938          * Since currently sub_stripes is fixed for each profile, let's
6939          * use the trusted value instead.
6940          */
6941         map->sub_stripes = btrfs_raid_array[index].sub_stripes;
6942         map->verified_stripes = 0;
6943         map->stripe_size = btrfs_calc_stripe_length(map);
6944         for (i = 0; i < num_stripes; i++) {
6945                 map->stripes[i].physical =
6946                         btrfs_stripe_offset_nr(leaf, chunk, i);
6947                 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
6948                 args.devid = devid;
6949                 read_extent_buffer(leaf, uuid, (unsigned long)
6950                                    btrfs_stripe_dev_uuid_nr(chunk, i),
6951                                    BTRFS_UUID_SIZE);
6952                 args.uuid = uuid;
6953                 map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args);
6954                 if (!map->stripes[i].dev) {
6955                         map->stripes[i].dev = handle_missing_device(fs_info,
6956                                                                     devid, uuid);
6957                         if (IS_ERR(map->stripes[i].dev)) {
6958                                 ret = PTR_ERR(map->stripes[i].dev);
6959                                 btrfs_free_chunk_map(map);
6960                                 return ret;
6961                         }
6962                 }
6963
6964                 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
6965                                 &(map->stripes[i].dev->dev_state));
6966         }
6967
6968         ret = btrfs_add_chunk_map(fs_info, map);
6969         if (ret < 0) {
6970                 btrfs_err(fs_info,
6971                           "failed to add chunk map, start=%llu len=%llu: %d",
6972                           map->start, map->chunk_len, ret);
6973         }
6974
6975         return ret;
6976 }
6977
6978 static void fill_device_from_item(struct extent_buffer *leaf,
6979                                  struct btrfs_dev_item *dev_item,
6980                                  struct btrfs_device *device)
6981 {
6982         unsigned long ptr;
6983
6984         device->devid = btrfs_device_id(leaf, dev_item);
6985         device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
6986         device->total_bytes = device->disk_total_bytes;
6987         device->commit_total_bytes = device->disk_total_bytes;
6988         device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
6989         device->commit_bytes_used = device->bytes_used;
6990         device->type = btrfs_device_type(leaf, dev_item);
6991         device->io_align = btrfs_device_io_align(leaf, dev_item);
6992         device->io_width = btrfs_device_io_width(leaf, dev_item);
6993         device->sector_size = btrfs_device_sector_size(leaf, dev_item);
6994         WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
6995         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
6996
6997         ptr = btrfs_device_uuid(dev_item);
6998         read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
6999 }
7000
7001 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
7002                                                   u8 *fsid)
7003 {
7004         struct btrfs_fs_devices *fs_devices;
7005         int ret;
7006
7007         lockdep_assert_held(&uuid_mutex);
7008         ASSERT(fsid);
7009
7010         /* This will match only for multi-device seed fs */
7011         list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
7012                 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
7013                         return fs_devices;
7014
7015
7016         fs_devices = find_fsid(fsid, NULL);
7017         if (!fs_devices) {
7018                 if (!btrfs_test_opt(fs_info, DEGRADED))
7019                         return ERR_PTR(-ENOENT);
7020
7021                 fs_devices = alloc_fs_devices(fsid);
7022                 if (IS_ERR(fs_devices))
7023                         return fs_devices;
7024
7025                 fs_devices->seeding = true;
7026                 fs_devices->opened = 1;
7027                 return fs_devices;
7028         }
7029
7030         /*
7031          * Upon first call for a seed fs fsid, just create a private copy of the
7032          * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
7033          */
7034         fs_devices = clone_fs_devices(fs_devices);
7035         if (IS_ERR(fs_devices))
7036                 return fs_devices;
7037
7038         ret = open_fs_devices(fs_devices, BLK_OPEN_READ, fs_info->bdev_holder);
7039         if (ret) {
7040                 free_fs_devices(fs_devices);
7041                 return ERR_PTR(ret);
7042         }
7043
7044         if (!fs_devices->seeding) {
7045                 close_fs_devices(fs_devices);
7046                 free_fs_devices(fs_devices);
7047                 return ERR_PTR(-EINVAL);
7048         }
7049
7050         list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
7051
7052         return fs_devices;
7053 }
7054
7055 static int read_one_dev(struct extent_buffer *leaf,
7056                         struct btrfs_dev_item *dev_item)
7057 {
7058         BTRFS_DEV_LOOKUP_ARGS(args);
7059         struct btrfs_fs_info *fs_info = leaf->fs_info;
7060         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7061         struct btrfs_device *device;
7062         u64 devid;
7063         int ret;
7064         u8 fs_uuid[BTRFS_FSID_SIZE];
7065         u8 dev_uuid[BTRFS_UUID_SIZE];
7066
7067         devid = btrfs_device_id(leaf, dev_item);
7068         args.devid = devid;
7069         read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
7070                            BTRFS_UUID_SIZE);
7071         read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
7072                            BTRFS_FSID_SIZE);
7073         args.uuid = dev_uuid;
7074         args.fsid = fs_uuid;
7075
7076         if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
7077                 fs_devices = open_seed_devices(fs_info, fs_uuid);
7078                 if (IS_ERR(fs_devices))
7079                         return PTR_ERR(fs_devices);
7080         }
7081
7082         device = btrfs_find_device(fs_info->fs_devices, &args);
7083         if (!device) {
7084                 if (!btrfs_test_opt(fs_info, DEGRADED)) {
7085                         btrfs_report_missing_device(fs_info, devid,
7086                                                         dev_uuid, true);
7087                         return -ENOENT;
7088                 }
7089
7090                 device = add_missing_dev(fs_devices, devid, dev_uuid);
7091                 if (IS_ERR(device)) {
7092                         btrfs_err(fs_info,
7093                                 "failed to add missing dev %llu: %ld",
7094                                 devid, PTR_ERR(device));
7095                         return PTR_ERR(device);
7096                 }
7097                 btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
7098         } else {
7099                 if (!device->bdev) {
7100                         if (!btrfs_test_opt(fs_info, DEGRADED)) {
7101                                 btrfs_report_missing_device(fs_info,
7102                                                 devid, dev_uuid, true);
7103                                 return -ENOENT;
7104                         }
7105                         btrfs_report_missing_device(fs_info, devid,
7106                                                         dev_uuid, false);
7107                 }
7108
7109                 if (!device->bdev &&
7110                     !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
7111                         /*
7112                          * this happens when a device that was properly setup
7113                          * in the device info lists suddenly goes bad.
7114                          * device->bdev is NULL, and so we have to set
7115                          * device->missing to one here
7116                          */
7117                         device->fs_devices->missing_devices++;
7118                         set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
7119                 }
7120
7121                 /* Move the device to its own fs_devices */
7122                 if (device->fs_devices != fs_devices) {
7123                         ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
7124                                                         &device->dev_state));
7125
7126                         list_move(&device->dev_list, &fs_devices->devices);
7127                         device->fs_devices->num_devices--;
7128                         fs_devices->num_devices++;
7129
7130                         device->fs_devices->missing_devices--;
7131                         fs_devices->missing_devices++;
7132
7133                         device->fs_devices = fs_devices;
7134                 }
7135         }
7136
7137         if (device->fs_devices != fs_info->fs_devices) {
7138                 BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
7139                 if (device->generation !=
7140                     btrfs_device_generation(leaf, dev_item))
7141                         return -EINVAL;
7142         }
7143
7144         fill_device_from_item(leaf, dev_item, device);
7145         if (device->bdev) {
7146                 u64 max_total_bytes = bdev_nr_bytes(device->bdev);
7147
7148                 if (device->total_bytes > max_total_bytes) {
7149                         btrfs_err(fs_info,
7150                         "device total_bytes should be at most %llu but found %llu",
7151                                   max_total_bytes, device->total_bytes);
7152                         return -EINVAL;
7153                 }
7154         }
7155         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
7156         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
7157            !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
7158                 device->fs_devices->total_rw_bytes += device->total_bytes;
7159                 atomic64_add(device->total_bytes - device->bytes_used,
7160                                 &fs_info->free_chunk_space);
7161         }
7162         ret = 0;
7163         return ret;
7164 }
7165
7166 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
7167 {
7168         struct btrfs_super_block *super_copy = fs_info->super_copy;
7169         struct extent_buffer *sb;
7170         struct btrfs_disk_key *disk_key;
7171         struct btrfs_chunk *chunk;
7172         u8 *array_ptr;
7173         unsigned long sb_array_offset;
7174         int ret = 0;
7175         u32 num_stripes;
7176         u32 array_size;
7177         u32 len = 0;
7178         u32 cur_offset;
7179         u64 type;
7180         struct btrfs_key key;
7181
7182         ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
7183
7184         /*
7185          * We allocated a dummy extent, just to use extent buffer accessors.
7186          * There will be unused space after BTRFS_SUPER_INFO_SIZE, but
7187          * that's fine, we will not go beyond system chunk array anyway.
7188          */
7189         sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET);
7190         if (!sb)
7191                 return -ENOMEM;
7192         set_extent_buffer_uptodate(sb);
7193
7194         write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
7195         array_size = btrfs_super_sys_array_size(super_copy);
7196
7197         array_ptr = super_copy->sys_chunk_array;
7198         sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
7199         cur_offset = 0;
7200
7201         while (cur_offset < array_size) {
7202                 disk_key = (struct btrfs_disk_key *)array_ptr;
7203                 len = sizeof(*disk_key);
7204                 if (cur_offset + len > array_size)
7205                         goto out_short_read;
7206
7207                 btrfs_disk_key_to_cpu(&key, disk_key);
7208
7209                 array_ptr += len;
7210                 sb_array_offset += len;
7211                 cur_offset += len;
7212
7213                 if (key.type != BTRFS_CHUNK_ITEM_KEY) {
7214                         btrfs_err(fs_info,
7215                             "unexpected item type %u in sys_array at offset %u",
7216                                   (u32)key.type, cur_offset);
7217                         ret = -EIO;
7218                         break;
7219                 }
7220
7221                 chunk = (struct btrfs_chunk *)sb_array_offset;
7222                 /*
7223                  * At least one btrfs_chunk with one stripe must be present,
7224                  * exact stripe count check comes afterwards
7225                  */
7226                 len = btrfs_chunk_item_size(1);
7227                 if (cur_offset + len > array_size)
7228                         goto out_short_read;
7229
7230                 num_stripes = btrfs_chunk_num_stripes(sb, chunk);
7231                 if (!num_stripes) {
7232                         btrfs_err(fs_info,
7233                         "invalid number of stripes %u in sys_array at offset %u",
7234                                   num_stripes, cur_offset);
7235                         ret = -EIO;
7236                         break;
7237                 }
7238
7239                 type = btrfs_chunk_type(sb, chunk);
7240                 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
7241                         btrfs_err(fs_info,
7242                         "invalid chunk type %llu in sys_array at offset %u",
7243                                   type, cur_offset);
7244                         ret = -EIO;
7245                         break;
7246                 }
7247
7248                 len = btrfs_chunk_item_size(num_stripes);
7249                 if (cur_offset + len > array_size)
7250                         goto out_short_read;
7251
7252                 ret = read_one_chunk(&key, sb, chunk);
7253                 if (ret)
7254                         break;
7255
7256                 array_ptr += len;
7257                 sb_array_offset += len;
7258                 cur_offset += len;
7259         }
7260         clear_extent_buffer_uptodate(sb);
7261         free_extent_buffer_stale(sb);
7262         return ret;
7263
7264 out_short_read:
7265         btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
7266                         len, cur_offset);
7267         clear_extent_buffer_uptodate(sb);
7268         free_extent_buffer_stale(sb);
7269         return -EIO;
7270 }
7271
7272 /*
7273  * Check if all chunks in the fs are OK for read-write degraded mount
7274  *
7275  * If the @failing_dev is specified, it's accounted as missing.
7276  *
7277  * Return true if all chunks meet the minimal RW mount requirements.
7278  * Return false if any chunk doesn't meet the minimal RW mount requirements.
7279  */
7280 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
7281                                         struct btrfs_device *failing_dev)
7282 {
7283         struct btrfs_chunk_map *map;
7284         u64 next_start;
7285         bool ret = true;
7286
7287         map = btrfs_find_chunk_map(fs_info, 0, U64_MAX);
7288         /* No chunk at all? Return false anyway */
7289         if (!map) {
7290                 ret = false;
7291                 goto out;
7292         }
7293         while (map) {
7294                 int missing = 0;
7295                 int max_tolerated;
7296                 int i;
7297
7298                 max_tolerated =
7299                         btrfs_get_num_tolerated_disk_barrier_failures(
7300                                         map->type);
7301                 for (i = 0; i < map->num_stripes; i++) {
7302                         struct btrfs_device *dev = map->stripes[i].dev;
7303
7304                         if (!dev || !dev->bdev ||
7305                             test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
7306                             dev->last_flush_error)
7307                                 missing++;
7308                         else if (failing_dev && failing_dev == dev)
7309                                 missing++;
7310                 }
7311                 if (missing > max_tolerated) {
7312                         if (!failing_dev)
7313                                 btrfs_warn(fs_info,
7314         "chunk %llu missing %d devices, max tolerance is %d for writable mount",
7315                                    map->start, missing, max_tolerated);
7316                         btrfs_free_chunk_map(map);
7317                         ret = false;
7318                         goto out;
7319                 }
7320                 next_start = map->start + map->chunk_len;
7321                 btrfs_free_chunk_map(map);
7322
7323                 map = btrfs_find_chunk_map(fs_info, next_start, U64_MAX - next_start);
7324         }
7325 out:
7326         return ret;
7327 }
7328
7329 static void readahead_tree_node_children(struct extent_buffer *node)
7330 {
7331         int i;
7332         const int nr_items = btrfs_header_nritems(node);
7333
7334         for (i = 0; i < nr_items; i++)
7335                 btrfs_readahead_node_child(node, i);
7336 }
7337
7338 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
7339 {
7340         struct btrfs_root *root = fs_info->chunk_root;
7341         struct btrfs_path *path;
7342         struct extent_buffer *leaf;
7343         struct btrfs_key key;
7344         struct btrfs_key found_key;
7345         int ret;
7346         int slot;
7347         int iter_ret = 0;
7348         u64 total_dev = 0;
7349         u64 last_ra_node = 0;
7350
7351         path = btrfs_alloc_path();
7352         if (!path)
7353                 return -ENOMEM;
7354
7355         /*
7356          * uuid_mutex is needed only if we are mounting a sprout FS
7357          * otherwise we don't need it.
7358          */
7359         mutex_lock(&uuid_mutex);
7360
7361         /*
7362          * It is possible for mount and umount to race in such a way that
7363          * we execute this code path, but open_fs_devices failed to clear
7364          * total_rw_bytes. We certainly want it cleared before reading the
7365          * device items, so clear it here.
7366          */
7367         fs_info->fs_devices->total_rw_bytes = 0;
7368
7369         /*
7370          * Lockdep complains about possible circular locking dependency between
7371          * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores
7372          * used for freeze procection of a fs (struct super_block.s_writers),
7373          * which we take when starting a transaction, and extent buffers of the
7374          * chunk tree if we call read_one_dev() while holding a lock on an
7375          * extent buffer of the chunk tree. Since we are mounting the filesystem
7376          * and at this point there can't be any concurrent task modifying the
7377          * chunk tree, to keep it simple, just skip locking on the chunk tree.
7378          */
7379         ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
7380         path->skip_locking = 1;
7381
7382         /*
7383          * Read all device items, and then all the chunk items. All
7384          * device items are found before any chunk item (their object id
7385          * is smaller than the lowest possible object id for a chunk
7386          * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
7387          */
7388         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
7389         key.offset = 0;
7390         key.type = 0;
7391         btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
7392                 struct extent_buffer *node = path->nodes[1];
7393
7394                 leaf = path->nodes[0];
7395                 slot = path->slots[0];
7396
7397                 if (node) {
7398                         if (last_ra_node != node->start) {
7399                                 readahead_tree_node_children(node);
7400                                 last_ra_node = node->start;
7401                         }
7402                 }
7403                 if (found_key.type == BTRFS_DEV_ITEM_KEY) {
7404                         struct btrfs_dev_item *dev_item;
7405                         dev_item = btrfs_item_ptr(leaf, slot,
7406                                                   struct btrfs_dev_item);
7407                         ret = read_one_dev(leaf, dev_item);
7408                         if (ret)
7409                                 goto error;
7410                         total_dev++;
7411                 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
7412                         struct btrfs_chunk *chunk;
7413
7414                         /*
7415                          * We are only called at mount time, so no need to take
7416                          * fs_info->chunk_mutex. Plus, to avoid lockdep warnings,
7417                          * we always lock first fs_info->chunk_mutex before
7418                          * acquiring any locks on the chunk tree. This is a
7419                          * requirement for chunk allocation, see the comment on
7420                          * top of btrfs_chunk_alloc() for details.
7421                          */
7422                         chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
7423                         ret = read_one_chunk(&found_key, leaf, chunk);
7424                         if (ret)
7425                                 goto error;
7426                 }
7427         }
7428         /* Catch error found during iteration */
7429         if (iter_ret < 0) {
7430                 ret = iter_ret;
7431                 goto error;
7432         }
7433
7434         /*
7435          * After loading chunk tree, we've got all device information,
7436          * do another round of validation checks.
7437          */
7438         if (total_dev != fs_info->fs_devices->total_devices) {
7439                 btrfs_warn(fs_info,
7440 "super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
7441                           btrfs_super_num_devices(fs_info->super_copy),
7442                           total_dev);
7443                 fs_info->fs_devices->total_devices = total_dev;
7444                 btrfs_set_super_num_devices(fs_info->super_copy, total_dev);
7445         }
7446         if (btrfs_super_total_bytes(fs_info->super_copy) <
7447             fs_info->fs_devices->total_rw_bytes) {
7448                 btrfs_err(fs_info,
7449         "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
7450                           btrfs_super_total_bytes(fs_info->super_copy),
7451                           fs_info->fs_devices->total_rw_bytes);
7452                 ret = -EINVAL;
7453                 goto error;
7454         }
7455         ret = 0;
7456 error:
7457         mutex_unlock(&uuid_mutex);
7458
7459         btrfs_free_path(path);
7460         return ret;
7461 }
7462
7463 int btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
7464 {
7465         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7466         struct btrfs_device *device;
7467         int ret = 0;
7468
7469         fs_devices->fs_info = fs_info;
7470
7471         mutex_lock(&fs_devices->device_list_mutex);
7472         list_for_each_entry(device, &fs_devices->devices, dev_list)
7473                 device->fs_info = fs_info;
7474
7475         list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7476                 list_for_each_entry(device, &seed_devs->devices, dev_list) {
7477                         device->fs_info = fs_info;
7478                         ret = btrfs_get_dev_zone_info(device, false);
7479                         if (ret)
7480                                 break;
7481                 }
7482
7483                 seed_devs->fs_info = fs_info;
7484         }
7485         mutex_unlock(&fs_devices->device_list_mutex);
7486
7487         return ret;
7488 }
7489
7490 static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
7491                                  const struct btrfs_dev_stats_item *ptr,
7492                                  int index)
7493 {
7494         u64 val;
7495
7496         read_extent_buffer(eb, &val,
7497                            offsetof(struct btrfs_dev_stats_item, values) +
7498                             ((unsigned long)ptr) + (index * sizeof(u64)),
7499                            sizeof(val));
7500         return val;
7501 }
7502
7503 static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
7504                                       struct btrfs_dev_stats_item *ptr,
7505                                       int index, u64 val)
7506 {
7507         write_extent_buffer(eb, &val,
7508                             offsetof(struct btrfs_dev_stats_item, values) +
7509                              ((unsigned long)ptr) + (index * sizeof(u64)),
7510                             sizeof(val));
7511 }
7512
7513 static int btrfs_device_init_dev_stats(struct btrfs_device *device,
7514                                        struct btrfs_path *path)
7515 {
7516         struct btrfs_dev_stats_item *ptr;
7517         struct extent_buffer *eb;
7518         struct btrfs_key key;
7519         int item_size;
7520         int i, ret, slot;
7521
7522         if (!device->fs_info->dev_root)
7523                 return 0;
7524
7525         key.objectid = BTRFS_DEV_STATS_OBJECTID;
7526         key.type = BTRFS_PERSISTENT_ITEM_KEY;
7527         key.offset = device->devid;
7528         ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
7529         if (ret) {
7530                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7531                         btrfs_dev_stat_set(device, i, 0);
7532                 device->dev_stats_valid = 1;
7533                 btrfs_release_path(path);
7534                 return ret < 0 ? ret : 0;
7535         }
7536         slot = path->slots[0];
7537         eb = path->nodes[0];
7538         item_size = btrfs_item_size(eb, slot);
7539
7540         ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
7541
7542         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7543                 if (item_size >= (1 + i) * sizeof(__le64))
7544                         btrfs_dev_stat_set(device, i,
7545                                            btrfs_dev_stats_value(eb, ptr, i));
7546                 else
7547                         btrfs_dev_stat_set(device, i, 0);
7548         }
7549
7550         device->dev_stats_valid = 1;
7551         btrfs_dev_stat_print_on_load(device);
7552         btrfs_release_path(path);
7553
7554         return 0;
7555 }
7556
7557 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
7558 {
7559         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7560         struct btrfs_device *device;
7561         struct btrfs_path *path = NULL;
7562         int ret = 0;
7563
7564         path = btrfs_alloc_path();
7565         if (!path)
7566                 return -ENOMEM;
7567
7568         mutex_lock(&fs_devices->device_list_mutex);
7569         list_for_each_entry(device, &fs_devices->devices, dev_list) {
7570                 ret = btrfs_device_init_dev_stats(device, path);
7571                 if (ret)
7572                         goto out;
7573         }
7574         list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7575                 list_for_each_entry(device, &seed_devs->devices, dev_list) {
7576                         ret = btrfs_device_init_dev_stats(device, path);
7577                         if (ret)
7578                                 goto out;
7579                 }
7580         }
7581 out:
7582         mutex_unlock(&fs_devices->device_list_mutex);
7583
7584         btrfs_free_path(path);
7585         return ret;
7586 }
7587
7588 static int update_dev_stat_item(struct btrfs_trans_handle *trans,
7589                                 struct btrfs_device *device)
7590 {
7591         struct btrfs_fs_info *fs_info = trans->fs_info;
7592         struct btrfs_root *dev_root = fs_info->dev_root;
7593         struct btrfs_path *path;
7594         struct btrfs_key key;
7595         struct extent_buffer *eb;
7596         struct btrfs_dev_stats_item *ptr;
7597         int ret;
7598         int i;
7599
7600         key.objectid = BTRFS_DEV_STATS_OBJECTID;
7601         key.type = BTRFS_PERSISTENT_ITEM_KEY;
7602         key.offset = device->devid;
7603
7604         path = btrfs_alloc_path();
7605         if (!path)
7606                 return -ENOMEM;
7607         ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
7608         if (ret < 0) {
7609                 btrfs_warn_in_rcu(fs_info,
7610                         "error %d while searching for dev_stats item for device %s",
7611                                   ret, btrfs_dev_name(device));
7612                 goto out;
7613         }
7614
7615         if (ret == 0 &&
7616             btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
7617                 /* need to delete old one and insert a new one */
7618                 ret = btrfs_del_item(trans, dev_root, path);
7619                 if (ret != 0) {
7620                         btrfs_warn_in_rcu(fs_info,
7621                                 "delete too small dev_stats item for device %s failed %d",
7622                                           btrfs_dev_name(device), ret);
7623                         goto out;
7624                 }
7625                 ret = 1;
7626         }
7627
7628         if (ret == 1) {
7629                 /* need to insert a new item */
7630                 btrfs_release_path(path);
7631                 ret = btrfs_insert_empty_item(trans, dev_root, path,
7632                                               &key, sizeof(*ptr));
7633                 if (ret < 0) {
7634                         btrfs_warn_in_rcu(fs_info,
7635                                 "insert dev_stats item for device %s failed %d",
7636                                 btrfs_dev_name(device), ret);
7637                         goto out;
7638                 }
7639         }
7640
7641         eb = path->nodes[0];
7642         ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
7643         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7644                 btrfs_set_dev_stats_value(eb, ptr, i,
7645                                           btrfs_dev_stat_read(device, i));
7646         btrfs_mark_buffer_dirty(trans, eb);
7647
7648 out:
7649         btrfs_free_path(path);
7650         return ret;
7651 }
7652
7653 /*
7654  * called from commit_transaction. Writes all changed device stats to disk.
7655  */
7656 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
7657 {
7658         struct btrfs_fs_info *fs_info = trans->fs_info;
7659         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7660         struct btrfs_device *device;
7661         int stats_cnt;
7662         int ret = 0;
7663
7664         mutex_lock(&fs_devices->device_list_mutex);
7665         list_for_each_entry(device, &fs_devices->devices, dev_list) {
7666                 stats_cnt = atomic_read(&device->dev_stats_ccnt);
7667                 if (!device->dev_stats_valid || stats_cnt == 0)
7668                         continue;
7669
7670
7671                 /*
7672                  * There is a LOAD-LOAD control dependency between the value of
7673                  * dev_stats_ccnt and updating the on-disk values which requires
7674                  * reading the in-memory counters. Such control dependencies
7675                  * require explicit read memory barriers.
7676                  *
7677                  * This memory barriers pairs with smp_mb__before_atomic in
7678                  * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
7679                  * barrier implied by atomic_xchg in
7680                  * btrfs_dev_stats_read_and_reset
7681                  */
7682                 smp_rmb();
7683
7684                 ret = update_dev_stat_item(trans, device);
7685                 if (!ret)
7686                         atomic_sub(stats_cnt, &device->dev_stats_ccnt);
7687         }
7688         mutex_unlock(&fs_devices->device_list_mutex);
7689
7690         return ret;
7691 }
7692
7693 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
7694 {
7695         btrfs_dev_stat_inc(dev, index);
7696
7697         if (!dev->dev_stats_valid)
7698                 return;
7699         btrfs_err_rl_in_rcu(dev->fs_info,
7700                 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7701                            btrfs_dev_name(dev),
7702                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7703                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7704                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7705                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7706                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7707 }
7708
7709 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
7710 {
7711         int i;
7712
7713         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7714                 if (btrfs_dev_stat_read(dev, i) != 0)
7715                         break;
7716         if (i == BTRFS_DEV_STAT_VALUES_MAX)
7717                 return; /* all values == 0, suppress message */
7718
7719         btrfs_info_in_rcu(dev->fs_info,
7720                 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7721                btrfs_dev_name(dev),
7722                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7723                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7724                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7725                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7726                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7727 }
7728
7729 int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
7730                         struct btrfs_ioctl_get_dev_stats *stats)
7731 {
7732         BTRFS_DEV_LOOKUP_ARGS(args);
7733         struct btrfs_device *dev;
7734         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7735         int i;
7736
7737         mutex_lock(&fs_devices->device_list_mutex);
7738         args.devid = stats->devid;
7739         dev = btrfs_find_device(fs_info->fs_devices, &args);
7740         mutex_unlock(&fs_devices->device_list_mutex);
7741
7742         if (!dev) {
7743                 btrfs_warn(fs_info, "get dev_stats failed, device not found");
7744                 return -ENODEV;
7745         } else if (!dev->dev_stats_valid) {
7746                 btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
7747                 return -ENODEV;
7748         } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
7749                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7750                         if (stats->nr_items > i)
7751                                 stats->values[i] =
7752                                         btrfs_dev_stat_read_and_reset(dev, i);
7753                         else
7754                                 btrfs_dev_stat_set(dev, i, 0);
7755                 }
7756                 btrfs_info(fs_info, "device stats zeroed by %s (%d)",
7757                            current->comm, task_pid_nr(current));
7758         } else {
7759                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7760                         if (stats->nr_items > i)
7761                                 stats->values[i] = btrfs_dev_stat_read(dev, i);
7762         }
7763         if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
7764                 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
7765         return 0;
7766 }
7767
7768 /*
7769  * Update the size and bytes used for each device where it changed.  This is
7770  * delayed since we would otherwise get errors while writing out the
7771  * superblocks.
7772  *
7773  * Must be invoked during transaction commit.
7774  */
7775 void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
7776 {
7777         struct btrfs_device *curr, *next;
7778
7779         ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
7780
7781         if (list_empty(&trans->dev_update_list))
7782                 return;
7783
7784         /*
7785          * We don't need the device_list_mutex here.  This list is owned by the
7786          * transaction and the transaction must complete before the device is
7787          * released.
7788          */
7789         mutex_lock(&trans->fs_info->chunk_mutex);
7790         list_for_each_entry_safe(curr, next, &trans->dev_update_list,
7791                                  post_commit_list) {
7792                 list_del_init(&curr->post_commit_list);
7793                 curr->commit_total_bytes = curr->disk_total_bytes;
7794                 curr->commit_bytes_used = curr->bytes_used;
7795         }
7796         mutex_unlock(&trans->fs_info->chunk_mutex);
7797 }
7798
7799 /*
7800  * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
7801  */
7802 int btrfs_bg_type_to_factor(u64 flags)
7803 {
7804         const int index = btrfs_bg_flags_to_raid_index(flags);
7805
7806         return btrfs_raid_array[index].ncopies;
7807 }
7808
7809
7810
7811 static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
7812                                  u64 chunk_offset, u64 devid,
7813                                  u64 physical_offset, u64 physical_len)
7814 {
7815         struct btrfs_dev_lookup_args args = { .devid = devid };
7816         struct btrfs_chunk_map *map;
7817         struct btrfs_device *dev;
7818         u64 stripe_len;
7819         bool found = false;
7820         int ret = 0;
7821         int i;
7822
7823         map = btrfs_find_chunk_map(fs_info, chunk_offset, 1);
7824         if (!map) {
7825                 btrfs_err(fs_info,
7826 "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
7827                           physical_offset, devid);
7828                 ret = -EUCLEAN;
7829                 goto out;
7830         }
7831
7832         stripe_len = btrfs_calc_stripe_length(map);
7833         if (physical_len != stripe_len) {
7834                 btrfs_err(fs_info,
7835 "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
7836                           physical_offset, devid, map->start, physical_len,
7837                           stripe_len);
7838                 ret = -EUCLEAN;
7839                 goto out;
7840         }
7841
7842         /*
7843          * Very old mkfs.btrfs (before v4.1) will not respect the reserved
7844          * space. Although kernel can handle it without problem, better to warn
7845          * the users.
7846          */
7847         if (physical_offset < BTRFS_DEVICE_RANGE_RESERVED)
7848                 btrfs_warn(fs_info,
7849                 "devid %llu physical %llu len %llu inside the reserved space",
7850                            devid, physical_offset, physical_len);
7851
7852         for (i = 0; i < map->num_stripes; i++) {
7853                 if (map->stripes[i].dev->devid == devid &&
7854                     map->stripes[i].physical == physical_offset) {
7855                         found = true;
7856                         if (map->verified_stripes >= map->num_stripes) {
7857                                 btrfs_err(fs_info,
7858                                 "too many dev extents for chunk %llu found",
7859                                           map->start);
7860                                 ret = -EUCLEAN;
7861                                 goto out;
7862                         }
7863                         map->verified_stripes++;
7864                         break;
7865                 }
7866         }
7867         if (!found) {
7868                 btrfs_err(fs_info,
7869         "dev extent physical offset %llu devid %llu has no corresponding chunk",
7870                         physical_offset, devid);
7871                 ret = -EUCLEAN;
7872         }
7873
7874         /* Make sure no dev extent is beyond device boundary */
7875         dev = btrfs_find_device(fs_info->fs_devices, &args);
7876         if (!dev) {
7877                 btrfs_err(fs_info, "failed to find devid %llu", devid);
7878                 ret = -EUCLEAN;
7879                 goto out;
7880         }
7881
7882         if (physical_offset + physical_len > dev->disk_total_bytes) {
7883                 btrfs_err(fs_info,
7884 "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
7885                           devid, physical_offset, physical_len,
7886                           dev->disk_total_bytes);
7887                 ret = -EUCLEAN;
7888                 goto out;
7889         }
7890
7891         if (dev->zone_info) {
7892                 u64 zone_size = dev->zone_info->zone_size;
7893
7894                 if (!IS_ALIGNED(physical_offset, zone_size) ||
7895                     !IS_ALIGNED(physical_len, zone_size)) {
7896                         btrfs_err(fs_info,
7897 "zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone",
7898                                   devid, physical_offset, physical_len);
7899                         ret = -EUCLEAN;
7900                         goto out;
7901                 }
7902         }
7903
7904 out:
7905         btrfs_free_chunk_map(map);
7906         return ret;
7907 }
7908
7909 static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
7910 {
7911         struct rb_node *node;
7912         int ret = 0;
7913
7914         read_lock(&fs_info->mapping_tree_lock);
7915         for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) {
7916                 struct btrfs_chunk_map *map;
7917
7918                 map = rb_entry(node, struct btrfs_chunk_map, rb_node);
7919                 if (map->num_stripes != map->verified_stripes) {
7920                         btrfs_err(fs_info,
7921                         "chunk %llu has missing dev extent, have %d expect %d",
7922                                   map->start, map->verified_stripes, map->num_stripes);
7923                         ret = -EUCLEAN;
7924                         goto out;
7925                 }
7926         }
7927 out:
7928         read_unlock(&fs_info->mapping_tree_lock);
7929         return ret;
7930 }
7931
7932 /*
7933  * Ensure that all dev extents are mapped to correct chunk, otherwise
7934  * later chunk allocation/free would cause unexpected behavior.
7935  *
7936  * NOTE: This will iterate through the whole device tree, which should be of
7937  * the same size level as the chunk tree.  This slightly increases mount time.
7938  */
7939 int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
7940 {
7941         struct btrfs_path *path;
7942         struct btrfs_root *root = fs_info->dev_root;
7943         struct btrfs_key key;
7944         u64 prev_devid = 0;
7945         u64 prev_dev_ext_end = 0;
7946         int ret = 0;
7947
7948         /*
7949          * We don't have a dev_root because we mounted with ignorebadroots and
7950          * failed to load the root, so we want to skip the verification in this
7951          * case for sure.
7952          *
7953          * However if the dev root is fine, but the tree itself is corrupted
7954          * we'd still fail to mount.  This verification is only to make sure
7955          * writes can happen safely, so instead just bypass this check
7956          * completely in the case of IGNOREBADROOTS.
7957          */
7958         if (btrfs_test_opt(fs_info, IGNOREBADROOTS))
7959                 return 0;
7960
7961         key.objectid = 1;
7962         key.type = BTRFS_DEV_EXTENT_KEY;
7963         key.offset = 0;
7964
7965         path = btrfs_alloc_path();
7966         if (!path)
7967                 return -ENOMEM;
7968
7969         path->reada = READA_FORWARD;
7970         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7971         if (ret < 0)
7972                 goto out;
7973
7974         if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
7975                 ret = btrfs_next_leaf(root, path);
7976                 if (ret < 0)
7977                         goto out;
7978                 /* No dev extents at all? Not good */
7979                 if (ret > 0) {
7980                         ret = -EUCLEAN;
7981                         goto out;
7982                 }
7983         }
7984         while (1) {
7985                 struct extent_buffer *leaf = path->nodes[0];
7986                 struct btrfs_dev_extent *dext;
7987                 int slot = path->slots[0];
7988                 u64 chunk_offset;
7989                 u64 physical_offset;
7990                 u64 physical_len;
7991                 u64 devid;
7992
7993                 btrfs_item_key_to_cpu(leaf, &key, slot);
7994                 if (key.type != BTRFS_DEV_EXTENT_KEY)
7995                         break;
7996                 devid = key.objectid;
7997                 physical_offset = key.offset;
7998
7999                 dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
8000                 chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
8001                 physical_len = btrfs_dev_extent_length(leaf, dext);
8002
8003                 /* Check if this dev extent overlaps with the previous one */
8004                 if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
8005                         btrfs_err(fs_info,
8006 "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
8007                                   devid, physical_offset, prev_dev_ext_end);
8008                         ret = -EUCLEAN;
8009                         goto out;
8010                 }
8011
8012                 ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
8013                                             physical_offset, physical_len);
8014                 if (ret < 0)
8015                         goto out;
8016                 prev_devid = devid;
8017                 prev_dev_ext_end = physical_offset + physical_len;
8018
8019                 ret = btrfs_next_item(root, path);
8020                 if (ret < 0)
8021                         goto out;
8022                 if (ret > 0) {
8023                         ret = 0;
8024                         break;
8025                 }
8026         }
8027
8028         /* Ensure all chunks have corresponding dev extents */
8029         ret = verify_chunk_dev_extent_mapping(fs_info);
8030 out:
8031         btrfs_free_path(path);
8032         return ret;
8033 }
8034
8035 /*
8036  * Check whether the given block group or device is pinned by any inode being
8037  * used as a swapfile.
8038  */
8039 bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
8040 {
8041         struct btrfs_swapfile_pin *sp;
8042         struct rb_node *node;
8043
8044         spin_lock(&fs_info->swapfile_pins_lock);
8045         node = fs_info->swapfile_pins.rb_node;
8046         while (node) {
8047                 sp = rb_entry(node, struct btrfs_swapfile_pin, node);
8048                 if (ptr < sp->ptr)
8049                         node = node->rb_left;
8050                 else if (ptr > sp->ptr)
8051                         node = node->rb_right;
8052                 else
8053                         break;
8054         }
8055         spin_unlock(&fs_info->swapfile_pins_lock);
8056         return node != NULL;
8057 }
8058
8059 static int relocating_repair_kthread(void *data)
8060 {
8061         struct btrfs_block_group *cache = data;
8062         struct btrfs_fs_info *fs_info = cache->fs_info;
8063         u64 target;
8064         int ret = 0;
8065
8066         target = cache->start;
8067         btrfs_put_block_group(cache);
8068
8069         sb_start_write(fs_info->sb);
8070         if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
8071                 btrfs_info(fs_info,
8072                            "zoned: skip relocating block group %llu to repair: EBUSY",
8073                            target);
8074                 sb_end_write(fs_info->sb);
8075                 return -EBUSY;
8076         }
8077
8078         mutex_lock(&fs_info->reclaim_bgs_lock);
8079
8080         /* Ensure block group still exists */
8081         cache = btrfs_lookup_block_group(fs_info, target);
8082         if (!cache)
8083                 goto out;
8084
8085         if (!test_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags))
8086                 goto out;
8087
8088         ret = btrfs_may_alloc_data_chunk(fs_info, target);
8089         if (ret < 0)
8090                 goto out;
8091
8092         btrfs_info(fs_info,
8093                    "zoned: relocating block group %llu to repair IO failure",
8094                    target);
8095         ret = btrfs_relocate_chunk(fs_info, target);
8096
8097 out:
8098         if (cache)
8099                 btrfs_put_block_group(cache);
8100         mutex_unlock(&fs_info->reclaim_bgs_lock);
8101         btrfs_exclop_finish(fs_info);
8102         sb_end_write(fs_info->sb);
8103
8104         return ret;
8105 }
8106
8107 bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
8108 {
8109         struct btrfs_block_group *cache;
8110
8111         if (!btrfs_is_zoned(fs_info))
8112                 return false;
8113
8114         /* Do not attempt to repair in degraded state */
8115         if (btrfs_test_opt(fs_info, DEGRADED))
8116                 return true;
8117
8118         cache = btrfs_lookup_block_group(fs_info, logical);
8119         if (!cache)
8120                 return true;
8121
8122         if (test_and_set_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) {
8123                 btrfs_put_block_group(cache);
8124                 return true;
8125         }
8126
8127         kthread_run(relocating_repair_kthread, cache,
8128                     "btrfs-relocating-repair");
8129
8130         return true;
8131 }
8132
8133 static void map_raid56_repair_block(struct btrfs_io_context *bioc,
8134                                     struct btrfs_io_stripe *smap,
8135                                     u64 logical)
8136 {
8137         int data_stripes = nr_bioc_data_stripes(bioc);
8138         int i;
8139
8140         for (i = 0; i < data_stripes; i++) {
8141                 u64 stripe_start = bioc->full_stripe_logical +
8142                                    btrfs_stripe_nr_to_offset(i);
8143
8144                 if (logical >= stripe_start &&
8145                     logical < stripe_start + BTRFS_STRIPE_LEN)
8146                         break;
8147         }
8148         ASSERT(i < data_stripes);
8149         smap->dev = bioc->stripes[i].dev;
8150         smap->physical = bioc->stripes[i].physical +
8151                         ((logical - bioc->full_stripe_logical) &
8152                          BTRFS_STRIPE_LEN_MASK);
8153 }
8154
8155 /*
8156  * Map a repair write into a single device.
8157  *
8158  * A repair write is triggered by read time repair or scrub, which would only
8159  * update the contents of a single device.
8160  * Not update any other mirrors nor go through RMW path.
8161  *
8162  * Callers should ensure:
8163  *
8164  * - Call btrfs_bio_counter_inc_blocked() first
8165  * - The range does not cross stripe boundary
8166  * - Has a valid @mirror_num passed in.
8167  */
8168 int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
8169                            struct btrfs_io_stripe *smap, u64 logical,
8170                            u32 length, int mirror_num)
8171 {
8172         struct btrfs_io_context *bioc = NULL;
8173         u64 map_length = length;
8174         int mirror_ret = mirror_num;
8175         int ret;
8176
8177         ASSERT(mirror_num > 0);
8178
8179         ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length,
8180                               &bioc, smap, &mirror_ret);
8181         if (ret < 0)
8182                 return ret;
8183
8184         /* The map range should not cross stripe boundary. */
8185         ASSERT(map_length >= length);
8186
8187         /* Already mapped to single stripe. */
8188         if (!bioc)
8189                 goto out;
8190
8191         /* Map the RAID56 multi-stripe writes to a single one. */
8192         if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
8193                 map_raid56_repair_block(bioc, smap, logical);
8194                 goto out;
8195         }
8196
8197         ASSERT(mirror_num <= bioc->num_stripes);
8198         smap->dev = bioc->stripes[mirror_num - 1].dev;
8199         smap->physical = bioc->stripes[mirror_num - 1].physical;
8200 out:
8201         btrfs_put_bioc(bioc);
8202         ASSERT(smap->dev);
8203         return 0;
8204 }