module/zfs/metaslab.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or https://opensource.org/licenses/CDDL-1.0.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
  24  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  25  * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
  26  * Copyright (c) 2017, Intel Corporation.
  27  */
  28
  29 #include <sys/zfs_context.h>
  30 #include <sys/dmu.h>
  31 #include <sys/dmu_tx.h>
  32 #include <sys/space_map.h>
  33 #include <sys/metaslab_impl.h>
  34 #include <sys/vdev_impl.h>
  35 #include <sys/vdev_draid.h>
  36 #include <sys/zio.h>
  37 #include <sys/spa_impl.h>
  38 #include <sys/zfeature.h>
  39 #include <sys/vdev_indirect_mapping.h>
  40 #include <sys/zap.h>
  41 #include <sys/btree.h>
  42
  43 #define WITH_DF_BLOCK_ALLOCATOR
  44
  45 #define GANG_ALLOCATION(flags) \
  46         ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
  47
  48 /*
  49  * Metaslab granularity, in bytes. This is roughly similar to what would be
  50  * referred to as the "stripe size" in traditional RAID arrays. In normal
  51  * operation, we will try to write this amount of data to each disk before
  52  * moving on to the next top-level vdev.
  53  */
  54 static uint64_t metaslab_aliquot = 1024 * 1024;
  55
  56 /*
  57  * For testing, make some blocks above a certain size be gang blocks.
  58  */
  59 uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;
  60
  61 /*
  62  * Of blocks of size >= metaslab_force_ganging, actually gang them this often.
  63  */
  64 uint_t metaslab_force_ganging_pct = 3;
  65
  66 /*
  67  * In pools where the log space map feature is not enabled we touch
  68  * multiple metaslabs (and their respective space maps) with each
  69  * transaction group. Thus, we benefit from having a small space map
  70  * block size since it allows us to issue more I/O operations scattered
  71  * around the disk. So a sane default for the space map block size
  72  * is 8~16K.
  73  */
  74 int zfs_metaslab_sm_blksz_no_log = (1 << 14);
  75
  76 /*
  77  * When the log space map feature is enabled, we accumulate a lot of
  78  * changes per metaslab that are flushed once in a while so we benefit
  79  * from a bigger block size like 128K for the metaslab space maps.
  80  */
  81 int zfs_metaslab_sm_blksz_with_log = (1 << 17);
  82
  83 /*
  84  * The in-core space map representation is more compact than its on-disk form.
  85  * The zfs_condense_pct determines how much more compact the in-core
  86  * space map representation must be before we compact it on-disk.
  87  * Values should be greater than or equal to 100.
  88  */
  89 uint_t zfs_condense_pct = 200;
  90
  91 /*
  92  * Condensing a metaslab is not guaranteed to actually reduce the amount of
  93  * space used on disk. In particular, a space map uses data in increments of
  94  * MAX(1 << ashift, space_map_blksz), so a metaslab might use the
  95  * same number of blocks after condensing. Since the goal of condensing is to
  96  * reduce the number of IOPs required to read the space map, we only want to
  97  * condense when we can be sure we will reduce the number of blocks used by the
  98  * space map. Unfortunately, we cannot precisely compute whether or not this is
  99  * the case in metaslab_should_condense since we are holding ms_lock. Instead,
 100  * we apply the following heuristic: do not condense a spacemap unless the
 101  * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
 102  * blocks.
 103  */
 104 static const int zfs_metaslab_condense_block_threshold = 4;
 105
 106 /*
 107  * The zfs_mg_noalloc_threshold defines which metaslab groups should
 108  * be eligible for allocation. The value is defined as a percentage of
 109  * free space. Metaslab groups that have more free space than
 110  * zfs_mg_noalloc_threshold are always eligible for allocations. Once
 111  * a metaslab group's free space is less than or equal to the
 112  * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
 113  * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
 114  * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
 115  * groups are allowed to accept allocations. Gang blocks are always
 116  * eligible to allocate on any metaslab group. The default value of 0 means
 117  * no metaslab group will be excluded based on this criterion.
 118  */
 119 static uint_t zfs_mg_noalloc_threshold = 0;
 120
 121 /*
 122  * Metaslab groups are considered eligible for allocations if their
 123  * fragmentation metric (measured as a percentage) is less than or
 124  * equal to zfs_mg_fragmentation_threshold. If a metaslab group
 125  * exceeds this threshold then it will be skipped unless all metaslab
 126  * groups within the metaslab class have also crossed this threshold.
 127  *
 128  * This tunable was introduced to avoid edge cases where we continue
 129  * allocating from very fragmented disks in our pool while other, less
 130  * fragmented disks, exists. On the other hand, if all disks in the
 131  * pool are uniformly approaching the threshold, the threshold can
 132  * be a speed bump in performance, where we keep switching the disks
 133  * that we allocate from (e.g. we allocate some segments from disk A
 134  * making it bypassing the threshold while freeing segments from disk
 135  * B getting its fragmentation below the threshold).
 136  *
 137  * Empirically, we've seen that our vdev selection for allocations is
 138  * good enough that fragmentation increases uniformly across all vdevs
 139  * the majority of the time. Thus we set the threshold percentage high
 140  * enough to avoid hitting the speed bump on pools that are being pushed
 141  * to the edge.
 142  */
 143 static uint_t zfs_mg_fragmentation_threshold = 95;
 144
 145 /*
 146  * Allow metaslabs to keep their active state as long as their fragmentation
 147  * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
 148  * active metaslab that exceeds this threshold will no longer keep its active
 149  * status allowing better metaslabs to be selected.
 150  */
 151 static uint_t zfs_metaslab_fragmentation_threshold = 70;
 152
 153 /*
 154  * When set will load all metaslabs when pool is first opened.
 155  */
 156 int metaslab_debug_load = B_FALSE;
 157
 158 /*
 159  * When set will prevent metaslabs from being unloaded.
 160  */
 161 static int metaslab_debug_unload = B_FALSE;
 162
 163 /*
 164  * Minimum size which forces the dynamic allocator to change
 165  * it's allocation strategy.  Once the space map cannot satisfy
 166  * an allocation of this size then it switches to using more
 167  * aggressive strategy (i.e search by size rather than offset).
 168  */
 169 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
 170
 171 /*
 172  * The minimum free space, in percent, which must be available
 173  * in a space map to continue allocations in a first-fit fashion.
 174  * Once the space map's free space drops below this level we dynamically
 175  * switch to using best-fit allocations.
 176  */
 177 uint_t metaslab_df_free_pct = 4;
 178
 179 /*
 180  * Maximum distance to search forward from the last offset. Without this
 181  * limit, fragmented pools can see >100,000 iterations and
 182  * metaslab_block_picker() becomes the performance limiting factor on
 183  * high-performance storage.
 184  *
 185  * With the default setting of 16MB, we typically see less than 500
 186  * iterations, even with very fragmented, ashift=9 pools. The maximum number
 187  * of iterations possible is:
 188  *     metaslab_df_max_search / (2 * (1<<ashift))
 189  * With the default setting of 16MB this is 16*1024 (with ashift=9) or
 190  * 2048 (with ashift=12).
 191  */
 192 static uint_t metaslab_df_max_search = 16 * 1024 * 1024;
 193
 194 /*
 195  * Forces the metaslab_block_picker function to search for at least this many
 196  * segments forwards until giving up on finding a segment that the allocation
 197  * will fit into.
 198  */
 199 static const uint32_t metaslab_min_search_count = 100;
 200
 201 /*
 202  * If we are not searching forward (due to metaslab_df_max_search,
 203  * metaslab_df_free_pct, or metaslab_df_alloc_threshold), this tunable
 204  * controls what segment is used.  If it is set, we will use the largest free
 205  * segment.  If it is not set, we will use a segment of exactly the requested
 206  * size (or larger).
 207  */
 208 static int metaslab_df_use_largest_segment = B_FALSE;
 209
 210 /*
 211  * Percentage of all cpus that can be used by the metaslab taskq.
 212  */
 213 int metaslab_load_pct = 50;
 214
 215 /*
 216  * These tunables control how long a metaslab will remain loaded after the
 217  * last allocation from it.  A metaslab can't be unloaded until at least
 218  * metaslab_unload_delay TXG's and metaslab_unload_delay_ms milliseconds
 219  * have elapsed.  However, zfs_metaslab_mem_limit may cause it to be
 220  * unloaded sooner.  These settings are intended to be generous -- to keep
 221  * metaslabs loaded for a long time, reducing the rate of metaslab loading.
 222  */
 223 static uint_t metaslab_unload_delay = 32;
 224 static uint_t metaslab_unload_delay_ms = 10 * 60 * 1000; /* ten minutes */
 225
 226 /*
 227  * Max number of metaslabs per group to preload.
 228  */
 229 uint_t metaslab_preload_limit = 10;
 230
 231 /*
 232  * Enable/disable preloading of metaslab.
 233  */
 234 static int metaslab_preload_enabled = B_TRUE;
 235
 236 /*
 237  * Enable/disable fragmentation weighting on metaslabs.
 238  */
 239 static int metaslab_fragmentation_factor_enabled = B_TRUE;
 240
 241 /*
 242  * Enable/disable lba weighting (i.e. outer tracks are given preference).
 243  */
 244 static int metaslab_lba_weighting_enabled = B_TRUE;
 245
 246 /*
 247  * Enable/disable metaslab group biasing.
 248  */
 249 static int metaslab_bias_enabled = B_TRUE;
 250
 251 /*
 252  * Enable/disable remapping of indirect DVAs to their concrete vdevs.
 253  */
 254 static const boolean_t zfs_remap_blkptr_enable = B_TRUE;
 255
 256 /*
 257  * Enable/disable segment-based metaslab selection.
 258  */
 259 static int zfs_metaslab_segment_weight_enabled = B_TRUE;
 260
 261 /*
 262  * When using segment-based metaslab selection, we will continue
 263  * allocating from the active metaslab until we have exhausted
 264  * zfs_metaslab_switch_threshold of its buckets.
 265  */
 266 static int zfs_metaslab_switch_threshold = 2;
 267
 268 /*
 269  * Internal switch to enable/disable the metaslab allocation tracing
 270  * facility.
 271  */
 272 static const boolean_t metaslab_trace_enabled = B_FALSE;
 273
 274 /*
 275  * Maximum entries that the metaslab allocation tracing facility will keep
 276  * in a given list when running in non-debug mode. We limit the number
 277  * of entries in non-debug mode to prevent us from using up too much memory.
 278  * The limit should be sufficiently large that we don't expect any allocation
 279  * to every exceed this value. In debug mode, the system will panic if this
 280  * limit is ever reached allowing for further investigation.
 281  */
 282 static const uint64_t metaslab_trace_max_entries = 5000;
 283
 284 /*
 285  * Maximum number of metaslabs per group that can be disabled
 286  * simultaneously.
 287  */
 288 static const int max_disabled_ms = 3;
 289
 290 /*
 291  * Time (in seconds) to respect ms_max_size when the metaslab is not loaded.
 292  * To avoid 64-bit overflow, don't set above UINT32_MAX.
 293  */
 294 static uint64_t zfs_metaslab_max_size_cache_sec = 1 * 60 * 60; /* 1 hour */
 295
 296 /*
 297  * Maximum percentage of memory to use on storing loaded metaslabs. If loading
 298  * a metaslab would take it over this percentage, the oldest selected metaslab
 299  * is automatically unloaded.
 300  */
 301 static uint_t zfs_metaslab_mem_limit = 25;
 302
 303 /*
 304  * Force the per-metaslab range trees to use 64-bit integers to store
 305  * segments. Used for debugging purposes.
 306  */
 307 static const boolean_t zfs_metaslab_force_large_segs = B_FALSE;
 308
 309 /*
 310  * By default we only store segments over a certain size in the size-sorted
 311  * metaslab trees (ms_allocatable_by_size and
 312  * ms_unflushed_frees_by_size). This dramatically reduces memory usage and
 313  * improves load and unload times at the cost of causing us to use slightly
 314  * larger segments than we would otherwise in some cases.
 315  */
 316 static const uint32_t metaslab_by_size_min_shift = 14;
 317
 318 /*
 319  * If not set, we will first try normal allocation.  If that fails then
 320  * we will do a gang allocation.  If that fails then we will do a "try hard"
 321  * gang allocation.  If that fails then we will have a multi-layer gang
 322  * block.
 323  *
 324  * If set, we will first try normal allocation.  If that fails then
 325  * we will do a "try hard" allocation.  If that fails we will do a gang
 326  * allocation.  If that fails we will do a "try hard" gang allocation.  If
 327  * that fails then we will have a multi-layer gang block.
 328  */
 329 static int zfs_metaslab_try_hard_before_gang = B_FALSE;
 330
 331 /*
 332  * When not trying hard, we only consider the best zfs_metaslab_find_max_tries
 333  * metaslabs.  This improves performance, especially when there are many
 334  * metaslabs per vdev and the allocation can't actually be satisfied (so we
 335  * would otherwise iterate all the metaslabs).  If there is a metaslab with a
 336  * worse weight but it can actually satisfy the allocation, we won't find it
 337  * until trying hard.  This may happen if the worse metaslab is not loaded
 338  * (and the true weight is better than we have calculated), or due to weight
 339  * bucketization.  E.g. we are looking for a 60K segment, and the best
 340  * metaslabs all have free segments in the 32-63K bucket, but the best
 341  * zfs_metaslab_find_max_tries metaslabs have ms_max_size <60KB, and a
 342  * subsequent metaslab has ms_max_size >60KB (but fewer segments in this
 343  * bucket, and therefore a lower weight).
 344  */
 345 static uint_t zfs_metaslab_find_max_tries = 100;
 346
 347 static uint64_t metaslab_weight(metaslab_t *, boolean_t);
 348 static void metaslab_set_fragmentation(metaslab_t *, boolean_t);
 349 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
 350 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
 351
 352 static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
 353 static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
 354 static void metaslab_flush_update(metaslab_t *, dmu_tx_t *);
 355 static unsigned int metaslab_idx_func(multilist_t *, void *);
 356 static void metaslab_evict(metaslab_t *, uint64_t);
 357 static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg);
 358 kmem_cache_t *metaslab_alloc_trace_cache;
 359
 360 typedef struct metaslab_stats {
 361         kstat_named_t metaslabstat_trace_over_limit;
 362         kstat_named_t metaslabstat_reload_tree;
 363         kstat_named_t metaslabstat_too_many_tries;
 364         kstat_named_t metaslabstat_try_hard;
 365 } metaslab_stats_t;
 366
 367 static metaslab_stats_t metaslab_stats = {
 368         { "trace_over_limit",           KSTAT_DATA_UINT64 },
 369         { "reload_tree",                KSTAT_DATA_UINT64 },
 370         { "too_many_tries",             KSTAT_DATA_UINT64 },
 371         { "try_hard",                   KSTAT_DATA_UINT64 },
 372 };
 373
 374 #define METASLABSTAT_BUMP(stat) \
 375         atomic_inc_64(&metaslab_stats.stat.value.ui64);
 376
 377
 378 static kstat_t *metaslab_ksp;
 379
 380 void
 381 metaslab_stat_init(void)
 382 {
 383         ASSERT(metaslab_alloc_trace_cache == NULL);
 384         metaslab_alloc_trace_cache = kmem_cache_create(
 385             "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
 386             0, NULL, NULL, NULL, NULL, NULL, 0);
 387         metaslab_ksp = kstat_create("zfs", 0, "metaslab_stats",
 388             "misc", KSTAT_TYPE_NAMED, sizeof (metaslab_stats) /
 389             sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
 390         if (metaslab_ksp != NULL) {
 391                 metaslab_ksp->ks_data = &metaslab_stats;
 392                 kstat_install(metaslab_ksp);
 393         }
 394 }
 395
 396 void
 397 metaslab_stat_fini(void)
 398 {
 399         if (metaslab_ksp != NULL) {
 400                 kstat_delete(metaslab_ksp);
 401                 metaslab_ksp = NULL;
 402         }
 403
 404         kmem_cache_destroy(metaslab_alloc_trace_cache);
 405         metaslab_alloc_trace_cache = NULL;
 406 }
 407
 408 /*
 409  * ==========================================================================
 410  * Metaslab classes
 411  * ==========================================================================
 412  */
 413 metaslab_class_t *
 414 metaslab_class_create(spa_t *spa, const metaslab_ops_t *ops)
 415 {
 416         metaslab_class_t *mc;
 417
 418         mc = kmem_zalloc(offsetof(metaslab_class_t,
 419             mc_allocator[spa->spa_alloc_count]), KM_SLEEP);
 420
 421         mc->mc_spa = spa;
 422         mc->mc_ops = ops;
 423         mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
 424         multilist_create(&mc->mc_metaslab_txg_list, sizeof (metaslab_t),
 425             offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func);
 426         for (int i = 0; i < spa->spa_alloc_count; i++) {
 427                 metaslab_class_allocator_t *mca = &mc->mc_allocator[i];
 428                 mca->mca_rotor = NULL;
 429                 zfs_refcount_create_tracked(&mca->mca_alloc_slots);
 430         }
 431
 432         return (mc);
 433 }
 434
 435 void
 436 metaslab_class_destroy(metaslab_class_t *mc)
 437 {
 438         spa_t *spa = mc->mc_spa;
 439
 440         ASSERT(mc->mc_alloc == 0);
 441         ASSERT(mc->mc_deferred == 0);
 442         ASSERT(mc->mc_space == 0);
 443         ASSERT(mc->mc_dspace == 0);
 444
 445         for (int i = 0; i < spa->spa_alloc_count; i++) {
 446                 metaslab_class_allocator_t *mca = &mc->mc_allocator[i];
 447                 ASSERT(mca->mca_rotor == NULL);
 448                 zfs_refcount_destroy(&mca->mca_alloc_slots);
 449         }
 450         mutex_destroy(&mc->mc_lock);
 451         multilist_destroy(&mc->mc_metaslab_txg_list);
 452         kmem_free(mc, offsetof(metaslab_class_t,
 453             mc_allocator[spa->spa_alloc_count]));
 454 }
 455
 456 int
 457 metaslab_class_validate(metaslab_class_t *mc)
 458 {
 459         metaslab_group_t *mg;
 460         vdev_t *vd;
 461
 462         /*
 463          * Must hold one of the spa_config locks.
 464          */
 465         ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
 466             spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
 467
 468         if ((mg = mc->mc_allocator[0].mca_rotor) == NULL)
 469                 return (0);
 470
 471         do {
 472                 vd = mg->mg_vd;
 473                 ASSERT(vd->vdev_mg != NULL);
 474                 ASSERT3P(vd->vdev_top, ==, vd);
 475                 ASSERT3P(mg->mg_class, ==, mc);
 476                 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
 477         } while ((mg = mg->mg_next) != mc->mc_allocator[0].mca_rotor);
 478
 479         return (0);
 480 }
 481
 482 static void
 483 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
 484     int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
 485 {
 486         atomic_add_64(&mc->mc_alloc, alloc_delta);
 487         atomic_add_64(&mc->mc_deferred, defer_delta);
 488         atomic_add_64(&mc->mc_space, space_delta);
 489         atomic_add_64(&mc->mc_dspace, dspace_delta);
 490 }
 491
 492 uint64_t
 493 metaslab_class_get_alloc(metaslab_class_t *mc)
 494 {
 495         return (mc->mc_alloc);
 496 }
 497
 498 uint64_t
 499 metaslab_class_get_deferred(metaslab_class_t *mc)
 500 {
 501         return (mc->mc_deferred);
 502 }
 503
 504 uint64_t
 505 metaslab_class_get_space(metaslab_class_t *mc)
 506 {
 507         return (mc->mc_space);
 508 }
 509
 510 uint64_t
 511 metaslab_class_get_dspace(metaslab_class_t *mc)
 512 {
 513         return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
 514 }
 515
 516 void
 517 metaslab_class_histogram_verify(metaslab_class_t *mc)
 518 {
 519         spa_t *spa = mc->mc_spa;
 520         vdev_t *rvd = spa->spa_root_vdev;
 521         uint64_t *mc_hist;
 522         int i;
 523
 524         if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 525                 return;
 526
 527         mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 528             KM_SLEEP);
 529
 530         mutex_enter(&mc->mc_lock);
 531         for (int c = 0; c < rvd->vdev_children; c++) {
 532                 vdev_t *tvd = rvd->vdev_child[c];
 533                 metaslab_group_t *mg = vdev_get_mg(tvd, mc);
 534
 535                 /*
 536                  * Skip any holes, uninitialized top-levels, or
 537                  * vdevs that are not in this metalab class.
 538                  */
 539                 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 540                     mg->mg_class != mc) {
 541                         continue;
 542                 }
 543
 544                 IMPLY(mg == mg->mg_vd->vdev_log_mg,
 545                     mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
 546
 547                 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
 548                         mc_hist[i] += mg->mg_histogram[i];
 549         }
 550
 551         for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
 552                 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
 553         }
 554
 555         mutex_exit(&mc->mc_lock);
 556         kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 557 }
 558
 559 /*
 560  * Calculate the metaslab class's fragmentation metric. The metric
 561  * is weighted based on the space contribution of each metaslab group.
 562  * The return value will be a number between 0 and 100 (inclusive), or
 563  * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
 564  * zfs_frag_table for more information about the metric.
 565  */
 566 uint64_t
 567 metaslab_class_fragmentation(metaslab_class_t *mc)
 568 {
 569         vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 570         uint64_t fragmentation = 0;
 571
 572         spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 573
 574         for (int c = 0; c < rvd->vdev_children; c++) {
 575                 vdev_t *tvd = rvd->vdev_child[c];
 576                 metaslab_group_t *mg = tvd->vdev_mg;
 577
 578                 /*
 579                  * Skip any holes, uninitialized top-levels,
 580                  * or vdevs that are not in this metalab class.
 581                  */
 582                 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 583                     mg->mg_class != mc) {
 584                         continue;
 585                 }
 586
 587                 /*
 588                  * If a metaslab group does not contain a fragmentation
 589                  * metric then just bail out.
 590                  */
 591                 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
 592                         spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 593                         return (ZFS_FRAG_INVALID);
 594                 }
 595
 596                 /*
 597                  * Determine how much this metaslab_group is contributing
 598                  * to the overall pool fragmentation metric.
 599                  */
 600                 fragmentation += mg->mg_fragmentation *
 601                     metaslab_group_get_space(mg);
 602         }
 603         fragmentation /= metaslab_class_get_space(mc);
 604
 605         ASSERT3U(fragmentation, <=, 100);
 606         spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 607         return (fragmentation);
 608 }
 609
 610 /*
 611  * Calculate the amount of expandable space that is available in
 612  * this metaslab class. If a device is expanded then its expandable
 613  * space will be the amount of allocatable space that is currently not
 614  * part of this metaslab class.
 615  */
 616 uint64_t
 617 metaslab_class_expandable_space(metaslab_class_t *mc)
 618 {
 619         vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 620         uint64_t space = 0;
 621
 622         spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 623         for (int c = 0; c < rvd->vdev_children; c++) {
 624                 vdev_t *tvd = rvd->vdev_child[c];
 625                 metaslab_group_t *mg = tvd->vdev_mg;
 626
 627                 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 628                     mg->mg_class != mc) {
 629                         continue;
 630                 }
 631
 632                 /*
 633                  * Calculate if we have enough space to add additional
 634                  * metaslabs. We report the expandable space in terms
 635                  * of the metaslab size since that's the unit of expansion.
 636                  */
 637                 space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize,
 638                     1ULL << tvd->vdev_ms_shift);
 639         }
 640         spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 641         return (space);
 642 }
 643
 644 void
 645 metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
 646 {
 647         multilist_t *ml = &mc->mc_metaslab_txg_list;
 648         for (int i = 0; i < multilist_get_num_sublists(ml); i++) {
 649                 multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
 650                 metaslab_t *msp = multilist_sublist_head(mls);
 651                 multilist_sublist_unlock(mls);
 652                 while (msp != NULL) {
 653                         mutex_enter(&msp->ms_lock);
 654
 655                         /*
 656                          * If the metaslab has been removed from the list
 657                          * (which could happen if we were at the memory limit
 658                          * and it was evicted during this loop), then we can't
 659                          * proceed and we should restart the sublist.
 660                          */
 661                         if (!multilist_link_active(&msp->ms_class_txg_node)) {
 662                                 mutex_exit(&msp->ms_lock);
 663                                 i--;
 664                                 break;
 665                         }
 666                         mls = multilist_sublist_lock(ml, i);
 667                         metaslab_t *next_msp = multilist_sublist_next(mls, msp);
 668                         multilist_sublist_unlock(mls);
 669                         if (txg >
 670                             msp->ms_selected_txg + metaslab_unload_delay &&
 671                             gethrtime() > msp->ms_selected_time +
 672                             (uint64_t)MSEC2NSEC(metaslab_unload_delay_ms)) {
 673                                 metaslab_evict(msp, txg);
 674                         } else {
 675                                 /*
 676                                  * Once we've hit a metaslab selected too
 677                                  * recently to evict, we're done evicting for
 678                                  * now.
 679                                  */
 680                                 mutex_exit(&msp->ms_lock);
 681                                 break;
 682                         }
 683                         mutex_exit(&msp->ms_lock);
 684                         msp = next_msp;
 685                 }
 686         }
 687 }
 688
 689 static int
 690 metaslab_compare(const void *x1, const void *x2)
 691 {
 692         const metaslab_t *m1 = (const metaslab_t *)x1;
 693         const metaslab_t *m2 = (const metaslab_t *)x2;
 694
 695         int sort1 = 0;
 696         int sort2 = 0;
 697         if (m1->ms_allocator != -1 && m1->ms_primary)
 698                 sort1 = 1;
 699         else if (m1->ms_allocator != -1 && !m1->ms_primary)
 700                 sort1 = 2;
 701         if (m2->ms_allocator != -1 && m2->ms_primary)
 702                 sort2 = 1;
 703         else if (m2->ms_allocator != -1 && !m2->ms_primary)
 704                 sort2 = 2;
 705
 706         /*
 707          * Sort inactive metaslabs first, then primaries, then secondaries. When
 708          * selecting a metaslab to allocate from, an allocator first tries its
 709          * primary, then secondary active metaslab. If it doesn't have active
 710          * metaslabs, or can't allocate from them, it searches for an inactive
 711          * metaslab to activate. If it can't find a suitable one, it will steal
 712          * a primary or secondary metaslab from another allocator.
 713          */
 714         if (sort1 < sort2)
 715                 return (-1);
 716         if (sort1 > sort2)
 717                 return (1);
 718
 719         int cmp = TREE_CMP(m2->ms_weight, m1->ms_weight);
 720         if (likely(cmp))
 721                 return (cmp);
 722
 723         IMPLY(TREE_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2);
 724
 725         return (TREE_CMP(m1->ms_start, m2->ms_start));
 726 }
 727
 728 /*
 729  * ==========================================================================
 730  * Metaslab groups
 731  * ==========================================================================
 732  */
 733 /*
 734  * Update the allocatable flag and the metaslab group's capacity.
 735  * The allocatable flag is set to true if the capacity is below
 736  * the zfs_mg_noalloc_threshold or has a fragmentation value that is
 737  * greater than zfs_mg_fragmentation_threshold. If a metaslab group
 738  * transitions from allocatable to non-allocatable or vice versa then the
 739  * metaslab group's class is updated to reflect the transition.
 740  */
 741 static void
 742 metaslab_group_alloc_update(metaslab_group_t *mg)
 743 {
 744         vdev_t *vd = mg->mg_vd;
 745         metaslab_class_t *mc = mg->mg_class;
 746         vdev_stat_t *vs = &vd->vdev_stat;
 747         boolean_t was_allocatable;
 748         boolean_t was_initialized;
 749
 750         ASSERT(vd == vd->vdev_top);
 751         ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==,
 752             SCL_ALLOC);
 753
 754         mutex_enter(&mg->mg_lock);
 755         was_allocatable = mg->mg_allocatable;
 756         was_initialized = mg->mg_initialized;
 757
 758         mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
 759             (vs->vs_space + 1);
 760
 761         mutex_enter(&mc->mc_lock);
 762
 763         /*
 764          * If the metaslab group was just added then it won't
 765          * have any space until we finish syncing out this txg.
 766          * At that point we will consider it initialized and available
 767          * for allocations.  We also don't consider non-activated
 768          * metaslab groups (e.g. vdevs that are in the middle of being removed)
 769          * to be initialized, because they can't be used for allocation.
 770          */
 771         mg->mg_initialized = metaslab_group_initialized(mg);
 772         if (!was_initialized && mg->mg_initialized) {
 773                 mc->mc_groups++;
 774         } else if (was_initialized && !mg->mg_initialized) {
 775                 ASSERT3U(mc->mc_groups, >, 0);
 776                 mc->mc_groups--;
 777         }
 778         if (mg->mg_initialized)
 779                 mg->mg_no_free_space = B_FALSE;
 780
 781         /*
 782          * A metaslab group is considered allocatable if it has plenty
 783          * of free space or is not heavily fragmented. We only take
 784          * fragmentation into account if the metaslab group has a valid
 785          * fragmentation metric (i.e. a value between 0 and 100).
 786          */
 787         mg->mg_allocatable = (mg->mg_activation_count > 0 &&
 788             mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
 789             (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
 790             mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
 791
 792         /*
 793          * The mc_alloc_groups maintains a count of the number of
 794          * groups in this metaslab class that are still above the
 795          * zfs_mg_noalloc_threshold. This is used by the allocating
 796          * threads to determine if they should avoid allocations to
 797          * a given group. The allocator will avoid allocations to a group
 798          * if that group has reached or is below the zfs_mg_noalloc_threshold
 799          * and there are still other groups that are above the threshold.
 800          * When a group transitions from allocatable to non-allocatable or
 801          * vice versa we update the metaslab class to reflect that change.
 802          * When the mc_alloc_groups value drops to 0 that means that all
 803          * groups have reached the zfs_mg_noalloc_threshold making all groups
 804          * eligible for allocations. This effectively means that all devices
 805          * are balanced again.
 806          */
 807         if (was_allocatable && !mg->mg_allocatable)
 808                 mc->mc_alloc_groups--;
 809         else if (!was_allocatable && mg->mg_allocatable)
 810                 mc->mc_alloc_groups++;
 811         mutex_exit(&mc->mc_lock);
 812
 813         mutex_exit(&mg->mg_lock);
 814 }
 815
 816 int
 817 metaslab_sort_by_flushed(const void *va, const void *vb)
 818 {
 819         const metaslab_t *a = va;
 820         const metaslab_t *b = vb;
 821
 822         int cmp = TREE_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg);
 823         if (likely(cmp))
 824                 return (cmp);
 825
 826         uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id;
 827         uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id;
 828         cmp = TREE_CMP(a_vdev_id, b_vdev_id);
 829         if (cmp)
 830                 return (cmp);
 831
 832         return (TREE_CMP(a->ms_id, b->ms_id));
 833 }
 834
 835 metaslab_group_t *
 836 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
 837 {
 838         metaslab_group_t *mg;
 839
 840         mg = kmem_zalloc(offsetof(metaslab_group_t,
 841             mg_allocator[allocators]), KM_SLEEP);
 842         mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
 843         mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL);
 844         cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL);
 845         avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 846             sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node));
 847         mg->mg_vd = vd;
 848         mg->mg_class = mc;
 849         mg->mg_activation_count = 0;
 850         mg->mg_initialized = B_FALSE;
 851         mg->mg_no_free_space = B_TRUE;
 852         mg->mg_allocators = allocators;
 853
 854         for (int i = 0; i < allocators; i++) {
 855                 metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
 856                 zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth);
 857         }
 858
 859         mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
 860             maxclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT | TASKQ_DYNAMIC);
 861
 862         return (mg);
 863 }
 864
 865 void
 866 metaslab_group_destroy(metaslab_group_t *mg)
 867 {
 868         ASSERT(mg->mg_prev == NULL);
 869         ASSERT(mg->mg_next == NULL);
 870         /*
 871          * We may have gone below zero with the activation count
 872          * either because we never activated in the first place or
 873          * because we're done, and possibly removing the vdev.
 874          */
 875         ASSERT(mg->mg_activation_count <= 0);
 876
 877         taskq_destroy(mg->mg_taskq);
 878         avl_destroy(&mg->mg_metaslab_tree);
 879         mutex_destroy(&mg->mg_lock);
 880         mutex_destroy(&mg->mg_ms_disabled_lock);
 881         cv_destroy(&mg->mg_ms_disabled_cv);
 882
 883         for (int i = 0; i < mg->mg_allocators; i++) {
 884                 metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
 885                 zfs_refcount_destroy(&mga->mga_alloc_queue_depth);
 886         }
 887         kmem_free(mg, offsetof(metaslab_group_t,
 888             mg_allocator[mg->mg_allocators]));
 889 }
 890
 891 void
 892 metaslab_group_activate(metaslab_group_t *mg)
 893 {
 894         metaslab_class_t *mc = mg->mg_class;
 895         spa_t *spa = mc->mc_spa;
 896         metaslab_group_t *mgprev, *mgnext;
 897
 898         ASSERT3U(spa_config_held(spa, SCL_ALLOC, RW_WRITER), !=, 0);
 899
 900         ASSERT(mg->mg_prev == NULL);
 901         ASSERT(mg->mg_next == NULL);
 902         ASSERT(mg->mg_activation_count <= 0);
 903
 904         if (++mg->mg_activation_count <= 0)
 905                 return;
 906
 907         mg->mg_aliquot = metaslab_aliquot * MAX(1,
 908             vdev_get_ndisks(mg->mg_vd) - vdev_get_nparity(mg->mg_vd));
 909         metaslab_group_alloc_update(mg);
 910
 911         if ((mgprev = mc->mc_allocator[0].mca_rotor) == NULL) {
 912                 mg->mg_prev = mg;
 913                 mg->mg_next = mg;
 914         } else {
 915                 mgnext = mgprev->mg_next;
 916                 mg->mg_prev = mgprev;
 917                 mg->mg_next = mgnext;
 918                 mgprev->mg_next = mg;
 919                 mgnext->mg_prev = mg;
 920         }
 921         for (int i = 0; i < spa->spa_alloc_count; i++) {
 922                 mc->mc_allocator[i].mca_rotor = mg;
 923                 mg = mg->mg_next;
 924         }
 925 }
 926
 927 /*
 928  * Passivate a metaslab group and remove it from the allocation rotor.
 929  * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating
 930  * a metaslab group. This function will momentarily drop spa_config_locks
 931  * that are lower than the SCL_ALLOC lock (see comment below).
 932  */
 933 void
 934 metaslab_group_passivate(metaslab_group_t *mg)
 935 {
 936         metaslab_class_t *mc = mg->mg_class;
 937         spa_t *spa = mc->mc_spa;
 938         metaslab_group_t *mgprev, *mgnext;
 939         int locks = spa_config_held(spa, SCL_ALL, RW_WRITER);
 940
 941         ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==,
 942             (SCL_ALLOC | SCL_ZIO));
 943
 944         if (--mg->mg_activation_count != 0) {
 945                 for (int i = 0; i < spa->spa_alloc_count; i++)
 946                         ASSERT(mc->mc_allocator[i].mca_rotor != mg);
 947                 ASSERT(mg->mg_prev == NULL);
 948                 ASSERT(mg->mg_next == NULL);
 949                 ASSERT(mg->mg_activation_count < 0);
 950                 return;
 951         }
 952
 953         /*
 954          * The spa_config_lock is an array of rwlocks, ordered as
 955          * follows (from highest to lowest):
 956          *      SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC >
 957          *      SCL_ZIO > SCL_FREE > SCL_VDEV
 958          * (For more information about the spa_config_lock see spa_misc.c)
 959          * The higher the lock, the broader its coverage. When we passivate
 960          * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO
 961          * config locks. However, the metaslab group's taskq might be trying
 962          * to preload metaslabs so we must drop the SCL_ZIO lock and any
 963          * lower locks to allow the I/O to complete. At a minimum,
 964          * we continue to hold the SCL_ALLOC lock, which prevents any future
 965          * allocations from taking place and any changes to the vdev tree.
 966          */
 967         spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
 968         taskq_wait_outstanding(mg->mg_taskq, 0);
 969         spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
 970         metaslab_group_alloc_update(mg);
 971         for (int i = 0; i < mg->mg_allocators; i++) {
 972                 metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
 973                 metaslab_t *msp = mga->mga_primary;
 974                 if (msp != NULL) {
 975                         mutex_enter(&msp->ms_lock);
 976                         metaslab_passivate(msp,
 977                             metaslab_weight_from_range_tree(msp));
 978                         mutex_exit(&msp->ms_lock);
 979                 }
 980                 msp = mga->mga_secondary;
 981                 if (msp != NULL) {
 982                         mutex_enter(&msp->ms_lock);
 983                         metaslab_passivate(msp,
 984                             metaslab_weight_from_range_tree(msp));
 985                         mutex_exit(&msp->ms_lock);
 986                 }
 987         }
 988
 989         mgprev = mg->mg_prev;
 990         mgnext = mg->mg_next;
 991
 992         if (mg == mgnext) {
 993                 mgnext = NULL;
 994         } else {
 995                 mgprev->mg_next = mgnext;
 996                 mgnext->mg_prev = mgprev;
 997         }
 998         for (int i = 0; i < spa->spa_alloc_count; i++) {
 999                 if (mc->mc_allocator[i].mca_rotor == mg)
1000                         mc->mc_allocator[i].mca_rotor = mgnext;
1001         }
1002
1003         mg->mg_prev = NULL;
1004         mg->mg_next = NULL;
1005 }
1006
1007 boolean_t
1008 metaslab_group_initialized(metaslab_group_t *mg)
1009 {
1010         vdev_t *vd = mg->mg_vd;
1011         vdev_stat_t *vs = &vd->vdev_stat;
1012
1013         return (vs->vs_space != 0 && mg->mg_activation_count > 0);
1014 }
1015
1016 uint64_t
1017 metaslab_group_get_space(metaslab_group_t *mg)
1018 {
1019         /*
1020          * Note that the number of nodes in mg_metaslab_tree may be one less
1021          * than vdev_ms_count, due to the embedded log metaslab.
1022          */
1023         mutex_enter(&mg->mg_lock);
1024         uint64_t ms_count = avl_numnodes(&mg->mg_metaslab_tree);
1025         mutex_exit(&mg->mg_lock);
1026         return ((1ULL << mg->mg_vd->vdev_ms_shift) * ms_count);
1027 }
1028
1029 void
1030 metaslab_group_histogram_verify(metaslab_group_t *mg)
1031 {
1032         uint64_t *mg_hist;
1033         avl_tree_t *t = &mg->mg_metaslab_tree;
1034         uint64_t ashift = mg->mg_vd->vdev_ashift;
1035
1036         if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
1037                 return;
1038
1039         mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
1040             KM_SLEEP);
1041
1042         ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
1043             SPACE_MAP_HISTOGRAM_SIZE + ashift);
1044
1045         mutex_enter(&mg->mg_lock);
1046         for (metaslab_t *msp = avl_first(t);
1047             msp != NULL; msp = AVL_NEXT(t, msp)) {
1048                 VERIFY3P(msp->ms_group, ==, mg);
1049                 /* skip if not active */
1050                 if (msp->ms_sm == NULL)
1051                         continue;
1052
1053                 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
1054                         mg_hist[i + ashift] +=
1055                             msp->ms_sm->sm_phys->smp_histogram[i];
1056                 }
1057         }
1058
1059         for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
1060                 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
1061
1062         mutex_exit(&mg->mg_lock);
1063
1064         kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
1065 }
1066
1067 static void
1068 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
1069 {
1070         metaslab_class_t *mc = mg->mg_class;
1071         uint64_t ashift = mg->mg_vd->vdev_ashift;
1072
1073         ASSERT(MUTEX_HELD(&msp->ms_lock));
1074         if (msp->ms_sm == NULL)
1075                 return;
1076
1077         mutex_enter(&mg->mg_lock);
1078         mutex_enter(&mc->mc_lock);
1079         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
1080                 IMPLY(mg == mg->mg_vd->vdev_log_mg,
1081                     mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
1082                 mg->mg_histogram[i + ashift] +=
1083                     msp->ms_sm->sm_phys->smp_histogram[i];
1084                 mc->mc_histogram[i + ashift] +=
1085                     msp->ms_sm->sm_phys->smp_histogram[i];
1086         }
1087         mutex_exit(&mc->mc_lock);
1088         mutex_exit(&mg->mg_lock);
1089 }
1090
1091 void
1092 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
1093 {
1094         metaslab_class_t *mc = mg->mg_class;
1095         uint64_t ashift = mg->mg_vd->vdev_ashift;
1096
1097         ASSERT(MUTEX_HELD(&msp->ms_lock));
1098         if (msp->ms_sm == NULL)
1099                 return;
1100
1101         mutex_enter(&mg->mg_lock);
1102         mutex_enter(&mc->mc_lock);
1103         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
1104                 ASSERT3U(mg->mg_histogram[i + ashift], >=,
1105                     msp->ms_sm->sm_phys->smp_histogram[i]);
1106                 ASSERT3U(mc->mc_histogram[i + ashift], >=,
1107                     msp->ms_sm->sm_phys->smp_histogram[i]);
1108                 IMPLY(mg == mg->mg_vd->vdev_log_mg,
1109                     mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
1110
1111                 mg->mg_histogram[i + ashift] -=
1112                     msp->ms_sm->sm_phys->smp_histogram[i];
1113                 mc->mc_histogram[i + ashift] -=
1114                     msp->ms_sm->sm_phys->smp_histogram[i];
1115         }
1116         mutex_exit(&mc->mc_lock);
1117         mutex_exit(&mg->mg_lock);
1118 }
1119
1120 static void
1121 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
1122 {
1123         ASSERT(msp->ms_group == NULL);
1124         mutex_enter(&mg->mg_lock);
1125         msp->ms_group = mg;
1126         msp->ms_weight = 0;
1127         avl_add(&mg->mg_metaslab_tree, msp);
1128         mutex_exit(&mg->mg_lock);
1129
1130         mutex_enter(&msp->ms_lock);
1131         metaslab_group_histogram_add(mg, msp);
1132         mutex_exit(&msp->ms_lock);
1133 }
1134
1135 static void
1136 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
1137 {
1138         mutex_enter(&msp->ms_lock);
1139         metaslab_group_histogram_remove(mg, msp);
1140         mutex_exit(&msp->ms_lock);
1141
1142         mutex_enter(&mg->mg_lock);
1143         ASSERT(msp->ms_group == mg);
1144         avl_remove(&mg->mg_metaslab_tree, msp);
1145
1146         metaslab_class_t *mc = msp->ms_group->mg_class;
1147         multilist_sublist_t *mls =
1148             multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
1149         if (multilist_link_active(&msp->ms_class_txg_node))
1150                 multilist_sublist_remove(mls, msp);
1151         multilist_sublist_unlock(mls);
1152
1153         msp->ms_group = NULL;
1154         mutex_exit(&mg->mg_lock);
1155 }
1156
1157 static void
1158 metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
1159 {
1160         ASSERT(MUTEX_HELD(&msp->ms_lock));
1161         ASSERT(MUTEX_HELD(&mg->mg_lock));
1162         ASSERT(msp->ms_group == mg);
1163
1164         avl_remove(&mg->mg_metaslab_tree, msp);
1165         msp->ms_weight = weight;
1166         avl_add(&mg->mg_metaslab_tree, msp);
1167
1168 }
1169
1170 static void
1171 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
1172 {
1173         /*
1174          * Although in principle the weight can be any value, in
1175          * practice we do not use values in the range [1, 511].
1176          */
1177         ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
1178         ASSERT(MUTEX_HELD(&msp->ms_lock));
1179
1180         mutex_enter(&mg->mg_lock);
1181         metaslab_group_sort_impl(mg, msp, weight);
1182         mutex_exit(&mg->mg_lock);
1183 }
1184
1185 /*
1186  * Calculate the fragmentation for a given metaslab group. We can use
1187  * a simple average here since all metaslabs within the group must have
1188  * the same size. The return value will be a value between 0 and 100
1189  * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
1190  * group have a fragmentation metric.
1191  */
1192 uint64_t
1193 metaslab_group_fragmentation(metaslab_group_t *mg)
1194 {
1195         vdev_t *vd = mg->mg_vd;
1196         uint64_t fragmentation = 0;
1197         uint64_t valid_ms = 0;
1198
1199         for (int m = 0; m < vd->vdev_ms_count; m++) {
1200                 metaslab_t *msp = vd->vdev_ms[m];
1201
1202                 if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
1203                         continue;
1204                 if (msp->ms_group != mg)
1205                         continue;
1206
1207                 valid_ms++;
1208                 fragmentation += msp->ms_fragmentation;
1209         }
1210
1211         if (valid_ms <= mg->mg_vd->vdev_ms_count / 2)
1212                 return (ZFS_FRAG_INVALID);
1213
1214         fragmentation /= valid_ms;
1215         ASSERT3U(fragmentation, <=, 100);
1216         return (fragmentation);
1217 }
1218
1219 /*
1220  * Determine if a given metaslab group should skip allocations. A metaslab
1221  * group should avoid allocations if its free capacity is less than the
1222  * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
1223  * zfs_mg_fragmentation_threshold and there is at least one metaslab group
1224  * that can still handle allocations. If the allocation throttle is enabled
1225  * then we skip allocations to devices that have reached their maximum
1226  * allocation queue depth unless the selected metaslab group is the only
1227  * eligible group remaining.
1228  */
1229 static boolean_t
1230 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
1231     int flags, uint64_t psize, int allocator, int d)
1232 {
1233         spa_t *spa = mg->mg_vd->vdev_spa;
1234         metaslab_class_t *mc = mg->mg_class;
1235
1236         /*
1237          * We can only consider skipping this metaslab group if it's
1238          * in the normal metaslab class and there are other metaslab
1239          * groups to select from. Otherwise, we always consider it eligible
1240          * for allocations.
1241          */
1242         if ((mc != spa_normal_class(spa) &&
1243             mc != spa_special_class(spa) &&
1244             mc != spa_dedup_class(spa)) ||
1245             mc->mc_groups <= 1)
1246                 return (B_TRUE);
1247
1248         /*
1249          * If the metaslab group's mg_allocatable flag is set (see comments
1250          * in metaslab_group_alloc_update() for more information) and
1251          * the allocation throttle is disabled then allow allocations to this
1252          * device. However, if the allocation throttle is enabled then
1253          * check if we have reached our allocation limit (mga_alloc_queue_depth)
1254          * to determine if we should allow allocations to this metaslab group.
1255          * If all metaslab groups are no longer considered allocatable
1256          * (mc_alloc_groups == 0) or we're trying to allocate the smallest
1257          * gang block size then we allow allocations on this metaslab group
1258          * regardless of the mg_allocatable or throttle settings.
1259          */
1260         if (mg->mg_allocatable) {
1261                 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
1262                 int64_t qdepth;
1263                 uint64_t qmax = mga->mga_cur_max_alloc_queue_depth;
1264
1265                 if (!mc->mc_alloc_throttle_enabled)
1266                         return (B_TRUE);
1267
1268                 /*
1269                  * If this metaslab group does not have any free space, then
1270                  * there is no point in looking further.
1271                  */
1272                 if (mg->mg_no_free_space)
1273                         return (B_FALSE);
1274
1275                 /*
1276                  * Some allocations (e.g., those coming from device removal
1277                  * where the * allocations are not even counted in the
1278                  * metaslab * allocation queues) are allowed to bypass
1279                  * the throttle.
1280                  */
1281                 if (flags & METASLAB_DONT_THROTTLE)
1282                         return (B_TRUE);
1283
1284                 /*
1285                  * Relax allocation throttling for ditto blocks.  Due to
1286                  * random imbalances in allocation it tends to push copies
1287                  * to one vdev, that looks a bit better at the moment.
1288                  */
1289                 qmax = qmax * (4 + d) / 4;
1290
1291                 qdepth = zfs_refcount_count(&mga->mga_alloc_queue_depth);
1292
1293                 /*
1294                  * If this metaslab group is below its qmax or it's
1295                  * the only allocatable metaslab group, then attempt
1296                  * to allocate from it.
1297                  */
1298                 if (qdepth < qmax || mc->mc_alloc_groups == 1)
1299                         return (B_TRUE);
1300                 ASSERT3U(mc->mc_alloc_groups, >, 1);
1301
1302                 /*
1303                  * Since this metaslab group is at or over its qmax, we
1304                  * need to determine if there are metaslab groups after this
1305                  * one that might be able to handle this allocation. This is
1306                  * racy since we can't hold the locks for all metaslab
1307                  * groups at the same time when we make this check.
1308                  */
1309                 for (metaslab_group_t *mgp = mg->mg_next;
1310                     mgp != rotor; mgp = mgp->mg_next) {
1311                         metaslab_group_allocator_t *mgap =
1312                             &mgp->mg_allocator[allocator];
1313                         qmax = mgap->mga_cur_max_alloc_queue_depth;
1314                         qmax = qmax * (4 + d) / 4;
1315                         qdepth =
1316                             zfs_refcount_count(&mgap->mga_alloc_queue_depth);
1317
1318                         /*
1319                          * If there is another metaslab group that
1320                          * might be able to handle the allocation, then
1321                          * we return false so that we skip this group.
1322                          */
1323                         if (qdepth < qmax && !mgp->mg_no_free_space)
1324                                 return (B_FALSE);
1325                 }
1326
1327                 /*
1328                  * We didn't find another group to handle the allocation
1329                  * so we can't skip this metaslab group even though
1330                  * we are at or over our qmax.
1331                  */
1332                 return (B_TRUE);
1333
1334         } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
1335                 return (B_TRUE);
1336         }
1337         return (B_FALSE);
1338 }
1339
1340 /*
1341  * ==========================================================================
1342  * Range tree callbacks
1343  * ==========================================================================
1344  */
1345
1346 /*
1347  * Comparison function for the private size-ordered tree using 32-bit
1348  * ranges. Tree is sorted by size, larger sizes at the end of the tree.
1349  */
1350 __attribute__((always_inline)) inline
1351 static int
1352 metaslab_rangesize32_compare(const void *x1, const void *x2)
1353 {
1354         const range_seg32_t *r1 = x1;
1355         const range_seg32_t *r2 = x2;
1356
1357         uint64_t rs_size1 = r1->rs_end - r1->rs_start;
1358         uint64_t rs_size2 = r2->rs_end - r2->rs_start;
1359
1360         int cmp = TREE_CMP(rs_size1, rs_size2);
1361
1362         return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start));
1363 }
1364
1365 /*
1366  * Comparison function for the private size-ordered tree using 64-bit
1367  * ranges. Tree is sorted by size, larger sizes at the end of the tree.
1368  */
1369 __attribute__((always_inline)) inline
1370 static int
1371 metaslab_rangesize64_compare(const void *x1, const void *x2)
1372 {
1373         const range_seg64_t *r1 = x1;
1374         const range_seg64_t *r2 = x2;
1375
1376         uint64_t rs_size1 = r1->rs_end - r1->rs_start;
1377         uint64_t rs_size2 = r2->rs_end - r2->rs_start;
1378
1379         int cmp = TREE_CMP(rs_size1, rs_size2);
1380
1381         return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start));
1382 }
1383
1384 typedef struct metaslab_rt_arg {
1385         zfs_btree_t *mra_bt;
1386         uint32_t mra_floor_shift;
1387 } metaslab_rt_arg_t;
1388
1389 struct mssa_arg {
1390         range_tree_t *rt;
1391         metaslab_rt_arg_t *mra;
1392 };
1393
1394 static void
1395 metaslab_size_sorted_add(void *arg, uint64_t start, uint64_t size)
1396 {
1397         struct mssa_arg *mssap = arg;
1398         range_tree_t *rt = mssap->rt;
1399         metaslab_rt_arg_t *mrap = mssap->mra;
1400         range_seg_max_t seg = {0};
1401         rs_set_start(&seg, rt, start);
1402         rs_set_end(&seg, rt, start + size);
1403         metaslab_rt_add(rt, &seg, mrap);
1404 }
1405
1406 static void
1407 metaslab_size_tree_full_load(range_tree_t *rt)
1408 {
1409         metaslab_rt_arg_t *mrap = rt->rt_arg;
1410         METASLABSTAT_BUMP(metaslabstat_reload_tree);
1411         ASSERT0(zfs_btree_numnodes(mrap->mra_bt));
1412         mrap->mra_floor_shift = 0;
1413         struct mssa_arg arg = {0};
1414         arg.rt = rt;
1415         arg.mra = mrap;
1416         range_tree_walk(rt, metaslab_size_sorted_add, &arg);
1417 }
1418
1419
1420 ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize32_in_buf,
1421     range_seg32_t, metaslab_rangesize32_compare)
1422
1423 ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize64_in_buf,
1424     range_seg64_t, metaslab_rangesize64_compare)
1425
1426 /*
1427  * Create any block allocator specific components. The current allocators
1428  * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
1429  */
1430 static void
1431 metaslab_rt_create(range_tree_t *rt, void *arg)
1432 {
1433         metaslab_rt_arg_t *mrap = arg;
1434         zfs_btree_t *size_tree = mrap->mra_bt;
1435
1436         size_t size;
1437         int (*compare) (const void *, const void *);
1438         bt_find_in_buf_f bt_find;
1439         switch (rt->rt_type) {
1440         case RANGE_SEG32:
1441                 size = sizeof (range_seg32_t);
1442                 compare = metaslab_rangesize32_compare;
1443                 bt_find = metaslab_rt_find_rangesize32_in_buf;
1444                 break;
1445         case RANGE_SEG64:
1446                 size = sizeof (range_seg64_t);
1447                 compare = metaslab_rangesize64_compare;
1448                 bt_find = metaslab_rt_find_rangesize64_in_buf;
1449                 break;
1450         default:
1451                 panic("Invalid range seg type %d", rt->rt_type);
1452         }
1453         zfs_btree_create(size_tree, compare, bt_find, size);
1454         mrap->mra_floor_shift = metaslab_by_size_min_shift;
1455 }
1456
1457 static void
1458 metaslab_rt_destroy(range_tree_t *rt, void *arg)
1459 {
1460         (void) rt;
1461         metaslab_rt_arg_t *mrap = arg;
1462         zfs_btree_t *size_tree = mrap->mra_bt;
1463
1464         zfs_btree_destroy(size_tree);
1465         kmem_free(mrap, sizeof (*mrap));
1466 }
1467
1468 static void
1469 metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
1470 {
1471         metaslab_rt_arg_t *mrap = arg;
1472         zfs_btree_t *size_tree = mrap->mra_bt;
1473
1474         if (rs_get_end(rs, rt) - rs_get_start(rs, rt) <
1475             (1ULL << mrap->mra_floor_shift))
1476                 return;
1477
1478         zfs_btree_add(size_tree, rs);
1479 }
1480
1481 static void
1482 metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
1483 {
1484         metaslab_rt_arg_t *mrap = arg;
1485         zfs_btree_t *size_tree = mrap->mra_bt;
1486
1487         if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1ULL <<
1488             mrap->mra_floor_shift))
1489                 return;
1490
1491         zfs_btree_remove(size_tree, rs);
1492 }
1493
1494 static void
1495 metaslab_rt_vacate(range_tree_t *rt, void *arg)
1496 {
1497         metaslab_rt_arg_t *mrap = arg;
1498         zfs_btree_t *size_tree = mrap->mra_bt;
1499         zfs_btree_clear(size_tree);
1500         zfs_btree_destroy(size_tree);
1501
1502         metaslab_rt_create(rt, arg);
1503 }
1504
1505 static const range_tree_ops_t metaslab_rt_ops = {
1506         .rtop_create = metaslab_rt_create,
1507         .rtop_destroy = metaslab_rt_destroy,
1508         .rtop_add = metaslab_rt_add,
1509         .rtop_remove = metaslab_rt_remove,
1510         .rtop_vacate = metaslab_rt_vacate
1511 };
1512
1513 /*
1514  * ==========================================================================
1515  * Common allocator routines
1516  * ==========================================================================
1517  */
1518
1519 /*
1520  * Return the maximum contiguous segment within the metaslab.
1521  */
1522 uint64_t
1523 metaslab_largest_allocatable(metaslab_t *msp)
1524 {
1525         zfs_btree_t *t = &msp->ms_allocatable_by_size;
1526         range_seg_t *rs;
1527
1528         if (t == NULL)
1529                 return (0);
1530         if (zfs_btree_numnodes(t) == 0)
1531                 metaslab_size_tree_full_load(msp->ms_allocatable);
1532
1533         rs = zfs_btree_last(t, NULL);
1534         if (rs == NULL)
1535                 return (0);
1536
1537         return (rs_get_end(rs, msp->ms_allocatable) - rs_get_start(rs,
1538             msp->ms_allocatable));
1539 }
1540
1541 /*
1542  * Return the maximum contiguous segment within the unflushed frees of this
1543  * metaslab.
1544  */
1545 static uint64_t
1546 metaslab_largest_unflushed_free(metaslab_t *msp)
1547 {
1548         ASSERT(MUTEX_HELD(&msp->ms_lock));
1549
1550         if (msp->ms_unflushed_frees == NULL)
1551                 return (0);
1552
1553         if (zfs_btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0)
1554                 metaslab_size_tree_full_load(msp->ms_unflushed_frees);
1555         range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size,
1556             NULL);
1557         if (rs == NULL)
1558                 return (0);
1559
1560         /*
1561          * When a range is freed from the metaslab, that range is added to
1562          * both the unflushed frees and the deferred frees. While the block
1563          * will eventually be usable, if the metaslab were loaded the range
1564          * would not be added to the ms_allocatable tree until TXG_DEFER_SIZE
1565          * txgs had passed.  As a result, when attempting to estimate an upper
1566          * bound for the largest currently-usable free segment in the
1567          * metaslab, we need to not consider any ranges currently in the defer
1568          * trees. This algorithm approximates the largest available chunk in
1569          * the largest range in the unflushed_frees tree by taking the first
1570          * chunk.  While this may be a poor estimate, it should only remain so
1571          * briefly and should eventually self-correct as frees are no longer
1572          * deferred. Similar logic applies to the ms_freed tree. See
1573          * metaslab_load() for more details.
1574          *
1575          * There are two primary sources of inaccuracy in this estimate. Both
1576          * are tolerated for performance reasons. The first source is that we
1577          * only check the largest segment for overlaps. Smaller segments may
1578          * have more favorable overlaps with the other trees, resulting in
1579          * larger usable chunks.  Second, we only look at the first chunk in
1580          * the largest segment; there may be other usable chunks in the
1581          * largest segment, but we ignore them.
1582          */
1583         uint64_t rstart = rs_get_start(rs, msp->ms_unflushed_frees);
1584         uint64_t rsize = rs_get_end(rs, msp->ms_unflushed_frees) - rstart;
1585         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1586                 uint64_t start = 0;
1587                 uint64_t size = 0;
1588                 boolean_t found = range_tree_find_in(msp->ms_defer[t], rstart,
1589                     rsize, &start, &size);
1590                 if (found) {
1591                         if (rstart == start)
1592                                 return (0);
1593                         rsize = start - rstart;
1594                 }
1595         }
1596
1597         uint64_t start = 0;
1598         uint64_t size = 0;
1599         boolean_t found = range_tree_find_in(msp->ms_freed, rstart,
1600             rsize, &start, &size);
1601         if (found)
1602                 rsize = start - rstart;
1603
1604         return (rsize);
1605 }
1606
1607 static range_seg_t *
1608 metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start,
1609     uint64_t size, zfs_btree_index_t *where)
1610 {
1611         range_seg_t *rs;
1612         range_seg_max_t rsearch;
1613
1614         rs_set_start(&rsearch, rt, start);
1615         rs_set_end(&rsearch, rt, start + size);
1616
1617         rs = zfs_btree_find(t, &rsearch, where);
1618         if (rs == NULL) {
1619                 rs = zfs_btree_next(t, where, where);
1620         }
1621
1622         return (rs);
1623 }
1624
1625 #if defined(WITH_DF_BLOCK_ALLOCATOR) || \
1626     defined(WITH_CF_BLOCK_ALLOCATOR)
1627
1628 /*
1629  * This is a helper function that can be used by the allocator to find a
1630  * suitable block to allocate. This will search the specified B-tree looking
1631  * for a block that matches the specified criteria.
1632  */
1633 static uint64_t
1634 metaslab_block_picker(range_tree_t *rt, uint64_t *cursor, uint64_t size,
1635     uint64_t max_search)
1636 {
1637         if (*cursor == 0)
1638                 *cursor = rt->rt_start;
1639         zfs_btree_t *bt = &rt->rt_root;
1640         zfs_btree_index_t where;
1641         range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size, &where);
1642         uint64_t first_found;
1643         int count_searched = 0;
1644
1645         if (rs != NULL)
1646                 first_found = rs_get_start(rs, rt);
1647
1648         while (rs != NULL && (rs_get_start(rs, rt) - first_found <=
1649             max_search || count_searched < metaslab_min_search_count)) {
1650                 uint64_t offset = rs_get_start(rs, rt);
1651                 if (offset + size <= rs_get_end(rs, rt)) {
1652                         *cursor = offset + size;
1653                         return (offset);
1654                 }
1655                 rs = zfs_btree_next(bt, &where, &where);
1656                 count_searched++;
1657         }
1658
1659         *cursor = 0;
1660         return (-1ULL);
1661 }
1662 #endif /* WITH_DF/CF_BLOCK_ALLOCATOR */
1663
1664 #if defined(WITH_DF_BLOCK_ALLOCATOR)
1665 /*
1666  * ==========================================================================
1667  * Dynamic Fit (df) block allocator
1668  *
1669  * Search for a free chunk of at least this size, starting from the last
1670  * offset (for this alignment of block) looking for up to
1671  * metaslab_df_max_search bytes (16MB).  If a large enough free chunk is not
1672  * found within 16MB, then return a free chunk of exactly the requested size (or
1673  * larger).
1674  *
1675  * If it seems like searching from the last offset will be unproductive, skip
1676  * that and just return a free chunk of exactly the requested size (or larger).
1677  * This is based on metaslab_df_alloc_threshold and metaslab_df_free_pct.  This
1678  * mechanism is probably not very useful and may be removed in the future.
1679  *
1680  * The behavior when not searching can be changed to return the largest free
1681  * chunk, instead of a free chunk of exactly the requested size, by setting
1682  * metaslab_df_use_largest_segment.
1683  * ==========================================================================
1684  */
1685 static uint64_t
1686 metaslab_df_alloc(metaslab_t *msp, uint64_t size)
1687 {
1688         /*
1689          * Find the largest power of 2 block size that evenly divides the
1690          * requested size. This is used to try to allocate blocks with similar
1691          * alignment from the same area of the metaslab (i.e. same cursor
1692          * bucket) but it does not guarantee that other allocations sizes
1693          * may exist in the same region.
1694          */
1695         uint64_t align = size & -size;
1696         uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1697         range_tree_t *rt = msp->ms_allocatable;
1698         uint_t free_pct = range_tree_space(rt) * 100 / msp->ms_size;
1699         uint64_t offset;
1700
1701         ASSERT(MUTEX_HELD(&msp->ms_lock));
1702
1703         /*
1704          * If we're running low on space, find a segment based on size,
1705          * rather than iterating based on offset.
1706          */
1707         if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold ||
1708             free_pct < metaslab_df_free_pct) {
1709                 offset = -1;
1710         } else {
1711                 offset = metaslab_block_picker(rt,
1712                     cursor, size, metaslab_df_max_search);
1713         }
1714
1715         if (offset == -1) {
1716                 range_seg_t *rs;
1717                 if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0)
1718                         metaslab_size_tree_full_load(msp->ms_allocatable);
1719
1720                 if (metaslab_df_use_largest_segment) {
1721                         /* use largest free segment */
1722                         rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL);
1723                 } else {
1724                         zfs_btree_index_t where;
1725                         /* use segment of this size, or next largest */
1726                         rs = metaslab_block_find(&msp->ms_allocatable_by_size,
1727                             rt, msp->ms_start, size, &where);
1728                 }
1729                 if (rs != NULL && rs_get_start(rs, rt) + size <= rs_get_end(rs,
1730                     rt)) {
1731                         offset = rs_get_start(rs, rt);
1732                         *cursor = offset + size;
1733                 }
1734         }
1735
1736         return (offset);
1737 }
1738
1739 const metaslab_ops_t zfs_metaslab_ops = {
1740         metaslab_df_alloc
1741 };
1742 #endif /* WITH_DF_BLOCK_ALLOCATOR */
1743
1744 #if defined(WITH_CF_BLOCK_ALLOCATOR)
1745 /*
1746  * ==========================================================================
1747  * Cursor fit block allocator -
1748  * Select the largest region in the metaslab, set the cursor to the beginning
1749  * of the range and the cursor_end to the end of the range. As allocations
1750  * are made advance the cursor. Continue allocating from the cursor until
1751  * the range is exhausted and then find a new range.
1752  * ==========================================================================
1753  */
1754 static uint64_t
1755 metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
1756 {
1757         range_tree_t *rt = msp->ms_allocatable;
1758         zfs_btree_t *t = &msp->ms_allocatable_by_size;
1759         uint64_t *cursor = &msp->ms_lbas[0];
1760         uint64_t *cursor_end = &msp->ms_lbas[1];
1761         uint64_t offset = 0;
1762
1763         ASSERT(MUTEX_HELD(&msp->ms_lock));
1764
1765         ASSERT3U(*cursor_end, >=, *cursor);
1766
1767         if ((*cursor + size) > *cursor_end) {
1768                 range_seg_t *rs;
1769
1770                 if (zfs_btree_numnodes(t) == 0)
1771                         metaslab_size_tree_full_load(msp->ms_allocatable);
1772                 rs = zfs_btree_last(t, NULL);
1773                 if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) <
1774                     size)
1775                         return (-1ULL);
1776
1777                 *cursor = rs_get_start(rs, rt);
1778                 *cursor_end = rs_get_end(rs, rt);
1779         }
1780
1781         offset = *cursor;
1782         *cursor += size;
1783
1784         return (offset);
1785 }
1786
1787 const metaslab_ops_t zfs_metaslab_ops = {
1788         metaslab_cf_alloc
1789 };
1790 #endif /* WITH_CF_BLOCK_ALLOCATOR */
1791
1792 #if defined(WITH_NDF_BLOCK_ALLOCATOR)
1793 /*
1794  * ==========================================================================
1795  * New dynamic fit allocator -
1796  * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
1797  * contiguous blocks. If no region is found then just use the largest segment
1798  * that remains.
1799  * ==========================================================================
1800  */
1801
1802 /*
1803  * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
1804  * to request from the allocator.
1805  */
1806 uint64_t metaslab_ndf_clump_shift = 4;
1807
1808 static uint64_t
1809 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
1810 {
1811         zfs_btree_t *t = &msp->ms_allocatable->rt_root;
1812         range_tree_t *rt = msp->ms_allocatable;
1813         zfs_btree_index_t where;
1814         range_seg_t *rs;
1815         range_seg_max_t rsearch;
1816         uint64_t hbit = highbit64(size);
1817         uint64_t *cursor = &msp->ms_lbas[hbit - 1];
1818         uint64_t max_size = metaslab_largest_allocatable(msp);
1819
1820         ASSERT(MUTEX_HELD(&msp->ms_lock));
1821
1822         if (max_size < size)
1823                 return (-1ULL);
1824
1825         rs_set_start(&rsearch, rt, *cursor);
1826         rs_set_end(&rsearch, rt, *cursor + size);
1827
1828         rs = zfs_btree_find(t, &rsearch, &where);
1829         if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < size) {
1830                 t = &msp->ms_allocatable_by_size;
1831
1832                 rs_set_start(&rsearch, rt, 0);
1833                 rs_set_end(&rsearch, rt, MIN(max_size, 1ULL << (hbit +
1834                     metaslab_ndf_clump_shift)));
1835
1836                 rs = zfs_btree_find(t, &rsearch, &where);
1837                 if (rs == NULL)
1838                         rs = zfs_btree_next(t, &where, &where);
1839                 ASSERT(rs != NULL);
1840         }
1841
1842         if ((rs_get_end(rs, rt) - rs_get_start(rs, rt)) >= size) {
1843                 *cursor = rs_get_start(rs, rt) + size;
1844                 return (rs_get_start(rs, rt));
1845         }
1846         return (-1ULL);
1847 }
1848
1849 const metaslab_ops_t zfs_metaslab_ops = {
1850         metaslab_ndf_alloc
1851 };
1852 #endif /* WITH_NDF_BLOCK_ALLOCATOR */
1853
1854
1855 /*
1856  * ==========================================================================
1857  * Metaslabs
1858  * ==========================================================================
1859  */
1860
1861 /*
1862  * Wait for any in-progress metaslab loads to complete.
1863  */
1864 static void
1865 metaslab_load_wait(metaslab_t *msp)
1866 {
1867         ASSERT(MUTEX_HELD(&msp->ms_lock));
1868
1869         while (msp->ms_loading) {
1870                 ASSERT(!msp->ms_loaded);
1871                 cv_wait(&msp->ms_load_cv, &msp->ms_lock);
1872         }
1873 }
1874
1875 /*
1876  * Wait for any in-progress flushing to complete.
1877  */
1878 static void
1879 metaslab_flush_wait(metaslab_t *msp)
1880 {
1881         ASSERT(MUTEX_HELD(&msp->ms_lock));
1882
1883         while (msp->ms_flushing)
1884                 cv_wait(&msp->ms_flush_cv, &msp->ms_lock);
1885 }
1886
1887 static unsigned int
1888 metaslab_idx_func(multilist_t *ml, void *arg)
1889 {
1890         metaslab_t *msp = arg;
1891
1892         /*
1893          * ms_id values are allocated sequentially, so full 64bit
1894          * division would be a waste of time, so limit it to 32 bits.
1895          */
1896         return ((unsigned int)msp->ms_id % multilist_get_num_sublists(ml));
1897 }
1898
1899 uint64_t
1900 metaslab_allocated_space(metaslab_t *msp)
1901 {
1902         return (msp->ms_allocated_space);
1903 }
1904
1905 /*
1906  * Verify that the space accounting on disk matches the in-core range_trees.
1907  */
1908 static void
1909 metaslab_verify_space(metaslab_t *msp, uint64_t txg)
1910 {
1911         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1912         uint64_t allocating = 0;
1913         uint64_t sm_free_space, msp_free_space;
1914
1915         ASSERT(MUTEX_HELD(&msp->ms_lock));
1916         ASSERT(!msp->ms_condensing);
1917
1918         if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
1919                 return;
1920
1921         /*
1922          * We can only verify the metaslab space when we're called
1923          * from syncing context with a loaded metaslab that has an
1924          * allocated space map. Calling this in non-syncing context
1925          * does not provide a consistent view of the metaslab since
1926          * we're performing allocations in the future.
1927          */
1928         if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
1929             !msp->ms_loaded)
1930                 return;
1931
1932         /*
1933          * Even though the smp_alloc field can get negative,
1934          * when it comes to a metaslab's space map, that should
1935          * never be the case.
1936          */
1937         ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
1938
1939         ASSERT3U(space_map_allocated(msp->ms_sm), >=,
1940             range_tree_space(msp->ms_unflushed_frees));
1941
1942         ASSERT3U(metaslab_allocated_space(msp), ==,
1943             space_map_allocated(msp->ms_sm) +
1944             range_tree_space(msp->ms_unflushed_allocs) -
1945             range_tree_space(msp->ms_unflushed_frees));
1946
1947         sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
1948
1949         /*
1950          * Account for future allocations since we would have
1951          * already deducted that space from the ms_allocatable.
1952          */
1953         for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
1954                 allocating +=
1955                     range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
1956         }
1957         ASSERT3U(allocating + msp->ms_allocated_this_txg, ==,
1958             msp->ms_allocating_total);
1959
1960         ASSERT3U(msp->ms_deferspace, ==,
1961             range_tree_space(msp->ms_defer[0]) +
1962             range_tree_space(msp->ms_defer[1]));
1963
1964         msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
1965             msp->ms_deferspace + range_tree_space(msp->ms_freed);
1966
1967         VERIFY3U(sm_free_space, ==, msp_free_space);
1968 }
1969
1970 static void
1971 metaslab_aux_histograms_clear(metaslab_t *msp)
1972 {
1973         /*
1974          * Auxiliary histograms are only cleared when resetting them,
1975          * which can only happen while the metaslab is loaded.
1976          */
1977         ASSERT(msp->ms_loaded);
1978
1979         memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist));
1980         for (int t = 0; t < TXG_DEFER_SIZE; t++)
1981                 memset(msp->ms_deferhist[t], 0, sizeof (msp->ms_deferhist[t]));
1982 }
1983
1984 static void
1985 metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift,
1986     range_tree_t *rt)
1987 {
1988         /*
1989          * This is modeled after space_map_histogram_add(), so refer to that
1990          * function for implementation details. We want this to work like
1991          * the space map histogram, and not the range tree histogram, as we
1992          * are essentially constructing a delta that will be later subtracted
1993          * from the space map histogram.
1994          */
1995         int idx = 0;
1996         for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
1997                 ASSERT3U(i, >=, idx + shift);
1998                 histogram[idx] += rt->rt_histogram[i] << (i - idx - shift);
1999
2000                 if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
2001                         ASSERT3U(idx + shift, ==, i);
2002                         idx++;
2003                         ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
2004                 }
2005         }
2006 }
2007
2008 /*
2009  * Called at every sync pass that the metaslab gets synced.
2010  *
2011  * The reason is that we want our auxiliary histograms to be updated
2012  * wherever the metaslab's space map histogram is updated. This way
2013  * we stay consistent on which parts of the metaslab space map's
2014  * histogram are currently not available for allocations (e.g because
2015  * they are in the defer, freed, and freeing trees).
2016  */
2017 static void
2018 metaslab_aux_histograms_update(metaslab_t *msp)
2019 {
2020         space_map_t *sm = msp->ms_sm;
2021         ASSERT(sm != NULL);
2022
2023         /*
2024          * This is similar to the metaslab's space map histogram updates
2025          * that take place in metaslab_sync(). The only difference is that
2026          * we only care about segments that haven't made it into the
2027          * ms_allocatable tree yet.
2028          */
2029         if (msp->ms_loaded) {
2030                 metaslab_aux_histograms_clear(msp);
2031
2032                 metaslab_aux_histogram_add(msp->ms_synchist,
2033                     sm->sm_shift, msp->ms_freed);
2034
2035                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2036                         metaslab_aux_histogram_add(msp->ms_deferhist[t],
2037                             sm->sm_shift, msp->ms_defer[t]);
2038                 }
2039         }
2040
2041         metaslab_aux_histogram_add(msp->ms_synchist,
2042             sm->sm_shift, msp->ms_freeing);
2043 }
2044
2045 /*
2046  * Called every time we are done syncing (writing to) the metaslab,
2047  * i.e. at the end of each sync pass.
2048  * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist]
2049  */
2050 static void
2051 metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed)
2052 {
2053         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2054         space_map_t *sm = msp->ms_sm;
2055
2056         if (sm == NULL) {
2057                 /*
2058                  * We came here from metaslab_init() when creating/opening a
2059                  * pool, looking at a metaslab that hasn't had any allocations
2060                  * yet.
2061                  */
2062                 return;
2063         }
2064
2065         /*
2066          * This is similar to the actions that we take for the ms_freed
2067          * and ms_defer trees in metaslab_sync_done().
2068          */
2069         uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE;
2070         if (defer_allowed) {
2071                 memcpy(msp->ms_deferhist[hist_index], msp->ms_synchist,
2072                     sizeof (msp->ms_synchist));
2073         } else {
2074                 memset(msp->ms_deferhist[hist_index], 0,
2075                     sizeof (msp->ms_deferhist[hist_index]));
2076         }
2077         memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist));
2078 }
2079
2080 /*
2081  * Ensure that the metaslab's weight and fragmentation are consistent
2082  * with the contents of the histogram (either the range tree's histogram
2083  * or the space map's depending whether the metaslab is loaded).
2084  */
2085 static void
2086 metaslab_verify_weight_and_frag(metaslab_t *msp)
2087 {
2088         ASSERT(MUTEX_HELD(&msp->ms_lock));
2089
2090         if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
2091                 return;
2092
2093         /*
2094          * We can end up here from vdev_remove_complete(), in which case we
2095          * cannot do these assertions because we hold spa config locks and
2096          * thus we are not allowed to read from the DMU.
2097          *
2098          * We check if the metaslab group has been removed and if that's
2099          * the case we return immediately as that would mean that we are
2100          * here from the aforementioned code path.
2101          */
2102         if (msp->ms_group == NULL)
2103                 return;
2104
2105         /*
2106          * Devices being removed always return a weight of 0 and leave
2107          * fragmentation and ms_max_size as is - there is nothing for
2108          * us to verify here.
2109          */
2110         vdev_t *vd = msp->ms_group->mg_vd;
2111         if (vd->vdev_removing)
2112                 return;
2113
2114         /*
2115          * If the metaslab is dirty it probably means that we've done
2116          * some allocations or frees that have changed our histograms
2117          * and thus the weight.
2118          */
2119         for (int t = 0; t < TXG_SIZE; t++) {
2120                 if (txg_list_member(&vd->vdev_ms_list, msp, t))
2121                         return;
2122         }
2123
2124         /*
2125          * This verification checks that our in-memory state is consistent
2126          * with what's on disk. If the pool is read-only then there aren't
2127          * any changes and we just have the initially-loaded state.
2128          */
2129         if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa))
2130                 return;
2131
2132         /* some extra verification for in-core tree if you can */
2133         if (msp->ms_loaded) {
2134                 range_tree_stat_verify(msp->ms_allocatable);
2135                 VERIFY(space_map_histogram_verify(msp->ms_sm,
2136                     msp->ms_allocatable));
2137         }
2138
2139         uint64_t weight = msp->ms_weight;
2140         uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
2141         boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight);
2142         uint64_t frag = msp->ms_fragmentation;
2143         uint64_t max_segsize = msp->ms_max_size;
2144
2145         msp->ms_weight = 0;
2146         msp->ms_fragmentation = 0;
2147
2148         /*
2149          * This function is used for verification purposes and thus should
2150          * not introduce any side-effects/mutations on the system's state.
2151          *
2152          * Regardless of whether metaslab_weight() thinks this metaslab
2153          * should be active or not, we want to ensure that the actual weight
2154          * (and therefore the value of ms_weight) would be the same if it
2155          * was to be recalculated at this point.
2156          *
2157          * In addition we set the nodirty flag so metaslab_weight() does
2158          * not dirty the metaslab for future TXGs (e.g. when trying to
2159          * force condensing to upgrade the metaslab spacemaps).
2160          */
2161         msp->ms_weight = metaslab_weight(msp, B_TRUE) | was_active;
2162
2163         VERIFY3U(max_segsize, ==, msp->ms_max_size);
2164
2165         /*
2166          * If the weight type changed then there is no point in doing
2167          * verification. Revert fields to their original values.
2168          */
2169         if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) ||
2170             (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) {
2171                 msp->ms_fragmentation = frag;
2172                 msp->ms_weight = weight;
2173                 return;
2174         }
2175
2176         VERIFY3U(msp->ms_fragmentation, ==, frag);
2177         VERIFY3U(msp->ms_weight, ==, weight);
2178 }
2179
2180 /*
2181  * If we're over the zfs_metaslab_mem_limit, select the loaded metaslab from
2182  * this class that was used longest ago, and attempt to unload it.  We don't
2183  * want to spend too much time in this loop to prevent performance
2184  * degradation, and we expect that most of the time this operation will
2185  * succeed. Between that and the normal unloading processing during txg sync,
2186  * we expect this to keep the metaslab memory usage under control.
2187  */
2188 static void
2189 metaslab_potentially_evict(metaslab_class_t *mc)
2190 {
2191 #ifdef _KERNEL
2192         uint64_t allmem = arc_all_memory();
2193         uint64_t inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache);
2194         uint64_t size = spl_kmem_cache_entry_size(zfs_btree_leaf_cache);
2195         uint_t tries = 0;
2196         for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size &&
2197             tries < multilist_get_num_sublists(&mc->mc_metaslab_txg_list) * 2;
2198             tries++) {
2199                 unsigned int idx = multilist_get_random_index(
2200                     &mc->mc_metaslab_txg_list);
2201                 multilist_sublist_t *mls =
2202                     multilist_sublist_lock(&mc->mc_metaslab_txg_list, idx);
2203                 metaslab_t *msp = multilist_sublist_head(mls);
2204                 multilist_sublist_unlock(mls);
2205                 while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 <
2206                     inuse * size) {
2207                         VERIFY3P(mls, ==, multilist_sublist_lock(
2208                             &mc->mc_metaslab_txg_list, idx));
2209                         ASSERT3U(idx, ==,
2210                             metaslab_idx_func(&mc->mc_metaslab_txg_list, msp));
2211
2212                         if (!multilist_link_active(&msp->ms_class_txg_node)) {
2213                                 multilist_sublist_unlock(mls);
2214                                 break;
2215                         }
2216                         metaslab_t *next_msp = multilist_sublist_next(mls, msp);
2217                         multilist_sublist_unlock(mls);
2218                         /*
2219                          * If the metaslab is currently loading there are two
2220                          * cases. If it's the metaslab we're evicting, we
2221                          * can't continue on or we'll panic when we attempt to
2222                          * recursively lock the mutex. If it's another
2223                          * metaslab that's loading, it can be safely skipped,
2224                          * since we know it's very new and therefore not a
2225                          * good eviction candidate. We check later once the
2226                          * lock is held that the metaslab is fully loaded
2227                          * before actually unloading it.
2228                          */
2229                         if (msp->ms_loading) {
2230                                 msp = next_msp;
2231                                 inuse =
2232                                     spl_kmem_cache_inuse(zfs_btree_leaf_cache);
2233                                 continue;
2234                         }
2235                         /*
2236                          * We can't unload metaslabs with no spacemap because
2237                          * they're not ready to be unloaded yet. We can't
2238                          * unload metaslabs with outstanding allocations
2239                          * because doing so could cause the metaslab's weight
2240                          * to decrease while it's unloaded, which violates an
2241                          * invariant that we use to prevent unnecessary
2242                          * loading. We also don't unload metaslabs that are
2243                          * currently active because they are high-weight
2244                          * metaslabs that are likely to be used in the near
2245                          * future.
2246                          */
2247                         mutex_enter(&msp->ms_lock);
2248                         if (msp->ms_allocator == -1 && msp->ms_sm != NULL &&
2249                             msp->ms_allocating_total == 0) {
2250                                 metaslab_unload(msp);
2251                         }
2252                         mutex_exit(&msp->ms_lock);
2253                         msp = next_msp;
2254                         inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache);
2255                 }
2256         }
2257 #else
2258         (void) mc, (void) zfs_metaslab_mem_limit;
2259 #endif
2260 }
2261
2262 static int
2263 metaslab_load_impl(metaslab_t *msp)
2264 {
2265         int error = 0;
2266
2267         ASSERT(MUTEX_HELD(&msp->ms_lock));
2268         ASSERT(msp->ms_loading);
2269         ASSERT(!msp->ms_condensing);
2270
2271         /*
2272          * We temporarily drop the lock to unblock other operations while we
2273          * are reading the space map. Therefore, metaslab_sync() and
2274          * metaslab_sync_done() can run at the same time as we do.
2275          *
2276          * If we are using the log space maps, metaslab_sync() can't write to
2277          * the metaslab's space map while we are loading as we only write to
2278          * it when we are flushing the metaslab, and that can't happen while
2279          * we are loading it.
2280          *
2281          * If we are not using log space maps though, metaslab_sync() can
2282          * append to the space map while we are loading. Therefore we load
2283          * only entries that existed when we started the load. Additionally,
2284          * metaslab_sync_done() has to wait for the load to complete because
2285          * there are potential races like metaslab_load() loading parts of the
2286          * space map that are currently being appended by metaslab_sync(). If
2287          * we didn't, the ms_allocatable would have entries that
2288          * metaslab_sync_done() would try to re-add later.
2289          *
2290          * That's why before dropping the lock we remember the synced length
2291          * of the metaslab and read up to that point of the space map,
2292          * ignoring entries appended by metaslab_sync() that happen after we
2293          * drop the lock.
2294          */
2295         uint64_t length = msp->ms_synced_length;
2296         mutex_exit(&msp->ms_lock);
2297
2298         hrtime_t load_start = gethrtime();
2299         metaslab_rt_arg_t *mrap;
2300         if (msp->ms_allocatable->rt_arg == NULL) {
2301                 mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
2302         } else {
2303                 mrap = msp->ms_allocatable->rt_arg;
2304                 msp->ms_allocatable->rt_ops = NULL;
2305                 msp->ms_allocatable->rt_arg = NULL;
2306         }
2307         mrap->mra_bt = &msp->ms_allocatable_by_size;
2308         mrap->mra_floor_shift = metaslab_by_size_min_shift;
2309
2310         if (msp->ms_sm != NULL) {
2311                 error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
2312                     SM_FREE, length);
2313
2314                 /* Now, populate the size-sorted tree. */
2315                 metaslab_rt_create(msp->ms_allocatable, mrap);
2316                 msp->ms_allocatable->rt_ops = &metaslab_rt_ops;
2317                 msp->ms_allocatable->rt_arg = mrap;
2318
2319                 struct mssa_arg arg = {0};
2320                 arg.rt = msp->ms_allocatable;
2321                 arg.mra = mrap;
2322                 range_tree_walk(msp->ms_allocatable, metaslab_size_sorted_add,
2323                     &arg);
2324         } else {
2325                 /*
2326                  * Add the size-sorted tree first, since we don't need to load
2327                  * the metaslab from the spacemap.
2328                  */
2329                 metaslab_rt_create(msp->ms_allocatable, mrap);
2330                 msp->ms_allocatable->rt_ops = &metaslab_rt_ops;
2331                 msp->ms_allocatable->rt_arg = mrap;
2332                 /*
2333                  * The space map has not been allocated yet, so treat
2334                  * all the space in the metaslab as free and add it to the
2335                  * ms_allocatable tree.
2336                  */
2337                 range_tree_add(msp->ms_allocatable,
2338                     msp->ms_start, msp->ms_size);
2339
2340                 if (msp->ms_new) {
2341                         /*
2342                          * If the ms_sm doesn't exist, this means that this
2343                          * metaslab hasn't gone through metaslab_sync() and
2344                          * thus has never been dirtied. So we shouldn't
2345                          * expect any unflushed allocs or frees from previous
2346                          * TXGs.
2347                          */
2348                         ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
2349                         ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
2350                 }
2351         }
2352
2353         /*
2354          * We need to grab the ms_sync_lock to prevent metaslab_sync() from
2355          * changing the ms_sm (or log_sm) and the metaslab's range trees
2356          * while we are about to use them and populate the ms_allocatable.
2357          * The ms_lock is insufficient for this because metaslab_sync() doesn't
2358          * hold the ms_lock while writing the ms_checkpointing tree to disk.
2359          */
2360         mutex_enter(&msp->ms_sync_lock);
2361         mutex_enter(&msp->ms_lock);
2362
2363         ASSERT(!msp->ms_condensing);
2364         ASSERT(!msp->ms_flushing);
2365
2366         if (error != 0) {
2367                 mutex_exit(&msp->ms_sync_lock);
2368                 return (error);
2369         }
2370
2371         ASSERT3P(msp->ms_group, !=, NULL);
2372         msp->ms_loaded = B_TRUE;
2373
2374         /*
2375          * Apply all the unflushed changes to ms_allocatable right
2376          * away so any manipulations we do below have a clear view
2377          * of what is allocated and what is free.
2378          */
2379         range_tree_walk(msp->ms_unflushed_allocs,
2380             range_tree_remove, msp->ms_allocatable);
2381         range_tree_walk(msp->ms_unflushed_frees,
2382             range_tree_add, msp->ms_allocatable);
2383
2384         ASSERT3P(msp->ms_group, !=, NULL);
2385         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2386         if (spa_syncing_log_sm(spa) != NULL) {
2387                 ASSERT(spa_feature_is_enabled(spa,
2388                     SPA_FEATURE_LOG_SPACEMAP));
2389
2390                 /*
2391                  * If we use a log space map we add all the segments
2392                  * that are in ms_unflushed_frees so they are available
2393                  * for allocation.
2394                  *
2395                  * ms_allocatable needs to contain all free segments
2396                  * that are ready for allocations (thus not segments
2397                  * from ms_freeing, ms_freed, and the ms_defer trees).
2398                  * But if we grab the lock in this code path at a sync
2399                  * pass later that 1, then it also contains the
2400                  * segments of ms_freed (they were added to it earlier
2401                  * in this path through ms_unflushed_frees). So we
2402                  * need to remove all the segments that exist in
2403                  * ms_freed from ms_allocatable as they will be added
2404                  * later in metaslab_sync_done().
2405                  *
2406                  * When there's no log space map, the ms_allocatable
2407                  * correctly doesn't contain any segments that exist
2408                  * in ms_freed [see ms_synced_length].
2409                  */
2410                 range_tree_walk(msp->ms_freed,
2411                     range_tree_remove, msp->ms_allocatable);
2412         }
2413
2414         /*
2415          * If we are not using the log space map, ms_allocatable
2416          * contains the segments that exist in the ms_defer trees
2417          * [see ms_synced_length]. Thus we need to remove them
2418          * from ms_allocatable as they will be added again in
2419          * metaslab_sync_done().
2420          *
2421          * If we are using the log space map, ms_allocatable still
2422          * contains the segments that exist in the ms_defer trees.
2423          * Not because it read them through the ms_sm though. But
2424          * because these segments are part of ms_unflushed_frees
2425          * whose segments we add to ms_allocatable earlier in this
2426          * code path.
2427          */
2428         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2429                 range_tree_walk(msp->ms_defer[t],
2430                     range_tree_remove, msp->ms_allocatable);
2431         }
2432
2433         /*
2434          * Call metaslab_recalculate_weight_and_sort() now that the
2435          * metaslab is loaded so we get the metaslab's real weight.
2436          *
2437          * Unless this metaslab was created with older software and
2438          * has not yet been converted to use segment-based weight, we
2439          * expect the new weight to be better or equal to the weight
2440          * that the metaslab had while it was not loaded. This is
2441          * because the old weight does not take into account the
2442          * consolidation of adjacent segments between TXGs. [see
2443          * comment for ms_synchist and ms_deferhist[] for more info]
2444          */
2445         uint64_t weight = msp->ms_weight;
2446         uint64_t max_size = msp->ms_max_size;
2447         metaslab_recalculate_weight_and_sort(msp);
2448         if (!WEIGHT_IS_SPACEBASED(weight))
2449                 ASSERT3U(weight, <=, msp->ms_weight);
2450         msp->ms_max_size = metaslab_largest_allocatable(msp);
2451         ASSERT3U(max_size, <=, msp->ms_max_size);
2452         hrtime_t load_end = gethrtime();
2453         msp->ms_load_time = load_end;
2454         zfs_dbgmsg("metaslab_load: txg %llu, spa %s, vdev_id %llu, "
2455             "ms_id %llu, smp_length %llu, "
2456             "unflushed_allocs %llu, unflushed_frees %llu, "
2457             "freed %llu, defer %llu + %llu, unloaded time %llu ms, "
2458             "loading_time %lld ms, ms_max_size %llu, "
2459             "max size error %lld, "
2460             "old_weight %llx, new_weight %llx",
2461             (u_longlong_t)spa_syncing_txg(spa), spa_name(spa),
2462             (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
2463             (u_longlong_t)msp->ms_id,
2464             (u_longlong_t)space_map_length(msp->ms_sm),
2465             (u_longlong_t)range_tree_space(msp->ms_unflushed_allocs),
2466             (u_longlong_t)range_tree_space(msp->ms_unflushed_frees),
2467             (u_longlong_t)range_tree_space(msp->ms_freed),
2468             (u_longlong_t)range_tree_space(msp->ms_defer[0]),
2469             (u_longlong_t)range_tree_space(msp->ms_defer[1]),
2470             (longlong_t)((load_start - msp->ms_unload_time) / 1000000),
2471             (longlong_t)((load_end - load_start) / 1000000),
2472             (u_longlong_t)msp->ms_max_size,
2473             (u_longlong_t)msp->ms_max_size - max_size,
2474             (u_longlong_t)weight, (u_longlong_t)msp->ms_weight);
2475
2476         metaslab_verify_space(msp, spa_syncing_txg(spa));
2477         mutex_exit(&msp->ms_sync_lock);
2478         return (0);
2479 }
2480
2481 int
2482 metaslab_load(metaslab_t *msp)
2483 {
2484         ASSERT(MUTEX_HELD(&msp->ms_lock));
2485
2486         /*
2487          * There may be another thread loading the same metaslab, if that's
2488          * the case just wait until the other thread is done and return.
2489          */
2490         metaslab_load_wait(msp);
2491         if (msp->ms_loaded)
2492                 return (0);
2493         VERIFY(!msp->ms_loading);
2494         ASSERT(!msp->ms_condensing);
2495
2496         /*
2497          * We set the loading flag BEFORE potentially dropping the lock to
2498          * wait for an ongoing flush (see ms_flushing below). This way other
2499          * threads know that there is already a thread that is loading this
2500          * metaslab.
2501          */
2502         msp->ms_loading = B_TRUE;
2503
2504         /*
2505          * Wait for any in-progress flushing to finish as we drop the ms_lock
2506          * both here (during space_map_load()) and in metaslab_flush() (when
2507          * we flush our changes to the ms_sm).
2508          */
2509         if (msp->ms_flushing)
2510                 metaslab_flush_wait(msp);
2511
2512         /*
2513          * In the possibility that we were waiting for the metaslab to be
2514          * flushed (where we temporarily dropped the ms_lock), ensure that
2515          * no one else loaded the metaslab somehow.
2516          */
2517         ASSERT(!msp->ms_loaded);
2518
2519         /*
2520          * If we're loading a metaslab in the normal class, consider evicting
2521          * another one to keep our memory usage under the limit defined by the
2522          * zfs_metaslab_mem_limit tunable.
2523          */
2524         if (spa_normal_class(msp->ms_group->mg_class->mc_spa) ==
2525             msp->ms_group->mg_class) {
2526                 metaslab_potentially_evict(msp->ms_group->mg_class);
2527         }
2528
2529         int error = metaslab_load_impl(msp);
2530
2531         ASSERT(MUTEX_HELD(&msp->ms_lock));
2532         msp->ms_loading = B_FALSE;
2533         cv_broadcast(&msp->ms_load_cv);
2534
2535         return (error);
2536 }
2537
2538 void
2539 metaslab_unload(metaslab_t *msp)
2540 {
2541         ASSERT(MUTEX_HELD(&msp->ms_lock));
2542
2543         /*
2544          * This can happen if a metaslab is selected for eviction (in
2545          * metaslab_potentially_evict) and then unloaded during spa_sync (via
2546          * metaslab_class_evict_old).
2547          */
2548         if (!msp->ms_loaded)
2549                 return;
2550
2551         range_tree_vacate(msp->ms_allocatable, NULL, NULL);
2552         msp->ms_loaded = B_FALSE;
2553         msp->ms_unload_time = gethrtime();
2554
2555         msp->ms_activation_weight = 0;
2556         msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
2557
2558         if (msp->ms_group != NULL) {
2559                 metaslab_class_t *mc = msp->ms_group->mg_class;
2560                 multilist_sublist_t *mls =
2561                     multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
2562                 if (multilist_link_active(&msp->ms_class_txg_node))
2563                         multilist_sublist_remove(mls, msp);
2564                 multilist_sublist_unlock(mls);
2565
2566                 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2567                 zfs_dbgmsg("metaslab_unload: txg %llu, spa %s, vdev_id %llu, "
2568                     "ms_id %llu, weight %llx, "
2569                     "selected txg %llu (%llu ms ago), alloc_txg %llu, "
2570                     "loaded %llu ms ago, max_size %llu",
2571                     (u_longlong_t)spa_syncing_txg(spa), spa_name(spa),
2572                     (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
2573                     (u_longlong_t)msp->ms_id,
2574                     (u_longlong_t)msp->ms_weight,
2575                     (u_longlong_t)msp->ms_selected_txg,
2576                     (u_longlong_t)(msp->ms_unload_time -
2577                     msp->ms_selected_time) / 1000 / 1000,
2578                     (u_longlong_t)msp->ms_alloc_txg,
2579                     (u_longlong_t)(msp->ms_unload_time -
2580                     msp->ms_load_time) / 1000 / 1000,
2581                     (u_longlong_t)msp->ms_max_size);
2582         }
2583
2584         /*
2585          * We explicitly recalculate the metaslab's weight based on its space
2586          * map (as it is now not loaded). We want unload metaslabs to always
2587          * have their weights calculated from the space map histograms, while
2588          * loaded ones have it calculated from their in-core range tree
2589          * [see metaslab_load()]. This way, the weight reflects the information
2590          * available in-core, whether it is loaded or not.
2591          *
2592          * If ms_group == NULL means that we came here from metaslab_fini(),
2593          * at which point it doesn't make sense for us to do the recalculation
2594          * and the sorting.
2595          */
2596         if (msp->ms_group != NULL)
2597                 metaslab_recalculate_weight_and_sort(msp);
2598 }
2599
2600 /*
2601  * We want to optimize the memory use of the per-metaslab range
2602  * trees. To do this, we store the segments in the range trees in
2603  * units of sectors, zero-indexing from the start of the metaslab. If
2604  * the vdev_ms_shift - the vdev_ashift is less than 32, we can store
2605  * the ranges using two uint32_ts, rather than two uint64_ts.
2606  */
2607 range_seg_type_t
2608 metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp,
2609     uint64_t *start, uint64_t *shift)
2610 {
2611         if (vdev->vdev_ms_shift - vdev->vdev_ashift < 32 &&
2612             !zfs_metaslab_force_large_segs) {
2613                 *shift = vdev->vdev_ashift;
2614                 *start = msp->ms_start;
2615                 return (RANGE_SEG32);
2616         } else {
2617                 *shift = 0;
2618                 *start = 0;
2619                 return (RANGE_SEG64);
2620         }
2621 }
2622
2623 void
2624 metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg)
2625 {
2626         ASSERT(MUTEX_HELD(&msp->ms_lock));
2627         metaslab_class_t *mc = msp->ms_group->mg_class;
2628         multilist_sublist_t *mls =
2629             multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
2630         if (multilist_link_active(&msp->ms_class_txg_node))
2631                 multilist_sublist_remove(mls, msp);
2632         msp->ms_selected_txg = txg;
2633         msp->ms_selected_time = gethrtime();
2634         multilist_sublist_insert_tail(mls, msp);
2635         multilist_sublist_unlock(mls);
2636 }
2637
2638 void
2639 metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
2640     int64_t defer_delta, int64_t space_delta)
2641 {
2642         vdev_space_update(vd, alloc_delta, defer_delta, space_delta);
2643
2644         ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent);
2645         ASSERT(vd->vdev_ms_count != 0);
2646
2647         metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta,
2648             vdev_deflated_space(vd, space_delta));
2649 }
2650
2651 int
2652 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
2653     uint64_t txg, metaslab_t **msp)
2654 {
2655         vdev_t *vd = mg->mg_vd;
2656         spa_t *spa = vd->vdev_spa;
2657         objset_t *mos = spa->spa_meta_objset;
2658         metaslab_t *ms;
2659         int error;
2660
2661         ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
2662         mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
2663         mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
2664         cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
2665         cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL);
2666         multilist_link_init(&ms->ms_class_txg_node);
2667
2668         ms->ms_id = id;
2669         ms->ms_start = id << vd->vdev_ms_shift;
2670         ms->ms_size = 1ULL << vd->vdev_ms_shift;
2671         ms->ms_allocator = -1;
2672         ms->ms_new = B_TRUE;
2673
2674         vdev_ops_t *ops = vd->vdev_ops;
2675         if (ops->vdev_op_metaslab_init != NULL)
2676                 ops->vdev_op_metaslab_init(vd, &ms->ms_start, &ms->ms_size);
2677
2678         /*
2679          * We only open space map objects that already exist. All others
2680          * will be opened when we finally allocate an object for it. For
2681          * readonly pools there is no need to open the space map object.
2682          *
2683          * Note:
2684          * When called from vdev_expand(), we can't call into the DMU as
2685          * we are holding the spa_config_lock as a writer and we would
2686          * deadlock [see relevant comment in vdev_metaslab_init()]. in
2687          * that case, the object parameter is zero though, so we won't
2688          * call into the DMU.
2689          */
2690         if (object != 0 && !(spa->spa_mode == SPA_MODE_READ &&
2691             !spa->spa_read_spacemaps)) {
2692                 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
2693                     ms->ms_size, vd->vdev_ashift);
2694
2695                 if (error != 0) {
2696                         kmem_free(ms, sizeof (metaslab_t));
2697                         return (error);
2698                 }
2699
2700                 ASSERT(ms->ms_sm != NULL);
2701                 ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
2702         }
2703
2704         uint64_t shift, start;
2705         range_seg_type_t type =
2706             metaslab_calculate_range_tree_type(vd, ms, &start, &shift);
2707
2708         ms->ms_allocatable = range_tree_create(NULL, type, NULL, start, shift);
2709         for (int t = 0; t < TXG_SIZE; t++) {
2710                 ms->ms_allocating[t] = range_tree_create(NULL, type,
2711                     NULL, start, shift);
2712         }
2713         ms->ms_freeing = range_tree_create(NULL, type, NULL, start, shift);
2714         ms->ms_freed = range_tree_create(NULL, type, NULL, start, shift);
2715         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2716                 ms->ms_defer[t] = range_tree_create(NULL, type, NULL,
2717                     start, shift);
2718         }
2719         ms->ms_checkpointing =
2720             range_tree_create(NULL, type, NULL, start, shift);
2721         ms->ms_unflushed_allocs =
2722             range_tree_create(NULL, type, NULL, start, shift);
2723
2724         metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
2725         mrap->mra_bt = &ms->ms_unflushed_frees_by_size;
2726         mrap->mra_floor_shift = metaslab_by_size_min_shift;
2727         ms->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops,
2728             type, mrap, start, shift);
2729
2730         ms->ms_trim = range_tree_create(NULL, type, NULL, start, shift);
2731
2732         metaslab_group_add(mg, ms);
2733         metaslab_set_fragmentation(ms, B_FALSE);
2734
2735         /*
2736          * If we're opening an existing pool (txg == 0) or creating
2737          * a new one (txg == TXG_INITIAL), all space is available now.
2738          * If we're adding space to an existing pool, the new space
2739          * does not become available until after this txg has synced.
2740          * The metaslab's weight will also be initialized when we sync
2741          * out this txg. This ensures that we don't attempt to allocate
2742          * from it before we have initialized it completely.
2743          */
2744         if (txg <= TXG_INITIAL) {
2745                 metaslab_sync_done(ms, 0);
2746                 metaslab_space_update(vd, mg->mg_class,
2747                     metaslab_allocated_space(ms), 0, 0);
2748         }
2749
2750         if (txg != 0) {
2751                 vdev_dirty(vd, 0, NULL, txg);
2752                 vdev_dirty(vd, VDD_METASLAB, ms, txg);
2753         }
2754
2755         *msp = ms;
2756
2757         return (0);
2758 }
2759
2760 static void
2761 metaslab_fini_flush_data(metaslab_t *msp)
2762 {
2763         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2764
2765         if (metaslab_unflushed_txg(msp) == 0) {
2766                 ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL),
2767                     ==, NULL);
2768                 return;
2769         }
2770         ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
2771
2772         mutex_enter(&spa->spa_flushed_ms_lock);
2773         avl_remove(&spa->spa_metaslabs_by_flushed, msp);
2774         mutex_exit(&spa->spa_flushed_ms_lock);
2775
2776         spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp));
2777         spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp),
2778             metaslab_unflushed_dirty(msp));
2779 }
2780
2781 uint64_t
2782 metaslab_unflushed_changes_memused(metaslab_t *ms)
2783 {
2784         return ((range_tree_numsegs(ms->ms_unflushed_allocs) +
2785             range_tree_numsegs(ms->ms_unflushed_frees)) *
2786             ms->ms_unflushed_allocs->rt_root.bt_elem_size);
2787 }
2788
2789 void
2790 metaslab_fini(metaslab_t *msp)
2791 {
2792         metaslab_group_t *mg = msp->ms_group;
2793         vdev_t *vd = mg->mg_vd;
2794         spa_t *spa = vd->vdev_spa;
2795
2796         metaslab_fini_flush_data(msp);
2797
2798         metaslab_group_remove(mg, msp);
2799
2800         mutex_enter(&msp->ms_lock);
2801         VERIFY(msp->ms_group == NULL);
2802
2803         /*
2804          * If this metaslab hasn't been through metaslab_sync_done() yet its
2805          * space hasn't been accounted for in its vdev and doesn't need to be
2806          * subtracted.
2807          */
2808         if (!msp->ms_new) {
2809                 metaslab_space_update(vd, mg->mg_class,
2810                     -metaslab_allocated_space(msp), 0, -msp->ms_size);
2811
2812         }
2813         space_map_close(msp->ms_sm);
2814         msp->ms_sm = NULL;
2815
2816         metaslab_unload(msp);
2817
2818         range_tree_destroy(msp->ms_allocatable);
2819         range_tree_destroy(msp->ms_freeing);
2820         range_tree_destroy(msp->ms_freed);
2821
2822         ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
2823             metaslab_unflushed_changes_memused(msp));
2824         spa->spa_unflushed_stats.sus_memused -=
2825             metaslab_unflushed_changes_memused(msp);
2826         range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
2827         range_tree_destroy(msp->ms_unflushed_allocs);
2828         range_tree_destroy(msp->ms_checkpointing);
2829         range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
2830         range_tree_destroy(msp->ms_unflushed_frees);
2831
2832         for (int t = 0; t < TXG_SIZE; t++) {
2833                 range_tree_destroy(msp->ms_allocating[t]);
2834         }
2835         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2836                 range_tree_destroy(msp->ms_defer[t]);
2837         }
2838         ASSERT0(msp->ms_deferspace);
2839
2840         for (int t = 0; t < TXG_SIZE; t++)
2841                 ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
2842
2843         range_tree_vacate(msp->ms_trim, NULL, NULL);
2844         range_tree_destroy(msp->ms_trim);
2845
2846         mutex_exit(&msp->ms_lock);
2847         cv_destroy(&msp->ms_load_cv);
2848         cv_destroy(&msp->ms_flush_cv);
2849         mutex_destroy(&msp->ms_lock);
2850         mutex_destroy(&msp->ms_sync_lock);
2851         ASSERT3U(msp->ms_allocator, ==, -1);
2852
2853         kmem_free(msp, sizeof (metaslab_t));
2854 }
2855
2856 #define FRAGMENTATION_TABLE_SIZE        17
2857
2858 /*
2859  * This table defines a segment size based fragmentation metric that will
2860  * allow each metaslab to derive its own fragmentation value. This is done
2861  * by calculating the space in each bucket of the spacemap histogram and
2862  * multiplying that by the fragmentation metric in this table. Doing
2863  * this for all buckets and dividing it by the total amount of free
2864  * space in this metaslab (i.e. the total free space in all buckets) gives
2865  * us the fragmentation metric. This means that a high fragmentation metric
2866  * equates to most of the free space being comprised of small segments.
2867  * Conversely, if the metric is low, then most of the free space is in
2868  * large segments. A 10% change in fragmentation equates to approximately
2869  * double the number of segments.
2870  *
2871  * This table defines 0% fragmented space using 16MB segments. Testing has
2872  * shown that segments that are greater than or equal to 16MB do not suffer
2873  * from drastic performance problems. Using this value, we derive the rest
2874  * of the table. Since the fragmentation value is never stored on disk, it
2875  * is possible to change these calculations in the future.
2876  */
2877 static const int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
2878         100,    /* 512B */
2879         100,    /* 1K   */
2880         98,     /* 2K   */
2881         95,     /* 4K   */
2882         90,     /* 8K   */
2883         80,     /* 16K  */
2884         70,     /* 32K  */
2885         60,     /* 64K  */
2886         50,     /* 128K */
2887         40,     /* 256K */
2888         30,     /* 512K */
2889         20,     /* 1M   */
2890         15,     /* 2M   */
2891         10,     /* 4M   */
2892         5,      /* 8M   */
2893         0       /* 16M  */
2894 };
2895
2896 /*
2897  * Calculate the metaslab's fragmentation metric and set ms_fragmentation.
2898  * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not
2899  * been upgraded and does not support this metric. Otherwise, the return
2900  * value should be in the range [0, 100].
2901  */
2902 static void
2903 metaslab_set_fragmentation(metaslab_t *msp, boolean_t nodirty)
2904 {
2905         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2906         uint64_t fragmentation = 0;
2907         uint64_t total = 0;
2908         boolean_t feature_enabled = spa_feature_is_enabled(spa,
2909             SPA_FEATURE_SPACEMAP_HISTOGRAM);
2910
2911         if (!feature_enabled) {
2912                 msp->ms_fragmentation = ZFS_FRAG_INVALID;
2913                 return;
2914         }
2915
2916         /*
2917          * A null space map means that the entire metaslab is free
2918          * and thus is not fragmented.
2919          */
2920         if (msp->ms_sm == NULL) {
2921                 msp->ms_fragmentation = 0;
2922                 return;
2923         }
2924
2925         /*
2926          * If this metaslab's space map has not been upgraded, flag it
2927          * so that we upgrade next time we encounter it.
2928          */
2929         if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
2930                 uint64_t txg = spa_syncing_txg(spa);
2931                 vdev_t *vd = msp->ms_group->mg_vd;
2932
2933                 /*
2934                  * If we've reached the final dirty txg, then we must
2935                  * be shutting down the pool. We don't want to dirty
2936                  * any data past this point so skip setting the condense
2937                  * flag. We can retry this action the next time the pool
2938                  * is imported. We also skip marking this metaslab for
2939                  * condensing if the caller has explicitly set nodirty.
2940                  */
2941                 if (!nodirty &&
2942                     spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
2943                         msp->ms_condense_wanted = B_TRUE;
2944                         vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
2945                         zfs_dbgmsg("txg %llu, requesting force condense: "
2946                             "ms_id %llu, vdev_id %llu", (u_longlong_t)txg,
2947                             (u_longlong_t)msp->ms_id,
2948                             (u_longlong_t)vd->vdev_id);
2949                 }
2950                 msp->ms_fragmentation = ZFS_FRAG_INVALID;
2951                 return;
2952         }
2953
2954         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
2955                 uint64_t space = 0;
2956                 uint8_t shift = msp->ms_sm->sm_shift;
2957
2958                 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
2959                     FRAGMENTATION_TABLE_SIZE - 1);
2960
2961                 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
2962                         continue;
2963
2964                 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
2965                 total += space;
2966
2967                 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
2968                 fragmentation += space * zfs_frag_table[idx];
2969         }
2970
2971         if (total > 0)
2972                 fragmentation /= total;
2973         ASSERT3U(fragmentation, <=, 100);
2974
2975         msp->ms_fragmentation = fragmentation;
2976 }
2977
2978 /*
2979  * Compute a weight -- a selection preference value -- for the given metaslab.
2980  * This is based on the amount of free space, the level of fragmentation,
2981  * the LBA range, and whether the metaslab is loaded.
2982  */
2983 static uint64_t
2984 metaslab_space_weight(metaslab_t *msp)
2985 {
2986         metaslab_group_t *mg = msp->ms_group;
2987         vdev_t *vd = mg->mg_vd;
2988         uint64_t weight, space;
2989
2990         ASSERT(MUTEX_HELD(&msp->ms_lock));
2991
2992         /*
2993          * The baseline weight is the metaslab's free space.
2994          */
2995         space = msp->ms_size - metaslab_allocated_space(msp);
2996
2997         if (metaslab_fragmentation_factor_enabled &&
2998             msp->ms_fragmentation != ZFS_FRAG_INVALID) {
2999                 /*
3000                  * Use the fragmentation information to inversely scale
3001                  * down the baseline weight. We need to ensure that we
3002                  * don't exclude this metaslab completely when it's 100%
3003                  * fragmented. To avoid this we reduce the fragmented value
3004                  * by 1.
3005                  */
3006                 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
3007
3008                 /*
3009                  * If space < SPA_MINBLOCKSIZE, then we will not allocate from
3010                  * this metaslab again. The fragmentation metric may have
3011                  * decreased the space to something smaller than
3012                  * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
3013                  * so that we can consume any remaining space.
3014                  */
3015                 if (space > 0 && space < SPA_MINBLOCKSIZE)
3016                         space = SPA_MINBLOCKSIZE;
3017         }
3018         weight = space;
3019
3020         /*
3021          * Modern disks have uniform bit density and constant angular velocity.
3022          * Therefore, the outer recording zones are faster (higher bandwidth)
3023          * than the inner zones by the ratio of outer to inner track diameter,
3024          * which is typically around 2:1.  We account for this by assigning
3025          * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
3026          * In effect, this means that we'll select the metaslab with the most
3027          * free bandwidth rather than simply the one with the most free space.
3028          */
3029         if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) {
3030                 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
3031                 ASSERT(weight >= space && weight <= 2 * space);
3032         }
3033
3034         /*
3035          * If this metaslab is one we're actively using, adjust its
3036          * weight to make it preferable to any inactive metaslab so
3037          * we'll polish it off. If the fragmentation on this metaslab
3038          * has exceed our threshold, then don't mark it active.
3039          */
3040         if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
3041             msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
3042                 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
3043         }
3044
3045         WEIGHT_SET_SPACEBASED(weight);
3046         return (weight);
3047 }
3048
3049 /*
3050  * Return the weight of the specified metaslab, according to the segment-based
3051  * weighting algorithm. The metaslab must be loaded. This function can
3052  * be called within a sync pass since it relies only on the metaslab's
3053  * range tree which is always accurate when the metaslab is loaded.
3054  */
3055 static uint64_t
3056 metaslab_weight_from_range_tree(metaslab_t *msp)
3057 {
3058         uint64_t weight = 0;
3059         uint32_t segments = 0;
3060
3061         ASSERT(msp->ms_loaded);
3062
3063         for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
3064             i--) {
3065                 uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
3066                 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
3067
3068                 segments <<= 1;
3069                 segments += msp->ms_allocatable->rt_histogram[i];
3070
3071                 /*
3072                  * The range tree provides more precision than the space map
3073                  * and must be downgraded so that all values fit within the
3074                  * space map's histogram. This allows us to compare loaded
3075                  * vs. unloaded metaslabs to determine which metaslab is
3076                  * considered "best".
3077                  */
3078                 if (i > max_idx)
3079                         continue;
3080
3081                 if (segments != 0) {
3082                         WEIGHT_SET_COUNT(weight, segments);
3083                         WEIGHT_SET_INDEX(weight, i);
3084                         WEIGHT_SET_ACTIVE(weight, 0);
3085                         break;
3086                 }
3087         }
3088         return (weight);
3089 }
3090
3091 /*
3092  * Calculate the weight based on the on-disk histogram. Should be applied
3093  * only to unloaded metaslabs  (i.e no incoming allocations) in-order to
3094  * give results consistent with the on-disk state
3095  */
3096 static uint64_t
3097 metaslab_weight_from_spacemap(metaslab_t *msp)
3098 {
3099         space_map_t *sm = msp->ms_sm;
3100         ASSERT(!msp->ms_loaded);
3101         ASSERT(sm != NULL);
3102         ASSERT3U(space_map_object(sm), !=, 0);
3103         ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
3104
3105         /*
3106          * Create a joint histogram from all the segments that have made
3107          * it to the metaslab's space map histogram, that are not yet
3108          * available for allocation because they are still in the freeing
3109          * pipeline (e.g. freeing, freed, and defer trees). Then subtract
3110          * these segments from the space map's histogram to get a more
3111          * accurate weight.
3112          */
3113         uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0};
3114         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
3115                 deferspace_histogram[i] += msp->ms_synchist[i];
3116         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
3117                 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
3118                         deferspace_histogram[i] += msp->ms_deferhist[t][i];
3119                 }
3120         }
3121
3122         uint64_t weight = 0;
3123         for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
3124                 ASSERT3U(sm->sm_phys->smp_histogram[i], >=,
3125                     deferspace_histogram[i]);
3126                 uint64_t count =
3127                     sm->sm_phys->smp_histogram[i] - deferspace_histogram[i];
3128                 if (count != 0) {
3129                         WEIGHT_SET_COUNT(weight, count);
3130                         WEIGHT_SET_INDEX(weight, i + sm->sm_shift);
3131                         WEIGHT_SET_ACTIVE(weight, 0);
3132                         break;
3133                 }
3134         }
3135         return (weight);
3136 }
3137
3138 /*
3139  * Compute a segment-based weight for the specified metaslab. The weight
3140  * is determined by highest bucket in the histogram. The information
3141  * for the highest bucket is encoded into the weight value.
3142  */
3143 static uint64_t
3144 metaslab_segment_weight(metaslab_t *msp)
3145 {
3146         metaslab_group_t *mg = msp->ms_group;
3147         uint64_t weight = 0;
3148         uint8_t shift = mg->mg_vd->vdev_ashift;
3149
3150         ASSERT(MUTEX_HELD(&msp->ms_lock));
3151
3152         /*
3153          * The metaslab is completely free.
3154          */
3155         if (metaslab_allocated_space(msp) == 0) {
3156                 int idx = highbit64(msp->ms_size) - 1;
3157                 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
3158
3159                 if (idx < max_idx) {
3160                         WEIGHT_SET_COUNT(weight, 1ULL);
3161                         WEIGHT_SET_INDEX(weight, idx);
3162                 } else {
3163                         WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
3164                         WEIGHT_SET_INDEX(weight, max_idx);
3165                 }
3166                 WEIGHT_SET_ACTIVE(weight, 0);
3167                 ASSERT(!WEIGHT_IS_SPACEBASED(weight));
3168                 return (weight);
3169         }
3170
3171         ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
3172
3173         /*
3174          * If the metaslab is fully allocated then just make the weight 0.
3175          */
3176         if (metaslab_allocated_space(msp) == msp->ms_size)
3177                 return (0);
3178         /*
3179          * If the metaslab is already loaded, then use the range tree to
3180          * determine the weight. Otherwise, we rely on the space map information
3181          * to generate the weight.
3182          */
3183         if (msp->ms_loaded) {
3184                 weight = metaslab_weight_from_range_tree(msp);
3185         } else {
3186                 weight = metaslab_weight_from_spacemap(msp);
3187         }
3188
3189         /*
3190          * If the metaslab was active the last time we calculated its weight
3191          * then keep it active. We want to consume the entire region that
3192          * is associated with this weight.
3193          */
3194         if (msp->ms_activation_weight != 0 && weight != 0)
3195                 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
3196         return (weight);
3197 }
3198
3199 /*
3200  * Determine if we should attempt to allocate from this metaslab. If the
3201  * metaslab is loaded, then we can determine if the desired allocation
3202  * can be satisfied by looking at the size of the maximum free segment
3203  * on that metaslab. Otherwise, we make our decision based on the metaslab's
3204  * weight. For segment-based weighting we can determine the maximum
3205  * allocation based on the index encoded in its value. For space-based
3206  * weights we rely on the entire weight (excluding the weight-type bit).
3207  */
3208 static boolean_t
3209 metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard)
3210 {
3211         /*
3212          * If the metaslab is loaded, ms_max_size is definitive and we can use
3213          * the fast check. If it's not, the ms_max_size is a lower bound (once
3214          * set), and we should use the fast check as long as we're not in
3215          * try_hard and it's been less than zfs_metaslab_max_size_cache_sec
3216          * seconds since the metaslab was unloaded.
3217          */
3218         if (msp->ms_loaded ||
3219             (msp->ms_max_size != 0 && !try_hard && gethrtime() <
3220             msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec)))
3221                 return (msp->ms_max_size >= asize);
3222
3223         boolean_t should_allocate;
3224         if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
3225                 /*
3226                  * The metaslab segment weight indicates segments in the
3227                  * range [2^i, 2^(i+1)), where i is the index in the weight.
3228                  * Since the asize might be in the middle of the range, we
3229                  * should attempt the allocation if asize < 2^(i+1).
3230                  */
3231                 should_allocate = (asize <
3232                     1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
3233         } else {
3234                 should_allocate = (asize <=
3235                     (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
3236         }
3237
3238         return (should_allocate);
3239 }
3240
3241 static uint64_t
3242 metaslab_weight(metaslab_t *msp, boolean_t nodirty)
3243 {
3244         vdev_t *vd = msp->ms_group->mg_vd;
3245         spa_t *spa = vd->vdev_spa;
3246         uint64_t weight;
3247
3248         ASSERT(MUTEX_HELD(&msp->ms_lock));
3249
3250         metaslab_set_fragmentation(msp, nodirty);
3251
3252         /*
3253          * Update the maximum size. If the metaslab is loaded, this will
3254          * ensure that we get an accurate maximum size if newly freed space
3255          * has been added back into the free tree. If the metaslab is
3256          * unloaded, we check if there's a larger free segment in the
3257          * unflushed frees. This is a lower bound on the largest allocatable
3258          * segment size. Coalescing of adjacent entries may reveal larger
3259          * allocatable segments, but we aren't aware of those until loading
3260          * the space map into a range tree.
3261          */
3262         if (msp->ms_loaded) {
3263                 msp->ms_max_size = metaslab_largest_allocatable(msp);
3264         } else {
3265                 msp->ms_max_size = MAX(msp->ms_max_size,
3266                     metaslab_largest_unflushed_free(msp));
3267         }
3268
3269         /*
3270          * Segment-based weighting requires space map histogram support.
3271          */
3272         if (zfs_metaslab_segment_weight_enabled &&
3273             spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
3274             (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
3275             sizeof (space_map_phys_t))) {
3276                 weight = metaslab_segment_weight(msp);
3277         } else {
3278                 weight = metaslab_space_weight(msp);
3279         }
3280         return (weight);
3281 }
3282
3283 void
3284 metaslab_recalculate_weight_and_sort(metaslab_t *msp)
3285 {
3286         ASSERT(MUTEX_HELD(&msp->ms_lock));
3287
3288         /* note: we preserve the mask (e.g. indication of primary, etc..) */
3289         uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
3290         metaslab_group_sort(msp->ms_group, msp,
3291             metaslab_weight(msp, B_FALSE) | was_active);
3292 }
3293
3294 static int
3295 metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
3296     int allocator, uint64_t activation_weight)
3297 {
3298         metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
3299         ASSERT(MUTEX_HELD(&msp->ms_lock));
3300
3301         /*
3302          * If we're activating for the claim code, we don't want to actually
3303          * set the metaslab up for a specific allocator.
3304          */
3305         if (activation_weight == METASLAB_WEIGHT_CLAIM) {
3306                 ASSERT0(msp->ms_activation_weight);
3307                 msp->ms_activation_weight = msp->ms_weight;
3308                 metaslab_group_sort(mg, msp, msp->ms_weight |
3309                     activation_weight);
3310                 return (0);
3311         }
3312
3313         metaslab_t **mspp = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
3314             &mga->mga_primary : &mga->mga_secondary);
3315
3316         mutex_enter(&mg->mg_lock);
3317         if (*mspp != NULL) {
3318                 mutex_exit(&mg->mg_lock);
3319                 return (EEXIST);
3320         }
3321
3322         *mspp = msp;
3323         ASSERT3S(msp->ms_allocator, ==, -1);
3324         msp->ms_allocator = allocator;
3325         msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
3326
3327         ASSERT0(msp->ms_activation_weight);
3328         msp->ms_activation_weight = msp->ms_weight;
3329         metaslab_group_sort_impl(mg, msp,
3330             msp->ms_weight | activation_weight);
3331         mutex_exit(&mg->mg_lock);
3332
3333         return (0);
3334 }
3335
3336 static int
3337 metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
3338 {
3339         ASSERT(MUTEX_HELD(&msp->ms_lock));
3340
3341         /*
3342          * The current metaslab is already activated for us so there
3343          * is nothing to do. Already activated though, doesn't mean
3344          * that this metaslab is activated for our allocator nor our
3345          * requested activation weight. The metaslab could have started
3346          * as an active one for our allocator but changed allocators
3347          * while we were waiting to grab its ms_lock or we stole it
3348          * [see find_valid_metaslab()]. This means that there is a
3349          * possibility of passivating a metaslab of another allocator
3350          * or from a different activation mask, from this thread.
3351          */
3352         if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
3353                 ASSERT(msp->ms_loaded);
3354                 return (0);
3355         }
3356
3357         int error = metaslab_load(msp);
3358         if (error != 0) {
3359                 metaslab_group_sort(msp->ms_group, msp, 0);
3360                 return (error);
3361         }
3362
3363         /*
3364          * When entering metaslab_load() we may have dropped the
3365          * ms_lock because we were loading this metaslab, or we
3366          * were waiting for another thread to load it for us. In
3367          * that scenario, we recheck the weight of the metaslab
3368          * to see if it was activated by another thread.
3369          *
3370          * If the metaslab was activated for another allocator or
3371          * it was activated with a different activation weight (e.g.
3372          * we wanted to make it a primary but it was activated as
3373          * secondary) we return error (EBUSY).
3374          *
3375          * If the metaslab was activated for the same allocator
3376          * and requested activation mask, skip activating it.
3377          */
3378         if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
3379                 if (msp->ms_allocator != allocator)
3380                         return (EBUSY);
3381
3382                 if ((msp->ms_weight & activation_weight) == 0)
3383                         return (SET_ERROR(EBUSY));
3384
3385                 EQUIV((activation_weight == METASLAB_WEIGHT_PRIMARY),
3386                     msp->ms_primary);
3387                 return (0);
3388         }
3389
3390         /*
3391          * If the metaslab has literally 0 space, it will have weight 0. In
3392          * that case, don't bother activating it. This can happen if the
3393          * metaslab had space during find_valid_metaslab, but another thread
3394          * loaded it and used all that space while we were waiting to grab the
3395          * lock.
3396          */
3397         if (msp->ms_weight == 0) {
3398                 ASSERT0(range_tree_space(msp->ms_allocatable));
3399                 return (SET_ERROR(ENOSPC));
3400         }
3401
3402         if ((error = metaslab_activate_allocator(msp->ms_group, msp,
3403             allocator, activation_weight)) != 0) {
3404                 return (error);
3405         }
3406
3407         ASSERT(msp->ms_loaded);
3408         ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
3409
3410         return (0);
3411 }
3412
3413 static void
3414 metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
3415     uint64_t weight)
3416 {
3417         ASSERT(MUTEX_HELD(&msp->ms_lock));
3418         ASSERT(msp->ms_loaded);
3419
3420         if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
3421                 metaslab_group_sort(mg, msp, weight);
3422                 return;
3423         }
3424
3425         mutex_enter(&mg->mg_lock);
3426         ASSERT3P(msp->ms_group, ==, mg);
3427         ASSERT3S(0, <=, msp->ms_allocator);
3428         ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
3429
3430         metaslab_group_allocator_t *mga = &mg->mg_allocator[msp->ms_allocator];
3431         if (msp->ms_primary) {
3432                 ASSERT3P(mga->mga_primary, ==, msp);
3433                 ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
3434                 mga->mga_primary = NULL;
3435         } else {
3436                 ASSERT3P(mga->mga_secondary, ==, msp);
3437                 ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
3438                 mga->mga_secondary = NULL;
3439         }
3440         msp->ms_allocator = -1;
3441         metaslab_group_sort_impl(mg, msp, weight);
3442         mutex_exit(&mg->mg_lock);
3443 }
3444
3445 static void
3446 metaslab_passivate(metaslab_t *msp, uint64_t weight)
3447 {
3448         uint64_t size __maybe_unused = weight & ~METASLAB_WEIGHT_TYPE;
3449
3450         /*
3451          * If size < SPA_MINBLOCKSIZE, then we will not allocate from
3452          * this metaslab again.  In that case, it had better be empty,
3453          * or we would be leaving space on the table.
3454          */
3455         ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) ||
3456             size >= SPA_MINBLOCKSIZE ||
3457             range_tree_space(msp->ms_allocatable) == 0);
3458         ASSERT0(weight & METASLAB_ACTIVE_MASK);
3459
3460         ASSERT(msp->ms_activation_weight != 0);
3461         msp->ms_activation_weight = 0;
3462         metaslab_passivate_allocator(msp->ms_group, msp, weight);
3463         ASSERT0(msp->ms_weight & METASLAB_ACTIVE_MASK);
3464 }
3465
3466 /*
3467  * Segment-based metaslabs are activated once and remain active until
3468  * we either fail an allocation attempt (similar to space-based metaslabs)
3469  * or have exhausted the free space in zfs_metaslab_switch_threshold
3470  * buckets since the metaslab was activated. This function checks to see
3471  * if we've exhausted the zfs_metaslab_switch_threshold buckets in the
3472  * metaslab and passivates it proactively. This will allow us to select a
3473  * metaslab with a larger contiguous region, if any, remaining within this
3474  * metaslab group. If we're in sync pass > 1, then we continue using this
3475  * metaslab so that we don't dirty more block and cause more sync passes.
3476  */
3477 static void
3478 metaslab_segment_may_passivate(metaslab_t *msp)
3479 {
3480         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
3481
3482         if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
3483                 return;
3484
3485         /*
3486          * Since we are in the middle of a sync pass, the most accurate
3487          * information that is accessible to us is the in-core range tree
3488          * histogram; calculate the new weight based on that information.
3489          */
3490         uint64_t weight = metaslab_weight_from_range_tree(msp);
3491         int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
3492         int current_idx = WEIGHT_GET_INDEX(weight);
3493
3494         if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
3495                 metaslab_passivate(msp, weight);
3496 }
3497
3498 static void
3499 metaslab_preload(void *arg)
3500 {
3501         metaslab_t *msp = arg;
3502         metaslab_class_t *mc = msp->ms_group->mg_class;
3503         spa_t *spa = mc->mc_spa;
3504         fstrans_cookie_t cookie = spl_fstrans_mark();
3505
3506         ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
3507
3508         mutex_enter(&msp->ms_lock);
3509         (void) metaslab_load(msp);
3510         metaslab_set_selected_txg(msp, spa_syncing_txg(spa));
3511         mutex_exit(&msp->ms_lock);
3512         spl_fstrans_unmark(cookie);
3513 }
3514
3515 static void
3516 metaslab_group_preload(metaslab_group_t *mg)
3517 {
3518         spa_t *spa = mg->mg_vd->vdev_spa;
3519         metaslab_t *msp;
3520         avl_tree_t *t = &mg->mg_metaslab_tree;
3521         int m = 0;
3522
3523         if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
3524                 taskq_wait_outstanding(mg->mg_taskq, 0);
3525                 return;
3526         }
3527
3528         mutex_enter(&mg->mg_lock);
3529
3530         /*
3531          * Load the next potential metaslabs
3532          */
3533         for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
3534                 ASSERT3P(msp->ms_group, ==, mg);
3535
3536                 /*
3537                  * We preload only the maximum number of metaslabs specified
3538                  * by metaslab_preload_limit. If a metaslab is being forced
3539                  * to condense then we preload it too. This will ensure
3540                  * that force condensing happens in the next txg.
3541                  */
3542                 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
3543                         continue;
3544                 }
3545
3546                 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
3547                     msp, TQ_SLEEP) != TASKQID_INVALID);
3548         }
3549         mutex_exit(&mg->mg_lock);
3550 }
3551
3552 /*
3553  * Determine if the space map's on-disk footprint is past our tolerance for
3554  * inefficiency. We would like to use the following criteria to make our
3555  * decision:
3556  *
3557  * 1. Do not condense if the size of the space map object would dramatically
3558  *    increase as a result of writing out the free space range tree.
3559  *
3560  * 2. Condense if the on on-disk space map representation is at least
3561  *    zfs_condense_pct/100 times the size of the optimal representation
3562  *    (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB).
3563  *
3564  * 3. Do not condense if the on-disk size of the space map does not actually
3565  *    decrease.
3566  *
3567  * Unfortunately, we cannot compute the on-disk size of the space map in this
3568  * context because we cannot accurately compute the effects of compression, etc.
3569  * Instead, we apply the heuristic described in the block comment for
3570  * zfs_metaslab_condense_block_threshold - we only condense if the space used
3571  * is greater than a threshold number of blocks.
3572  */
3573 static boolean_t
3574 metaslab_should_condense(metaslab_t *msp)
3575 {
3576         space_map_t *sm = msp->ms_sm;
3577         vdev_t *vd = msp->ms_group->mg_vd;
3578         uint64_t vdev_blocksize = 1ULL << vd->vdev_ashift;
3579
3580         ASSERT(MUTEX_HELD(&msp->ms_lock));
3581         ASSERT(msp->ms_loaded);
3582         ASSERT(sm != NULL);
3583         ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1);
3584
3585         /*
3586          * We always condense metaslabs that are empty and metaslabs for
3587          * which a condense request has been made.
3588          */
3589         if (range_tree_numsegs(msp->ms_allocatable) == 0 ||
3590             msp->ms_condense_wanted)
3591                 return (B_TRUE);
3592
3593         uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize);
3594         uint64_t object_size = space_map_length(sm);
3595         uint64_t optimal_size = space_map_estimate_optimal_size(sm,
3596             msp->ms_allocatable, SM_NO_VDEVID);
3597
3598         return (object_size >= (optimal_size * zfs_condense_pct / 100) &&
3599             object_size > zfs_metaslab_condense_block_threshold * record_size);
3600 }
3601
3602 /*
3603  * Condense the on-disk space map representation to its minimized form.
3604  * The minimized form consists of a small number of allocations followed
3605  * by the entries of the free range tree (ms_allocatable). The condensed
3606  * spacemap contains all the entries of previous TXGs (including those in
3607  * the pool-wide log spacemaps; thus this is effectively a superset of
3608  * metaslab_flush()), but this TXG's entries still need to be written.
3609  */
3610 static void
3611 metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
3612 {
3613         range_tree_t *condense_tree;
3614         space_map_t *sm = msp->ms_sm;
3615         uint64_t txg = dmu_tx_get_txg(tx);
3616         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
3617
3618         ASSERT(MUTEX_HELD(&msp->ms_lock));
3619         ASSERT(msp->ms_loaded);
3620         ASSERT(msp->ms_sm != NULL);
3621
3622         /*
3623          * In order to condense the space map, we need to change it so it
3624          * only describes which segments are currently allocated and free.
3625          *
3626          * All the current free space resides in the ms_allocatable, all
3627          * the ms_defer trees, and all the ms_allocating trees. We ignore
3628          * ms_freed because it is empty because we're in sync pass 1. We
3629          * ignore ms_freeing because these changes are not yet reflected
3630          * in the spacemap (they will be written later this txg).
3631          *
3632          * So to truncate the space map to represent all the entries of
3633          * previous TXGs we do the following:
3634          *
3635          * 1] We create a range tree (condense tree) that is 100% empty.
3636          * 2] We add to it all segments found in the ms_defer trees
3637          *    as those segments are marked as free in the original space
3638          *    map. We do the same with the ms_allocating trees for the same
3639          *    reason. Adding these segments should be a relatively
3640          *    inexpensive operation since we expect these trees to have a
3641          *    small number of nodes.
3642          * 3] We vacate any unflushed allocs, since they are not frees we
3643          *    need to add to the condense tree. Then we vacate any
3644          *    unflushed frees as they should already be part of ms_allocatable.
3645          * 4] At this point, we would ideally like to add all segments
3646          *    in the ms_allocatable tree from the condense tree. This way
3647          *    we would write all the entries of the condense tree as the
3648          *    condensed space map, which would only contain freed
3649          *    segments with everything else assumed to be allocated.
3650          *
3651          *    Doing so can be prohibitively expensive as ms_allocatable can
3652          *    be large, and therefore computationally expensive to add to
3653          *    the condense_tree. Instead we first sync out an entry marking
3654          *    everything as allocated, then the condense_tree and then the
3655          *    ms_allocatable, in the condensed space map. While this is not
3656          *    optimal, it is typically close to optimal and more importantly
3657          *    much cheaper to compute.
3658          *
3659          * 5] Finally, as both of the unflushed trees were written to our
3660          *    new and condensed metaslab space map, we basically flushed
3661          *    all the unflushed changes to disk, thus we call
3662          *    metaslab_flush_update().
3663          */
3664         ASSERT3U(spa_sync_pass(spa), ==, 1);
3665         ASSERT(range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */
3666
3667         zfs_dbgmsg("condensing: txg %llu, msp[%llu] %px, vdev id %llu, "
3668             "spa %s, smp size %llu, segments %llu, forcing condense=%s",
3669             (u_longlong_t)txg, (u_longlong_t)msp->ms_id, msp,
3670             (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
3671             spa->spa_name, (u_longlong_t)space_map_length(msp->ms_sm),
3672             (u_longlong_t)range_tree_numsegs(msp->ms_allocatable),
3673             msp->ms_condense_wanted ? "TRUE" : "FALSE");
3674
3675         msp->ms_condense_wanted = B_FALSE;
3676
3677         range_seg_type_t type;
3678         uint64_t shift, start;
3679         type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp,
3680             &start, &shift);
3681
3682         condense_tree = range_tree_create(NULL, type, NULL, start, shift);
3683
3684         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
3685                 range_tree_walk(msp->ms_defer[t],
3686                     range_tree_add, condense_tree);
3687         }
3688
3689         for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
3690                 range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
3691                     range_tree_add, condense_tree);
3692         }
3693
3694         ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
3695             metaslab_unflushed_changes_memused(msp));
3696         spa->spa_unflushed_stats.sus_memused -=
3697             metaslab_unflushed_changes_memused(msp);
3698         range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
3699         range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
3700
3701         /*
3702          * We're about to drop the metaslab's lock thus allowing other
3703          * consumers to change it's content. Set the metaslab's ms_condensing
3704          * flag to ensure that allocations on this metaslab do not occur
3705          * while we're in the middle of committing it to disk. This is only
3706          * critical for ms_allocatable as all other range trees use per TXG
3707          * views of their content.
3708          */
3709         msp->ms_condensing = B_TRUE;
3710
3711         mutex_exit(&msp->ms_lock);
3712         uint64_t object = space_map_object(msp->ms_sm);
3713         space_map_truncate(sm,
3714             spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
3715             zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx);
3716
3717         /*
3718          * space_map_truncate() may have reallocated the spacemap object.
3719          * If so, update the vdev_ms_array.
3720          */
3721         if (space_map_object(msp->ms_sm) != object) {
3722                 object = space_map_object(msp->ms_sm);
3723                 dmu_write(spa->spa_meta_objset,
3724                     msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) *
3725                     msp->ms_id, sizeof (uint64_t), &object, tx);
3726         }
3727
3728         /*
3729          * Note:
3730          * When the log space map feature is enabled, each space map will
3731          * always have ALLOCS followed by FREES for each sync pass. This is
3732          * typically true even when the log space map feature is disabled,
3733          * except from the case where a metaslab goes through metaslab_sync()
3734          * and gets condensed. In that case the metaslab's space map will have
3735          * ALLOCS followed by FREES (due to condensing) followed by ALLOCS
3736          * followed by FREES (due to space_map_write() in metaslab_sync()) for
3737          * sync pass 1.
3738          */
3739         range_tree_t *tmp_tree = range_tree_create(NULL, type, NULL, start,
3740             shift);
3741         range_tree_add(tmp_tree, msp->ms_start, msp->ms_size);
3742         space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx);
3743         space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
3744         space_map_write(sm, condense_tree, SM_FREE, SM_NO_VDEVID, tx);
3745
3746         range_tree_vacate(condense_tree, NULL, NULL);
3747         range_tree_destroy(condense_tree);
3748         range_tree_vacate(tmp_tree, NULL, NULL);
3749         range_tree_destroy(tmp_tree);
3750         mutex_enter(&msp->ms_lock);
3751
3752         msp->ms_condensing = B_FALSE;
3753         metaslab_flush_update(msp, tx);
3754 }
3755
3756 static void
3757 metaslab_unflushed_add(metaslab_t *msp, dmu_tx_t *tx)
3758 {
3759         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
3760         ASSERT(spa_syncing_log_sm(spa) != NULL);
3761         ASSERT(msp->ms_sm != NULL);
3762         ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
3763         ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
3764
3765         mutex_enter(&spa->spa_flushed_ms_lock);
3766         metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
3767         metaslab_set_unflushed_dirty(msp, B_TRUE);
3768         avl_add(&spa->spa_metaslabs_by_flushed, msp);
3769         mutex_exit(&spa->spa_flushed_ms_lock);
3770
3771         spa_log_sm_increment_current_mscount(spa);
3772         spa_log_summary_add_flushed_metaslab(spa, B_TRUE);
3773 }
3774
3775 void
3776 metaslab_unflushed_bump(metaslab_t *msp, dmu_tx_t *tx, boolean_t dirty)
3777 {
3778         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
3779         ASSERT(spa_syncing_log_sm(spa) != NULL);
3780         ASSERT(msp->ms_sm != NULL);
3781         ASSERT(metaslab_unflushed_txg(msp) != 0);
3782         ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp);
3783         ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
3784         ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
3785
3786         VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa));
3787
3788         /* update metaslab's position in our flushing tree */
3789         uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp);
3790         boolean_t ms_prev_flushed_dirty = metaslab_unflushed_dirty(msp);
3791         mutex_enter(&spa->spa_flushed_ms_lock);
3792         avl_remove(&spa->spa_metaslabs_by_flushed, msp);
3793         metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
3794         metaslab_set_unflushed_dirty(msp, dirty);
3795         avl_add(&spa->spa_metaslabs_by_flushed, msp);
3796         mutex_exit(&spa->spa_flushed_ms_lock);
3797
3798         /* update metaslab counts of spa_log_sm_t nodes */
3799         spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg);
3800         spa_log_sm_increment_current_mscount(spa);
3801
3802         /* update log space map summary */
3803         spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg,
3804             ms_prev_flushed_dirty);
3805         spa_log_summary_add_flushed_metaslab(spa, dirty);
3806
3807         /* cleanup obsolete logs if any */
3808         spa_cleanup_old_sm_logs(spa, tx);
3809 }
3810
3811 /*
3812  * Called when the metaslab has been flushed (its own spacemap now reflects
3813  * all the contents of the pool-wide spacemap log). Updates the metaslab's
3814  * metadata and any pool-wide related log space map data (e.g. summary,
3815  * obsolete logs, etc..) to reflect that.
3816  */
3817 static void
3818 metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx)
3819 {
3820         metaslab_group_t *mg = msp->ms_group;
3821         spa_t *spa = mg->mg_vd->vdev_spa;
3822
3823         ASSERT(MUTEX_HELD(&msp->ms_lock));
3824
3825         ASSERT3U(spa_sync_pass(spa), ==, 1);
3826
3827         /*
3828          * Just because a metaslab got flushed, that doesn't mean that
3829          * it will pass through metaslab_sync_done(). Thus, make sure to
3830          * update ms_synced_length here in case it doesn't.
3831          */
3832         msp->ms_synced_length = space_map_length(msp->ms_sm);
3833
3834         /*
3835          * We may end up here from metaslab_condense() without the
3836          * feature being active. In that case this is a no-op.
3837          */
3838         if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP) ||
3839             metaslab_unflushed_txg(msp) == 0)
3840                 return;
3841
3842         metaslab_unflushed_bump(msp, tx, B_FALSE);
3843 }
3844
3845 boolean_t
3846 metaslab_flush(metaslab_t *msp, dmu_tx_t *tx)
3847 {
3848         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
3849
3850         ASSERT(MUTEX_HELD(&msp->ms_lock));
3851         ASSERT3U(spa_sync_pass(spa), ==, 1);
3852         ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
3853
3854         ASSERT(msp->ms_sm != NULL);
3855         ASSERT(metaslab_unflushed_txg(msp) != 0);
3856         ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL);
3857
3858         /*
3859          * There is nothing wrong with flushing the same metaslab twice, as
3860          * this codepath should work on that case. However, the current
3861          * flushing scheme makes sure to avoid this situation as we would be
3862          * making all these calls without having anything meaningful to write
3863          * to disk. We assert this behavior here.
3864          */
3865         ASSERT3U(metaslab_unflushed_txg(msp), <, dmu_tx_get_txg(tx));
3866
3867         /*
3868          * We can not flush while loading, because then we would
3869          * not load the ms_unflushed_{allocs,frees}.
3870          */
3871         if (msp->ms_loading)
3872                 return (B_FALSE);
3873
3874         metaslab_verify_space(msp, dmu_tx_get_txg(tx));
3875         metaslab_verify_weight_and_frag(msp);
3876
3877         /*
3878          * Metaslab condensing is effectively flushing. Therefore if the
3879          * metaslab can be condensed we can just condense it instead of
3880          * flushing it.
3881          *
3882          * Note that metaslab_condense() does call metaslab_flush_update()
3883          * so we can just return immediately after condensing. We also
3884          * don't need to care about setting ms_flushing or broadcasting
3885          * ms_flush_cv, even if we temporarily drop the ms_lock in
3886          * metaslab_condense(), as the metaslab is already loaded.
3887          */
3888         if (msp->ms_loaded && metaslab_should_condense(msp)) {
3889                 metaslab_group_t *mg = msp->ms_group;
3890
3891                 /*
3892                  * For all histogram operations below refer to the
3893                  * comments of metaslab_sync() where we follow a
3894                  * similar procedure.
3895                  */
3896                 metaslab_group_histogram_verify(mg);
3897                 metaslab_class_histogram_verify(mg->mg_class);
3898                 metaslab_group_histogram_remove(mg, msp);
3899
3900                 metaslab_condense(msp, tx);
3901
3902                 space_map_histogram_clear(msp->ms_sm);
3903                 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
3904                 ASSERT(range_tree_is_empty(msp->ms_freed));
3905                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
3906                         space_map_histogram_add(msp->ms_sm,
3907                             msp->ms_defer[t], tx);
3908                 }
3909                 metaslab_aux_histograms_update(msp);
3910
3911                 metaslab_group_histogram_add(mg, msp);
3912                 metaslab_group_histogram_verify(mg);
3913                 metaslab_class_histogram_verify(mg->mg_class);
3914
3915                 metaslab_verify_space(msp, dmu_tx_get_txg(tx));
3916
3917                 /*
3918                  * Since we recreated the histogram (and potentially
3919                  * the ms_sm too while condensing) ensure that the
3920                  * weight is updated too because we are not guaranteed
3921                  * that this metaslab is dirty and will go through
3922                  * metaslab_sync_done().
3923                  */
3924                 metaslab_recalculate_weight_and_sort(msp);
3925                 return (B_TRUE);
3926         }
3927
3928         msp->ms_flushing = B_TRUE;
3929         uint64_t sm_len_before = space_map_length(msp->ms_sm);
3930
3931         mutex_exit(&msp->ms_lock);
3932         space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC,
3933             SM_NO_VDEVID, tx);
3934         space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE,
3935             SM_NO_VDEVID, tx);
3936         mutex_enter(&msp->ms_lock);
3937
3938         uint64_t sm_len_after = space_map_length(msp->ms_sm);
3939         if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) {
3940                 zfs_dbgmsg("flushing: txg %llu, spa %s, vdev_id %llu, "
3941                     "ms_id %llu, unflushed_allocs %llu, unflushed_frees %llu, "
3942                     "appended %llu bytes", (u_longlong_t)dmu_tx_get_txg(tx),
3943                     spa_name(spa),
3944                     (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
3945                     (u_longlong_t)msp->ms_id,
3946                     (u_longlong_t)range_tree_space(msp->ms_unflushed_allocs),
3947                     (u_longlong_t)range_tree_space(msp->ms_unflushed_frees),
3948                     (u_longlong_t)(sm_len_after - sm_len_before));
3949         }
3950
3951         ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
3952             metaslab_unflushed_changes_memused(msp));
3953         spa->spa_unflushed_stats.sus_memused -=
3954             metaslab_unflushed_changes_memused(msp);
3955         range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
3956         range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
3957
3958         metaslab_verify_space(msp, dmu_tx_get_txg(tx));
3959         metaslab_verify_weight_and_frag(msp);
3960
3961         metaslab_flush_update(msp, tx);
3962
3963         metaslab_verify_space(msp, dmu_tx_get_txg(tx));
3964         metaslab_verify_weight_and_frag(msp);
3965
3966         msp->ms_flushing = B_FALSE;
3967         cv_broadcast(&msp->ms_flush_cv);
3968         return (B_TRUE);
3969 }
3970
3971 /*
3972  * Write a metaslab to disk in the context of the specified transaction group.
3973  */
3974 void
3975 metaslab_sync(metaslab_t *msp, uint64_t txg)
3976 {
3977         metaslab_group_t *mg = msp->ms_group;
3978         vdev_t *vd = mg->mg_vd;
3979         spa_t *spa = vd->vdev_spa;
3980         objset_t *mos = spa_meta_objset(spa);
3981         range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK];
3982         dmu_tx_t *tx;
3983
3984         ASSERT(!vd->vdev_ishole);
3985
3986         /*
3987          * This metaslab has just been added so there's no work to do now.
3988          */
3989         if (msp->ms_new) {
3990                 ASSERT0(range_tree_space(alloctree));
3991                 ASSERT0(range_tree_space(msp->ms_freeing));
3992                 ASSERT0(range_tree_space(msp->ms_freed));
3993                 ASSERT0(range_tree_space(msp->ms_checkpointing));
3994                 ASSERT0(range_tree_space(msp->ms_trim));
3995                 return;
3996         }
3997
3998         /*
3999          * Normally, we don't want to process a metaslab if there are no
4000          * allocations or frees to perform. However, if the metaslab is being
4001          * forced to condense, it's loaded and we're not beyond the final
4002          * dirty txg, we need to let it through. Not condensing beyond the
4003          * final dirty txg prevents an issue where metaslabs that need to be
4004          * condensed but were loaded for other reasons could cause a panic
4005          * here. By only checking the txg in that branch of the conditional,
4006          * we preserve the utility of the VERIFY statements in all other
4007          * cases.
4008          */
4009         if (range_tree_is_empty(alloctree) &&
4010             range_tree_is_empty(msp->ms_freeing) &&
4011             range_tree_is_empty(msp->ms_checkpointing) &&
4012             !(msp->ms_loaded && msp->ms_condense_wanted &&
4013             txg <= spa_final_dirty_txg(spa)))
4014                 return;
4015
4016
4017         VERIFY3U(txg, <=, spa_final_dirty_txg(spa));
4018
4019         /*
4020          * The only state that can actually be changing concurrently
4021          * with metaslab_sync() is the metaslab's ms_allocatable. No
4022          * other thread can be modifying this txg's alloc, freeing,
4023          * freed, or space_map_phys_t.  We drop ms_lock whenever we
4024          * could call into the DMU, because the DMU can call down to
4025          * us (e.g. via zio_free()) at any time.
4026          *
4027          * The spa_vdev_remove_thread() can be reading metaslab state
4028          * concurrently, and it is locked out by the ms_sync_lock.
4029          * Note that the ms_lock is insufficient for this, because it
4030          * is dropped by space_map_write().
4031          */
4032         tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
4033
4034         /*
4035          * Generate a log space map if one doesn't exist already.
4036          */
4037         spa_generate_syncing_log_sm(spa, tx);
4038
4039         if (msp->ms_sm == NULL) {
4040                 uint64_t new_object = space_map_alloc(mos,
4041                     spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
4042                     zfs_metaslab_sm_blksz_with_log :
4043                     zfs_metaslab_sm_blksz_no_log, tx);
4044                 VERIFY3U(new_object, !=, 0);
4045
4046                 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
4047                     msp->ms_id, sizeof (uint64_t), &new_object, tx);
4048
4049                 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
4050                     msp->ms_start, msp->ms_size, vd->vdev_ashift));
4051                 ASSERT(msp->ms_sm != NULL);
4052
4053                 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
4054                 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
4055                 ASSERT0(metaslab_allocated_space(msp));
4056         }
4057
4058         if (!range_tree_is_empty(msp->ms_checkpointing) &&
4059             vd->vdev_checkpoint_sm == NULL) {
4060                 ASSERT(spa_has_checkpoint(spa));
4061
4062                 uint64_t new_object = space_map_alloc(mos,
4063                     zfs_vdev_standard_sm_blksz, tx);
4064                 VERIFY3U(new_object, !=, 0);
4065
4066                 VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
4067                     mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift));
4068                 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
4069
4070                 /*
4071                  * We save the space map object as an entry in vdev_top_zap
4072                  * so it can be retrieved when the pool is reopened after an
4073                  * export or through zdb.
4074                  */
4075                 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset,
4076                     vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
4077                     sizeof (new_object), 1, &new_object, tx));
4078         }
4079
4080         mutex_enter(&msp->ms_sync_lock);
4081         mutex_enter(&msp->ms_lock);
4082
4083         /*
4084          * Note: metaslab_condense() clears the space map's histogram.
4085          * Therefore we must verify and remove this histogram before
4086          * condensing.
4087          */
4088         metaslab_group_histogram_verify(mg);
4089         metaslab_class_histogram_verify(mg->mg_class);
4090         metaslab_group_histogram_remove(mg, msp);
4091
4092         if (spa->spa_sync_pass == 1 && msp->ms_loaded &&
4093             metaslab_should_condense(msp))
4094                 metaslab_condense(msp, tx);
4095
4096         /*
4097          * We'll be going to disk to sync our space accounting, thus we
4098          * drop the ms_lock during that time so allocations coming from
4099          * open-context (ZIL) for future TXGs do not block.
4100          */
4101         mutex_exit(&msp->ms_lock);
4102         space_map_t *log_sm = spa_syncing_log_sm(spa);
4103         if (log_sm != NULL) {
4104                 ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
4105                 if (metaslab_unflushed_txg(msp) == 0)
4106                         metaslab_unflushed_add(msp, tx);
4107                 else if (!metaslab_unflushed_dirty(msp))
4108                         metaslab_unflushed_bump(msp, tx, B_TRUE);
4109
4110                 space_map_write(log_sm, alloctree, SM_ALLOC,
4111                     vd->vdev_id, tx);
4112                 space_map_write(log_sm, msp->ms_freeing, SM_FREE,
4113                     vd->vdev_id, tx);
4114                 mutex_enter(&msp->ms_lock);
4115
4116                 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
4117                     metaslab_unflushed_changes_memused(msp));
4118                 spa->spa_unflushed_stats.sus_memused -=
4119                     metaslab_unflushed_changes_memused(msp);
4120                 range_tree_remove_xor_add(alloctree,
4121                     msp->ms_unflushed_frees, msp->ms_unflushed_allocs);
4122                 range_tree_remove_xor_add(msp->ms_freeing,
4123                     msp->ms_unflushed_allocs, msp->ms_unflushed_frees);
4124                 spa->spa_unflushed_stats.sus_memused +=
4125                     metaslab_unflushed_changes_memused(msp);
4126         } else {
4127                 ASSERT(!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
4128
4129                 space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
4130                     SM_NO_VDEVID, tx);
4131                 space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
4132                     SM_NO_VDEVID, tx);
4133                 mutex_enter(&msp->ms_lock);
4134         }
4135
4136         msp->ms_allocated_space += range_tree_space(alloctree);
4137         ASSERT3U(msp->ms_allocated_space, >=,
4138             range_tree_space(msp->ms_freeing));
4139         msp->ms_allocated_space -= range_tree_space(msp->ms_freeing);
4140
4141         if (!range_tree_is_empty(msp->ms_checkpointing)) {
4142                 ASSERT(spa_has_checkpoint(spa));
4143                 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
4144
4145                 /*
4146                  * Since we are doing writes to disk and the ms_checkpointing
4147                  * tree won't be changing during that time, we drop the
4148                  * ms_lock while writing to the checkpoint space map, for the
4149                  * same reason mentioned above.
4150                  */
4151                 mutex_exit(&msp->ms_lock);
4152                 space_map_write(vd->vdev_checkpoint_sm,
4153                     msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
4154                 mutex_enter(&msp->ms_lock);
4155
4156                 spa->spa_checkpoint_info.sci_dspace +=
4157                     range_tree_space(msp->ms_checkpointing);
4158                 vd->vdev_stat.vs_checkpoint_space +=
4159                     range_tree_space(msp->ms_checkpointing);
4160                 ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
4161                     -space_map_allocated(vd->vdev_checkpoint_sm));
4162
4163                 range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
4164         }
4165
4166         if (msp->ms_loaded) {
4167                 /*
4168                  * When the space map is loaded, we have an accurate
4169                  * histogram in the range tree. This gives us an opportunity
4170                  * to bring the space map's histogram up-to-date so we clear
4171                  * it first before updating it.
4172                  */
4173                 space_map_histogram_clear(msp->ms_sm);
4174                 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
4175
4176                 /*
4177                  * Since we've cleared the histogram we need to add back
4178                  * any free space that has already been processed, plus
4179                  * any deferred space. This allows the on-disk histogram
4180                  * to accurately reflect all free space even if some space
4181                  * is not yet available for allocation (i.e. deferred).
4182                  */
4183                 space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx);
4184
4185                 /*
4186                  * Add back any deferred free space that has not been
4187                  * added back into the in-core free tree yet. This will
4188                  * ensure that we don't end up with a space map histogram
4189                  * that is completely empty unless the metaslab is fully
4190                  * allocated.
4191                  */
4192                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
4193                         space_map_histogram_add(msp->ms_sm,
4194                             msp->ms_defer[t], tx);
4195                 }
4196         }
4197
4198         /*
4199          * Always add the free space from this sync pass to the space
4200          * map histogram. We want to make sure that the on-disk histogram
4201          * accounts for all free space. If the space map is not loaded,
4202          * then we will lose some accuracy but will correct it the next
4203          * time we load the space map.
4204          */
4205         space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
4206         metaslab_aux_histograms_update(msp);
4207
4208         metaslab_group_histogram_add(mg, msp);
4209         metaslab_group_histogram_verify(mg);
4210         metaslab_class_histogram_verify(mg->mg_class);
4211
4212         /*
4213          * For sync pass 1, we avoid traversing this txg's free range tree
4214          * and instead will just swap the pointers for freeing and freed.
4215          * We can safely do this since the freed_tree is guaranteed to be
4216          * empty on the initial pass.
4217          *
4218          * Keep in mind that even if we are currently using a log spacemap
4219          * we want current frees to end up in the ms_allocatable (but not
4220          * get appended to the ms_sm) so their ranges can be reused as usual.
4221          */
4222         if (spa_sync_pass(spa) == 1) {
4223                 range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
4224                 ASSERT0(msp->ms_allocated_this_txg);
4225         } else {
4226                 range_tree_vacate(msp->ms_freeing,
4227                     range_tree_add, msp->ms_freed);
4228         }
4229         msp->ms_allocated_this_txg += range_tree_space(alloctree);
4230         range_tree_vacate(alloctree, NULL, NULL);
4231
4232         ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
4233         ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg)
4234             & TXG_MASK]));
4235         ASSERT0(range_tree_space(msp->ms_freeing));
4236         ASSERT0(range_tree_space(msp->ms_checkpointing));
4237
4238         mutex_exit(&msp->ms_lock);
4239
4240         /*
4241          * Verify that the space map object ID has been recorded in the
4242          * vdev_ms_array.
4243          */
4244         uint64_t object;
4245         VERIFY0(dmu_read(mos, vd->vdev_ms_array,
4246             msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0));
4247         VERIFY3U(object, ==, space_map_object(msp->ms_sm));
4248
4249         mutex_exit(&msp->ms_sync_lock);
4250         dmu_tx_commit(tx);
4251 }
4252
4253 static void
4254 metaslab_evict(metaslab_t *msp, uint64_t txg)
4255 {
4256         if (!msp->ms_loaded || msp->ms_disabled != 0)
4257                 return;
4258
4259         for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
4260                 VERIFY0(range_tree_space(
4261                     msp->ms_allocating[(txg + t) & TXG_MASK]));
4262         }
4263         if (msp->ms_allocator != -1)
4264                 metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK);
4265
4266         if (!metaslab_debug_unload)
4267                 metaslab_unload(msp);
4268 }
4269
4270 /*
4271  * Called after a transaction group has completely synced to mark
4272  * all of the metaslab's free space as usable.
4273  */
4274 void
4275 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
4276 {
4277         metaslab_group_t *mg = msp->ms_group;
4278         vdev_t *vd = mg->mg_vd;
4279         spa_t *spa = vd->vdev_spa;
4280         range_tree_t **defer_tree;
4281         int64_t alloc_delta, defer_delta;
4282         boolean_t defer_allowed = B_TRUE;
4283
4284         ASSERT(!vd->vdev_ishole);
4285
4286         mutex_enter(&msp->ms_lock);
4287
4288         if (msp->ms_new) {
4289                 /* this is a new metaslab, add its capacity to the vdev */
4290                 metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
4291
4292                 /* there should be no allocations nor frees at this point */
4293                 VERIFY0(msp->ms_allocated_this_txg);
4294                 VERIFY0(range_tree_space(msp->ms_freed));
4295         }
4296
4297         ASSERT0(range_tree_space(msp->ms_freeing));
4298         ASSERT0(range_tree_space(msp->ms_checkpointing));
4299
4300         defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];
4301
4302         uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
4303             metaslab_class_get_alloc(spa_normal_class(spa));
4304         if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) {
4305                 defer_allowed = B_FALSE;
4306         }
4307
4308         defer_delta = 0;
4309         alloc_delta = msp->ms_allocated_this_txg -
4310             range_tree_space(msp->ms_freed);
4311
4312         if (defer_allowed) {
4313                 defer_delta = range_tree_space(msp->ms_freed) -
4314                     range_tree_space(*defer_tree);
4315         } else {
4316                 defer_delta -= range_tree_space(*defer_tree);
4317         }
4318         metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
4319             defer_delta, 0);
4320
4321         if (spa_syncing_log_sm(spa) == NULL) {
4322                 /*
4323                  * If there's a metaslab_load() in progress and we don't have
4324                  * a log space map, it means that we probably wrote to the
4325                  * metaslab's space map. If this is the case, we need to
4326                  * make sure that we wait for the load to complete so that we
4327                  * have a consistent view at the in-core side of the metaslab.
4328                  */
4329                 metaslab_load_wait(msp);
4330         } else {
4331                 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
4332         }
4333
4334         /*
4335          * When auto-trimming is enabled, free ranges which are added to
4336          * ms_allocatable are also be added to ms_trim.  The ms_trim tree is
4337          * periodically consumed by the vdev_autotrim_thread() which issues
4338          * trims for all ranges and then vacates the tree.  The ms_trim tree
4339          * can be discarded at any time with the sole consequence of recent
4340          * frees not being trimmed.
4341          */
4342         if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) {
4343                 range_tree_walk(*defer_tree, range_tree_add, msp->ms_trim);
4344                 if (!defer_allowed) {
4345                         range_tree_walk(msp->ms_freed, range_tree_add,
4346                             msp->ms_trim);
4347                 }
4348         } else {
4349                 range_tree_vacate(msp->ms_trim, NULL, NULL);
4350         }
4351
4352         /*
4353          * Move the frees from the defer_tree back to the free
4354          * range tree (if it's loaded). Swap the freed_tree and
4355          * the defer_tree -- this is safe to do because we've
4356          * just emptied out the defer_tree.
4357          */
4358         range_tree_vacate(*defer_tree,
4359             msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable);
4360         if (defer_allowed) {
4361                 range_tree_swap(&msp->ms_freed, defer_tree);
4362         } else {
4363                 range_tree_vacate(msp->ms_freed,
4364                     msp->ms_loaded ? range_tree_add : NULL,
4365                     msp->ms_allocatable);
4366         }
4367
4368         msp->ms_synced_length = space_map_length(msp->ms_sm);
4369
4370         msp->ms_deferspace += defer_delta;
4371         ASSERT3S(msp->ms_deferspace, >=, 0);
4372         ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
4373         if (msp->ms_deferspace != 0) {
4374                 /*
4375                  * Keep syncing this metaslab until all deferred frees
4376                  * are back in circulation.
4377                  */
4378                 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
4379         }
4380         metaslab_aux_histograms_update_done(msp, defer_allowed);
4381
4382         if (msp->ms_new) {
4383                 msp->ms_new = B_FALSE;
4384                 mutex_enter(&mg->mg_lock);
4385                 mg->mg_ms_ready++;
4386                 mutex_exit(&mg->mg_lock);
4387         }
4388
4389         /*
4390          * Re-sort metaslab within its group now that we've adjusted
4391          * its allocatable space.
4392          */
4393         metaslab_recalculate_weight_and_sort(msp);
4394
4395         ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
4396         ASSERT0(range_tree_space(msp->ms_freeing));
4397         ASSERT0(range_tree_space(msp->ms_freed));
4398         ASSERT0(range_tree_space(msp->ms_checkpointing));
4399         msp->ms_allocating_total -= msp->ms_allocated_this_txg;
4400         msp->ms_allocated_this_txg = 0;
4401         mutex_exit(&msp->ms_lock);
4402 }
4403
4404 void
4405 metaslab_sync_reassess(metaslab_group_t *mg)
4406 {
4407         spa_t *spa = mg->mg_class->mc_spa;
4408
4409         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
4410         metaslab_group_alloc_update(mg);
4411         mg->mg_fragmentation = metaslab_group_fragmentation(mg);
4412
4413         /*
4414          * Preload the next potential metaslabs but only on active
4415          * metaslab groups. We can get into a state where the metaslab
4416          * is no longer active since we dirty metaslabs as we remove a
4417          * a device, thus potentially making the metaslab group eligible
4418          * for preloading.
4419          */
4420         if (mg->mg_activation_count > 0) {
4421                 metaslab_group_preload(mg);
4422         }
4423         spa_config_exit(spa, SCL_ALLOC, FTAG);
4424 }
4425
4426 /*
4427  * When writing a ditto block (i.e. more than one DVA for a given BP) on
4428  * the same vdev as an existing DVA of this BP, then try to allocate it
4429  * on a different metaslab than existing DVAs (i.e. a unique metaslab).
4430  */
4431 static boolean_t
4432 metaslab_is_unique(metaslab_t *msp, dva_t *dva)
4433 {
4434         uint64_t dva_ms_id;
4435
4436         if (DVA_GET_ASIZE(dva) == 0)
4437                 return (B_TRUE);
4438
4439         if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
4440                 return (B_TRUE);
4441
4442         dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift;
4443
4444         return (msp->ms_id != dva_ms_id);
4445 }
4446
4447 /*
4448  * ==========================================================================
4449  * Metaslab allocation tracing facility
4450  * ==========================================================================
4451  */
4452
4453 /*
4454  * Add an allocation trace element to the allocation tracing list.
4455  */
4456 static void
4457 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
4458     metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset,
4459     int allocator)
4460 {
4461         metaslab_alloc_trace_t *mat;
4462
4463         if (!metaslab_trace_enabled)
4464                 return;
4465
4466         /*
4467          * When the tracing list reaches its maximum we remove
4468          * the second element in the list before adding a new one.
4469          * By removing the second element we preserve the original
4470          * entry as a clue to what allocations steps have already been
4471          * performed.
4472          */
4473         if (zal->zal_size == metaslab_trace_max_entries) {
4474                 metaslab_alloc_trace_t *mat_next;
4475 #ifdef ZFS_DEBUG
4476                 panic("too many entries in allocation list");
4477 #endif
4478                 METASLABSTAT_BUMP(metaslabstat_trace_over_limit);
4479                 zal->zal_size--;
4480                 mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
4481                 list_remove(&zal->zal_list, mat_next);
4482                 kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
4483         }
4484
4485         mat = kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
4486         list_link_init(&mat->mat_list_node);
4487         mat->mat_mg = mg;
4488         mat->mat_msp = msp;
4489         mat->mat_size = psize;
4490         mat->mat_dva_id = dva_id;
4491         mat->mat_offset = offset;
4492         mat->mat_weight = 0;
4493         mat->mat_allocator = allocator;
4494
4495         if (msp != NULL)
4496                 mat->mat_weight = msp->ms_weight;
4497
4498         /*
4499          * The list is part of the zio so locking is not required. Only
4500          * a single thread will perform allocations for a given zio.
4501          */
4502         list_insert_tail(&zal->zal_list, mat);
4503         zal->zal_size++;
4504
4505         ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
4506 }
4507
4508 void
4509 metaslab_trace_init(zio_alloc_list_t *zal)
4510 {
4511         list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
4512             offsetof(metaslab_alloc_trace_t, mat_list_node));
4513         zal->zal_size = 0;
4514 }
4515
4516 void
4517 metaslab_trace_fini(zio_alloc_list_t *zal)
4518 {
4519         metaslab_alloc_trace_t *mat;
4520
4521         while ((mat = list_remove_head(&zal->zal_list)) != NULL)
4522                 kmem_cache_free(metaslab_alloc_trace_cache, mat);
4523         list_destroy(&zal->zal_list);
4524         zal->zal_size = 0;
4525 }
4526
4527 /*
4528  * ==========================================================================
4529  * Metaslab block operations
4530  * ==========================================================================
4531  */
4532
4533 static void
4534 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, const void *tag,
4535     int flags, int allocator)
4536 {
4537         if (!(flags & METASLAB_ASYNC_ALLOC) ||
4538             (flags & METASLAB_DONT_THROTTLE))
4539                 return;
4540
4541         metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
4542         if (!mg->mg_class->mc_alloc_throttle_enabled)
4543                 return;
4544
4545         metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
4546         (void) zfs_refcount_add(&mga->mga_alloc_queue_depth, tag);
4547 }
4548
4549 static void
4550 metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
4551 {
4552         metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
4553         metaslab_class_allocator_t *mca =
4554             &mg->mg_class->mc_allocator[allocator];
4555         uint64_t max = mg->mg_max_alloc_queue_depth;
4556         uint64_t cur = mga->mga_cur_max_alloc_queue_depth;
4557         while (cur < max) {
4558                 if (atomic_cas_64(&mga->mga_cur_max_alloc_queue_depth,
4559                     cur, cur + 1) == cur) {
4560                         atomic_inc_64(&mca->mca_alloc_max_slots);
4561                         return;
4562                 }
4563                 cur = mga->mga_cur_max_alloc_queue_depth;
4564         }
4565 }
4566
4567 void
4568 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, const void *tag,
4569     int flags, int allocator, boolean_t io_complete)
4570 {
4571         if (!(flags & METASLAB_ASYNC_ALLOC) ||
4572             (flags & METASLAB_DONT_THROTTLE))
4573                 return;
4574
4575         metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
4576         if (!mg->mg_class->mc_alloc_throttle_enabled)
4577                 return;
4578
4579         metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
4580         (void) zfs_refcount_remove(&mga->mga_alloc_queue_depth, tag);
4581         if (io_complete)
4582                 metaslab_group_increment_qdepth(mg, allocator);
4583 }
4584
4585 void
4586 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, const void *tag,
4587     int allocator)
4588 {
4589 #ifdef ZFS_DEBUG
4590         const dva_t *dva = bp->blk_dva;
4591         int ndvas = BP_GET_NDVAS(bp);
4592
4593         for (int d = 0; d < ndvas; d++) {
4594                 uint64_t vdev = DVA_GET_VDEV(&dva[d]);
4595                 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
4596                 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
4597                 VERIFY(zfs_refcount_not_held(&mga->mga_alloc_queue_depth, tag));
4598         }
4599 #endif
4600 }
4601
4602 static uint64_t
4603 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
4604 {
4605         uint64_t start;
4606         range_tree_t *rt = msp->ms_allocatable;
4607         metaslab_class_t *mc = msp->ms_group->mg_class;
4608
4609         ASSERT(MUTEX_HELD(&msp->ms_lock));
4610         VERIFY(!msp->ms_condensing);
4611         VERIFY0(msp->ms_disabled);
4612
4613         start = mc->mc_ops->msop_alloc(msp, size);
4614         if (start != -1ULL) {
4615                 metaslab_group_t *mg = msp->ms_group;
4616                 vdev_t *vd = mg->mg_vd;
4617
4618                 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
4619                 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
4620                 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
4621                 range_tree_remove(rt, start, size);
4622                 range_tree_clear(msp->ms_trim, start, size);
4623
4624                 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
4625                         vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
4626
4627                 range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size);
4628                 msp->ms_allocating_total += size;
4629
4630                 /* Track the last successful allocation */
4631                 msp->ms_alloc_txg = txg;
4632                 metaslab_verify_space(msp, txg);
4633         }
4634
4635         /*
4636          * Now that we've attempted the allocation we need to update the
4637          * metaslab's maximum block size since it may have changed.
4638          */
4639         msp->ms_max_size = metaslab_largest_allocatable(msp);
4640         return (start);
4641 }
4642
4643 /*
4644  * Find the metaslab with the highest weight that is less than what we've
4645  * already tried.  In the common case, this means that we will examine each
4646  * metaslab at most once. Note that concurrent callers could reorder metaslabs
4647  * by activation/passivation once we have dropped the mg_lock. If a metaslab is
4648  * activated by another thread, and we fail to allocate from the metaslab we
4649  * have selected, we may not try the newly-activated metaslab, and instead
4650  * activate another metaslab.  This is not optimal, but generally does not cause
4651  * any problems (a possible exception being if every metaslab is completely full
4652  * except for the newly-activated metaslab which we fail to examine).
4653  */
4654 static metaslab_t *
4655 find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
4656     dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator,
4657     boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search,
4658     boolean_t *was_active)
4659 {
4660         avl_index_t idx;
4661         avl_tree_t *t = &mg->mg_metaslab_tree;
4662         metaslab_t *msp = avl_find(t, search, &idx);
4663         if (msp == NULL)
4664                 msp = avl_nearest(t, idx, AVL_AFTER);
4665
4666         uint_t tries = 0;
4667         for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
4668                 int i;
4669
4670                 if (!try_hard && tries > zfs_metaslab_find_max_tries) {
4671                         METASLABSTAT_BUMP(metaslabstat_too_many_tries);
4672                         return (NULL);
4673                 }
4674                 tries++;
4675
4676                 if (!metaslab_should_allocate(msp, asize, try_hard)) {
4677                         metaslab_trace_add(zal, mg, msp, asize, d,
4678                             TRACE_TOO_SMALL, allocator);
4679                         continue;
4680                 }
4681
4682                 /*
4683                  * If the selected metaslab is condensing or disabled,
4684                  * skip it.
4685                  */
4686                 if (msp->ms_condensing || msp->ms_disabled > 0)
4687                         continue;
4688
4689                 *was_active = msp->ms_allocator != -1;
4690                 /*
4691                  * If we're activating as primary, this is our first allocation
4692                  * from this disk, so we don't need to check how close we are.
4693                  * If the metaslab under consideration was already active,
4694                  * we're getting desperate enough to steal another allocator's
4695                  * metaslab, so we still don't care about distances.
4696                  */
4697                 if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active)
4698                         break;
4699
4700                 for (i = 0; i < d; i++) {
4701                         if (want_unique &&
4702                             !metaslab_is_unique(msp, &dva[i]))
4703                                 break;  /* try another metaslab */
4704                 }
4705                 if (i == d)
4706                         break;
4707         }
4708
4709         if (msp != NULL) {
4710                 search->ms_weight = msp->ms_weight;
4711                 search->ms_start = msp->ms_start + 1;
4712                 search->ms_allocator = msp->ms_allocator;
4713                 search->ms_primary = msp->ms_primary;
4714         }
4715         return (msp);
4716 }
4717
4718 static void
4719 metaslab_active_mask_verify(metaslab_t *msp)
4720 {
4721         ASSERT(MUTEX_HELD(&msp->ms_lock));
4722
4723         if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
4724                 return;
4725
4726         if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0)
4727                 return;
4728
4729         if (msp->ms_weight & METASLAB_WEIGHT_PRIMARY) {
4730                 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
4731                 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
4732                 VERIFY3S(msp->ms_allocator, !=, -1);
4733                 VERIFY(msp->ms_primary);
4734                 return;
4735         }
4736
4737         if (msp->ms_weight & METASLAB_WEIGHT_SECONDARY) {
4738                 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
4739                 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
4740                 VERIFY3S(msp->ms_allocator, !=, -1);
4741                 VERIFY(!msp->ms_primary);
4742                 return;
4743         }
4744
4745         if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
4746                 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
4747                 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
4748                 VERIFY3S(msp->ms_allocator, ==, -1);
4749                 return;
4750         }
4751 }
4752
4753 static uint64_t
4754 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
4755     uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
4756     int allocator, boolean_t try_hard)
4757 {
4758         metaslab_t *msp = NULL;
4759         uint64_t offset = -1ULL;
4760
4761         uint64_t activation_weight = METASLAB_WEIGHT_PRIMARY;
4762         for (int i = 0; i < d; i++) {
4763                 if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
4764                     DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
4765                         activation_weight = METASLAB_WEIGHT_SECONDARY;
4766                 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
4767                     DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
4768                         activation_weight = METASLAB_WEIGHT_CLAIM;
4769                         break;
4770                 }
4771         }
4772
4773         /*
4774          * If we don't have enough metaslabs active to fill the entire array, we
4775          * just use the 0th slot.
4776          */
4777         if (mg->mg_ms_ready < mg->mg_allocators * 3)
4778                 allocator = 0;
4779         metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
4780
4781         ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2);
4782
4783         metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
4784         search->ms_weight = UINT64_MAX;
4785         search->ms_start = 0;
4786         /*
4787          * At the end of the metaslab tree are the already-active metaslabs,
4788          * first the primaries, then the secondaries. When we resume searching
4789          * through the tree, we need to consider ms_allocator and ms_primary so
4790          * we start in the location right after where we left off, and don't
4791          * accidentally loop forever considering the same metaslabs.
4792          */
4793         search->ms_allocator = -1;
4794         search->ms_primary = B_TRUE;
4795         for (;;) {
4796                 boolean_t was_active = B_FALSE;
4797
4798                 mutex_enter(&mg->mg_lock);
4799
4800                 if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
4801                     mga->mga_primary != NULL) {
4802                         msp = mga->mga_primary;
4803
4804                         /*
4805                          * Even though we don't hold the ms_lock for the
4806                          * primary metaslab, those fields should not
4807                          * change while we hold the mg_lock. Thus it is
4808                          * safe to make assertions on them.
4809                          */
4810                         ASSERT(msp->ms_primary);
4811                         ASSERT3S(msp->ms_allocator, ==, allocator);
4812                         ASSERT(msp->ms_loaded);
4813
4814                         was_active = B_TRUE;
4815                         ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
4816                 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
4817                     mga->mga_secondary != NULL) {
4818                         msp = mga->mga_secondary;
4819
4820                         /*
4821                          * See comment above about the similar assertions
4822                          * for the primary metaslab.
4823                          */
4824                         ASSERT(!msp->ms_primary);
4825                         ASSERT3S(msp->ms_allocator, ==, allocator);
4826                         ASSERT(msp->ms_loaded);
4827
4828                         was_active = B_TRUE;
4829                         ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
4830                 } else {
4831                         msp = find_valid_metaslab(mg, activation_weight, dva, d,
4832                             want_unique, asize, allocator, try_hard, zal,
4833                             search, &was_active);
4834                 }
4835
4836                 mutex_exit(&mg->mg_lock);
4837                 if (msp == NULL) {
4838                         kmem_free(search, sizeof (*search));
4839                         return (-1ULL);
4840                 }
4841                 mutex_enter(&msp->ms_lock);
4842
4843                 metaslab_active_mask_verify(msp);
4844
4845                 /*
4846                  * This code is disabled out because of issues with
4847                  * tracepoints in non-gpl kernel modules.
4848                  */
4849 #if 0
4850                 DTRACE_PROBE3(ms__activation__attempt,
4851                     metaslab_t *, msp, uint64_t, activation_weight,
4852                     boolean_t, was_active);
4853 #endif
4854
4855                 /*
4856                  * Ensure that the metaslab we have selected is still
4857                  * capable of handling our request. It's possible that
4858                  * another thread may have changed the weight while we
4859                  * were blocked on the metaslab lock. We check the
4860                  * active status first to see if we need to set_selected_txg
4861                  * a new metaslab.
4862                  */
4863                 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
4864                         ASSERT3S(msp->ms_allocator, ==, -1);
4865                         mutex_exit(&msp->ms_lock);
4866                         continue;
4867                 }
4868
4869                 /*
4870                  * If the metaslab was activated for another allocator
4871                  * while we were waiting in the ms_lock above, or it's
4872                  * a primary and we're seeking a secondary (or vice versa),
4873                  * we go back and select a new metaslab.
4874                  */
4875                 if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) &&
4876                     (msp->ms_allocator != -1) &&
4877                     (msp->ms_allocator != allocator || ((activation_weight ==
4878                     METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) {
4879                         ASSERT(msp->ms_loaded);
4880                         ASSERT((msp->ms_weight & METASLAB_WEIGHT_CLAIM) ||
4881                             msp->ms_allocator != -1);
4882                         mutex_exit(&msp->ms_lock);
4883                         continue;
4884                 }
4885
4886                 /*
4887                  * This metaslab was used for claiming regions allocated
4888                  * by the ZIL during pool import. Once these regions are
4889                  * claimed we don't need to keep the CLAIM bit set
4890                  * anymore. Passivate this metaslab to zero its activation
4891                  * mask.
4892                  */
4893                 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM &&
4894                     activation_weight != METASLAB_WEIGHT_CLAIM) {
4895                         ASSERT(msp->ms_loaded);
4896                         ASSERT3S(msp->ms_allocator, ==, -1);
4897                         metaslab_passivate(msp, msp->ms_weight &
4898                             ~METASLAB_WEIGHT_CLAIM);
4899                         mutex_exit(&msp->ms_lock);
4900                         continue;
4901                 }
4902
4903                 metaslab_set_selected_txg(msp, txg);
4904
4905                 int activation_error =
4906                     metaslab_activate(msp, allocator, activation_weight);
4907                 metaslab_active_mask_verify(msp);
4908
4909                 /*
4910                  * If the metaslab was activated by another thread for
4911                  * another allocator or activation_weight (EBUSY), or it
4912                  * failed because another metaslab was assigned as primary
4913                  * for this allocator (EEXIST) we continue using this
4914                  * metaslab for our allocation, rather than going on to a
4915                  * worse metaslab (we waited for that metaslab to be loaded
4916                  * after all).
4917                  *
4918                  * If the activation failed due to an I/O error or ENOSPC we
4919                  * skip to the next metaslab.
4920                  */
4921                 boolean_t activated;
4922                 if (activation_error == 0) {
4923                         activated = B_TRUE;
4924                 } else if (activation_error == EBUSY ||
4925                     activation_error == EEXIST) {
4926                         activated = B_FALSE;
4927                 } else {
4928                         mutex_exit(&msp->ms_lock);
4929                         continue;
4930                 }
4931                 ASSERT(msp->ms_loaded);
4932
4933                 /*
4934                  * Now that we have the lock, recheck to see if we should
4935                  * continue to use this metaslab for this allocation. The
4936                  * the metaslab is now loaded so metaslab_should_allocate()
4937                  * can accurately determine if the allocation attempt should
4938                  * proceed.
4939                  */
4940                 if (!metaslab_should_allocate(msp, asize, try_hard)) {
4941                         /* Passivate this metaslab and select a new one. */
4942                         metaslab_trace_add(zal, mg, msp, asize, d,
4943                             TRACE_TOO_SMALL, allocator);
4944                         goto next;
4945                 }
4946
4947                 /*
4948                  * If this metaslab is currently condensing then pick again
4949                  * as we can't manipulate this metaslab until it's committed
4950                  * to disk. If this metaslab is being initialized, we shouldn't
4951                  * allocate from it since the allocated region might be
4952                  * overwritten after allocation.
4953                  */
4954                 if (msp->ms_condensing) {
4955                         metaslab_trace_add(zal, mg, msp, asize, d,
4956                             TRACE_CONDENSING, allocator);
4957                         if (activated) {
4958                                 metaslab_passivate(msp, msp->ms_weight &
4959                                     ~METASLAB_ACTIVE_MASK);
4960                         }
4961                         mutex_exit(&msp->ms_lock);
4962                         continue;
4963                 } else if (msp->ms_disabled > 0) {
4964                         metaslab_trace_add(zal, mg, msp, asize, d,
4965                             TRACE_DISABLED, allocator);
4966                         if (activated) {
4967                                 metaslab_passivate(msp, msp->ms_weight &
4968                                     ~METASLAB_ACTIVE_MASK);
4969                         }
4970                         mutex_exit(&msp->ms_lock);
4971                         continue;
4972                 }
4973
4974                 offset = metaslab_block_alloc(msp, asize, txg);
4975                 metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);
4976
4977                 if (offset != -1ULL) {
4978                         /* Proactively passivate the metaslab, if needed */
4979                         if (activated)
4980                                 metaslab_segment_may_passivate(msp);
4981                         break;
4982                 }
4983 next:
4984                 ASSERT(msp->ms_loaded);
4985
4986                 /*
4987                  * This code is disabled out because of issues with
4988                  * tracepoints in non-gpl kernel modules.
4989                  */
4990 #if 0
4991                 DTRACE_PROBE2(ms__alloc__failure, metaslab_t *, msp,
4992                     uint64_t, asize);
4993 #endif
4994
4995                 /*
4996                  * We were unable to allocate from this metaslab so determine
4997                  * a new weight for this metaslab. Now that we have loaded
4998                  * the metaslab we can provide a better hint to the metaslab
4999                  * selector.
5000                  *
5001                  * For space-based metaslabs, we use the maximum block size.
5002                  * This information is only available when the metaslab
5003                  * is loaded and is more accurate than the generic free
5004                  * space weight that was calculated by metaslab_weight().
5005                  * This information allows us to quickly compare the maximum
5006                  * available allocation in the metaslab to the allocation
5007                  * size being requested.
5008                  *
5009                  * For segment-based metaslabs, determine the new weight
5010                  * based on the highest bucket in the range tree. We
5011                  * explicitly use the loaded segment weight (i.e. the range
5012                  * tree histogram) since it contains the space that is
5013                  * currently available for allocation and is accurate
5014                  * even within a sync pass.
5015                  */
5016                 uint64_t weight;
5017                 if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
5018                         weight = metaslab_largest_allocatable(msp);
5019                         WEIGHT_SET_SPACEBASED(weight);
5020                 } else {
5021                         weight = metaslab_weight_from_range_tree(msp);
5022                 }
5023
5024                 if (activated) {
5025                         metaslab_passivate(msp, weight);
5026                 } else {
5027                         /*
5028                          * For the case where we use the metaslab that is
5029                          * active for another allocator we want to make
5030                          * sure that we retain the activation mask.
5031                          *
5032                          * Note that we could attempt to use something like
5033                          * metaslab_recalculate_weight_and_sort() that
5034                          * retains the activation mask here. That function
5035                          * uses metaslab_weight() to set the weight though
5036                          * which is not as accurate as the calculations
5037                          * above.
5038                          */
5039                         weight |= msp->ms_weight & METASLAB_ACTIVE_MASK;
5040                         metaslab_group_sort(mg, msp, weight);
5041                 }
5042                 metaslab_active_mask_verify(msp);
5043
5044                 /*
5045                  * We have just failed an allocation attempt, check
5046                  * that metaslab_should_allocate() agrees. Otherwise,
5047                  * we may end up in an infinite loop retrying the same
5048                  * metaslab.
5049                  */
5050                 ASSERT(!metaslab_should_allocate(msp, asize, try_hard));
5051
5052                 mutex_exit(&msp->ms_lock);
5053         }
5054         mutex_exit(&msp->ms_lock);
5055         kmem_free(search, sizeof (*search));
5056         return (offset);
5057 }
5058
5059 static uint64_t
5060 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
5061     uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
5062     int allocator, boolean_t try_hard)
5063 {
5064         uint64_t offset;
5065         ASSERT(mg->mg_initialized);
5066
5067         offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique,
5068             dva, d, allocator, try_hard);
5069
5070         mutex_enter(&mg->mg_lock);
5071         if (offset == -1ULL) {
5072                 mg->mg_failed_allocations++;
5073                 metaslab_trace_add(zal, mg, NULL, asize, d,
5074                     TRACE_GROUP_FAILURE, allocator);
5075                 if (asize == SPA_GANGBLOCKSIZE) {
5076                         /*
5077                          * This metaslab group was unable to allocate
5078                          * the minimum gang block size so it must be out of
5079                          * space. We must notify the allocation throttle
5080                          * to start skipping allocation attempts to this
5081                          * metaslab group until more space becomes available.
5082                          * Note: this failure cannot be caused by the
5083                          * allocation throttle since the allocation throttle
5084                          * is only responsible for skipping devices and
5085                          * not failing block allocations.
5086                          */
5087                         mg->mg_no_free_space = B_TRUE;
5088                 }
5089         }
5090         mg->mg_allocations++;
5091         mutex_exit(&mg->mg_lock);
5092         return (offset);
5093 }
5094
5095 /*
5096  * Allocate a block for the specified i/o.
5097  */
5098 int
5099 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
5100     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
5101     zio_alloc_list_t *zal, int allocator)
5102 {
5103         metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
5104         metaslab_group_t *mg, *rotor;
5105         vdev_t *vd;
5106         boolean_t try_hard = B_FALSE;
5107
5108         ASSERT(!DVA_IS_VALID(&dva[d]));
5109
5110         /*
5111          * For testing, make some blocks above a certain size be gang blocks.
5112          * This will result in more split blocks when using device removal,
5113          * and a large number of split blocks coupled with ztest-induced
5114          * damage can result in extremely long reconstruction times.  This
5115          * will also test spilling from special to normal.
5116          */
5117         if (psize >= metaslab_force_ganging &&
5118             metaslab_force_ganging_pct > 0 &&
5119             (random_in_range(100) < MIN(metaslab_force_ganging_pct, 100))) {
5120                 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
5121                     allocator);
5122                 return (SET_ERROR(ENOSPC));
5123         }
5124
5125         /*
5126          * Start at the rotor and loop through all mgs until we find something.
5127          * Note that there's no locking on mca_rotor or mca_aliquot because
5128          * nothing actually breaks if we miss a few updates -- we just won't
5129          * allocate quite as evenly.  It all balances out over time.
5130          *
5131          * If we are doing ditto or log blocks, try to spread them across
5132          * consecutive vdevs.  If we're forced to reuse a vdev before we've
5133          * allocated all of our ditto blocks, then try and spread them out on
5134          * that vdev as much as possible.  If it turns out to not be possible,
5135          * gradually lower our standards until anything becomes acceptable.
5136          * Also, allocating on consecutive vdevs (as opposed to random vdevs)
5137          * gives us hope of containing our fault domains to something we're
5138          * able to reason about.  Otherwise, any two top-level vdev failures
5139          * will guarantee the loss of data.  With consecutive allocation,
5140          * only two adjacent top-level vdev failures will result in data loss.
5141          *
5142          * If we are doing gang blocks (hintdva is non-NULL), try to keep
5143          * ourselves on the same vdev as our gang block header.  That
5144          * way, we can hope for locality in vdev_cache, plus it makes our
5145          * fault domains something tractable.
5146          */
5147         if (hintdva) {
5148                 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
5149
5150                 /*
5151                  * It's possible the vdev we're using as the hint no
5152                  * longer exists or its mg has been closed (e.g. by
5153                  * device removal).  Consult the rotor when
5154                  * all else fails.
5155                  */
5156                 if (vd != NULL && vd->vdev_mg != NULL) {
5157                         mg = vdev_get_mg(vd, mc);
5158
5159                         if (flags & METASLAB_HINTBP_AVOID)
5160                                 mg = mg->mg_next;
5161                 } else {
5162                         mg = mca->mca_rotor;
5163                 }
5164         } else if (d != 0) {
5165                 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
5166                 mg = vd->vdev_mg->mg_next;
5167         } else {
5168                 ASSERT(mca->mca_rotor != NULL);
5169                 mg = mca->mca_rotor;
5170         }
5171
5172         /*
5173          * If the hint put us into the wrong metaslab class, or into a
5174          * metaslab group that has been passivated, just follow the rotor.
5175          */
5176         if (mg->mg_class != mc || mg->mg_activation_count <= 0)
5177                 mg = mca->mca_rotor;
5178
5179         rotor = mg;
5180 top:
5181         do {
5182                 boolean_t allocatable;
5183
5184                 ASSERT(mg->mg_activation_count == 1);
5185                 vd = mg->mg_vd;
5186
5187                 /*
5188                  * Don't allocate from faulted devices.
5189                  */
5190                 if (try_hard) {
5191                         spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
5192                         allocatable = vdev_allocatable(vd);
5193                         spa_config_exit(spa, SCL_ZIO, FTAG);
5194                 } else {
5195                         allocatable = vdev_allocatable(vd);
5196                 }
5197
5198                 /*
5199                  * Determine if the selected metaslab group is eligible
5200                  * for allocations. If we're ganging then don't allow
5201                  * this metaslab group to skip allocations since that would
5202                  * inadvertently return ENOSPC and suspend the pool
5203                  * even though space is still available.
5204                  */
5205                 if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
5206                         allocatable = metaslab_group_allocatable(mg, rotor,
5207                             flags, psize, allocator, d);
5208                 }
5209
5210                 if (!allocatable) {
5211                         metaslab_trace_add(zal, mg, NULL, psize, d,
5212                             TRACE_NOT_ALLOCATABLE, allocator);
5213                         goto next;
5214                 }
5215
5216                 ASSERT(mg->mg_initialized);
5217
5218                 /*
5219                  * Avoid writing single-copy data to an unhealthy,
5220                  * non-redundant vdev, unless we've already tried all
5221                  * other vdevs.
5222                  */
5223                 if (vd->vdev_state < VDEV_STATE_HEALTHY &&
5224                     d == 0 && !try_hard && vd->vdev_children == 0) {
5225                         metaslab_trace_add(zal, mg, NULL, psize, d,
5226                             TRACE_VDEV_ERROR, allocator);
5227                         goto next;
5228                 }
5229
5230                 ASSERT(mg->mg_class == mc);
5231
5232                 uint64_t asize = vdev_psize_to_asize(vd, psize);
5233                 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
5234
5235                 /*
5236                  * If we don't need to try hard, then require that the
5237                  * block be on a different metaslab from any other DVAs
5238                  * in this BP (unique=true).  If we are trying hard, then
5239                  * allow any metaslab to be used (unique=false).
5240                  */
5241                 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
5242                     !try_hard, dva, d, allocator, try_hard);
5243
5244                 if (offset != -1ULL) {
5245                         /*
5246                          * If we've just selected this metaslab group,
5247                          * figure out whether the corresponding vdev is
5248                          * over- or under-used relative to the pool,
5249                          * and set an allocation bias to even it out.
5250                          *
5251                          * Bias is also used to compensate for unequally
5252                          * sized vdevs so that space is allocated fairly.
5253                          */
5254                         if (mca->mca_aliquot == 0 && metaslab_bias_enabled) {
5255                                 vdev_stat_t *vs = &vd->vdev_stat;
5256                                 int64_t vs_free = vs->vs_space - vs->vs_alloc;
5257                                 int64_t mc_free = mc->mc_space - mc->mc_alloc;
5258                                 int64_t ratio;
5259
5260                                 /*
5261                                  * Calculate how much more or less we should
5262                                  * try to allocate from this device during
5263                                  * this iteration around the rotor.
5264                                  *
5265                                  * This basically introduces a zero-centered
5266                                  * bias towards the devices with the most
5267                                  * free space, while compensating for vdev
5268                                  * size differences.
5269                                  *
5270                                  * Examples:
5271                                  *  vdev V1 = 16M/128M
5272                                  *  vdev V2 = 16M/128M
5273                                  *  ratio(V1) = 100% ratio(V2) = 100%
5274                                  *
5275                                  *  vdev V1 = 16M/128M
5276                                  *  vdev V2 = 64M/128M
5277                                  *  ratio(V1) = 127% ratio(V2) =  72%
5278                                  *
5279                                  *  vdev V1 = 16M/128M
5280                                  *  vdev V2 = 64M/512M
5281                                  *  ratio(V1) =  40% ratio(V2) = 160%
5282                                  */
5283                                 ratio = (vs_free * mc->mc_alloc_groups * 100) /
5284                                     (mc_free + 1);
5285                                 mg->mg_bias = ((ratio - 100) *
5286                                     (int64_t)mg->mg_aliquot) / 100;
5287                         } else if (!metaslab_bias_enabled) {
5288                                 mg->mg_bias = 0;
5289                         }
5290
5291                         if ((flags & METASLAB_ZIL) ||
5292                             atomic_add_64_nv(&mca->mca_aliquot, asize) >=
5293                             mg->mg_aliquot + mg->mg_bias) {
5294                                 mca->mca_rotor = mg->mg_next;
5295                                 mca->mca_aliquot = 0;
5296                         }
5297
5298                         DVA_SET_VDEV(&dva[d], vd->vdev_id);
5299                         DVA_SET_OFFSET(&dva[d], offset);
5300                         DVA_SET_GANG(&dva[d],
5301                             ((flags & METASLAB_GANG_HEADER) ? 1 : 0));
5302                         DVA_SET_ASIZE(&dva[d], asize);
5303
5304                         return (0);
5305                 }
5306 next:
5307                 mca->mca_rotor = mg->mg_next;
5308                 mca->mca_aliquot = 0;
5309         } while ((mg = mg->mg_next) != rotor);
5310
5311         /*
5312          * If we haven't tried hard, perhaps do so now.
5313          */
5314         if (!try_hard && (zfs_metaslab_try_hard_before_gang ||
5315             GANG_ALLOCATION(flags) || (flags & METASLAB_ZIL) != 0 ||
5316             psize <= 1 << spa->spa_min_ashift)) {
5317                 METASLABSTAT_BUMP(metaslabstat_try_hard);
5318                 try_hard = B_TRUE;
5319                 goto top;
5320         }
5321
5322         memset(&dva[d], 0, sizeof (dva_t));
5323
5324         metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator);
5325         return (SET_ERROR(ENOSPC));
5326 }
5327
5328 void
5329 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
5330     boolean_t checkpoint)
5331 {
5332         metaslab_t *msp;
5333         spa_t *spa = vd->vdev_spa;
5334
5335         ASSERT(vdev_is_concrete(vd));
5336         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
5337         ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
5338
5339         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
5340
5341         VERIFY(!msp->ms_condensing);
5342         VERIFY3U(offset, >=, msp->ms_start);
5343         VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size);
5344         VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
5345         VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
5346
5347         metaslab_check_free_impl(vd, offset, asize);
5348
5349         mutex_enter(&msp->ms_lock);
5350         if (range_tree_is_empty(msp->ms_freeing) &&
5351             range_tree_is_empty(msp->ms_checkpointing)) {
5352                 vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa));
5353         }
5354
5355         if (checkpoint) {
5356                 ASSERT(spa_has_checkpoint(spa));
5357                 range_tree_add(msp->ms_checkpointing, offset, asize);
5358         } else {
5359                 range_tree_add(msp->ms_freeing, offset, asize);
5360         }
5361         mutex_exit(&msp->ms_lock);
5362 }
5363
5364 void
5365 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
5366     uint64_t size, void *arg)
5367 {
5368         (void) inner_offset;
5369         boolean_t *checkpoint = arg;
5370
5371         ASSERT3P(checkpoint, !=, NULL);
5372
5373         if (vd->vdev_ops->vdev_op_remap != NULL)
5374                 vdev_indirect_mark_obsolete(vd, offset, size);
5375         else
5376                 metaslab_free_impl(vd, offset, size, *checkpoint);
5377 }
5378
5379 static void
5380 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
5381     boolean_t checkpoint)
5382 {
5383         spa_t *spa = vd->vdev_spa;
5384
5385         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
5386
5387         if (spa_syncing_txg(spa) > spa_freeze_txg(spa))
5388                 return;
5389
5390         if (spa->spa_vdev_removal != NULL &&
5391             spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
5392             vdev_is_concrete(vd)) {
5393                 /*
5394                  * Note: we check if the vdev is concrete because when
5395                  * we complete the removal, we first change the vdev to be
5396                  * an indirect vdev (in open context), and then (in syncing
5397                  * context) clear spa_vdev_removal.
5398                  */
5399                 free_from_removing_vdev(vd, offset, size);
5400         } else if (vd->vdev_ops->vdev_op_remap != NULL) {
5401                 vdev_indirect_mark_obsolete(vd, offset, size);
5402                 vd->vdev_ops->vdev_op_remap(vd, offset, size,
5403                     metaslab_free_impl_cb, &checkpoint);
5404         } else {
5405                 metaslab_free_concrete(vd, offset, size, checkpoint);
5406         }
5407 }
5408
5409 typedef struct remap_blkptr_cb_arg {
5410         blkptr_t *rbca_bp;
5411         spa_remap_cb_t rbca_cb;
5412         vdev_t *rbca_remap_vd;
5413         uint64_t rbca_remap_offset;
5414         void *rbca_cb_arg;
5415 } remap_blkptr_cb_arg_t;
5416
5417 static void
5418 remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
5419     uint64_t size, void *arg)
5420 {
5421         remap_blkptr_cb_arg_t *rbca = arg;
5422         blkptr_t *bp = rbca->rbca_bp;
5423
5424         /* We can not remap split blocks. */
5425         if (size != DVA_GET_ASIZE(&bp->blk_dva[0]))
5426                 return;
5427         ASSERT0(inner_offset);
5428
5429         if (rbca->rbca_cb != NULL) {
5430                 /*
5431                  * At this point we know that we are not handling split
5432                  * blocks and we invoke the callback on the previous
5433                  * vdev which must be indirect.
5434                  */
5435                 ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops);
5436
5437                 rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id,
5438                     rbca->rbca_remap_offset, size, rbca->rbca_cb_arg);
5439
5440                 /* set up remap_blkptr_cb_arg for the next call */
5441                 rbca->rbca_remap_vd = vd;
5442                 rbca->rbca_remap_offset = offset;
5443         }
5444
5445         /*
5446          * The phys birth time is that of dva[0].  This ensures that we know
5447          * when each dva was written, so that resilver can determine which
5448          * blocks need to be scrubbed (i.e. those written during the time
5449          * the vdev was offline).  It also ensures that the key used in
5450          * the ARC hash table is unique (i.e. dva[0] + phys_birth).  If
5451          * we didn't change the phys_birth, a lookup in the ARC for a
5452          * remapped BP could find the data that was previously stored at
5453          * this vdev + offset.
5454          */
5455         vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
5456             DVA_GET_VDEV(&bp->blk_dva[0]));
5457         vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
5458         bp->blk_phys_birth = vdev_indirect_births_physbirth(vib,
5459             DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
5460
5461         DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
5462         DVA_SET_OFFSET(&bp->blk_dva[0], offset);
5463 }
5464
5465 /*
5466  * If the block pointer contains any indirect DVAs, modify them to refer to
5467  * concrete DVAs.  Note that this will sometimes not be possible, leaving
5468  * the indirect DVA in place.  This happens if the indirect DVA spans multiple
5469  * segments in the mapping (i.e. it is a "split block").
5470  *
5471  * If the BP was remapped, calls the callback on the original dva (note the
5472  * callback can be called multiple times if the original indirect DVA refers
5473  * to another indirect DVA, etc).
5474  *
5475  * Returns TRUE if the BP was remapped.
5476  */
5477 boolean_t
5478 spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg)
5479 {
5480         remap_blkptr_cb_arg_t rbca;
5481
5482         if (!zfs_remap_blkptr_enable)
5483                 return (B_FALSE);
5484
5485         if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS))
5486                 return (B_FALSE);
5487
5488         /*
5489          * Dedup BP's can not be remapped, because ddt_phys_select() depends
5490          * on DVA[0] being the same in the BP as in the DDT (dedup table).
5491          */
5492         if (BP_GET_DEDUP(bp))
5493                 return (B_FALSE);
5494
5495         /*
5496          * Gang blocks can not be remapped, because
5497          * zio_checksum_gang_verifier() depends on the DVA[0] that's in
5498          * the BP used to read the gang block header (GBH) being the same
5499          * as the DVA[0] that we allocated for the GBH.
5500          */
5501         if (BP_IS_GANG(bp))
5502                 return (B_FALSE);
5503
5504         /*
5505          * Embedded BP's have no DVA to remap.
5506          */
5507         if (BP_GET_NDVAS(bp) < 1)
5508                 return (B_FALSE);
5509
5510         /*
5511          * Note: we only remap dva[0].  If we remapped other dvas, we
5512          * would no longer know what their phys birth txg is.
5513          */
5514         dva_t *dva = &bp->blk_dva[0];
5515
5516         uint64_t offset = DVA_GET_OFFSET(dva);
5517         uint64_t size = DVA_GET_ASIZE(dva);
5518         vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
5519
5520         if (vd->vdev_ops->vdev_op_remap == NULL)
5521                 return (B_FALSE);
5522
5523         rbca.rbca_bp = bp;
5524         rbca.rbca_cb = callback;
5525         rbca.rbca_remap_vd = vd;
5526         rbca.rbca_remap_offset = offset;
5527         rbca.rbca_cb_arg = arg;
5528
5529         /*
5530          * remap_blkptr_cb() will be called in order for each level of
5531          * indirection, until a concrete vdev is reached or a split block is
5532          * encountered. old_vd and old_offset are updated within the callback
5533          * as we go from the one indirect vdev to the next one (either concrete
5534          * or indirect again) in that order.
5535          */
5536         vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca);
5537
5538         /* Check if the DVA wasn't remapped because it is a split block */
5539         if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id)
5540                 return (B_FALSE);
5541
5542         return (B_TRUE);
5543 }
5544
5545 /*
5546  * Undo the allocation of a DVA which happened in the given transaction group.
5547  */
5548 void
5549 metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
5550 {
5551         metaslab_t *msp;
5552         vdev_t *vd;
5553         uint64_t vdev = DVA_GET_VDEV(dva);
5554         uint64_t offset = DVA_GET_OFFSET(dva);
5555         uint64_t size = DVA_GET_ASIZE(dva);
5556
5557         ASSERT(DVA_IS_VALID(dva));
5558         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
5559
5560         if (txg > spa_freeze_txg(spa))
5561                 return;
5562
5563         if ((vd = vdev_lookup_top(spa, vdev)) == NULL || !DVA_IS_VALID(dva) ||
5564             (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
5565                 zfs_panic_recover("metaslab_free_dva(): bad DVA %llu:%llu:%llu",
5566                     (u_longlong_t)vdev, (u_longlong_t)offset,
5567                     (u_longlong_t)size);
5568                 return;
5569         }
5570
5571         ASSERT(!vd->vdev_removing);
5572         ASSERT(vdev_is_concrete(vd));
5573         ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
5574         ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
5575
5576         if (DVA_GET_GANG(dva))
5577                 size = vdev_gang_header_asize(vd);
5578
5579         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
5580
5581         mutex_enter(&msp->ms_lock);
5582         range_tree_remove(msp->ms_allocating[txg & TXG_MASK],
5583             offset, size);
5584         msp->ms_allocating_total -= size;
5585
5586         VERIFY(!msp->ms_condensing);
5587         VERIFY3U(offset, >=, msp->ms_start);
5588         VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
5589         VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=,
5590             msp->ms_size);
5591         VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
5592         VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
5593         range_tree_add(msp->ms_allocatable, offset, size);
5594         mutex_exit(&msp->ms_lock);
5595 }
5596
5597 /*
5598  * Free the block represented by the given DVA.
5599  */
5600 void
5601 metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
5602 {
5603         uint64_t vdev = DVA_GET_VDEV(dva);
5604         uint64_t offset = DVA_GET_OFFSET(dva);
5605         uint64_t size = DVA_GET_ASIZE(dva);
5606         vdev_t *vd = vdev_lookup_top(spa, vdev);
5607
5608         ASSERT(DVA_IS_VALID(dva));
5609         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
5610
5611         if (DVA_GET_GANG(dva)) {
5612                 size = vdev_gang_header_asize(vd);
5613         }
5614
5615         metaslab_free_impl(vd, offset, size, checkpoint);
5616 }
5617
5618 /*
5619  * Reserve some allocation slots. The reservation system must be called
5620  * before we call into the allocator. If there aren't any available slots
5621  * then the I/O will be throttled until an I/O completes and its slots are
5622  * freed up. The function returns true if it was successful in placing
5623  * the reservation.
5624  */
5625 boolean_t
5626 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
5627     zio_t *zio, int flags)
5628 {
5629         metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
5630         uint64_t max = mca->mca_alloc_max_slots;
5631
5632         ASSERT(mc->mc_alloc_throttle_enabled);
5633         if (GANG_ALLOCATION(flags) || (flags & METASLAB_MUST_RESERVE) ||
5634             zfs_refcount_count(&mca->mca_alloc_slots) + slots <= max) {
5635                 /*
5636                  * The potential race between _count() and _add() is covered
5637                  * by the allocator lock in most cases, or irrelevant due to
5638                  * GANG_ALLOCATION() or METASLAB_MUST_RESERVE set in others.
5639                  * But even if we assume some other non-existing scenario, the
5640                  * worst that can happen is few more I/Os get to allocation
5641                  * earlier, that is not a problem.
5642                  *
5643                  * We reserve the slots individually so that we can unreserve
5644                  * them individually when an I/O completes.
5645                  */
5646                 zfs_refcount_add_few(&mca->mca_alloc_slots, slots, zio);
5647                 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
5648                 return (B_TRUE);
5649         }
5650         return (B_FALSE);
5651 }
5652
5653 void
5654 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
5655     int allocator, zio_t *zio)
5656 {
5657         metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
5658
5659         ASSERT(mc->mc_alloc_throttle_enabled);
5660         zfs_refcount_remove_few(&mca->mca_alloc_slots, slots, zio);
5661 }
5662
5663 static int
5664 metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
5665     uint64_t txg)
5666 {
5667         metaslab_t *msp;
5668         spa_t *spa = vd->vdev_spa;
5669         int error = 0;
5670
5671         if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count)
5672                 return (SET_ERROR(ENXIO));
5673
5674         ASSERT3P(vd->vdev_ms, !=, NULL);
5675         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
5676
5677         mutex_enter(&msp->ms_lock);
5678
5679         if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) {
5680                 error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM);
5681                 if (error == EBUSY) {
5682                         ASSERT(msp->ms_loaded);
5683                         ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
5684                         error = 0;
5685                 }
5686         }
5687
5688         if (error == 0 &&
5689             !range_tree_contains(msp->ms_allocatable, offset, size))
5690                 error = SET_ERROR(ENOENT);
5691
5692         if (error || txg == 0) {        /* txg == 0 indicates dry run */
5693                 mutex_exit(&msp->ms_lock);
5694                 return (error);
5695         }
5696
5697         VERIFY(!msp->ms_condensing);
5698         VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
5699         VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
5700         VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=,
5701             msp->ms_size);
5702         range_tree_remove(msp->ms_allocatable, offset, size);
5703         range_tree_clear(msp->ms_trim, offset, size);
5704
5705         if (spa_writeable(spa)) {       /* don't dirty if we're zdb(8) */
5706                 metaslab_class_t *mc = msp->ms_group->mg_class;
5707                 multilist_sublist_t *mls =
5708                     multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
5709                 if (!multilist_link_active(&msp->ms_class_txg_node)) {
5710                         msp->ms_selected_txg = txg;
5711                         multilist_sublist_insert_head(mls, msp);
5712                 }
5713                 multilist_sublist_unlock(mls);
5714
5715                 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
5716                         vdev_dirty(vd, VDD_METASLAB, msp, txg);
5717                 range_tree_add(msp->ms_allocating[txg & TXG_MASK],
5718                     offset, size);
5719                 msp->ms_allocating_total += size;
5720         }
5721
5722         mutex_exit(&msp->ms_lock);
5723
5724         return (0);
5725 }
5726
5727 typedef struct metaslab_claim_cb_arg_t {
5728         uint64_t        mcca_txg;
5729         int             mcca_error;
5730 } metaslab_claim_cb_arg_t;
5731
5732 static void
5733 metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
5734     uint64_t size, void *arg)
5735 {
5736         (void) inner_offset;
5737         metaslab_claim_cb_arg_t *mcca_arg = arg;
5738
5739         if (mcca_arg->mcca_error == 0) {
5740                 mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset,
5741                     size, mcca_arg->mcca_txg);
5742         }
5743 }
5744
5745 int
5746 metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
5747 {
5748         if (vd->vdev_ops->vdev_op_remap != NULL) {
5749                 metaslab_claim_cb_arg_t arg;
5750
5751                 /*
5752                  * Only zdb(8) can claim on indirect vdevs.  This is used
5753                  * to detect leaks of mapped space (that are not accounted
5754                  * for in the obsolete counts, spacemap, or bpobj).
5755                  */
5756                 ASSERT(!spa_writeable(vd->vdev_spa));
5757                 arg.mcca_error = 0;
5758                 arg.mcca_txg = txg;
5759
5760                 vd->vdev_ops->vdev_op_remap(vd, offset, size,
5761                     metaslab_claim_impl_cb, &arg);
5762
5763                 if (arg.mcca_error == 0) {
5764                         arg.mcca_error = metaslab_claim_concrete(vd,
5765                             offset, size, txg);
5766                 }
5767                 return (arg.mcca_error);
5768         } else {
5769                 return (metaslab_claim_concrete(vd, offset, size, txg));
5770         }
5771 }
5772
5773 /*
5774  * Intent log support: upon opening the pool after a crash, notify the SPA
5775  * of blocks that the intent log has allocated for immediate write, but
5776  * which are still considered free by the SPA because the last transaction
5777  * group didn't commit yet.
5778  */
5779 static int
5780 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
5781 {
5782         uint64_t vdev = DVA_GET_VDEV(dva);
5783         uint64_t offset = DVA_GET_OFFSET(dva);
5784         uint64_t size = DVA_GET_ASIZE(dva);
5785         vdev_t *vd;
5786
5787         if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
5788                 return (SET_ERROR(ENXIO));
5789         }
5790
5791         ASSERT(DVA_IS_VALID(dva));
5792
5793         if (DVA_GET_GANG(dva))
5794                 size = vdev_gang_header_asize(vd);
5795
5796         return (metaslab_claim_impl(vd, offset, size, txg));
5797 }
5798
5799 int
5800 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
5801     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
5802     zio_alloc_list_t *zal, zio_t *zio, int allocator)
5803 {
5804         dva_t *dva = bp->blk_dva;
5805         dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
5806         int error = 0;
5807
5808         ASSERT(bp->blk_birth == 0);
5809         ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
5810
5811         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
5812
5813         if (mc->mc_allocator[allocator].mca_rotor == NULL) {
5814                 /* no vdevs in this class */
5815                 spa_config_exit(spa, SCL_ALLOC, FTAG);
5816                 return (SET_ERROR(ENOSPC));
5817         }
5818
5819         ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
5820         ASSERT(BP_GET_NDVAS(bp) == 0);
5821         ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
5822         ASSERT3P(zal, !=, NULL);
5823
5824         for (int d = 0; d < ndvas; d++) {
5825                 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
5826                     txg, flags, zal, allocator);
5827                 if (error != 0) {
5828                         for (d--; d >= 0; d--) {
5829                                 metaslab_unalloc_dva(spa, &dva[d], txg);
5830                                 metaslab_group_alloc_decrement(spa,
5831                                     DVA_GET_VDEV(&dva[d]), zio, flags,
5832                                     allocator, B_FALSE);
5833                                 memset(&dva[d], 0, sizeof (dva_t));
5834                         }
5835                         spa_config_exit(spa, SCL_ALLOC, FTAG);
5836                         return (error);
5837                 } else {
5838                         /*
5839                          * Update the metaslab group's queue depth
5840                          * based on the newly allocated dva.
5841                          */
5842                         metaslab_group_alloc_increment(spa,
5843                             DVA_GET_VDEV(&dva[d]), zio, flags, allocator);
5844                 }
5845         }
5846         ASSERT(error == 0);
5847         ASSERT(BP_GET_NDVAS(bp) == ndvas);
5848
5849         spa_config_exit(spa, SCL_ALLOC, FTAG);
5850
5851         BP_SET_BIRTH(bp, txg, 0);
5852
5853         return (0);
5854 }
5855
5856 void
5857 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
5858 {
5859         const dva_t *dva = bp->blk_dva;
5860         int ndvas = BP_GET_NDVAS(bp);
5861
5862         ASSERT(!BP_IS_HOLE(bp));
5863         ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
5864
5865         /*
5866          * If we have a checkpoint for the pool we need to make sure that
5867          * the blocks that we free that are part of the checkpoint won't be
5868          * reused until the checkpoint is discarded or we revert to it.
5869          *
5870          * The checkpoint flag is passed down the metaslab_free code path
5871          * and is set whenever we want to add a block to the checkpoint's
5872          * accounting. That is, we "checkpoint" blocks that existed at the
5873          * time the checkpoint was created and are therefore referenced by
5874          * the checkpointed uberblock.
5875          *
5876          * Note that, we don't checkpoint any blocks if the current
5877          * syncing txg <= spa_checkpoint_txg. We want these frees to sync
5878          * normally as they will be referenced by the checkpointed uberblock.
5879          */
5880         boolean_t checkpoint = B_FALSE;
5881         if (bp->blk_birth <= spa->spa_checkpoint_txg &&
5882             spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
5883                 /*
5884                  * At this point, if the block is part of the checkpoint
5885                  * there is no way it was created in the current txg.
5886                  */
5887                 ASSERT(!now);
5888                 ASSERT3U(spa_syncing_txg(spa), ==, txg);
5889                 checkpoint = B_TRUE;
5890         }
5891
5892         spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
5893
5894         for (int d = 0; d < ndvas; d++) {
5895                 if (now) {
5896                         metaslab_unalloc_dva(spa, &dva[d], txg);
5897                 } else {
5898                         ASSERT3U(txg, ==, spa_syncing_txg(spa));
5899                         metaslab_free_dva(spa, &dva[d], checkpoint);
5900                 }
5901         }
5902
5903         spa_config_exit(spa, SCL_FREE, FTAG);
5904 }
5905
5906 int
5907 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
5908 {
5909         const dva_t *dva = bp->blk_dva;
5910         int ndvas = BP_GET_NDVAS(bp);
5911         int error = 0;
5912
5913         ASSERT(!BP_IS_HOLE(bp));
5914
5915         if (txg != 0) {
5916                 /*
5917                  * First do a dry run to make sure all DVAs are claimable,
5918                  * so we don't have to unwind from partial failures below.
5919                  */
5920                 if ((error = metaslab_claim(spa, bp, 0)) != 0)
5921                         return (error);
5922         }
5923
5924         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
5925
5926         for (int d = 0; d < ndvas; d++) {
5927                 error = metaslab_claim_dva(spa, &dva[d], txg);
5928                 if (error != 0)
5929                         break;
5930         }
5931
5932         spa_config_exit(spa, SCL_ALLOC, FTAG);
5933
5934         ASSERT(error == 0 || txg == 0);
5935
5936         return (error);
5937 }
5938
5939 static void
5940 metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
5941     uint64_t size, void *arg)
5942 {
5943         (void) inner, (void) arg;
5944
5945         if (vd->vdev_ops == &vdev_indirect_ops)
5946                 return;
5947
5948         metaslab_check_free_impl(vd, offset, size);
5949 }
5950
5951 static void
5952 metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
5953 {
5954         metaslab_t *msp;
5955         spa_t *spa __maybe_unused = vd->vdev_spa;
5956
5957         if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
5958                 return;
5959
5960         if (vd->vdev_ops->vdev_op_remap != NULL) {
5961                 vd->vdev_ops->vdev_op_remap(vd, offset, size,
5962                     metaslab_check_free_impl_cb, NULL);
5963                 return;
5964         }
5965
5966         ASSERT(vdev_is_concrete(vd));
5967         ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
5968         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
5969
5970         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
5971
5972         mutex_enter(&msp->ms_lock);
5973         if (msp->ms_loaded) {
5974                 range_tree_verify_not_present(msp->ms_allocatable,
5975                     offset, size);
5976         }
5977
5978         /*
5979          * Check all segments that currently exist in the freeing pipeline.
5980          *
5981          * It would intuitively make sense to also check the current allocating
5982          * tree since metaslab_unalloc_dva() exists for extents that are
5983          * allocated and freed in the same sync pass within the same txg.
5984          * Unfortunately there are places (e.g. the ZIL) where we allocate a
5985          * segment but then we free part of it within the same txg
5986          * [see zil_sync()]. Thus, we don't call range_tree_verify() in the
5987          * current allocating tree.
5988          */
5989         range_tree_verify_not_present(msp->ms_freeing, offset, size);
5990         range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
5991         range_tree_verify_not_present(msp->ms_freed, offset, size);
5992         for (int j = 0; j < TXG_DEFER_SIZE; j++)
5993                 range_tree_verify_not_present(msp->ms_defer[j], offset, size);
5994         range_tree_verify_not_present(msp->ms_trim, offset, size);
5995         mutex_exit(&msp->ms_lock);
5996 }
5997
5998 void
5999 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
6000 {
6001         if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
6002                 return;
6003
6004         spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
6005         for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
6006                 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
6007                 vdev_t *vd = vdev_lookup_top(spa, vdev);
6008                 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
6009                 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
6010
6011                 if (DVA_GET_GANG(&bp->blk_dva[i]))
6012                         size = vdev_gang_header_asize(vd);
6013
6014                 ASSERT3P(vd, !=, NULL);
6015
6016                 metaslab_check_free_impl(vd, offset, size);
6017         }
6018         spa_config_exit(spa, SCL_VDEV, FTAG);
6019 }
6020
6021 static void
6022 metaslab_group_disable_wait(metaslab_group_t *mg)
6023 {
6024         ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
6025         while (mg->mg_disabled_updating) {
6026                 cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
6027         }
6028 }
6029
6030 static void
6031 metaslab_group_disabled_increment(metaslab_group_t *mg)
6032 {
6033         ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
6034         ASSERT(mg->mg_disabled_updating);
6035
6036         while (mg->mg_ms_disabled >= max_disabled_ms) {
6037                 cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
6038         }
6039         mg->mg_ms_disabled++;
6040         ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms);
6041 }
6042
6043 /*
6044  * Mark the metaslab as disabled to prevent any allocations on this metaslab.
6045  * We must also track how many metaslabs are currently disabled within a
6046  * metaslab group and limit them to prevent allocation failures from
6047  * occurring because all metaslabs are disabled.
6048  */
6049 void
6050 metaslab_disable(metaslab_t *msp)
6051 {
6052         ASSERT(!MUTEX_HELD(&msp->ms_lock));
6053         metaslab_group_t *mg = msp->ms_group;
6054
6055         mutex_enter(&mg->mg_ms_disabled_lock);
6056
6057         /*
6058          * To keep an accurate count of how many threads have disabled
6059          * a specific metaslab group, we only allow one thread to mark
6060          * the metaslab group at a time. This ensures that the value of
6061          * ms_disabled will be accurate when we decide to mark a metaslab
6062          * group as disabled. To do this we force all other threads
6063          * to wait till the metaslab's mg_disabled_updating flag is no
6064          * longer set.
6065          */
6066         metaslab_group_disable_wait(mg);
6067         mg->mg_disabled_updating = B_TRUE;
6068         if (msp->ms_disabled == 0) {
6069                 metaslab_group_disabled_increment(mg);
6070         }
6071         mutex_enter(&msp->ms_lock);
6072         msp->ms_disabled++;
6073         mutex_exit(&msp->ms_lock);
6074
6075         mg->mg_disabled_updating = B_FALSE;
6076         cv_broadcast(&mg->mg_ms_disabled_cv);
6077         mutex_exit(&mg->mg_ms_disabled_lock);
6078 }
6079
6080 void
6081 metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload)
6082 {
6083         metaslab_group_t *mg = msp->ms_group;
6084         spa_t *spa = mg->mg_vd->vdev_spa;
6085
6086         /*
6087          * Wait for the outstanding IO to be synced to prevent newly
6088          * allocated blocks from being overwritten.  This used by
6089          * initialize and TRIM which are modifying unallocated space.
6090          */
6091         if (sync)
6092                 txg_wait_synced(spa_get_dsl(spa), 0);
6093
6094         mutex_enter(&mg->mg_ms_disabled_lock);
6095         mutex_enter(&msp->ms_lock);
6096         if (--msp->ms_disabled == 0) {
6097                 mg->mg_ms_disabled--;
6098                 cv_broadcast(&mg->mg_ms_disabled_cv);
6099                 if (unload)
6100                         metaslab_unload(msp);
6101         }
6102         mutex_exit(&msp->ms_lock);
6103         mutex_exit(&mg->mg_ms_disabled_lock);
6104 }
6105
6106 void
6107 metaslab_set_unflushed_dirty(metaslab_t *ms, boolean_t dirty)
6108 {
6109         ms->ms_unflushed_dirty = dirty;
6110 }
6111
6112 static void
6113 metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx)
6114 {
6115         vdev_t *vd = ms->ms_group->mg_vd;
6116         spa_t *spa = vd->vdev_spa;
6117         objset_t *mos = spa_meta_objset(spa);
6118
6119         ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
6120
6121         metaslab_unflushed_phys_t entry = {
6122                 .msp_unflushed_txg = metaslab_unflushed_txg(ms),
6123         };
6124         uint64_t entry_size = sizeof (entry);
6125         uint64_t entry_offset = ms->ms_id * entry_size;
6126
6127         uint64_t object = 0;
6128         int err = zap_lookup(mos, vd->vdev_top_zap,
6129             VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
6130             &object);
6131         if (err == ENOENT) {
6132                 object = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA,
6133                     SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
6134                 VERIFY0(zap_add(mos, vd->vdev_top_zap,
6135                     VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
6136                     &object, tx));
6137         } else {
6138                 VERIFY0(err);
6139         }
6140
6141         dmu_write(spa_meta_objset(spa), object, entry_offset, entry_size,
6142             &entry, tx);
6143 }
6144
6145 void
6146 metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx)
6147 {
6148         ms->ms_unflushed_txg = txg;
6149         metaslab_update_ondisk_flush_data(ms, tx);
6150 }
6151
6152 boolean_t
6153 metaslab_unflushed_dirty(metaslab_t *ms)
6154 {
6155         return (ms->ms_unflushed_dirty);
6156 }
6157
6158 uint64_t
6159 metaslab_unflushed_txg(metaslab_t *ms)
6160 {
6161         return (ms->ms_unflushed_txg);
6162 }
6163
6164 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, aliquot, U64, ZMOD_RW,
6165         "Allocation granularity (a.k.a. stripe size)");
6166
6167 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_load, INT, ZMOD_RW,
6168         "Load all metaslabs when pool is first opened");
6169
6170 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_unload, INT, ZMOD_RW,
6171         "Prevent metaslabs from being unloaded");
6172
6173 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_enabled, INT, ZMOD_RW,
6174         "Preload potential metaslabs during reassessment");
6175
6176 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay, UINT, ZMOD_RW,
6177         "Delay in txgs after metaslab was last used before unloading");
6178
6179 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay_ms, UINT, ZMOD_RW,
6180         "Delay in milliseconds after metaslab was last used before unloading");
6181
6182 /* BEGIN CSTYLED */
6183 ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, noalloc_threshold, UINT, ZMOD_RW,
6184         "Percentage of metaslab group size that should be free to make it "
6185         "eligible for allocation");
6186
6187 ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, fragmentation_threshold, UINT, ZMOD_RW,
6188         "Percentage of metaslab group size that should be considered eligible "
6189         "for allocations unless all metaslab groups within the metaslab class "
6190         "have also crossed this threshold");
6191
6192 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, fragmentation_factor_enabled, INT,
6193         ZMOD_RW,
6194         "Use the fragmentation metric to prefer less fragmented metaslabs");
6195 /* END CSTYLED */
6196
6197 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, fragmentation_threshold, UINT,
6198         ZMOD_RW, "Fragmentation for metaslab to allow allocation");
6199
6200 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, lba_weighting_enabled, INT, ZMOD_RW,
6201         "Prefer metaslabs with lower LBAs");
6202
6203 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, bias_enabled, INT, ZMOD_RW,
6204         "Enable metaslab group biasing");
6205
6206 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, segment_weight_enabled, INT,
6207         ZMOD_RW, "Enable segment-based metaslab selection");
6208
6209 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, switch_threshold, INT, ZMOD_RW,
6210         "Segment-based metaslab selection maximum buckets before switching");
6211
6212 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, U64, ZMOD_RW,
6213         "Blocks larger than this size are sometimes forced to be gang blocks");
6214
6215 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging_pct, UINT, ZMOD_RW,
6216         "Percentage of large blocks that will be forced to be gang blocks");
6217
6218 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, UINT, ZMOD_RW,
6219         "Max distance (bytes) to search forward before using size tree");
6220
6221 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_use_largest_segment, INT, ZMOD_RW,
6222         "When looking in size tree, use largest segment instead of exact fit");
6223
6224 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, U64,
6225         ZMOD_RW, "How long to trust the cached max chunk size of a metaslab");
6226
6227 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, mem_limit, UINT, ZMOD_RW,
6228         "Percentage of memory that can be used to store metaslab range trees");
6229
6230 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, try_hard_before_gang, INT,
6231         ZMOD_RW, "Try hard to allocate before ganging");
6232
6233 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, UINT, ZMOD_RW,
6234         "Normally only consider this many of the best metaslabs in each vdev");