module/zfs/metaslab.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or https://opensource.org/licenses/CDDL-1.0.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
  24  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  25  * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
  26  * Copyright (c) 2017, Intel Corporation.
  27  */
  28
  29 #include <sys/zfs_context.h>
  30 #include <sys/dmu.h>
  31 #include <sys/dmu_tx.h>
  32 #include <sys/space_map.h>
  33 #include <sys/metaslab_impl.h>
  34 #include <sys/vdev_impl.h>
  35 #include <sys/vdev_draid.h>
  36 #include <sys/zio.h>
  37 #include <sys/spa_impl.h>
  38 #include <sys/zfeature.h>
  39 #include <sys/vdev_indirect_mapping.h>
  40 #include <sys/zap.h>
  41 #include <sys/btree.h>
  42
  43 #define WITH_DF_BLOCK_ALLOCATOR
  44
  45 #define GANG_ALLOCATION(flags) \
  46         ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
  47
  48 /*
  49  * Metaslab granularity, in bytes. This is roughly similar to what would be
  50  * referred to as the "stripe size" in traditional RAID arrays. In normal
  51  * operation, we will try to write this amount of data to each disk before
  52  * moving on to the next top-level vdev.
  53  */
  54 static uint64_t metaslab_aliquot = 1024 * 1024;
  55
  56 /*
  57  * For testing, make some blocks above a certain size be gang blocks.
  58  */
  59 uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;
  60
  61 /*
  62  * Of blocks of size >= metaslab_force_ganging, actually gang them this often.
  63  */
  64 uint_t metaslab_force_ganging_pct = 3;
  65
  66 /*
  67  * In pools where the log space map feature is not enabled we touch
  68  * multiple metaslabs (and their respective space maps) with each
  69  * transaction group. Thus, we benefit from having a small space map
  70  * block size since it allows us to issue more I/O operations scattered
  71  * around the disk. So a sane default for the space map block size
  72  * is 8~16K.
  73  */
  74 int zfs_metaslab_sm_blksz_no_log = (1 << 14);
  75
  76 /*
  77  * When the log space map feature is enabled, we accumulate a lot of
  78  * changes per metaslab that are flushed once in a while so we benefit
  79  * from a bigger block size like 128K for the metaslab space maps.
  80  */
  81 int zfs_metaslab_sm_blksz_with_log = (1 << 17);
  82
  83 /*
  84  * The in-core space map representation is more compact than its on-disk form.
  85  * The zfs_condense_pct determines how much more compact the in-core
  86  * space map representation must be before we compact it on-disk.
  87  * Values should be greater than or equal to 100.
  88  */
  89 uint_t zfs_condense_pct = 200;
  90
  91 /*
  92  * Condensing a metaslab is not guaranteed to actually reduce the amount of
  93  * space used on disk. In particular, a space map uses data in increments of
  94  * MAX(1 << ashift, space_map_blksz), so a metaslab might use the
  95  * same number of blocks after condensing. Since the goal of condensing is to
  96  * reduce the number of IOPs required to read the space map, we only want to
  97  * condense when we can be sure we will reduce the number of blocks used by the
  98  * space map. Unfortunately, we cannot precisely compute whether or not this is
  99  * the case in metaslab_should_condense since we are holding ms_lock. Instead,
 100  * we apply the following heuristic: do not condense a spacemap unless the
 101  * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
 102  * blocks.
 103  */
 104 static const int zfs_metaslab_condense_block_threshold = 4;
 105
 106 /*
 107  * The zfs_mg_noalloc_threshold defines which metaslab groups should
 108  * be eligible for allocation. The value is defined as a percentage of
 109  * free space. Metaslab groups that have more free space than
 110  * zfs_mg_noalloc_threshold are always eligible for allocations. Once
 111  * a metaslab group's free space is less than or equal to the
 112  * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
 113  * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
 114  * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
 115  * groups are allowed to accept allocations. Gang blocks are always
 116  * eligible to allocate on any metaslab group. The default value of 0 means
 117  * no metaslab group will be excluded based on this criterion.
 118  */
 119 static uint_t zfs_mg_noalloc_threshold = 0;
 120
 121 /*
 122  * Metaslab groups are considered eligible for allocations if their
 123  * fragmentation metric (measured as a percentage) is less than or
 124  * equal to zfs_mg_fragmentation_threshold. If a metaslab group
 125  * exceeds this threshold then it will be skipped unless all metaslab
 126  * groups within the metaslab class have also crossed this threshold.
 127  *
 128  * This tunable was introduced to avoid edge cases where we continue
 129  * allocating from very fragmented disks in our pool while other, less
 130  * fragmented disks, exists. On the other hand, if all disks in the
 131  * pool are uniformly approaching the threshold, the threshold can
 132  * be a speed bump in performance, where we keep switching the disks
 133  * that we allocate from (e.g. we allocate some segments from disk A
 134  * making it bypassing the threshold while freeing segments from disk
 135  * B getting its fragmentation below the threshold).
 136  *
 137  * Empirically, we've seen that our vdev selection for allocations is
 138  * good enough that fragmentation increases uniformly across all vdevs
 139  * the majority of the time. Thus we set the threshold percentage high
 140  * enough to avoid hitting the speed bump on pools that are being pushed
 141  * to the edge.
 142  */
 143 static uint_t zfs_mg_fragmentation_threshold = 95;
 144
 145 /*
 146  * Allow metaslabs to keep their active state as long as their fragmentation
 147  * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
 148  * active metaslab that exceeds this threshold will no longer keep its active
 149  * status allowing better metaslabs to be selected.
 150  */
 151 static uint_t zfs_metaslab_fragmentation_threshold = 70;
 152
 153 /*
 154  * When set will load all metaslabs when pool is first opened.
 155  */
 156 int metaslab_debug_load = B_FALSE;
 157
 158 /*
 159  * When set will prevent metaslabs from being unloaded.
 160  */
 161 static int metaslab_debug_unload = B_FALSE;
 162
 163 /*
 164  * Minimum size which forces the dynamic allocator to change
 165  * it's allocation strategy.  Once the space map cannot satisfy
 166  * an allocation of this size then it switches to using more
 167  * aggressive strategy (i.e search by size rather than offset).
 168  */
 169 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
 170
 171 /*
 172  * The minimum free space, in percent, which must be available
 173  * in a space map to continue allocations in a first-fit fashion.
 174  * Once the space map's free space drops below this level we dynamically
 175  * switch to using best-fit allocations.
 176  */
 177 uint_t metaslab_df_free_pct = 4;
 178
 179 /*
 180  * Maximum distance to search forward from the last offset. Without this
 181  * limit, fragmented pools can see >100,000 iterations and
 182  * metaslab_block_picker() becomes the performance limiting factor on
 183  * high-performance storage.
 184  *
 185  * With the default setting of 16MB, we typically see less than 500
 186  * iterations, even with very fragmented, ashift=9 pools. The maximum number
 187  * of iterations possible is:
 188  *     metaslab_df_max_search / (2 * (1<<ashift))
 189  * With the default setting of 16MB this is 16*1024 (with ashift=9) or
 190  * 2048 (with ashift=12).
 191  */
 192 static uint_t metaslab_df_max_search = 16 * 1024 * 1024;
 193
 194 /*
 195  * Forces the metaslab_block_picker function to search for at least this many
 196  * segments forwards until giving up on finding a segment that the allocation
 197  * will fit into.
 198  */
 199 static const uint32_t metaslab_min_search_count = 100;
 200
 201 /*
 202  * If we are not searching forward (due to metaslab_df_max_search,
 203  * metaslab_df_free_pct, or metaslab_df_alloc_threshold), this tunable
 204  * controls what segment is used.  If it is set, we will use the largest free
 205  * segment.  If it is not set, we will use a segment of exactly the requested
 206  * size (or larger).
 207  */
 208 static int metaslab_df_use_largest_segment = B_FALSE;
 209
 210 /*
 211  * These tunables control how long a metaslab will remain loaded after the
 212  * last allocation from it.  A metaslab can't be unloaded until at least
 213  * metaslab_unload_delay TXG's and metaslab_unload_delay_ms milliseconds
 214  * have elapsed.  However, zfs_metaslab_mem_limit may cause it to be
 215  * unloaded sooner.  These settings are intended to be generous -- to keep
 216  * metaslabs loaded for a long time, reducing the rate of metaslab loading.
 217  */
 218 static uint_t metaslab_unload_delay = 32;
 219 static uint_t metaslab_unload_delay_ms = 10 * 60 * 1000; /* ten minutes */
 220
 221 /*
 222  * Max number of metaslabs per group to preload.
 223  */
 224 uint_t metaslab_preload_limit = 10;
 225
 226 /*
 227  * Enable/disable preloading of metaslab.
 228  */
 229 static int metaslab_preload_enabled = B_TRUE;
 230
 231 /*
 232  * Enable/disable fragmentation weighting on metaslabs.
 233  */
 234 static int metaslab_fragmentation_factor_enabled = B_TRUE;
 235
 236 /*
 237  * Enable/disable lba weighting (i.e. outer tracks are given preference).
 238  */
 239 static int metaslab_lba_weighting_enabled = B_TRUE;
 240
 241 /*
 242  * Enable/disable metaslab group biasing.
 243  */
 244 static int metaslab_bias_enabled = B_TRUE;
 245
 246 /*
 247  * Enable/disable remapping of indirect DVAs to their concrete vdevs.
 248  */
 249 static const boolean_t zfs_remap_blkptr_enable = B_TRUE;
 250
 251 /*
 252  * Enable/disable segment-based metaslab selection.
 253  */
 254 static int zfs_metaslab_segment_weight_enabled = B_TRUE;
 255
 256 /*
 257  * When using segment-based metaslab selection, we will continue
 258  * allocating from the active metaslab until we have exhausted
 259  * zfs_metaslab_switch_threshold of its buckets.
 260  */
 261 static int zfs_metaslab_switch_threshold = 2;
 262
 263 /*
 264  * Internal switch to enable/disable the metaslab allocation tracing
 265  * facility.
 266  */
 267 static const boolean_t metaslab_trace_enabled = B_FALSE;
 268
 269 /*
 270  * Maximum entries that the metaslab allocation tracing facility will keep
 271  * in a given list when running in non-debug mode. We limit the number
 272  * of entries in non-debug mode to prevent us from using up too much memory.
 273  * The limit should be sufficiently large that we don't expect any allocation
 274  * to every exceed this value. In debug mode, the system will panic if this
 275  * limit is ever reached allowing for further investigation.
 276  */
 277 static const uint64_t metaslab_trace_max_entries = 5000;
 278
 279 /*
 280  * Maximum number of metaslabs per group that can be disabled
 281  * simultaneously.
 282  */
 283 static const int max_disabled_ms = 3;
 284
 285 /*
 286  * Time (in seconds) to respect ms_max_size when the metaslab is not loaded.
 287  * To avoid 64-bit overflow, don't set above UINT32_MAX.
 288  */
 289 static uint64_t zfs_metaslab_max_size_cache_sec = 1 * 60 * 60; /* 1 hour */
 290
 291 /*
 292  * Maximum percentage of memory to use on storing loaded metaslabs. If loading
 293  * a metaslab would take it over this percentage, the oldest selected metaslab
 294  * is automatically unloaded.
 295  */
 296 static uint_t zfs_metaslab_mem_limit = 25;
 297
 298 /*
 299  * Force the per-metaslab range trees to use 64-bit integers to store
 300  * segments. Used for debugging purposes.
 301  */
 302 static const boolean_t zfs_metaslab_force_large_segs = B_FALSE;
 303
 304 /*
 305  * By default we only store segments over a certain size in the size-sorted
 306  * metaslab trees (ms_allocatable_by_size and
 307  * ms_unflushed_frees_by_size). This dramatically reduces memory usage and
 308  * improves load and unload times at the cost of causing us to use slightly
 309  * larger segments than we would otherwise in some cases.
 310  */
 311 static const uint32_t metaslab_by_size_min_shift = 14;
 312
 313 /*
 314  * If not set, we will first try normal allocation.  If that fails then
 315  * we will do a gang allocation.  If that fails then we will do a "try hard"
 316  * gang allocation.  If that fails then we will have a multi-layer gang
 317  * block.
 318  *
 319  * If set, we will first try normal allocation.  If that fails then
 320  * we will do a "try hard" allocation.  If that fails we will do a gang
 321  * allocation.  If that fails we will do a "try hard" gang allocation.  If
 322  * that fails then we will have a multi-layer gang block.
 323  */
 324 static int zfs_metaslab_try_hard_before_gang = B_FALSE;
 325
 326 /*
 327  * When not trying hard, we only consider the best zfs_metaslab_find_max_tries
 328  * metaslabs.  This improves performance, especially when there are many
 329  * metaslabs per vdev and the allocation can't actually be satisfied (so we
 330  * would otherwise iterate all the metaslabs).  If there is a metaslab with a
 331  * worse weight but it can actually satisfy the allocation, we won't find it
 332  * until trying hard.  This may happen if the worse metaslab is not loaded
 333  * (and the true weight is better than we have calculated), or due to weight
 334  * bucketization.  E.g. we are looking for a 60K segment, and the best
 335  * metaslabs all have free segments in the 32-63K bucket, but the best
 336  * zfs_metaslab_find_max_tries metaslabs have ms_max_size <60KB, and a
 337  * subsequent metaslab has ms_max_size >60KB (but fewer segments in this
 338  * bucket, and therefore a lower weight).
 339  */
 340 static uint_t zfs_metaslab_find_max_tries = 100;
 341
 342 static uint64_t metaslab_weight(metaslab_t *, boolean_t);
 343 static void metaslab_set_fragmentation(metaslab_t *, boolean_t);
 344 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
 345 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
 346
 347 static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
 348 static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
 349 static void metaslab_flush_update(metaslab_t *, dmu_tx_t *);
 350 static unsigned int metaslab_idx_func(multilist_t *, void *);
 351 static void metaslab_evict(metaslab_t *, uint64_t);
 352 static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg);
 353 kmem_cache_t *metaslab_alloc_trace_cache;
 354
 355 typedef struct metaslab_stats {
 356         kstat_named_t metaslabstat_trace_over_limit;
 357         kstat_named_t metaslabstat_reload_tree;
 358         kstat_named_t metaslabstat_too_many_tries;
 359         kstat_named_t metaslabstat_try_hard;
 360 } metaslab_stats_t;
 361
 362 static metaslab_stats_t metaslab_stats = {
 363         { "trace_over_limit",           KSTAT_DATA_UINT64 },
 364         { "reload_tree",                KSTAT_DATA_UINT64 },
 365         { "too_many_tries",             KSTAT_DATA_UINT64 },
 366         { "try_hard",                   KSTAT_DATA_UINT64 },
 367 };
 368
 369 #define METASLABSTAT_BUMP(stat) \
 370         atomic_inc_64(&metaslab_stats.stat.value.ui64);
 371
 372
 373 static kstat_t *metaslab_ksp;
 374
 375 void
 376 metaslab_stat_init(void)
 377 {
 378         ASSERT(metaslab_alloc_trace_cache == NULL);
 379         metaslab_alloc_trace_cache = kmem_cache_create(
 380             "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
 381             0, NULL, NULL, NULL, NULL, NULL, 0);
 382         metaslab_ksp = kstat_create("zfs", 0, "metaslab_stats",
 383             "misc", KSTAT_TYPE_NAMED, sizeof (metaslab_stats) /
 384             sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
 385         if (metaslab_ksp != NULL) {
 386                 metaslab_ksp->ks_data = &metaslab_stats;
 387                 kstat_install(metaslab_ksp);
 388         }
 389 }
 390
 391 void
 392 metaslab_stat_fini(void)
 393 {
 394         if (metaslab_ksp != NULL) {
 395                 kstat_delete(metaslab_ksp);
 396                 metaslab_ksp = NULL;
 397         }
 398
 399         kmem_cache_destroy(metaslab_alloc_trace_cache);
 400         metaslab_alloc_trace_cache = NULL;
 401 }
 402
 403 /*
 404  * ==========================================================================
 405  * Metaslab classes
 406  * ==========================================================================
 407  */
 408 metaslab_class_t *
 409 metaslab_class_create(spa_t *spa, const metaslab_ops_t *ops)
 410 {
 411         metaslab_class_t *mc;
 412
 413         mc = kmem_zalloc(offsetof(metaslab_class_t,
 414             mc_allocator[spa->spa_alloc_count]), KM_SLEEP);
 415
 416         mc->mc_spa = spa;
 417         mc->mc_ops = ops;
 418         mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
 419         multilist_create(&mc->mc_metaslab_txg_list, sizeof (metaslab_t),
 420             offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func);
 421         for (int i = 0; i < spa->spa_alloc_count; i++) {
 422                 metaslab_class_allocator_t *mca = &mc->mc_allocator[i];
 423                 mca->mca_rotor = NULL;
 424                 zfs_refcount_create_tracked(&mca->mca_alloc_slots);
 425         }
 426
 427         return (mc);
 428 }
 429
 430 void
 431 metaslab_class_destroy(metaslab_class_t *mc)
 432 {
 433         spa_t *spa = mc->mc_spa;
 434
 435         ASSERT(mc->mc_alloc == 0);
 436         ASSERT(mc->mc_deferred == 0);
 437         ASSERT(mc->mc_space == 0);
 438         ASSERT(mc->mc_dspace == 0);
 439
 440         for (int i = 0; i < spa->spa_alloc_count; i++) {
 441                 metaslab_class_allocator_t *mca = &mc->mc_allocator[i];
 442                 ASSERT(mca->mca_rotor == NULL);
 443                 zfs_refcount_destroy(&mca->mca_alloc_slots);
 444         }
 445         mutex_destroy(&mc->mc_lock);
 446         multilist_destroy(&mc->mc_metaslab_txg_list);
 447         kmem_free(mc, offsetof(metaslab_class_t,
 448             mc_allocator[spa->spa_alloc_count]));
 449 }
 450
 451 int
 452 metaslab_class_validate(metaslab_class_t *mc)
 453 {
 454         metaslab_group_t *mg;
 455         vdev_t *vd;
 456
 457         /*
 458          * Must hold one of the spa_config locks.
 459          */
 460         ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
 461             spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
 462
 463         if ((mg = mc->mc_allocator[0].mca_rotor) == NULL)
 464                 return (0);
 465
 466         do {
 467                 vd = mg->mg_vd;
 468                 ASSERT(vd->vdev_mg != NULL);
 469                 ASSERT3P(vd->vdev_top, ==, vd);
 470                 ASSERT3P(mg->mg_class, ==, mc);
 471                 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
 472         } while ((mg = mg->mg_next) != mc->mc_allocator[0].mca_rotor);
 473
 474         return (0);
 475 }
 476
 477 static void
 478 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
 479     int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
 480 {
 481         atomic_add_64(&mc->mc_alloc, alloc_delta);
 482         atomic_add_64(&mc->mc_deferred, defer_delta);
 483         atomic_add_64(&mc->mc_space, space_delta);
 484         atomic_add_64(&mc->mc_dspace, dspace_delta);
 485 }
 486
 487 uint64_t
 488 metaslab_class_get_alloc(metaslab_class_t *mc)
 489 {
 490         return (mc->mc_alloc);
 491 }
 492
 493 uint64_t
 494 metaslab_class_get_deferred(metaslab_class_t *mc)
 495 {
 496         return (mc->mc_deferred);
 497 }
 498
 499 uint64_t
 500 metaslab_class_get_space(metaslab_class_t *mc)
 501 {
 502         return (mc->mc_space);
 503 }
 504
 505 uint64_t
 506 metaslab_class_get_dspace(metaslab_class_t *mc)
 507 {
 508         return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
 509 }
 510
 511 void
 512 metaslab_class_histogram_verify(metaslab_class_t *mc)
 513 {
 514         spa_t *spa = mc->mc_spa;
 515         vdev_t *rvd = spa->spa_root_vdev;
 516         uint64_t *mc_hist;
 517         int i;
 518
 519         if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 520                 return;
 521
 522         mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 523             KM_SLEEP);
 524
 525         mutex_enter(&mc->mc_lock);
 526         for (int c = 0; c < rvd->vdev_children; c++) {
 527                 vdev_t *tvd = rvd->vdev_child[c];
 528                 metaslab_group_t *mg = vdev_get_mg(tvd, mc);
 529
 530                 /*
 531                  * Skip any holes, uninitialized top-levels, or
 532                  * vdevs that are not in this metalab class.
 533                  */
 534                 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 535                     mg->mg_class != mc) {
 536                         continue;
 537                 }
 538
 539                 IMPLY(mg == mg->mg_vd->vdev_log_mg,
 540                     mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
 541
 542                 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
 543                         mc_hist[i] += mg->mg_histogram[i];
 544         }
 545
 546         for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
 547                 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
 548         }
 549
 550         mutex_exit(&mc->mc_lock);
 551         kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 552 }
 553
 554 /*
 555  * Calculate the metaslab class's fragmentation metric. The metric
 556  * is weighted based on the space contribution of each metaslab group.
 557  * The return value will be a number between 0 and 100 (inclusive), or
 558  * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
 559  * zfs_frag_table for more information about the metric.
 560  */
 561 uint64_t
 562 metaslab_class_fragmentation(metaslab_class_t *mc)
 563 {
 564         vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 565         uint64_t fragmentation = 0;
 566
 567         spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 568
 569         for (int c = 0; c < rvd->vdev_children; c++) {
 570                 vdev_t *tvd = rvd->vdev_child[c];
 571                 metaslab_group_t *mg = tvd->vdev_mg;
 572
 573                 /*
 574                  * Skip any holes, uninitialized top-levels,
 575                  * or vdevs that are not in this metalab class.
 576                  */
 577                 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 578                     mg->mg_class != mc) {
 579                         continue;
 580                 }
 581
 582                 /*
 583                  * If a metaslab group does not contain a fragmentation
 584                  * metric then just bail out.
 585                  */
 586                 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
 587                         spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 588                         return (ZFS_FRAG_INVALID);
 589                 }
 590
 591                 /*
 592                  * Determine how much this metaslab_group is contributing
 593                  * to the overall pool fragmentation metric.
 594                  */
 595                 fragmentation += mg->mg_fragmentation *
 596                     metaslab_group_get_space(mg);
 597         }
 598         fragmentation /= metaslab_class_get_space(mc);
 599
 600         ASSERT3U(fragmentation, <=, 100);
 601         spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 602         return (fragmentation);
 603 }
 604
 605 /*
 606  * Calculate the amount of expandable space that is available in
 607  * this metaslab class. If a device is expanded then its expandable
 608  * space will be the amount of allocatable space that is currently not
 609  * part of this metaslab class.
 610  */
 611 uint64_t
 612 metaslab_class_expandable_space(metaslab_class_t *mc)
 613 {
 614         vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 615         uint64_t space = 0;
 616
 617         spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 618         for (int c = 0; c < rvd->vdev_children; c++) {
 619                 vdev_t *tvd = rvd->vdev_child[c];
 620                 metaslab_group_t *mg = tvd->vdev_mg;
 621
 622                 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 623                     mg->mg_class != mc) {
 624                         continue;
 625                 }
 626
 627                 /*
 628                  * Calculate if we have enough space to add additional
 629                  * metaslabs. We report the expandable space in terms
 630                  * of the metaslab size since that's the unit of expansion.
 631                  */
 632                 space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize,
 633                     1ULL << tvd->vdev_ms_shift);
 634         }
 635         spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 636         return (space);
 637 }
 638
 639 void
 640 metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
 641 {
 642         multilist_t *ml = &mc->mc_metaslab_txg_list;
 643         for (int i = 0; i < multilist_get_num_sublists(ml); i++) {
 644                 multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
 645                 metaslab_t *msp = multilist_sublist_head(mls);
 646                 multilist_sublist_unlock(mls);
 647                 while (msp != NULL) {
 648                         mutex_enter(&msp->ms_lock);
 649
 650                         /*
 651                          * If the metaslab has been removed from the list
 652                          * (which could happen if we were at the memory limit
 653                          * and it was evicted during this loop), then we can't
 654                          * proceed and we should restart the sublist.
 655                          */
 656                         if (!multilist_link_active(&msp->ms_class_txg_node)) {
 657                                 mutex_exit(&msp->ms_lock);
 658                                 i--;
 659                                 break;
 660                         }
 661                         mls = multilist_sublist_lock(ml, i);
 662                         metaslab_t *next_msp = multilist_sublist_next(mls, msp);
 663                         multilist_sublist_unlock(mls);
 664                         if (txg >
 665                             msp->ms_selected_txg + metaslab_unload_delay &&
 666                             gethrtime() > msp->ms_selected_time +
 667                             (uint64_t)MSEC2NSEC(metaslab_unload_delay_ms)) {
 668                                 metaslab_evict(msp, txg);
 669                         } else {
 670                                 /*
 671                                  * Once we've hit a metaslab selected too
 672                                  * recently to evict, we're done evicting for
 673                                  * now.
 674                                  */
 675                                 mutex_exit(&msp->ms_lock);
 676                                 break;
 677                         }
 678                         mutex_exit(&msp->ms_lock);
 679                         msp = next_msp;
 680                 }
 681         }
 682 }
 683
 684 static int
 685 metaslab_compare(const void *x1, const void *x2)
 686 {
 687         const metaslab_t *m1 = (const metaslab_t *)x1;
 688         const metaslab_t *m2 = (const metaslab_t *)x2;
 689
 690         int sort1 = 0;
 691         int sort2 = 0;
 692         if (m1->ms_allocator != -1 && m1->ms_primary)
 693                 sort1 = 1;
 694         else if (m1->ms_allocator != -1 && !m1->ms_primary)
 695                 sort1 = 2;
 696         if (m2->ms_allocator != -1 && m2->ms_primary)
 697                 sort2 = 1;
 698         else if (m2->ms_allocator != -1 && !m2->ms_primary)
 699                 sort2 = 2;
 700
 701         /*
 702          * Sort inactive metaslabs first, then primaries, then secondaries. When
 703          * selecting a metaslab to allocate from, an allocator first tries its
 704          * primary, then secondary active metaslab. If it doesn't have active
 705          * metaslabs, or can't allocate from them, it searches for an inactive
 706          * metaslab to activate. If it can't find a suitable one, it will steal
 707          * a primary or secondary metaslab from another allocator.
 708          */
 709         if (sort1 < sort2)
 710                 return (-1);
 711         if (sort1 > sort2)
 712                 return (1);
 713
 714         int cmp = TREE_CMP(m2->ms_weight, m1->ms_weight);
 715         if (likely(cmp))
 716                 return (cmp);
 717
 718         IMPLY(TREE_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2);
 719
 720         return (TREE_CMP(m1->ms_start, m2->ms_start));
 721 }
 722
 723 /*
 724  * ==========================================================================
 725  * Metaslab groups
 726  * ==========================================================================
 727  */
 728 /*
 729  * Update the allocatable flag and the metaslab group's capacity.
 730  * The allocatable flag is set to true if the capacity is below
 731  * the zfs_mg_noalloc_threshold or has a fragmentation value that is
 732  * greater than zfs_mg_fragmentation_threshold. If a metaslab group
 733  * transitions from allocatable to non-allocatable or vice versa then the
 734  * metaslab group's class is updated to reflect the transition.
 735  */
 736 static void
 737 metaslab_group_alloc_update(metaslab_group_t *mg)
 738 {
 739         vdev_t *vd = mg->mg_vd;
 740         metaslab_class_t *mc = mg->mg_class;
 741         vdev_stat_t *vs = &vd->vdev_stat;
 742         boolean_t was_allocatable;
 743         boolean_t was_initialized;
 744
 745         ASSERT(vd == vd->vdev_top);
 746         ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==,
 747             SCL_ALLOC);
 748
 749         mutex_enter(&mg->mg_lock);
 750         was_allocatable = mg->mg_allocatable;
 751         was_initialized = mg->mg_initialized;
 752
 753         mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
 754             (vs->vs_space + 1);
 755
 756         mutex_enter(&mc->mc_lock);
 757
 758         /*
 759          * If the metaslab group was just added then it won't
 760          * have any space until we finish syncing out this txg.
 761          * At that point we will consider it initialized and available
 762          * for allocations.  We also don't consider non-activated
 763          * metaslab groups (e.g. vdevs that are in the middle of being removed)
 764          * to be initialized, because they can't be used for allocation.
 765          */
 766         mg->mg_initialized = metaslab_group_initialized(mg);
 767         if (!was_initialized && mg->mg_initialized) {
 768                 mc->mc_groups++;
 769         } else if (was_initialized && !mg->mg_initialized) {
 770                 ASSERT3U(mc->mc_groups, >, 0);
 771                 mc->mc_groups--;
 772         }
 773         if (mg->mg_initialized)
 774                 mg->mg_no_free_space = B_FALSE;
 775
 776         /*
 777          * A metaslab group is considered allocatable if it has plenty
 778          * of free space or is not heavily fragmented. We only take
 779          * fragmentation into account if the metaslab group has a valid
 780          * fragmentation metric (i.e. a value between 0 and 100).
 781          */
 782         mg->mg_allocatable = (mg->mg_activation_count > 0 &&
 783             mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
 784             (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
 785             mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
 786
 787         /*
 788          * The mc_alloc_groups maintains a count of the number of
 789          * groups in this metaslab class that are still above the
 790          * zfs_mg_noalloc_threshold. This is used by the allocating
 791          * threads to determine if they should avoid allocations to
 792          * a given group. The allocator will avoid allocations to a group
 793          * if that group has reached or is below the zfs_mg_noalloc_threshold
 794          * and there are still other groups that are above the threshold.
 795          * When a group transitions from allocatable to non-allocatable or
 796          * vice versa we update the metaslab class to reflect that change.
 797          * When the mc_alloc_groups value drops to 0 that means that all
 798          * groups have reached the zfs_mg_noalloc_threshold making all groups
 799          * eligible for allocations. This effectively means that all devices
 800          * are balanced again.
 801          */
 802         if (was_allocatable && !mg->mg_allocatable)
 803                 mc->mc_alloc_groups--;
 804         else if (!was_allocatable && mg->mg_allocatable)
 805                 mc->mc_alloc_groups++;
 806         mutex_exit(&mc->mc_lock);
 807
 808         mutex_exit(&mg->mg_lock);
 809 }
 810
 811 int
 812 metaslab_sort_by_flushed(const void *va, const void *vb)
 813 {
 814         const metaslab_t *a = va;
 815         const metaslab_t *b = vb;
 816
 817         int cmp = TREE_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg);
 818         if (likely(cmp))
 819                 return (cmp);
 820
 821         uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id;
 822         uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id;
 823         cmp = TREE_CMP(a_vdev_id, b_vdev_id);
 824         if (cmp)
 825                 return (cmp);
 826
 827         return (TREE_CMP(a->ms_id, b->ms_id));
 828 }
 829
 830 metaslab_group_t *
 831 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
 832 {
 833         metaslab_group_t *mg;
 834
 835         mg = kmem_zalloc(offsetof(metaslab_group_t,
 836             mg_allocator[allocators]), KM_SLEEP);
 837         mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
 838         mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL);
 839         cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL);
 840         avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 841             sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node));
 842         mg->mg_vd = vd;
 843         mg->mg_class = mc;
 844         mg->mg_activation_count = 0;
 845         mg->mg_initialized = B_FALSE;
 846         mg->mg_no_free_space = B_TRUE;
 847         mg->mg_allocators = allocators;
 848
 849         for (int i = 0; i < allocators; i++) {
 850                 metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
 851                 zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth);
 852         }
 853
 854         return (mg);
 855 }
 856
 857 void
 858 metaslab_group_destroy(metaslab_group_t *mg)
 859 {
 860         ASSERT(mg->mg_prev == NULL);
 861         ASSERT(mg->mg_next == NULL);
 862         /*
 863          * We may have gone below zero with the activation count
 864          * either because we never activated in the first place or
 865          * because we're done, and possibly removing the vdev.
 866          */
 867         ASSERT(mg->mg_activation_count <= 0);
 868
 869         avl_destroy(&mg->mg_metaslab_tree);
 870         mutex_destroy(&mg->mg_lock);
 871         mutex_destroy(&mg->mg_ms_disabled_lock);
 872         cv_destroy(&mg->mg_ms_disabled_cv);
 873
 874         for (int i = 0; i < mg->mg_allocators; i++) {
 875                 metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
 876                 zfs_refcount_destroy(&mga->mga_alloc_queue_depth);
 877         }
 878         kmem_free(mg, offsetof(metaslab_group_t,
 879             mg_allocator[mg->mg_allocators]));
 880 }
 881
 882 void
 883 metaslab_group_activate(metaslab_group_t *mg)
 884 {
 885         metaslab_class_t *mc = mg->mg_class;
 886         spa_t *spa = mc->mc_spa;
 887         metaslab_group_t *mgprev, *mgnext;
 888
 889         ASSERT3U(spa_config_held(spa, SCL_ALLOC, RW_WRITER), !=, 0);
 890
 891         ASSERT(mg->mg_prev == NULL);
 892         ASSERT(mg->mg_next == NULL);
 893         ASSERT(mg->mg_activation_count <= 0);
 894
 895         if (++mg->mg_activation_count <= 0)
 896                 return;
 897
 898         mg->mg_aliquot = metaslab_aliquot * MAX(1,
 899             vdev_get_ndisks(mg->mg_vd) - vdev_get_nparity(mg->mg_vd));
 900         metaslab_group_alloc_update(mg);
 901
 902         if ((mgprev = mc->mc_allocator[0].mca_rotor) == NULL) {
 903                 mg->mg_prev = mg;
 904                 mg->mg_next = mg;
 905         } else {
 906                 mgnext = mgprev->mg_next;
 907                 mg->mg_prev = mgprev;
 908                 mg->mg_next = mgnext;
 909                 mgprev->mg_next = mg;
 910                 mgnext->mg_prev = mg;
 911         }
 912         for (int i = 0; i < spa->spa_alloc_count; i++) {
 913                 mc->mc_allocator[i].mca_rotor = mg;
 914                 mg = mg->mg_next;
 915         }
 916 }
 917
 918 /*
 919  * Passivate a metaslab group and remove it from the allocation rotor.
 920  * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating
 921  * a metaslab group. This function will momentarily drop spa_config_locks
 922  * that are lower than the SCL_ALLOC lock (see comment below).
 923  */
 924 void
 925 metaslab_group_passivate(metaslab_group_t *mg)
 926 {
 927         metaslab_class_t *mc = mg->mg_class;
 928         spa_t *spa = mc->mc_spa;
 929         metaslab_group_t *mgprev, *mgnext;
 930         int locks = spa_config_held(spa, SCL_ALL, RW_WRITER);
 931
 932         ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==,
 933             (SCL_ALLOC | SCL_ZIO));
 934
 935         if (--mg->mg_activation_count != 0) {
 936                 for (int i = 0; i < spa->spa_alloc_count; i++)
 937                         ASSERT(mc->mc_allocator[i].mca_rotor != mg);
 938                 ASSERT(mg->mg_prev == NULL);
 939                 ASSERT(mg->mg_next == NULL);
 940                 ASSERT(mg->mg_activation_count < 0);
 941                 return;
 942         }
 943
 944         /*
 945          * The spa_config_lock is an array of rwlocks, ordered as
 946          * follows (from highest to lowest):
 947          *      SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC >
 948          *      SCL_ZIO > SCL_FREE > SCL_VDEV
 949          * (For more information about the spa_config_lock see spa_misc.c)
 950          * The higher the lock, the broader its coverage. When we passivate
 951          * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO
 952          * config locks. However, the metaslab group's taskq might be trying
 953          * to preload metaslabs so we must drop the SCL_ZIO lock and any
 954          * lower locks to allow the I/O to complete. At a minimum,
 955          * we continue to hold the SCL_ALLOC lock, which prevents any future
 956          * allocations from taking place and any changes to the vdev tree.
 957          */
 958         spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
 959         taskq_wait_outstanding(spa->spa_metaslab_taskq, 0);
 960         spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
 961         metaslab_group_alloc_update(mg);
 962         for (int i = 0; i < mg->mg_allocators; i++) {
 963                 metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
 964                 metaslab_t *msp = mga->mga_primary;
 965                 if (msp != NULL) {
 966                         mutex_enter(&msp->ms_lock);
 967                         metaslab_passivate(msp,
 968                             metaslab_weight_from_range_tree(msp));
 969                         mutex_exit(&msp->ms_lock);
 970                 }
 971                 msp = mga->mga_secondary;
 972                 if (msp != NULL) {
 973                         mutex_enter(&msp->ms_lock);
 974                         metaslab_passivate(msp,
 975                             metaslab_weight_from_range_tree(msp));
 976                         mutex_exit(&msp->ms_lock);
 977                 }
 978         }
 979
 980         mgprev = mg->mg_prev;
 981         mgnext = mg->mg_next;
 982
 983         if (mg == mgnext) {
 984                 mgnext = NULL;
 985         } else {
 986                 mgprev->mg_next = mgnext;
 987                 mgnext->mg_prev = mgprev;
 988         }
 989         for (int i = 0; i < spa->spa_alloc_count; i++) {
 990                 if (mc->mc_allocator[i].mca_rotor == mg)
 991                         mc->mc_allocator[i].mca_rotor = mgnext;
 992         }
 993
 994         mg->mg_prev = NULL;
 995         mg->mg_next = NULL;
 996 }
 997
 998 boolean_t
 999 metaslab_group_initialized(metaslab_group_t *mg)
1000 {
1001         vdev_t *vd = mg->mg_vd;
1002         vdev_stat_t *vs = &vd->vdev_stat;
1003
1004         return (vs->vs_space != 0 && mg->mg_activation_count > 0);
1005 }
1006
1007 uint64_t
1008 metaslab_group_get_space(metaslab_group_t *mg)
1009 {
1010         /*
1011          * Note that the number of nodes in mg_metaslab_tree may be one less
1012          * than vdev_ms_count, due to the embedded log metaslab.
1013          */
1014         mutex_enter(&mg->mg_lock);
1015         uint64_t ms_count = avl_numnodes(&mg->mg_metaslab_tree);
1016         mutex_exit(&mg->mg_lock);
1017         return ((1ULL << mg->mg_vd->vdev_ms_shift) * ms_count);
1018 }
1019
1020 void
1021 metaslab_group_histogram_verify(metaslab_group_t *mg)
1022 {
1023         uint64_t *mg_hist;
1024         avl_tree_t *t = &mg->mg_metaslab_tree;
1025         uint64_t ashift = mg->mg_vd->vdev_ashift;
1026
1027         if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
1028                 return;
1029
1030         mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
1031             KM_SLEEP);
1032
1033         ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
1034             SPACE_MAP_HISTOGRAM_SIZE + ashift);
1035
1036         mutex_enter(&mg->mg_lock);
1037         for (metaslab_t *msp = avl_first(t);
1038             msp != NULL; msp = AVL_NEXT(t, msp)) {
1039                 VERIFY3P(msp->ms_group, ==, mg);
1040                 /* skip if not active */
1041                 if (msp->ms_sm == NULL)
1042                         continue;
1043
1044                 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
1045                         mg_hist[i + ashift] +=
1046                             msp->ms_sm->sm_phys->smp_histogram[i];
1047                 }
1048         }
1049
1050         for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
1051                 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
1052
1053         mutex_exit(&mg->mg_lock);
1054
1055         kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
1056 }
1057
1058 static void
1059 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
1060 {
1061         metaslab_class_t *mc = mg->mg_class;
1062         uint64_t ashift = mg->mg_vd->vdev_ashift;
1063
1064         ASSERT(MUTEX_HELD(&msp->ms_lock));
1065         if (msp->ms_sm == NULL)
1066                 return;
1067
1068         mutex_enter(&mg->mg_lock);
1069         mutex_enter(&mc->mc_lock);
1070         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
1071                 IMPLY(mg == mg->mg_vd->vdev_log_mg,
1072                     mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
1073                 mg->mg_histogram[i + ashift] +=
1074                     msp->ms_sm->sm_phys->smp_histogram[i];
1075                 mc->mc_histogram[i + ashift] +=
1076                     msp->ms_sm->sm_phys->smp_histogram[i];
1077         }
1078         mutex_exit(&mc->mc_lock);
1079         mutex_exit(&mg->mg_lock);
1080 }
1081
1082 void
1083 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
1084 {
1085         metaslab_class_t *mc = mg->mg_class;
1086         uint64_t ashift = mg->mg_vd->vdev_ashift;
1087
1088         ASSERT(MUTEX_HELD(&msp->ms_lock));
1089         if (msp->ms_sm == NULL)
1090                 return;
1091
1092         mutex_enter(&mg->mg_lock);
1093         mutex_enter(&mc->mc_lock);
1094         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
1095                 ASSERT3U(mg->mg_histogram[i + ashift], >=,
1096                     msp->ms_sm->sm_phys->smp_histogram[i]);
1097                 ASSERT3U(mc->mc_histogram[i + ashift], >=,
1098                     msp->ms_sm->sm_phys->smp_histogram[i]);
1099                 IMPLY(mg == mg->mg_vd->vdev_log_mg,
1100                     mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
1101
1102                 mg->mg_histogram[i + ashift] -=
1103                     msp->ms_sm->sm_phys->smp_histogram[i];
1104                 mc->mc_histogram[i + ashift] -=
1105                     msp->ms_sm->sm_phys->smp_histogram[i];
1106         }
1107         mutex_exit(&mc->mc_lock);
1108         mutex_exit(&mg->mg_lock);
1109 }
1110
1111 static void
1112 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
1113 {
1114         ASSERT(msp->ms_group == NULL);
1115         mutex_enter(&mg->mg_lock);
1116         msp->ms_group = mg;
1117         msp->ms_weight = 0;
1118         avl_add(&mg->mg_metaslab_tree, msp);
1119         mutex_exit(&mg->mg_lock);
1120
1121         mutex_enter(&msp->ms_lock);
1122         metaslab_group_histogram_add(mg, msp);
1123         mutex_exit(&msp->ms_lock);
1124 }
1125
1126 static void
1127 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
1128 {
1129         mutex_enter(&msp->ms_lock);
1130         metaslab_group_histogram_remove(mg, msp);
1131         mutex_exit(&msp->ms_lock);
1132
1133         mutex_enter(&mg->mg_lock);
1134         ASSERT(msp->ms_group == mg);
1135         avl_remove(&mg->mg_metaslab_tree, msp);
1136
1137         metaslab_class_t *mc = msp->ms_group->mg_class;
1138         multilist_sublist_t *mls =
1139             multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
1140         if (multilist_link_active(&msp->ms_class_txg_node))
1141                 multilist_sublist_remove(mls, msp);
1142         multilist_sublist_unlock(mls);
1143
1144         msp->ms_group = NULL;
1145         mutex_exit(&mg->mg_lock);
1146 }
1147
1148 static void
1149 metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
1150 {
1151         ASSERT(MUTEX_HELD(&msp->ms_lock));
1152         ASSERT(MUTEX_HELD(&mg->mg_lock));
1153         ASSERT(msp->ms_group == mg);
1154
1155         avl_remove(&mg->mg_metaslab_tree, msp);
1156         msp->ms_weight = weight;
1157         avl_add(&mg->mg_metaslab_tree, msp);
1158
1159 }
1160
1161 static void
1162 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
1163 {
1164         /*
1165          * Although in principle the weight can be any value, in
1166          * practice we do not use values in the range [1, 511].
1167          */
1168         ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
1169         ASSERT(MUTEX_HELD(&msp->ms_lock));
1170
1171         mutex_enter(&mg->mg_lock);
1172         metaslab_group_sort_impl(mg, msp, weight);
1173         mutex_exit(&mg->mg_lock);
1174 }
1175
1176 /*
1177  * Calculate the fragmentation for a given metaslab group. We can use
1178  * a simple average here since all metaslabs within the group must have
1179  * the same size. The return value will be a value between 0 and 100
1180  * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
1181  * group have a fragmentation metric.
1182  */
1183 uint64_t
1184 metaslab_group_fragmentation(metaslab_group_t *mg)
1185 {
1186         vdev_t *vd = mg->mg_vd;
1187         uint64_t fragmentation = 0;
1188         uint64_t valid_ms = 0;
1189
1190         for (int m = 0; m < vd->vdev_ms_count; m++) {
1191                 metaslab_t *msp = vd->vdev_ms[m];
1192
1193                 if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
1194                         continue;
1195                 if (msp->ms_group != mg)
1196                         continue;
1197
1198                 valid_ms++;
1199                 fragmentation += msp->ms_fragmentation;
1200         }
1201
1202         if (valid_ms <= mg->mg_vd->vdev_ms_count / 2)
1203                 return (ZFS_FRAG_INVALID);
1204
1205         fragmentation /= valid_ms;
1206         ASSERT3U(fragmentation, <=, 100);
1207         return (fragmentation);
1208 }
1209
1210 /*
1211  * Determine if a given metaslab group should skip allocations. A metaslab
1212  * group should avoid allocations if its free capacity is less than the
1213  * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
1214  * zfs_mg_fragmentation_threshold and there is at least one metaslab group
1215  * that can still handle allocations. If the allocation throttle is enabled
1216  * then we skip allocations to devices that have reached their maximum
1217  * allocation queue depth unless the selected metaslab group is the only
1218  * eligible group remaining.
1219  */
1220 static boolean_t
1221 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
1222     int flags, uint64_t psize, int allocator, int d)
1223 {
1224         spa_t *spa = mg->mg_vd->vdev_spa;
1225         metaslab_class_t *mc = mg->mg_class;
1226
1227         /*
1228          * We can only consider skipping this metaslab group if it's
1229          * in the normal metaslab class and there are other metaslab
1230          * groups to select from. Otherwise, we always consider it eligible
1231          * for allocations.
1232          */
1233         if ((mc != spa_normal_class(spa) &&
1234             mc != spa_special_class(spa) &&
1235             mc != spa_dedup_class(spa)) ||
1236             mc->mc_groups <= 1)
1237                 return (B_TRUE);
1238
1239         /*
1240          * If the metaslab group's mg_allocatable flag is set (see comments
1241          * in metaslab_group_alloc_update() for more information) and
1242          * the allocation throttle is disabled then allow allocations to this
1243          * device. However, if the allocation throttle is enabled then
1244          * check if we have reached our allocation limit (mga_alloc_queue_depth)
1245          * to determine if we should allow allocations to this metaslab group.
1246          * If all metaslab groups are no longer considered allocatable
1247          * (mc_alloc_groups == 0) or we're trying to allocate the smallest
1248          * gang block size then we allow allocations on this metaslab group
1249          * regardless of the mg_allocatable or throttle settings.
1250          */
1251         if (mg->mg_allocatable) {
1252                 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
1253                 int64_t qdepth;
1254                 uint64_t qmax = mga->mga_cur_max_alloc_queue_depth;
1255
1256                 if (!mc->mc_alloc_throttle_enabled)
1257                         return (B_TRUE);
1258
1259                 /*
1260                  * If this metaslab group does not have any free space, then
1261                  * there is no point in looking further.
1262                  */
1263                 if (mg->mg_no_free_space)
1264                         return (B_FALSE);
1265
1266                 /*
1267                  * Some allocations (e.g., those coming from device removal
1268                  * where the * allocations are not even counted in the
1269                  * metaslab * allocation queues) are allowed to bypass
1270                  * the throttle.
1271                  */
1272                 if (flags & METASLAB_DONT_THROTTLE)
1273                         return (B_TRUE);
1274
1275                 /*
1276                  * Relax allocation throttling for ditto blocks.  Due to
1277                  * random imbalances in allocation it tends to push copies
1278                  * to one vdev, that looks a bit better at the moment.
1279                  */
1280                 qmax = qmax * (4 + d) / 4;
1281
1282                 qdepth = zfs_refcount_count(&mga->mga_alloc_queue_depth);
1283
1284                 /*
1285                  * If this metaslab group is below its qmax or it's
1286                  * the only allocatable metaslab group, then attempt
1287                  * to allocate from it.
1288                  */
1289                 if (qdepth < qmax || mc->mc_alloc_groups == 1)
1290                         return (B_TRUE);
1291                 ASSERT3U(mc->mc_alloc_groups, >, 1);
1292
1293                 /*
1294                  * Since this metaslab group is at or over its qmax, we
1295                  * need to determine if there are metaslab groups after this
1296                  * one that might be able to handle this allocation. This is
1297                  * racy since we can't hold the locks for all metaslab
1298                  * groups at the same time when we make this check.
1299                  */
1300                 for (metaslab_group_t *mgp = mg->mg_next;
1301                     mgp != rotor; mgp = mgp->mg_next) {
1302                         metaslab_group_allocator_t *mgap =
1303                             &mgp->mg_allocator[allocator];
1304                         qmax = mgap->mga_cur_max_alloc_queue_depth;
1305                         qmax = qmax * (4 + d) / 4;
1306                         qdepth =
1307                             zfs_refcount_count(&mgap->mga_alloc_queue_depth);
1308
1309                         /*
1310                          * If there is another metaslab group that
1311                          * might be able to handle the allocation, then
1312                          * we return false so that we skip this group.
1313                          */
1314                         if (qdepth < qmax && !mgp->mg_no_free_space)
1315                                 return (B_FALSE);
1316                 }
1317
1318                 /*
1319                  * We didn't find another group to handle the allocation
1320                  * so we can't skip this metaslab group even though
1321                  * we are at or over our qmax.
1322                  */
1323                 return (B_TRUE);
1324
1325         } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
1326                 return (B_TRUE);
1327         }
1328         return (B_FALSE);
1329 }
1330
1331 /*
1332  * ==========================================================================
1333  * Range tree callbacks
1334  * ==========================================================================
1335  */
1336
1337 /*
1338  * Comparison function for the private size-ordered tree using 32-bit
1339  * ranges. Tree is sorted by size, larger sizes at the end of the tree.
1340  */
1341 __attribute__((always_inline)) inline
1342 static int
1343 metaslab_rangesize32_compare(const void *x1, const void *x2)
1344 {
1345         const range_seg32_t *r1 = x1;
1346         const range_seg32_t *r2 = x2;
1347
1348         uint64_t rs_size1 = r1->rs_end - r1->rs_start;
1349         uint64_t rs_size2 = r2->rs_end - r2->rs_start;
1350
1351         int cmp = TREE_CMP(rs_size1, rs_size2);
1352
1353         return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start));
1354 }
1355
1356 /*
1357  * Comparison function for the private size-ordered tree using 64-bit
1358  * ranges. Tree is sorted by size, larger sizes at the end of the tree.
1359  */
1360 __attribute__((always_inline)) inline
1361 static int
1362 metaslab_rangesize64_compare(const void *x1, const void *x2)
1363 {
1364         const range_seg64_t *r1 = x1;
1365         const range_seg64_t *r2 = x2;
1366
1367         uint64_t rs_size1 = r1->rs_end - r1->rs_start;
1368         uint64_t rs_size2 = r2->rs_end - r2->rs_start;
1369
1370         int cmp = TREE_CMP(rs_size1, rs_size2);
1371
1372         return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start));
1373 }
1374
1375 typedef struct metaslab_rt_arg {
1376         zfs_btree_t *mra_bt;
1377         uint32_t mra_floor_shift;
1378 } metaslab_rt_arg_t;
1379
1380 struct mssa_arg {
1381         range_tree_t *rt;
1382         metaslab_rt_arg_t *mra;
1383 };
1384
1385 static void
1386 metaslab_size_sorted_add(void *arg, uint64_t start, uint64_t size)
1387 {
1388         struct mssa_arg *mssap = arg;
1389         range_tree_t *rt = mssap->rt;
1390         metaslab_rt_arg_t *mrap = mssap->mra;
1391         range_seg_max_t seg = {0};
1392         rs_set_start(&seg, rt, start);
1393         rs_set_end(&seg, rt, start + size);
1394         metaslab_rt_add(rt, &seg, mrap);
1395 }
1396
1397 static void
1398 metaslab_size_tree_full_load(range_tree_t *rt)
1399 {
1400         metaslab_rt_arg_t *mrap = rt->rt_arg;
1401         METASLABSTAT_BUMP(metaslabstat_reload_tree);
1402         ASSERT0(zfs_btree_numnodes(mrap->mra_bt));
1403         mrap->mra_floor_shift = 0;
1404         struct mssa_arg arg = {0};
1405         arg.rt = rt;
1406         arg.mra = mrap;
1407         range_tree_walk(rt, metaslab_size_sorted_add, &arg);
1408 }
1409
1410
1411 ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize32_in_buf,
1412     range_seg32_t, metaslab_rangesize32_compare)
1413
1414 ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize64_in_buf,
1415     range_seg64_t, metaslab_rangesize64_compare)
1416
1417 /*
1418  * Create any block allocator specific components. The current allocators
1419  * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
1420  */
1421 static void
1422 metaslab_rt_create(range_tree_t *rt, void *arg)
1423 {
1424         metaslab_rt_arg_t *mrap = arg;
1425         zfs_btree_t *size_tree = mrap->mra_bt;
1426
1427         size_t size;
1428         int (*compare) (const void *, const void *);
1429         bt_find_in_buf_f bt_find;
1430         switch (rt->rt_type) {
1431         case RANGE_SEG32:
1432                 size = sizeof (range_seg32_t);
1433                 compare = metaslab_rangesize32_compare;
1434                 bt_find = metaslab_rt_find_rangesize32_in_buf;
1435                 break;
1436         case RANGE_SEG64:
1437                 size = sizeof (range_seg64_t);
1438                 compare = metaslab_rangesize64_compare;
1439                 bt_find = metaslab_rt_find_rangesize64_in_buf;
1440                 break;
1441         default:
1442                 panic("Invalid range seg type %d", rt->rt_type);
1443         }
1444         zfs_btree_create(size_tree, compare, bt_find, size);
1445         mrap->mra_floor_shift = metaslab_by_size_min_shift;
1446 }
1447
1448 static void
1449 metaslab_rt_destroy(range_tree_t *rt, void *arg)
1450 {
1451         (void) rt;
1452         metaslab_rt_arg_t *mrap = arg;
1453         zfs_btree_t *size_tree = mrap->mra_bt;
1454
1455         zfs_btree_destroy(size_tree);
1456         kmem_free(mrap, sizeof (*mrap));
1457 }
1458
1459 static void
1460 metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
1461 {
1462         metaslab_rt_arg_t *mrap = arg;
1463         zfs_btree_t *size_tree = mrap->mra_bt;
1464
1465         if (rs_get_end(rs, rt) - rs_get_start(rs, rt) <
1466             (1ULL << mrap->mra_floor_shift))
1467                 return;
1468
1469         zfs_btree_add(size_tree, rs);
1470 }
1471
1472 static void
1473 metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
1474 {
1475         metaslab_rt_arg_t *mrap = arg;
1476         zfs_btree_t *size_tree = mrap->mra_bt;
1477
1478         if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1ULL <<
1479             mrap->mra_floor_shift))
1480                 return;
1481
1482         zfs_btree_remove(size_tree, rs);
1483 }
1484
1485 static void
1486 metaslab_rt_vacate(range_tree_t *rt, void *arg)
1487 {
1488         metaslab_rt_arg_t *mrap = arg;
1489         zfs_btree_t *size_tree = mrap->mra_bt;
1490         zfs_btree_clear(size_tree);
1491         zfs_btree_destroy(size_tree);
1492
1493         metaslab_rt_create(rt, arg);
1494 }
1495
1496 static const range_tree_ops_t metaslab_rt_ops = {
1497         .rtop_create = metaslab_rt_create,
1498         .rtop_destroy = metaslab_rt_destroy,
1499         .rtop_add = metaslab_rt_add,
1500         .rtop_remove = metaslab_rt_remove,
1501         .rtop_vacate = metaslab_rt_vacate
1502 };
1503
1504 /*
1505  * ==========================================================================
1506  * Common allocator routines
1507  * ==========================================================================
1508  */
1509
1510 /*
1511  * Return the maximum contiguous segment within the metaslab.
1512  */
1513 uint64_t
1514 metaslab_largest_allocatable(metaslab_t *msp)
1515 {
1516         zfs_btree_t *t = &msp->ms_allocatable_by_size;
1517         range_seg_t *rs;
1518
1519         if (t == NULL)
1520                 return (0);
1521         if (zfs_btree_numnodes(t) == 0)
1522                 metaslab_size_tree_full_load(msp->ms_allocatable);
1523
1524         rs = zfs_btree_last(t, NULL);
1525         if (rs == NULL)
1526                 return (0);
1527
1528         return (rs_get_end(rs, msp->ms_allocatable) - rs_get_start(rs,
1529             msp->ms_allocatable));
1530 }
1531
1532 /*
1533  * Return the maximum contiguous segment within the unflushed frees of this
1534  * metaslab.
1535  */
1536 static uint64_t
1537 metaslab_largest_unflushed_free(metaslab_t *msp)
1538 {
1539         ASSERT(MUTEX_HELD(&msp->ms_lock));
1540
1541         if (msp->ms_unflushed_frees == NULL)
1542                 return (0);
1543
1544         if (zfs_btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0)
1545                 metaslab_size_tree_full_load(msp->ms_unflushed_frees);
1546         range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size,
1547             NULL);
1548         if (rs == NULL)
1549                 return (0);
1550
1551         /*
1552          * When a range is freed from the metaslab, that range is added to
1553          * both the unflushed frees and the deferred frees. While the block
1554          * will eventually be usable, if the metaslab were loaded the range
1555          * would not be added to the ms_allocatable tree until TXG_DEFER_SIZE
1556          * txgs had passed.  As a result, when attempting to estimate an upper
1557          * bound for the largest currently-usable free segment in the
1558          * metaslab, we need to not consider any ranges currently in the defer
1559          * trees. This algorithm approximates the largest available chunk in
1560          * the largest range in the unflushed_frees tree by taking the first
1561          * chunk.  While this may be a poor estimate, it should only remain so
1562          * briefly and should eventually self-correct as frees are no longer
1563          * deferred. Similar logic applies to the ms_freed tree. See
1564          * metaslab_load() for more details.
1565          *
1566          * There are two primary sources of inaccuracy in this estimate. Both
1567          * are tolerated for performance reasons. The first source is that we
1568          * only check the largest segment for overlaps. Smaller segments may
1569          * have more favorable overlaps with the other trees, resulting in
1570          * larger usable chunks.  Second, we only look at the first chunk in
1571          * the largest segment; there may be other usable chunks in the
1572          * largest segment, but we ignore them.
1573          */
1574         uint64_t rstart = rs_get_start(rs, msp->ms_unflushed_frees);
1575         uint64_t rsize = rs_get_end(rs, msp->ms_unflushed_frees) - rstart;
1576         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1577                 uint64_t start = 0;
1578                 uint64_t size = 0;
1579                 boolean_t found = range_tree_find_in(msp->ms_defer[t], rstart,
1580                     rsize, &start, &size);
1581                 if (found) {
1582                         if (rstart == start)
1583                                 return (0);
1584                         rsize = start - rstart;
1585                 }
1586         }
1587
1588         uint64_t start = 0;
1589         uint64_t size = 0;
1590         boolean_t found = range_tree_find_in(msp->ms_freed, rstart,
1591             rsize, &start, &size);
1592         if (found)
1593                 rsize = start - rstart;
1594
1595         return (rsize);
1596 }
1597
1598 static range_seg_t *
1599 metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start,
1600     uint64_t size, zfs_btree_index_t *where)
1601 {
1602         range_seg_t *rs;
1603         range_seg_max_t rsearch;
1604
1605         rs_set_start(&rsearch, rt, start);
1606         rs_set_end(&rsearch, rt, start + size);
1607
1608         rs = zfs_btree_find(t, &rsearch, where);
1609         if (rs == NULL) {
1610                 rs = zfs_btree_next(t, where, where);
1611         }
1612
1613         return (rs);
1614 }
1615
1616 #if defined(WITH_DF_BLOCK_ALLOCATOR) || \
1617     defined(WITH_CF_BLOCK_ALLOCATOR)
1618
1619 /*
1620  * This is a helper function that can be used by the allocator to find a
1621  * suitable block to allocate. This will search the specified B-tree looking
1622  * for a block that matches the specified criteria.
1623  */
1624 static uint64_t
1625 metaslab_block_picker(range_tree_t *rt, uint64_t *cursor, uint64_t size,
1626     uint64_t max_search)
1627 {
1628         if (*cursor == 0)
1629                 *cursor = rt->rt_start;
1630         zfs_btree_t *bt = &rt->rt_root;
1631         zfs_btree_index_t where;
1632         range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size, &where);
1633         uint64_t first_found;
1634         int count_searched = 0;
1635
1636         if (rs != NULL)
1637                 first_found = rs_get_start(rs, rt);
1638
1639         while (rs != NULL && (rs_get_start(rs, rt) - first_found <=
1640             max_search || count_searched < metaslab_min_search_count)) {
1641                 uint64_t offset = rs_get_start(rs, rt);
1642                 if (offset + size <= rs_get_end(rs, rt)) {
1643                         *cursor = offset + size;
1644                         return (offset);
1645                 }
1646                 rs = zfs_btree_next(bt, &where, &where);
1647                 count_searched++;
1648         }
1649
1650         *cursor = 0;
1651         return (-1ULL);
1652 }
1653 #endif /* WITH_DF/CF_BLOCK_ALLOCATOR */
1654
1655 #if defined(WITH_DF_BLOCK_ALLOCATOR)
1656 /*
1657  * ==========================================================================
1658  * Dynamic Fit (df) block allocator
1659  *
1660  * Search for a free chunk of at least this size, starting from the last
1661  * offset (for this alignment of block) looking for up to
1662  * metaslab_df_max_search bytes (16MB).  If a large enough free chunk is not
1663  * found within 16MB, then return a free chunk of exactly the requested size (or
1664  * larger).
1665  *
1666  * If it seems like searching from the last offset will be unproductive, skip
1667  * that and just return a free chunk of exactly the requested size (or larger).
1668  * This is based on metaslab_df_alloc_threshold and metaslab_df_free_pct.  This
1669  * mechanism is probably not very useful and may be removed in the future.
1670  *
1671  * The behavior when not searching can be changed to return the largest free
1672  * chunk, instead of a free chunk of exactly the requested size, by setting
1673  * metaslab_df_use_largest_segment.
1674  * ==========================================================================
1675  */
1676 static uint64_t
1677 metaslab_df_alloc(metaslab_t *msp, uint64_t size)
1678 {
1679         /*
1680          * Find the largest power of 2 block size that evenly divides the
1681          * requested size. This is used to try to allocate blocks with similar
1682          * alignment from the same area of the metaslab (i.e. same cursor
1683          * bucket) but it does not guarantee that other allocations sizes
1684          * may exist in the same region.
1685          */
1686         uint64_t align = size & -size;
1687         uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1688         range_tree_t *rt = msp->ms_allocatable;
1689         uint_t free_pct = range_tree_space(rt) * 100 / msp->ms_size;
1690         uint64_t offset;
1691
1692         ASSERT(MUTEX_HELD(&msp->ms_lock));
1693
1694         /*
1695          * If we're running low on space, find a segment based on size,
1696          * rather than iterating based on offset.
1697          */
1698         if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold ||
1699             free_pct < metaslab_df_free_pct) {
1700                 offset = -1;
1701         } else {
1702                 offset = metaslab_block_picker(rt,
1703                     cursor, size, metaslab_df_max_search);
1704         }
1705
1706         if (offset == -1) {
1707                 range_seg_t *rs;
1708                 if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0)
1709                         metaslab_size_tree_full_load(msp->ms_allocatable);
1710
1711                 if (metaslab_df_use_largest_segment) {
1712                         /* use largest free segment */
1713                         rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL);
1714                 } else {
1715                         zfs_btree_index_t where;
1716                         /* use segment of this size, or next largest */
1717                         rs = metaslab_block_find(&msp->ms_allocatable_by_size,
1718                             rt, msp->ms_start, size, &where);
1719                 }
1720                 if (rs != NULL && rs_get_start(rs, rt) + size <= rs_get_end(rs,
1721                     rt)) {
1722                         offset = rs_get_start(rs, rt);
1723                         *cursor = offset + size;
1724                 }
1725         }
1726
1727         return (offset);
1728 }
1729
1730 const metaslab_ops_t zfs_metaslab_ops = {
1731         metaslab_df_alloc
1732 };
1733 #endif /* WITH_DF_BLOCK_ALLOCATOR */
1734
1735 #if defined(WITH_CF_BLOCK_ALLOCATOR)
1736 /*
1737  * ==========================================================================
1738  * Cursor fit block allocator -
1739  * Select the largest region in the metaslab, set the cursor to the beginning
1740  * of the range and the cursor_end to the end of the range. As allocations
1741  * are made advance the cursor. Continue allocating from the cursor until
1742  * the range is exhausted and then find a new range.
1743  * ==========================================================================
1744  */
1745 static uint64_t
1746 metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
1747 {
1748         range_tree_t *rt = msp->ms_allocatable;
1749         zfs_btree_t *t = &msp->ms_allocatable_by_size;
1750         uint64_t *cursor = &msp->ms_lbas[0];
1751         uint64_t *cursor_end = &msp->ms_lbas[1];
1752         uint64_t offset = 0;
1753
1754         ASSERT(MUTEX_HELD(&msp->ms_lock));
1755
1756         ASSERT3U(*cursor_end, >=, *cursor);
1757
1758         if ((*cursor + size) > *cursor_end) {
1759                 range_seg_t *rs;
1760
1761                 if (zfs_btree_numnodes(t) == 0)
1762                         metaslab_size_tree_full_load(msp->ms_allocatable);
1763                 rs = zfs_btree_last(t, NULL);
1764                 if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) <
1765                     size)
1766                         return (-1ULL);
1767
1768                 *cursor = rs_get_start(rs, rt);
1769                 *cursor_end = rs_get_end(rs, rt);
1770         }
1771
1772         offset = *cursor;
1773         *cursor += size;
1774
1775         return (offset);
1776 }
1777
1778 const metaslab_ops_t zfs_metaslab_ops = {
1779         metaslab_cf_alloc
1780 };
1781 #endif /* WITH_CF_BLOCK_ALLOCATOR */
1782
1783 #if defined(WITH_NDF_BLOCK_ALLOCATOR)
1784 /*
1785  * ==========================================================================
1786  * New dynamic fit allocator -
1787  * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
1788  * contiguous blocks. If no region is found then just use the largest segment
1789  * that remains.
1790  * ==========================================================================
1791  */
1792
1793 /*
1794  * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
1795  * to request from the allocator.
1796  */
1797 uint64_t metaslab_ndf_clump_shift = 4;
1798
1799 static uint64_t
1800 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
1801 {
1802         zfs_btree_t *t = &msp->ms_allocatable->rt_root;
1803         range_tree_t *rt = msp->ms_allocatable;
1804         zfs_btree_index_t where;
1805         range_seg_t *rs;
1806         range_seg_max_t rsearch;
1807         uint64_t hbit = highbit64(size);
1808         uint64_t *cursor = &msp->ms_lbas[hbit - 1];
1809         uint64_t max_size = metaslab_largest_allocatable(msp);
1810
1811         ASSERT(MUTEX_HELD(&msp->ms_lock));
1812
1813         if (max_size < size)
1814                 return (-1ULL);
1815
1816         rs_set_start(&rsearch, rt, *cursor);
1817         rs_set_end(&rsearch, rt, *cursor + size);
1818
1819         rs = zfs_btree_find(t, &rsearch, &where);
1820         if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < size) {
1821                 t = &msp->ms_allocatable_by_size;
1822
1823                 rs_set_start(&rsearch, rt, 0);
1824                 rs_set_end(&rsearch, rt, MIN(max_size, 1ULL << (hbit +
1825                     metaslab_ndf_clump_shift)));
1826
1827                 rs = zfs_btree_find(t, &rsearch, &where);
1828                 if (rs == NULL)
1829                         rs = zfs_btree_next(t, &where, &where);
1830                 ASSERT(rs != NULL);
1831         }
1832
1833         if ((rs_get_end(rs, rt) - rs_get_start(rs, rt)) >= size) {
1834                 *cursor = rs_get_start(rs, rt) + size;
1835                 return (rs_get_start(rs, rt));
1836         }
1837         return (-1ULL);
1838 }
1839
1840 const metaslab_ops_t zfs_metaslab_ops = {
1841         metaslab_ndf_alloc
1842 };
1843 #endif /* WITH_NDF_BLOCK_ALLOCATOR */
1844
1845
1846 /*
1847  * ==========================================================================
1848  * Metaslabs
1849  * ==========================================================================
1850  */
1851
1852 /*
1853  * Wait for any in-progress metaslab loads to complete.
1854  */
1855 static void
1856 metaslab_load_wait(metaslab_t *msp)
1857 {
1858         ASSERT(MUTEX_HELD(&msp->ms_lock));
1859
1860         while (msp->ms_loading) {
1861                 ASSERT(!msp->ms_loaded);
1862                 cv_wait(&msp->ms_load_cv, &msp->ms_lock);
1863         }
1864 }
1865
1866 /*
1867  * Wait for any in-progress flushing to complete.
1868  */
1869 static void
1870 metaslab_flush_wait(metaslab_t *msp)
1871 {
1872         ASSERT(MUTEX_HELD(&msp->ms_lock));
1873
1874         while (msp->ms_flushing)
1875                 cv_wait(&msp->ms_flush_cv, &msp->ms_lock);
1876 }
1877
1878 static unsigned int
1879 metaslab_idx_func(multilist_t *ml, void *arg)
1880 {
1881         metaslab_t *msp = arg;
1882
1883         /*
1884          * ms_id values are allocated sequentially, so full 64bit
1885          * division would be a waste of time, so limit it to 32 bits.
1886          */
1887         return ((unsigned int)msp->ms_id % multilist_get_num_sublists(ml));
1888 }
1889
1890 uint64_t
1891 metaslab_allocated_space(metaslab_t *msp)
1892 {
1893         return (msp->ms_allocated_space);
1894 }
1895
1896 /*
1897  * Verify that the space accounting on disk matches the in-core range_trees.
1898  */
1899 static void
1900 metaslab_verify_space(metaslab_t *msp, uint64_t txg)
1901 {
1902         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1903         uint64_t allocating = 0;
1904         uint64_t sm_free_space, msp_free_space;
1905
1906         ASSERT(MUTEX_HELD(&msp->ms_lock));
1907         ASSERT(!msp->ms_condensing);
1908
1909         if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
1910                 return;
1911
1912         /*
1913          * We can only verify the metaslab space when we're called
1914          * from syncing context with a loaded metaslab that has an
1915          * allocated space map. Calling this in non-syncing context
1916          * does not provide a consistent view of the metaslab since
1917          * we're performing allocations in the future.
1918          */
1919         if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
1920             !msp->ms_loaded)
1921                 return;
1922
1923         /*
1924          * Even though the smp_alloc field can get negative,
1925          * when it comes to a metaslab's space map, that should
1926          * never be the case.
1927          */
1928         ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
1929
1930         ASSERT3U(space_map_allocated(msp->ms_sm), >=,
1931             range_tree_space(msp->ms_unflushed_frees));
1932
1933         ASSERT3U(metaslab_allocated_space(msp), ==,
1934             space_map_allocated(msp->ms_sm) +
1935             range_tree_space(msp->ms_unflushed_allocs) -
1936             range_tree_space(msp->ms_unflushed_frees));
1937
1938         sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
1939
1940         /*
1941          * Account for future allocations since we would have
1942          * already deducted that space from the ms_allocatable.
1943          */
1944         for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
1945                 allocating +=
1946                     range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
1947         }
1948         ASSERT3U(allocating + msp->ms_allocated_this_txg, ==,
1949             msp->ms_allocating_total);
1950
1951         ASSERT3U(msp->ms_deferspace, ==,
1952             range_tree_space(msp->ms_defer[0]) +
1953             range_tree_space(msp->ms_defer[1]));
1954
1955         msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
1956             msp->ms_deferspace + range_tree_space(msp->ms_freed);
1957
1958         VERIFY3U(sm_free_space, ==, msp_free_space);
1959 }
1960
1961 static void
1962 metaslab_aux_histograms_clear(metaslab_t *msp)
1963 {
1964         /*
1965          * Auxiliary histograms are only cleared when resetting them,
1966          * which can only happen while the metaslab is loaded.
1967          */
1968         ASSERT(msp->ms_loaded);
1969
1970         memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist));
1971         for (int t = 0; t < TXG_DEFER_SIZE; t++)
1972                 memset(msp->ms_deferhist[t], 0, sizeof (msp->ms_deferhist[t]));
1973 }
1974
1975 static void
1976 metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift,
1977     range_tree_t *rt)
1978 {
1979         /*
1980          * This is modeled after space_map_histogram_add(), so refer to that
1981          * function for implementation details. We want this to work like
1982          * the space map histogram, and not the range tree histogram, as we
1983          * are essentially constructing a delta that will be later subtracted
1984          * from the space map histogram.
1985          */
1986         int idx = 0;
1987         for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
1988                 ASSERT3U(i, >=, idx + shift);
1989                 histogram[idx] += rt->rt_histogram[i] << (i - idx - shift);
1990
1991                 if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
1992                         ASSERT3U(idx + shift, ==, i);
1993                         idx++;
1994                         ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
1995                 }
1996         }
1997 }
1998
1999 /*
2000  * Called at every sync pass that the metaslab gets synced.
2001  *
2002  * The reason is that we want our auxiliary histograms to be updated
2003  * wherever the metaslab's space map histogram is updated. This way
2004  * we stay consistent on which parts of the metaslab space map's
2005  * histogram are currently not available for allocations (e.g because
2006  * they are in the defer, freed, and freeing trees).
2007  */
2008 static void
2009 metaslab_aux_histograms_update(metaslab_t *msp)
2010 {
2011         space_map_t *sm = msp->ms_sm;
2012         ASSERT(sm != NULL);
2013
2014         /*
2015          * This is similar to the metaslab's space map histogram updates
2016          * that take place in metaslab_sync(). The only difference is that
2017          * we only care about segments that haven't made it into the
2018          * ms_allocatable tree yet.
2019          */
2020         if (msp->ms_loaded) {
2021                 metaslab_aux_histograms_clear(msp);
2022
2023                 metaslab_aux_histogram_add(msp->ms_synchist,
2024                     sm->sm_shift, msp->ms_freed);
2025
2026                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2027                         metaslab_aux_histogram_add(msp->ms_deferhist[t],
2028                             sm->sm_shift, msp->ms_defer[t]);
2029                 }
2030         }
2031
2032         metaslab_aux_histogram_add(msp->ms_synchist,
2033             sm->sm_shift, msp->ms_freeing);
2034 }
2035
2036 /*
2037  * Called every time we are done syncing (writing to) the metaslab,
2038  * i.e. at the end of each sync pass.
2039  * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist]
2040  */
2041 static void
2042 metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed)
2043 {
2044         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2045         space_map_t *sm = msp->ms_sm;
2046
2047         if (sm == NULL) {
2048                 /*
2049                  * We came here from metaslab_init() when creating/opening a
2050                  * pool, looking at a metaslab that hasn't had any allocations
2051                  * yet.
2052                  */
2053                 return;
2054         }
2055
2056         /*
2057          * This is similar to the actions that we take for the ms_freed
2058          * and ms_defer trees in metaslab_sync_done().
2059          */
2060         uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE;
2061         if (defer_allowed) {
2062                 memcpy(msp->ms_deferhist[hist_index], msp->ms_synchist,
2063                     sizeof (msp->ms_synchist));
2064         } else {
2065                 memset(msp->ms_deferhist[hist_index], 0,
2066                     sizeof (msp->ms_deferhist[hist_index]));
2067         }
2068         memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist));
2069 }
2070
2071 /*
2072  * Ensure that the metaslab's weight and fragmentation are consistent
2073  * with the contents of the histogram (either the range tree's histogram
2074  * or the space map's depending whether the metaslab is loaded).
2075  */
2076 static void
2077 metaslab_verify_weight_and_frag(metaslab_t *msp)
2078 {
2079         ASSERT(MUTEX_HELD(&msp->ms_lock));
2080
2081         if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
2082                 return;
2083
2084         /*
2085          * We can end up here from vdev_remove_complete(), in which case we
2086          * cannot do these assertions because we hold spa config locks and
2087          * thus we are not allowed to read from the DMU.
2088          *
2089          * We check if the metaslab group has been removed and if that's
2090          * the case we return immediately as that would mean that we are
2091          * here from the aforementioned code path.
2092          */
2093         if (msp->ms_group == NULL)
2094                 return;
2095
2096         /*
2097          * Devices being removed always return a weight of 0 and leave
2098          * fragmentation and ms_max_size as is - there is nothing for
2099          * us to verify here.
2100          */
2101         vdev_t *vd = msp->ms_group->mg_vd;
2102         if (vd->vdev_removing)
2103                 return;
2104
2105         /*
2106          * If the metaslab is dirty it probably means that we've done
2107          * some allocations or frees that have changed our histograms
2108          * and thus the weight.
2109          */
2110         for (int t = 0; t < TXG_SIZE; t++) {
2111                 if (txg_list_member(&vd->vdev_ms_list, msp, t))
2112                         return;
2113         }
2114
2115         /*
2116          * This verification checks that our in-memory state is consistent
2117          * with what's on disk. If the pool is read-only then there aren't
2118          * any changes and we just have the initially-loaded state.
2119          */
2120         if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa))
2121                 return;
2122
2123         /* some extra verification for in-core tree if you can */
2124         if (msp->ms_loaded) {
2125                 range_tree_stat_verify(msp->ms_allocatable);
2126                 VERIFY(space_map_histogram_verify(msp->ms_sm,
2127                     msp->ms_allocatable));
2128         }
2129
2130         uint64_t weight = msp->ms_weight;
2131         uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
2132         boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight);
2133         uint64_t frag = msp->ms_fragmentation;
2134         uint64_t max_segsize = msp->ms_max_size;
2135
2136         msp->ms_weight = 0;
2137         msp->ms_fragmentation = 0;
2138
2139         /*
2140          * This function is used for verification purposes and thus should
2141          * not introduce any side-effects/mutations on the system's state.
2142          *
2143          * Regardless of whether metaslab_weight() thinks this metaslab
2144          * should be active or not, we want to ensure that the actual weight
2145          * (and therefore the value of ms_weight) would be the same if it
2146          * was to be recalculated at this point.
2147          *
2148          * In addition we set the nodirty flag so metaslab_weight() does
2149          * not dirty the metaslab for future TXGs (e.g. when trying to
2150          * force condensing to upgrade the metaslab spacemaps).
2151          */
2152         msp->ms_weight = metaslab_weight(msp, B_TRUE) | was_active;
2153
2154         VERIFY3U(max_segsize, ==, msp->ms_max_size);
2155
2156         /*
2157          * If the weight type changed then there is no point in doing
2158          * verification. Revert fields to their original values.
2159          */
2160         if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) ||
2161             (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) {
2162                 msp->ms_fragmentation = frag;
2163                 msp->ms_weight = weight;
2164                 return;
2165         }
2166
2167         VERIFY3U(msp->ms_fragmentation, ==, frag);
2168         VERIFY3U(msp->ms_weight, ==, weight);
2169 }
2170
2171 /*
2172  * If we're over the zfs_metaslab_mem_limit, select the loaded metaslab from
2173  * this class that was used longest ago, and attempt to unload it.  We don't
2174  * want to spend too much time in this loop to prevent performance
2175  * degradation, and we expect that most of the time this operation will
2176  * succeed. Between that and the normal unloading processing during txg sync,
2177  * we expect this to keep the metaslab memory usage under control.
2178  */
2179 static void
2180 metaslab_potentially_evict(metaslab_class_t *mc)
2181 {
2182 #ifdef _KERNEL
2183         uint64_t allmem = arc_all_memory();
2184         uint64_t inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache);
2185         uint64_t size = spl_kmem_cache_entry_size(zfs_btree_leaf_cache);
2186         uint_t tries = 0;
2187         for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size &&
2188             tries < multilist_get_num_sublists(&mc->mc_metaslab_txg_list) * 2;
2189             tries++) {
2190                 unsigned int idx = multilist_get_random_index(
2191                     &mc->mc_metaslab_txg_list);
2192                 multilist_sublist_t *mls =
2193                     multilist_sublist_lock(&mc->mc_metaslab_txg_list, idx);
2194                 metaslab_t *msp = multilist_sublist_head(mls);
2195                 multilist_sublist_unlock(mls);
2196                 while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 <
2197                     inuse * size) {
2198                         VERIFY3P(mls, ==, multilist_sublist_lock(
2199                             &mc->mc_metaslab_txg_list, idx));
2200                         ASSERT3U(idx, ==,
2201                             metaslab_idx_func(&mc->mc_metaslab_txg_list, msp));
2202
2203                         if (!multilist_link_active(&msp->ms_class_txg_node)) {
2204                                 multilist_sublist_unlock(mls);
2205                                 break;
2206                         }
2207                         metaslab_t *next_msp = multilist_sublist_next(mls, msp);
2208                         multilist_sublist_unlock(mls);
2209                         /*
2210                          * If the metaslab is currently loading there are two
2211                          * cases. If it's the metaslab we're evicting, we
2212                          * can't continue on or we'll panic when we attempt to
2213                          * recursively lock the mutex. If it's another
2214                          * metaslab that's loading, it can be safely skipped,
2215                          * since we know it's very new and therefore not a
2216                          * good eviction candidate. We check later once the
2217                          * lock is held that the metaslab is fully loaded
2218                          * before actually unloading it.
2219                          */
2220                         if (msp->ms_loading) {
2221                                 msp = next_msp;
2222                                 inuse =
2223                                     spl_kmem_cache_inuse(zfs_btree_leaf_cache);
2224                                 continue;
2225                         }
2226                         /*
2227                          * We can't unload metaslabs with no spacemap because
2228                          * they're not ready to be unloaded yet. We can't
2229                          * unload metaslabs with outstanding allocations
2230                          * because doing so could cause the metaslab's weight
2231                          * to decrease while it's unloaded, which violates an
2232                          * invariant that we use to prevent unnecessary
2233                          * loading. We also don't unload metaslabs that are
2234                          * currently active because they are high-weight
2235                          * metaslabs that are likely to be used in the near
2236                          * future.
2237                          */
2238                         mutex_enter(&msp->ms_lock);
2239                         if (msp->ms_allocator == -1 && msp->ms_sm != NULL &&
2240                             msp->ms_allocating_total == 0) {
2241                                 metaslab_unload(msp);
2242                         }
2243                         mutex_exit(&msp->ms_lock);
2244                         msp = next_msp;
2245                         inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache);
2246                 }
2247         }
2248 #else
2249         (void) mc, (void) zfs_metaslab_mem_limit;
2250 #endif
2251 }
2252
2253 static int
2254 metaslab_load_impl(metaslab_t *msp)
2255 {
2256         int error = 0;
2257
2258         ASSERT(MUTEX_HELD(&msp->ms_lock));
2259         ASSERT(msp->ms_loading);
2260         ASSERT(!msp->ms_condensing);
2261
2262         /*
2263          * We temporarily drop the lock to unblock other operations while we
2264          * are reading the space map. Therefore, metaslab_sync() and
2265          * metaslab_sync_done() can run at the same time as we do.
2266          *
2267          * If we are using the log space maps, metaslab_sync() can't write to
2268          * the metaslab's space map while we are loading as we only write to
2269          * it when we are flushing the metaslab, and that can't happen while
2270          * we are loading it.
2271          *
2272          * If we are not using log space maps though, metaslab_sync() can
2273          * append to the space map while we are loading. Therefore we load
2274          * only entries that existed when we started the load. Additionally,
2275          * metaslab_sync_done() has to wait for the load to complete because
2276          * there are potential races like metaslab_load() loading parts of the
2277          * space map that are currently being appended by metaslab_sync(). If
2278          * we didn't, the ms_allocatable would have entries that
2279          * metaslab_sync_done() would try to re-add later.
2280          *
2281          * That's why before dropping the lock we remember the synced length
2282          * of the metaslab and read up to that point of the space map,
2283          * ignoring entries appended by metaslab_sync() that happen after we
2284          * drop the lock.
2285          */
2286         uint64_t length = msp->ms_synced_length;
2287         mutex_exit(&msp->ms_lock);
2288
2289         hrtime_t load_start = gethrtime();
2290         metaslab_rt_arg_t *mrap;
2291         if (msp->ms_allocatable->rt_arg == NULL) {
2292                 mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
2293         } else {
2294                 mrap = msp->ms_allocatable->rt_arg;
2295                 msp->ms_allocatable->rt_ops = NULL;
2296                 msp->ms_allocatable->rt_arg = NULL;
2297         }
2298         mrap->mra_bt = &msp->ms_allocatable_by_size;
2299         mrap->mra_floor_shift = metaslab_by_size_min_shift;
2300
2301         if (msp->ms_sm != NULL) {
2302                 error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
2303                     SM_FREE, length);
2304
2305                 /* Now, populate the size-sorted tree. */
2306                 metaslab_rt_create(msp->ms_allocatable, mrap);
2307                 msp->ms_allocatable->rt_ops = &metaslab_rt_ops;
2308                 msp->ms_allocatable->rt_arg = mrap;
2309
2310                 struct mssa_arg arg = {0};
2311                 arg.rt = msp->ms_allocatable;
2312                 arg.mra = mrap;
2313                 range_tree_walk(msp->ms_allocatable, metaslab_size_sorted_add,
2314                     &arg);
2315         } else {
2316                 /*
2317                  * Add the size-sorted tree first, since we don't need to load
2318                  * the metaslab from the spacemap.
2319                  */
2320                 metaslab_rt_create(msp->ms_allocatable, mrap);
2321                 msp->ms_allocatable->rt_ops = &metaslab_rt_ops;
2322                 msp->ms_allocatable->rt_arg = mrap;
2323                 /*
2324                  * The space map has not been allocated yet, so treat
2325                  * all the space in the metaslab as free and add it to the
2326                  * ms_allocatable tree.
2327                  */
2328                 range_tree_add(msp->ms_allocatable,
2329                     msp->ms_start, msp->ms_size);
2330
2331                 if (msp->ms_new) {
2332                         /*
2333                          * If the ms_sm doesn't exist, this means that this
2334                          * metaslab hasn't gone through metaslab_sync() and
2335                          * thus has never been dirtied. So we shouldn't
2336                          * expect any unflushed allocs or frees from previous
2337                          * TXGs.
2338                          */
2339                         ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
2340                         ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
2341                 }
2342         }
2343
2344         /*
2345          * We need to grab the ms_sync_lock to prevent metaslab_sync() from
2346          * changing the ms_sm (or log_sm) and the metaslab's range trees
2347          * while we are about to use them and populate the ms_allocatable.
2348          * The ms_lock is insufficient for this because metaslab_sync() doesn't
2349          * hold the ms_lock while writing the ms_checkpointing tree to disk.
2350          */
2351         mutex_enter(&msp->ms_sync_lock);
2352         mutex_enter(&msp->ms_lock);
2353
2354         ASSERT(!msp->ms_condensing);
2355         ASSERT(!msp->ms_flushing);
2356
2357         if (error != 0) {
2358                 mutex_exit(&msp->ms_sync_lock);
2359                 return (error);
2360         }
2361
2362         ASSERT3P(msp->ms_group, !=, NULL);
2363         msp->ms_loaded = B_TRUE;
2364
2365         /*
2366          * Apply all the unflushed changes to ms_allocatable right
2367          * away so any manipulations we do below have a clear view
2368          * of what is allocated and what is free.
2369          */
2370         range_tree_walk(msp->ms_unflushed_allocs,
2371             range_tree_remove, msp->ms_allocatable);
2372         range_tree_walk(msp->ms_unflushed_frees,
2373             range_tree_add, msp->ms_allocatable);
2374
2375         ASSERT3P(msp->ms_group, !=, NULL);
2376         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2377         if (spa_syncing_log_sm(spa) != NULL) {
2378                 ASSERT(spa_feature_is_enabled(spa,
2379                     SPA_FEATURE_LOG_SPACEMAP));
2380
2381                 /*
2382                  * If we use a log space map we add all the segments
2383                  * that are in ms_unflushed_frees so they are available
2384                  * for allocation.
2385                  *
2386                  * ms_allocatable needs to contain all free segments
2387                  * that are ready for allocations (thus not segments
2388                  * from ms_freeing, ms_freed, and the ms_defer trees).
2389                  * But if we grab the lock in this code path at a sync
2390                  * pass later that 1, then it also contains the
2391                  * segments of ms_freed (they were added to it earlier
2392                  * in this path through ms_unflushed_frees). So we
2393                  * need to remove all the segments that exist in
2394                  * ms_freed from ms_allocatable as they will be added
2395                  * later in metaslab_sync_done().
2396                  *
2397                  * When there's no log space map, the ms_allocatable
2398                  * correctly doesn't contain any segments that exist
2399                  * in ms_freed [see ms_synced_length].
2400                  */
2401                 range_tree_walk(msp->ms_freed,
2402                     range_tree_remove, msp->ms_allocatable);
2403         }
2404
2405         /*
2406          * If we are not using the log space map, ms_allocatable
2407          * contains the segments that exist in the ms_defer trees
2408          * [see ms_synced_length]. Thus we need to remove them
2409          * from ms_allocatable as they will be added again in
2410          * metaslab_sync_done().
2411          *
2412          * If we are using the log space map, ms_allocatable still
2413          * contains the segments that exist in the ms_defer trees.
2414          * Not because it read them through the ms_sm though. But
2415          * because these segments are part of ms_unflushed_frees
2416          * whose segments we add to ms_allocatable earlier in this
2417          * code path.
2418          */
2419         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2420                 range_tree_walk(msp->ms_defer[t],
2421                     range_tree_remove, msp->ms_allocatable);
2422         }
2423
2424         /*
2425          * Call metaslab_recalculate_weight_and_sort() now that the
2426          * metaslab is loaded so we get the metaslab's real weight.
2427          *
2428          * Unless this metaslab was created with older software and
2429          * has not yet been converted to use segment-based weight, we
2430          * expect the new weight to be better or equal to the weight
2431          * that the metaslab had while it was not loaded. This is
2432          * because the old weight does not take into account the
2433          * consolidation of adjacent segments between TXGs. [see
2434          * comment for ms_synchist and ms_deferhist[] for more info]
2435          */
2436         uint64_t weight = msp->ms_weight;
2437         uint64_t max_size = msp->ms_max_size;
2438         metaslab_recalculate_weight_and_sort(msp);
2439         if (!WEIGHT_IS_SPACEBASED(weight))
2440                 ASSERT3U(weight, <=, msp->ms_weight);
2441         msp->ms_max_size = metaslab_largest_allocatable(msp);
2442         ASSERT3U(max_size, <=, msp->ms_max_size);
2443         hrtime_t load_end = gethrtime();
2444         msp->ms_load_time = load_end;
2445         zfs_dbgmsg("metaslab_load: txg %llu, spa %s, vdev_id %llu, "
2446             "ms_id %llu, smp_length %llu, "
2447             "unflushed_allocs %llu, unflushed_frees %llu, "
2448             "freed %llu, defer %llu + %llu, unloaded time %llu ms, "
2449             "loading_time %lld ms, ms_max_size %llu, "
2450             "max size error %lld, "
2451             "old_weight %llx, new_weight %llx",
2452             (u_longlong_t)spa_syncing_txg(spa), spa_name(spa),
2453             (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
2454             (u_longlong_t)msp->ms_id,
2455             (u_longlong_t)space_map_length(msp->ms_sm),
2456             (u_longlong_t)range_tree_space(msp->ms_unflushed_allocs),
2457             (u_longlong_t)range_tree_space(msp->ms_unflushed_frees),
2458             (u_longlong_t)range_tree_space(msp->ms_freed),
2459             (u_longlong_t)range_tree_space(msp->ms_defer[0]),
2460             (u_longlong_t)range_tree_space(msp->ms_defer[1]),
2461             (longlong_t)((load_start - msp->ms_unload_time) / 1000000),
2462             (longlong_t)((load_end - load_start) / 1000000),
2463             (u_longlong_t)msp->ms_max_size,
2464             (u_longlong_t)msp->ms_max_size - max_size,
2465             (u_longlong_t)weight, (u_longlong_t)msp->ms_weight);
2466
2467         metaslab_verify_space(msp, spa_syncing_txg(spa));
2468         mutex_exit(&msp->ms_sync_lock);
2469         return (0);
2470 }
2471
2472 int
2473 metaslab_load(metaslab_t *msp)
2474 {
2475         ASSERT(MUTEX_HELD(&msp->ms_lock));
2476
2477         /*
2478          * There may be another thread loading the same metaslab, if that's
2479          * the case just wait until the other thread is done and return.
2480          */
2481         metaslab_load_wait(msp);
2482         if (msp->ms_loaded)
2483                 return (0);
2484         VERIFY(!msp->ms_loading);
2485         ASSERT(!msp->ms_condensing);
2486
2487         /*
2488          * We set the loading flag BEFORE potentially dropping the lock to
2489          * wait for an ongoing flush (see ms_flushing below). This way other
2490          * threads know that there is already a thread that is loading this
2491          * metaslab.
2492          */
2493         msp->ms_loading = B_TRUE;
2494
2495         /*
2496          * Wait for any in-progress flushing to finish as we drop the ms_lock
2497          * both here (during space_map_load()) and in metaslab_flush() (when
2498          * we flush our changes to the ms_sm).
2499          */
2500         if (msp->ms_flushing)
2501                 metaslab_flush_wait(msp);
2502
2503         /*
2504          * In the possibility that we were waiting for the metaslab to be
2505          * flushed (where we temporarily dropped the ms_lock), ensure that
2506          * no one else loaded the metaslab somehow.
2507          */
2508         ASSERT(!msp->ms_loaded);
2509
2510         /*
2511          * If we're loading a metaslab in the normal class, consider evicting
2512          * another one to keep our memory usage under the limit defined by the
2513          * zfs_metaslab_mem_limit tunable.
2514          */
2515         if (spa_normal_class(msp->ms_group->mg_class->mc_spa) ==
2516             msp->ms_group->mg_class) {
2517                 metaslab_potentially_evict(msp->ms_group->mg_class);
2518         }
2519
2520         int error = metaslab_load_impl(msp);
2521
2522         ASSERT(MUTEX_HELD(&msp->ms_lock));
2523         msp->ms_loading = B_FALSE;
2524         cv_broadcast(&msp->ms_load_cv);
2525
2526         return (error);
2527 }
2528
2529 void
2530 metaslab_unload(metaslab_t *msp)
2531 {
2532         ASSERT(MUTEX_HELD(&msp->ms_lock));
2533
2534         /*
2535          * This can happen if a metaslab is selected for eviction (in
2536          * metaslab_potentially_evict) and then unloaded during spa_sync (via
2537          * metaslab_class_evict_old).
2538          */
2539         if (!msp->ms_loaded)
2540                 return;
2541
2542         range_tree_vacate(msp->ms_allocatable, NULL, NULL);
2543         msp->ms_loaded = B_FALSE;
2544         msp->ms_unload_time = gethrtime();
2545
2546         msp->ms_activation_weight = 0;
2547         msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
2548
2549         if (msp->ms_group != NULL) {
2550                 metaslab_class_t *mc = msp->ms_group->mg_class;
2551                 multilist_sublist_t *mls =
2552                     multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
2553                 if (multilist_link_active(&msp->ms_class_txg_node))
2554                         multilist_sublist_remove(mls, msp);
2555                 multilist_sublist_unlock(mls);
2556
2557                 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2558                 zfs_dbgmsg("metaslab_unload: txg %llu, spa %s, vdev_id %llu, "
2559                     "ms_id %llu, weight %llx, "
2560                     "selected txg %llu (%llu ms ago), alloc_txg %llu, "
2561                     "loaded %llu ms ago, max_size %llu",
2562                     (u_longlong_t)spa_syncing_txg(spa), spa_name(spa),
2563                     (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
2564                     (u_longlong_t)msp->ms_id,
2565                     (u_longlong_t)msp->ms_weight,
2566                     (u_longlong_t)msp->ms_selected_txg,
2567                     (u_longlong_t)(msp->ms_unload_time -
2568                     msp->ms_selected_time) / 1000 / 1000,
2569                     (u_longlong_t)msp->ms_alloc_txg,
2570                     (u_longlong_t)(msp->ms_unload_time -
2571                     msp->ms_load_time) / 1000 / 1000,
2572                     (u_longlong_t)msp->ms_max_size);
2573         }
2574
2575         /*
2576          * We explicitly recalculate the metaslab's weight based on its space
2577          * map (as it is now not loaded). We want unload metaslabs to always
2578          * have their weights calculated from the space map histograms, while
2579          * loaded ones have it calculated from their in-core range tree
2580          * [see metaslab_load()]. This way, the weight reflects the information
2581          * available in-core, whether it is loaded or not.
2582          *
2583          * If ms_group == NULL means that we came here from metaslab_fini(),
2584          * at which point it doesn't make sense for us to do the recalculation
2585          * and the sorting.
2586          */
2587         if (msp->ms_group != NULL)
2588                 metaslab_recalculate_weight_and_sort(msp);
2589 }
2590
2591 /*
2592  * We want to optimize the memory use of the per-metaslab range
2593  * trees. To do this, we store the segments in the range trees in
2594  * units of sectors, zero-indexing from the start of the metaslab. If
2595  * the vdev_ms_shift - the vdev_ashift is less than 32, we can store
2596  * the ranges using two uint32_ts, rather than two uint64_ts.
2597  */
2598 range_seg_type_t
2599 metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp,
2600     uint64_t *start, uint64_t *shift)
2601 {
2602         if (vdev->vdev_ms_shift - vdev->vdev_ashift < 32 &&
2603             !zfs_metaslab_force_large_segs) {
2604                 *shift = vdev->vdev_ashift;
2605                 *start = msp->ms_start;
2606                 return (RANGE_SEG32);
2607         } else {
2608                 *shift = 0;
2609                 *start = 0;
2610                 return (RANGE_SEG64);
2611         }
2612 }
2613
2614 void
2615 metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg)
2616 {
2617         ASSERT(MUTEX_HELD(&msp->ms_lock));
2618         metaslab_class_t *mc = msp->ms_group->mg_class;
2619         multilist_sublist_t *mls =
2620             multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
2621         if (multilist_link_active(&msp->ms_class_txg_node))
2622                 multilist_sublist_remove(mls, msp);
2623         msp->ms_selected_txg = txg;
2624         msp->ms_selected_time = gethrtime();
2625         multilist_sublist_insert_tail(mls, msp);
2626         multilist_sublist_unlock(mls);
2627 }
2628
2629 void
2630 metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
2631     int64_t defer_delta, int64_t space_delta)
2632 {
2633         vdev_space_update(vd, alloc_delta, defer_delta, space_delta);
2634
2635         ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent);
2636         ASSERT(vd->vdev_ms_count != 0);
2637
2638         metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta,
2639             vdev_deflated_space(vd, space_delta));
2640 }
2641
2642 int
2643 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
2644     uint64_t txg, metaslab_t **msp)
2645 {
2646         vdev_t *vd = mg->mg_vd;
2647         spa_t *spa = vd->vdev_spa;
2648         objset_t *mos = spa->spa_meta_objset;
2649         metaslab_t *ms;
2650         int error;
2651
2652         ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
2653         mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
2654         mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
2655         cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
2656         cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL);
2657         multilist_link_init(&ms->ms_class_txg_node);
2658
2659         ms->ms_id = id;
2660         ms->ms_start = id << vd->vdev_ms_shift;
2661         ms->ms_size = 1ULL << vd->vdev_ms_shift;
2662         ms->ms_allocator = -1;
2663         ms->ms_new = B_TRUE;
2664
2665         vdev_ops_t *ops = vd->vdev_ops;
2666         if (ops->vdev_op_metaslab_init != NULL)
2667                 ops->vdev_op_metaslab_init(vd, &ms->ms_start, &ms->ms_size);
2668
2669         /*
2670          * We only open space map objects that already exist. All others
2671          * will be opened when we finally allocate an object for it. For
2672          * readonly pools there is no need to open the space map object.
2673          *
2674          * Note:
2675          * When called from vdev_expand(), we can't call into the DMU as
2676          * we are holding the spa_config_lock as a writer and we would
2677          * deadlock [see relevant comment in vdev_metaslab_init()]. in
2678          * that case, the object parameter is zero though, so we won't
2679          * call into the DMU.
2680          */
2681         if (object != 0 && !(spa->spa_mode == SPA_MODE_READ &&
2682             !spa->spa_read_spacemaps)) {
2683                 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
2684                     ms->ms_size, vd->vdev_ashift);
2685
2686                 if (error != 0) {
2687                         kmem_free(ms, sizeof (metaslab_t));
2688                         return (error);
2689                 }
2690
2691                 ASSERT(ms->ms_sm != NULL);
2692                 ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
2693         }
2694
2695         uint64_t shift, start;
2696         range_seg_type_t type =
2697             metaslab_calculate_range_tree_type(vd, ms, &start, &shift);
2698
2699         ms->ms_allocatable = range_tree_create(NULL, type, NULL, start, shift);
2700         for (int t = 0; t < TXG_SIZE; t++) {
2701                 ms->ms_allocating[t] = range_tree_create(NULL, type,
2702                     NULL, start, shift);
2703         }
2704         ms->ms_freeing = range_tree_create(NULL, type, NULL, start, shift);
2705         ms->ms_freed = range_tree_create(NULL, type, NULL, start, shift);
2706         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2707                 ms->ms_defer[t] = range_tree_create(NULL, type, NULL,
2708                     start, shift);
2709         }
2710         ms->ms_checkpointing =
2711             range_tree_create(NULL, type, NULL, start, shift);
2712         ms->ms_unflushed_allocs =
2713             range_tree_create(NULL, type, NULL, start, shift);
2714
2715         metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
2716         mrap->mra_bt = &ms->ms_unflushed_frees_by_size;
2717         mrap->mra_floor_shift = metaslab_by_size_min_shift;
2718         ms->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops,
2719             type, mrap, start, shift);
2720
2721         ms->ms_trim = range_tree_create(NULL, type, NULL, start, shift);
2722
2723         metaslab_group_add(mg, ms);
2724         metaslab_set_fragmentation(ms, B_FALSE);
2725
2726         /*
2727          * If we're opening an existing pool (txg == 0) or creating
2728          * a new one (txg == TXG_INITIAL), all space is available now.
2729          * If we're adding space to an existing pool, the new space
2730          * does not become available until after this txg has synced.
2731          * The metaslab's weight will also be initialized when we sync
2732          * out this txg. This ensures that we don't attempt to allocate
2733          * from it before we have initialized it completely.
2734          */
2735         if (txg <= TXG_INITIAL) {
2736                 metaslab_sync_done(ms, 0);
2737                 metaslab_space_update(vd, mg->mg_class,
2738                     metaslab_allocated_space(ms), 0, 0);
2739         }
2740
2741         if (txg != 0) {
2742                 vdev_dirty(vd, 0, NULL, txg);
2743                 vdev_dirty(vd, VDD_METASLAB, ms, txg);
2744         }
2745
2746         *msp = ms;
2747
2748         return (0);
2749 }
2750
2751 static void
2752 metaslab_fini_flush_data(metaslab_t *msp)
2753 {
2754         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2755
2756         if (metaslab_unflushed_txg(msp) == 0) {
2757                 ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL),
2758                     ==, NULL);
2759                 return;
2760         }
2761         ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
2762
2763         mutex_enter(&spa->spa_flushed_ms_lock);
2764         avl_remove(&spa->spa_metaslabs_by_flushed, msp);
2765         mutex_exit(&spa->spa_flushed_ms_lock);
2766
2767         spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp));
2768         spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp),
2769             metaslab_unflushed_dirty(msp));
2770 }
2771
2772 uint64_t
2773 metaslab_unflushed_changes_memused(metaslab_t *ms)
2774 {
2775         return ((range_tree_numsegs(ms->ms_unflushed_allocs) +
2776             range_tree_numsegs(ms->ms_unflushed_frees)) *
2777             ms->ms_unflushed_allocs->rt_root.bt_elem_size);
2778 }
2779
2780 void
2781 metaslab_fini(metaslab_t *msp)
2782 {
2783         metaslab_group_t *mg = msp->ms_group;
2784         vdev_t *vd = mg->mg_vd;
2785         spa_t *spa = vd->vdev_spa;
2786
2787         metaslab_fini_flush_data(msp);
2788
2789         metaslab_group_remove(mg, msp);
2790
2791         mutex_enter(&msp->ms_lock);
2792         VERIFY(msp->ms_group == NULL);
2793
2794         /*
2795          * If this metaslab hasn't been through metaslab_sync_done() yet its
2796          * space hasn't been accounted for in its vdev and doesn't need to be
2797          * subtracted.
2798          */
2799         if (!msp->ms_new) {
2800                 metaslab_space_update(vd, mg->mg_class,
2801                     -metaslab_allocated_space(msp), 0, -msp->ms_size);
2802
2803         }
2804         space_map_close(msp->ms_sm);
2805         msp->ms_sm = NULL;
2806
2807         metaslab_unload(msp);
2808
2809         range_tree_destroy(msp->ms_allocatable);
2810         range_tree_destroy(msp->ms_freeing);
2811         range_tree_destroy(msp->ms_freed);
2812
2813         ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
2814             metaslab_unflushed_changes_memused(msp));
2815         spa->spa_unflushed_stats.sus_memused -=
2816             metaslab_unflushed_changes_memused(msp);
2817         range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
2818         range_tree_destroy(msp->ms_unflushed_allocs);
2819         range_tree_destroy(msp->ms_checkpointing);
2820         range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
2821         range_tree_destroy(msp->ms_unflushed_frees);
2822
2823         for (int t = 0; t < TXG_SIZE; t++) {
2824                 range_tree_destroy(msp->ms_allocating[t]);
2825         }
2826         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2827                 range_tree_destroy(msp->ms_defer[t]);
2828         }
2829         ASSERT0(msp->ms_deferspace);
2830
2831         for (int t = 0; t < TXG_SIZE; t++)
2832                 ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
2833
2834         range_tree_vacate(msp->ms_trim, NULL, NULL);
2835         range_tree_destroy(msp->ms_trim);
2836
2837         mutex_exit(&msp->ms_lock);
2838         cv_destroy(&msp->ms_load_cv);
2839         cv_destroy(&msp->ms_flush_cv);
2840         mutex_destroy(&msp->ms_lock);
2841         mutex_destroy(&msp->ms_sync_lock);
2842         ASSERT3U(msp->ms_allocator, ==, -1);
2843
2844         kmem_free(msp, sizeof (metaslab_t));
2845 }
2846
2847 #define FRAGMENTATION_TABLE_SIZE        17
2848
2849 /*
2850  * This table defines a segment size based fragmentation metric that will
2851  * allow each metaslab to derive its own fragmentation value. This is done
2852  * by calculating the space in each bucket of the spacemap histogram and
2853  * multiplying that by the fragmentation metric in this table. Doing
2854  * this for all buckets and dividing it by the total amount of free
2855  * space in this metaslab (i.e. the total free space in all buckets) gives
2856  * us the fragmentation metric. This means that a high fragmentation metric
2857  * equates to most of the free space being comprised of small segments.
2858  * Conversely, if the metric is low, then most of the free space is in
2859  * large segments. A 10% change in fragmentation equates to approximately
2860  * double the number of segments.
2861  *
2862  * This table defines 0% fragmented space using 16MB segments. Testing has
2863  * shown that segments that are greater than or equal to 16MB do not suffer
2864  * from drastic performance problems. Using this value, we derive the rest
2865  * of the table. Since the fragmentation value is never stored on disk, it
2866  * is possible to change these calculations in the future.
2867  */
2868 static const int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
2869         100,    /* 512B */
2870         100,    /* 1K   */
2871         98,     /* 2K   */
2872         95,     /* 4K   */
2873         90,     /* 8K   */
2874         80,     /* 16K  */
2875         70,     /* 32K  */
2876         60,     /* 64K  */
2877         50,     /* 128K */
2878         40,     /* 256K */
2879         30,     /* 512K */
2880         20,     /* 1M   */
2881         15,     /* 2M   */
2882         10,     /* 4M   */
2883         5,      /* 8M   */
2884         0       /* 16M  */
2885 };
2886
2887 /*
2888  * Calculate the metaslab's fragmentation metric and set ms_fragmentation.
2889  * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not
2890  * been upgraded and does not support this metric. Otherwise, the return
2891  * value should be in the range [0, 100].
2892  */
2893 static void
2894 metaslab_set_fragmentation(metaslab_t *msp, boolean_t nodirty)
2895 {
2896         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2897         uint64_t fragmentation = 0;
2898         uint64_t total = 0;
2899         boolean_t feature_enabled = spa_feature_is_enabled(spa,
2900             SPA_FEATURE_SPACEMAP_HISTOGRAM);
2901
2902         if (!feature_enabled) {
2903                 msp->ms_fragmentation = ZFS_FRAG_INVALID;
2904                 return;
2905         }
2906
2907         /*
2908          * A null space map means that the entire metaslab is free
2909          * and thus is not fragmented.
2910          */
2911         if (msp->ms_sm == NULL) {
2912                 msp->ms_fragmentation = 0;
2913                 return;
2914         }
2915
2916         /*
2917          * If this metaslab's space map has not been upgraded, flag it
2918          * so that we upgrade next time we encounter it.
2919          */
2920         if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
2921                 uint64_t txg = spa_syncing_txg(spa);
2922                 vdev_t *vd = msp->ms_group->mg_vd;
2923
2924                 /*
2925                  * If we've reached the final dirty txg, then we must
2926                  * be shutting down the pool. We don't want to dirty
2927                  * any data past this point so skip setting the condense
2928                  * flag. We can retry this action the next time the pool
2929                  * is imported. We also skip marking this metaslab for
2930                  * condensing if the caller has explicitly set nodirty.
2931                  */
2932                 if (!nodirty &&
2933                     spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
2934                         msp->ms_condense_wanted = B_TRUE;
2935                         vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
2936                         zfs_dbgmsg("txg %llu, requesting force condense: "
2937                             "ms_id %llu, vdev_id %llu", (u_longlong_t)txg,
2938                             (u_longlong_t)msp->ms_id,
2939                             (u_longlong_t)vd->vdev_id);
2940                 }
2941                 msp->ms_fragmentation = ZFS_FRAG_INVALID;
2942                 return;
2943         }
2944
2945         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
2946                 uint64_t space = 0;
2947                 uint8_t shift = msp->ms_sm->sm_shift;
2948
2949                 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
2950                     FRAGMENTATION_TABLE_SIZE - 1);
2951
2952                 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
2953                         continue;
2954
2955                 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
2956                 total += space;
2957
2958                 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
2959                 fragmentation += space * zfs_frag_table[idx];
2960         }
2961
2962         if (total > 0)
2963                 fragmentation /= total;
2964         ASSERT3U(fragmentation, <=, 100);
2965
2966         msp->ms_fragmentation = fragmentation;
2967 }
2968
2969 /*
2970  * Compute a weight -- a selection preference value -- for the given metaslab.
2971  * This is based on the amount of free space, the level of fragmentation,
2972  * the LBA range, and whether the metaslab is loaded.
2973  */
2974 static uint64_t
2975 metaslab_space_weight(metaslab_t *msp)
2976 {
2977         metaslab_group_t *mg = msp->ms_group;
2978         vdev_t *vd = mg->mg_vd;
2979         uint64_t weight, space;
2980
2981         ASSERT(MUTEX_HELD(&msp->ms_lock));
2982
2983         /*
2984          * The baseline weight is the metaslab's free space.
2985          */
2986         space = msp->ms_size - metaslab_allocated_space(msp);
2987
2988         if (metaslab_fragmentation_factor_enabled &&
2989             msp->ms_fragmentation != ZFS_FRAG_INVALID) {
2990                 /*
2991                  * Use the fragmentation information to inversely scale
2992                  * down the baseline weight. We need to ensure that we
2993                  * don't exclude this metaslab completely when it's 100%
2994                  * fragmented. To avoid this we reduce the fragmented value
2995                  * by 1.
2996                  */
2997                 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
2998
2999                 /*
3000                  * If space < SPA_MINBLOCKSIZE, then we will not allocate from
3001                  * this metaslab again. The fragmentation metric may have
3002                  * decreased the space to something smaller than
3003                  * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
3004                  * so that we can consume any remaining space.
3005                  */
3006                 if (space > 0 && space < SPA_MINBLOCKSIZE)
3007                         space = SPA_MINBLOCKSIZE;
3008         }
3009         weight = space;
3010
3011         /*
3012          * Modern disks have uniform bit density and constant angular velocity.
3013          * Therefore, the outer recording zones are faster (higher bandwidth)
3014          * than the inner zones by the ratio of outer to inner track diameter,
3015          * which is typically around 2:1.  We account for this by assigning
3016          * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
3017          * In effect, this means that we'll select the metaslab with the most
3018          * free bandwidth rather than simply the one with the most free space.
3019          */
3020         if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) {
3021                 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
3022                 ASSERT(weight >= space && weight <= 2 * space);
3023         }
3024
3025         /*
3026          * If this metaslab is one we're actively using, adjust its
3027          * weight to make it preferable to any inactive metaslab so
3028          * we'll polish it off. If the fragmentation on this metaslab
3029          * has exceed our threshold, then don't mark it active.
3030          */
3031         if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
3032             msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
3033                 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
3034         }
3035
3036         WEIGHT_SET_SPACEBASED(weight);
3037         return (weight);
3038 }
3039
3040 /*
3041  * Return the weight of the specified metaslab, according to the segment-based
3042  * weighting algorithm. The metaslab must be loaded. This function can
3043  * be called within a sync pass since it relies only on the metaslab's
3044  * range tree which is always accurate when the metaslab is loaded.
3045  */
3046 static uint64_t
3047 metaslab_weight_from_range_tree(metaslab_t *msp)
3048 {
3049         uint64_t weight = 0;
3050         uint32_t segments = 0;
3051
3052         ASSERT(msp->ms_loaded);
3053
3054         for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
3055             i--) {
3056                 uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
3057                 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
3058
3059                 segments <<= 1;
3060                 segments += msp->ms_allocatable->rt_histogram[i];
3061
3062                 /*
3063                  * The range tree provides more precision than the space map
3064                  * and must be downgraded so that all values fit within the
3065                  * space map's histogram. This allows us to compare loaded
3066                  * vs. unloaded metaslabs to determine which metaslab is
3067                  * considered "best".
3068                  */
3069                 if (i > max_idx)
3070                         continue;
3071
3072                 if (segments != 0) {
3073                         WEIGHT_SET_COUNT(weight, segments);
3074                         WEIGHT_SET_INDEX(weight, i);
3075                         WEIGHT_SET_ACTIVE(weight, 0);
3076                         break;
3077                 }
3078         }
3079         return (weight);
3080 }
3081
3082 /*
3083  * Calculate the weight based on the on-disk histogram. Should be applied
3084  * only to unloaded metaslabs  (i.e no incoming allocations) in-order to
3085  * give results consistent with the on-disk state
3086  */
3087 static uint64_t
3088 metaslab_weight_from_spacemap(metaslab_t *msp)
3089 {
3090         space_map_t *sm = msp->ms_sm;
3091         ASSERT(!msp->ms_loaded);
3092         ASSERT(sm != NULL);
3093         ASSERT3U(space_map_object(sm), !=, 0);
3094         ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
3095
3096         /*
3097          * Create a joint histogram from all the segments that have made
3098          * it to the metaslab's space map histogram, that are not yet
3099          * available for allocation because they are still in the freeing
3100          * pipeline (e.g. freeing, freed, and defer trees). Then subtract
3101          * these segments from the space map's histogram to get a more
3102          * accurate weight.
3103          */
3104         uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0};
3105         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
3106                 deferspace_histogram[i] += msp->ms_synchist[i];
3107         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
3108                 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
3109                         deferspace_histogram[i] += msp->ms_deferhist[t][i];
3110                 }
3111         }
3112
3113         uint64_t weight = 0;
3114         for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
3115                 ASSERT3U(sm->sm_phys->smp_histogram[i], >=,
3116                     deferspace_histogram[i]);
3117                 uint64_t count =
3118                     sm->sm_phys->smp_histogram[i] - deferspace_histogram[i];
3119                 if (count != 0) {
3120                         WEIGHT_SET_COUNT(weight, count);
3121                         WEIGHT_SET_INDEX(weight, i + sm->sm_shift);
3122                         WEIGHT_SET_ACTIVE(weight, 0);
3123                         break;
3124                 }
3125         }
3126         return (weight);
3127 }
3128
3129 /*
3130  * Compute a segment-based weight for the specified metaslab. The weight
3131  * is determined by highest bucket in the histogram. The information
3132  * for the highest bucket is encoded into the weight value.
3133  */
3134 static uint64_t
3135 metaslab_segment_weight(metaslab_t *msp)
3136 {
3137         metaslab_group_t *mg = msp->ms_group;
3138         uint64_t weight = 0;
3139         uint8_t shift = mg->mg_vd->vdev_ashift;
3140
3141         ASSERT(MUTEX_HELD(&msp->ms_lock));
3142
3143         /*
3144          * The metaslab is completely free.
3145          */
3146         if (metaslab_allocated_space(msp) == 0) {
3147                 int idx = highbit64(msp->ms_size) - 1;
3148                 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
3149
3150                 if (idx < max_idx) {
3151                         WEIGHT_SET_COUNT(weight, 1ULL);
3152                         WEIGHT_SET_INDEX(weight, idx);
3153                 } else {
3154                         WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
3155                         WEIGHT_SET_INDEX(weight, max_idx);
3156                 }
3157                 WEIGHT_SET_ACTIVE(weight, 0);
3158                 ASSERT(!WEIGHT_IS_SPACEBASED(weight));
3159                 return (weight);
3160         }
3161
3162         ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
3163
3164         /*
3165          * If the metaslab is fully allocated then just make the weight 0.
3166          */
3167         if (metaslab_allocated_space(msp) == msp->ms_size)
3168                 return (0);
3169         /*
3170          * If the metaslab is already loaded, then use the range tree to
3171          * determine the weight. Otherwise, we rely on the space map information
3172          * to generate the weight.
3173          */
3174         if (msp->ms_loaded) {
3175                 weight = metaslab_weight_from_range_tree(msp);
3176         } else {
3177                 weight = metaslab_weight_from_spacemap(msp);
3178         }
3179
3180         /*
3181          * If the metaslab was active the last time we calculated its weight
3182          * then keep it active. We want to consume the entire region that
3183          * is associated with this weight.
3184          */
3185         if (msp->ms_activation_weight != 0 && weight != 0)
3186                 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
3187         return (weight);
3188 }
3189
3190 /*
3191  * Determine if we should attempt to allocate from this metaslab. If the
3192  * metaslab is loaded, then we can determine if the desired allocation
3193  * can be satisfied by looking at the size of the maximum free segment
3194  * on that metaslab. Otherwise, we make our decision based on the metaslab's
3195  * weight. For segment-based weighting we can determine the maximum
3196  * allocation based on the index encoded in its value. For space-based
3197  * weights we rely on the entire weight (excluding the weight-type bit).
3198  */
3199 static boolean_t
3200 metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard)
3201 {
3202         /*
3203          * This case will usually but not always get caught by the checks below;
3204          * metaslabs can be loaded by various means, including the trim and
3205          * initialize code. Once that happens, without this check they are
3206          * allocatable even before they finish their first txg sync.
3207          */
3208         if (unlikely(msp->ms_new))
3209                 return (B_FALSE);
3210
3211         /*
3212          * If the metaslab is loaded, ms_max_size is definitive and we can use
3213          * the fast check. If it's not, the ms_max_size is a lower bound (once
3214          * set), and we should use the fast check as long as we're not in
3215          * try_hard and it's been less than zfs_metaslab_max_size_cache_sec
3216          * seconds since the metaslab was unloaded.
3217          */
3218         if (msp->ms_loaded ||
3219             (msp->ms_max_size != 0 && !try_hard && gethrtime() <
3220             msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec)))
3221                 return (msp->ms_max_size >= asize);
3222
3223         boolean_t should_allocate;
3224         if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
3225                 /*
3226                  * The metaslab segment weight indicates segments in the
3227                  * range [2^i, 2^(i+1)), where i is the index in the weight.
3228                  * Since the asize might be in the middle of the range, we
3229                  * should attempt the allocation if asize < 2^(i+1).
3230                  */
3231                 should_allocate = (asize <
3232                     1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
3233         } else {
3234                 should_allocate = (asize <=
3235                     (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
3236         }
3237
3238         return (should_allocate);
3239 }
3240
3241 static uint64_t
3242 metaslab_weight(metaslab_t *msp, boolean_t nodirty)
3243 {
3244         vdev_t *vd = msp->ms_group->mg_vd;
3245         spa_t *spa = vd->vdev_spa;
3246         uint64_t weight;
3247
3248         ASSERT(MUTEX_HELD(&msp->ms_lock));
3249
3250         metaslab_set_fragmentation(msp, nodirty);
3251
3252         /*
3253          * Update the maximum size. If the metaslab is loaded, this will
3254          * ensure that we get an accurate maximum size if newly freed space
3255          * has been added back into the free tree. If the metaslab is
3256          * unloaded, we check if there's a larger free segment in the
3257          * unflushed frees. This is a lower bound on the largest allocatable
3258          * segment size. Coalescing of adjacent entries may reveal larger
3259          * allocatable segments, but we aren't aware of those until loading
3260          * the space map into a range tree.
3261          */
3262         if (msp->ms_loaded) {
3263                 msp->ms_max_size = metaslab_largest_allocatable(msp);
3264         } else {
3265                 msp->ms_max_size = MAX(msp->ms_max_size,
3266                     metaslab_largest_unflushed_free(msp));
3267         }
3268
3269         /*
3270          * Segment-based weighting requires space map histogram support.
3271          */
3272         if (zfs_metaslab_segment_weight_enabled &&
3273             spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
3274             (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
3275             sizeof (space_map_phys_t))) {
3276                 weight = metaslab_segment_weight(msp);
3277         } else {
3278                 weight = metaslab_space_weight(msp);
3279         }
3280         return (weight);
3281 }
3282
3283 void
3284 metaslab_recalculate_weight_and_sort(metaslab_t *msp)
3285 {
3286         ASSERT(MUTEX_HELD(&msp->ms_lock));
3287
3288         /* note: we preserve the mask (e.g. indication of primary, etc..) */
3289         uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
3290         metaslab_group_sort(msp->ms_group, msp,
3291             metaslab_weight(msp, B_FALSE) | was_active);
3292 }
3293
3294 static int
3295 metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
3296     int allocator, uint64_t activation_weight)
3297 {
3298         metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
3299         ASSERT(MUTEX_HELD(&msp->ms_lock));
3300
3301         /*
3302          * If we're activating for the claim code, we don't want to actually
3303          * set the metaslab up for a specific allocator.
3304          */
3305         if (activation_weight == METASLAB_WEIGHT_CLAIM) {
3306                 ASSERT0(msp->ms_activation_weight);
3307                 msp->ms_activation_weight = msp->ms_weight;
3308                 metaslab_group_sort(mg, msp, msp->ms_weight |
3309                     activation_weight);
3310                 return (0);
3311         }
3312
3313         metaslab_t **mspp = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
3314             &mga->mga_primary : &mga->mga_secondary);
3315
3316         mutex_enter(&mg->mg_lock);
3317         if (*mspp != NULL) {
3318                 mutex_exit(&mg->mg_lock);
3319                 return (EEXIST);
3320         }
3321
3322         *mspp = msp;
3323         ASSERT3S(msp->ms_allocator, ==, -1);
3324         msp->ms_allocator = allocator;
3325         msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
3326
3327         ASSERT0(msp->ms_activation_weight);
3328         msp->ms_activation_weight = msp->ms_weight;
3329         metaslab_group_sort_impl(mg, msp,
3330             msp->ms_weight | activation_weight);
3331         mutex_exit(&mg->mg_lock);
3332
3333         return (0);
3334 }
3335
3336 static int
3337 metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
3338 {
3339         ASSERT(MUTEX_HELD(&msp->ms_lock));
3340
3341         /*
3342          * The current metaslab is already activated for us so there
3343          * is nothing to do. Already activated though, doesn't mean
3344          * that this metaslab is activated for our allocator nor our
3345          * requested activation weight. The metaslab could have started
3346          * as an active one for our allocator but changed allocators
3347          * while we were waiting to grab its ms_lock or we stole it
3348          * [see find_valid_metaslab()]. This means that there is a
3349          * possibility of passivating a metaslab of another allocator
3350          * or from a different activation mask, from this thread.
3351          */
3352         if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
3353                 ASSERT(msp->ms_loaded);
3354                 return (0);
3355         }
3356
3357         int error = metaslab_load(msp);
3358         if (error != 0) {
3359                 metaslab_group_sort(msp->ms_group, msp, 0);
3360                 return (error);
3361         }
3362
3363         /*
3364          * When entering metaslab_load() we may have dropped the
3365          * ms_lock because we were loading this metaslab, or we
3366          * were waiting for another thread to load it for us. In
3367          * that scenario, we recheck the weight of the metaslab
3368          * to see if it was activated by another thread.
3369          *
3370          * If the metaslab was activated for another allocator or
3371          * it was activated with a different activation weight (e.g.
3372          * we wanted to make it a primary but it was activated as
3373          * secondary) we return error (EBUSY).
3374          *
3375          * If the metaslab was activated for the same allocator
3376          * and requested activation mask, skip activating it.
3377          */
3378         if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
3379                 if (msp->ms_allocator != allocator)
3380                         return (EBUSY);
3381
3382                 if ((msp->ms_weight & activation_weight) == 0)
3383                         return (SET_ERROR(EBUSY));
3384
3385                 EQUIV((activation_weight == METASLAB_WEIGHT_PRIMARY),
3386                     msp->ms_primary);
3387                 return (0);
3388         }
3389
3390         /*
3391          * If the metaslab has literally 0 space, it will have weight 0. In
3392          * that case, don't bother activating it. This can happen if the
3393          * metaslab had space during find_valid_metaslab, but another thread
3394          * loaded it and used all that space while we were waiting to grab the
3395          * lock.
3396          */
3397         if (msp->ms_weight == 0) {
3398                 ASSERT0(range_tree_space(msp->ms_allocatable));
3399                 return (SET_ERROR(ENOSPC));
3400         }
3401
3402         if ((error = metaslab_activate_allocator(msp->ms_group, msp,
3403             allocator, activation_weight)) != 0) {
3404                 return (error);
3405         }
3406
3407         ASSERT(msp->ms_loaded);
3408         ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
3409
3410         return (0);
3411 }
3412
3413 static void
3414 metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
3415     uint64_t weight)
3416 {
3417         ASSERT(MUTEX_HELD(&msp->ms_lock));
3418         ASSERT(msp->ms_loaded);
3419
3420         if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
3421                 metaslab_group_sort(mg, msp, weight);
3422                 return;
3423         }
3424
3425         mutex_enter(&mg->mg_lock);
3426         ASSERT3P(msp->ms_group, ==, mg);
3427         ASSERT3S(0, <=, msp->ms_allocator);
3428         ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
3429
3430         metaslab_group_allocator_t *mga = &mg->mg_allocator[msp->ms_allocator];
3431         if (msp->ms_primary) {
3432                 ASSERT3P(mga->mga_primary, ==, msp);
3433                 ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
3434                 mga->mga_primary = NULL;
3435         } else {
3436                 ASSERT3P(mga->mga_secondary, ==, msp);
3437                 ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
3438                 mga->mga_secondary = NULL;
3439         }
3440         msp->ms_allocator = -1;
3441         metaslab_group_sort_impl(mg, msp, weight);
3442         mutex_exit(&mg->mg_lock);
3443 }
3444
3445 static void
3446 metaslab_passivate(metaslab_t *msp, uint64_t weight)
3447 {
3448         uint64_t size __maybe_unused = weight & ~METASLAB_WEIGHT_TYPE;
3449
3450         /*
3451          * If size < SPA_MINBLOCKSIZE, then we will not allocate from
3452          * this metaslab again.  In that case, it had better be empty,
3453          * or we would be leaving space on the table.
3454          */
3455         ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) ||
3456             size >= SPA_MINBLOCKSIZE ||
3457             range_tree_space(msp->ms_allocatable) == 0);
3458         ASSERT0(weight & METASLAB_ACTIVE_MASK);
3459
3460         ASSERT(msp->ms_activation_weight != 0);
3461         msp->ms_activation_weight = 0;
3462         metaslab_passivate_allocator(msp->ms_group, msp, weight);
3463         ASSERT0(msp->ms_weight & METASLAB_ACTIVE_MASK);
3464 }
3465
3466 /*
3467  * Segment-based metaslabs are activated once and remain active until
3468  * we either fail an allocation attempt (similar to space-based metaslabs)
3469  * or have exhausted the free space in zfs_metaslab_switch_threshold
3470  * buckets since the metaslab was activated. This function checks to see
3471  * if we've exhausted the zfs_metaslab_switch_threshold buckets in the
3472  * metaslab and passivates it proactively. This will allow us to select a
3473  * metaslab with a larger contiguous region, if any, remaining within this
3474  * metaslab group. If we're in sync pass > 1, then we continue using this
3475  * metaslab so that we don't dirty more block and cause more sync passes.
3476  */
3477 static void
3478 metaslab_segment_may_passivate(metaslab_t *msp)
3479 {
3480         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
3481
3482         if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
3483                 return;
3484
3485         /*
3486          * Since we are in the middle of a sync pass, the most accurate
3487          * information that is accessible to us is the in-core range tree
3488          * histogram; calculate the new weight based on that information.
3489          */
3490         uint64_t weight = metaslab_weight_from_range_tree(msp);
3491         int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
3492         int current_idx = WEIGHT_GET_INDEX(weight);
3493
3494         if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
3495                 metaslab_passivate(msp, weight);
3496 }
3497
3498 static void
3499 metaslab_preload(void *arg)
3500 {
3501         metaslab_t *msp = arg;
3502         metaslab_class_t *mc = msp->ms_group->mg_class;
3503         spa_t *spa = mc->mc_spa;
3504         fstrans_cookie_t cookie = spl_fstrans_mark();
3505
3506         ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
3507
3508         mutex_enter(&msp->ms_lock);
3509         (void) metaslab_load(msp);
3510         metaslab_set_selected_txg(msp, spa_syncing_txg(spa));
3511         mutex_exit(&msp->ms_lock);
3512         spl_fstrans_unmark(cookie);
3513 }
3514
3515 static void
3516 metaslab_group_preload(metaslab_group_t *mg)
3517 {
3518         spa_t *spa = mg->mg_vd->vdev_spa;
3519         metaslab_t *msp;
3520         avl_tree_t *t = &mg->mg_metaslab_tree;
3521         int m = 0;
3522
3523         if (spa_shutting_down(spa) || !metaslab_preload_enabled)
3524                 return;
3525
3526         mutex_enter(&mg->mg_lock);
3527
3528         /*
3529          * Load the next potential metaslabs
3530          */
3531         for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
3532                 ASSERT3P(msp->ms_group, ==, mg);
3533
3534                 /*
3535                  * We preload only the maximum number of metaslabs specified
3536                  * by metaslab_preload_limit. If a metaslab is being forced
3537                  * to condense then we preload it too. This will ensure
3538                  * that force condensing happens in the next txg.
3539                  */
3540                 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
3541                         continue;
3542                 }
3543
3544                 VERIFY(taskq_dispatch(spa->spa_metaslab_taskq, metaslab_preload,
3545                     msp, TQ_SLEEP | (m <= mg->mg_allocators ? TQ_FRONT : 0))
3546                     != TASKQID_INVALID);
3547         }
3548         mutex_exit(&mg->mg_lock);
3549 }
3550
3551 /*
3552  * Determine if the space map's on-disk footprint is past our tolerance for
3553  * inefficiency. We would like to use the following criteria to make our
3554  * decision:
3555  *
3556  * 1. Do not condense if the size of the space map object would dramatically
3557  *    increase as a result of writing out the free space range tree.
3558  *
3559  * 2. Condense if the on on-disk space map representation is at least
3560  *    zfs_condense_pct/100 times the size of the optimal representation
3561  *    (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB).
3562  *
3563  * 3. Do not condense if the on-disk size of the space map does not actually
3564  *    decrease.
3565  *
3566  * Unfortunately, we cannot compute the on-disk size of the space map in this
3567  * context because we cannot accurately compute the effects of compression, etc.
3568  * Instead, we apply the heuristic described in the block comment for
3569  * zfs_metaslab_condense_block_threshold - we only condense if the space used
3570  * is greater than a threshold number of blocks.
3571  */
3572 static boolean_t
3573 metaslab_should_condense(metaslab_t *msp)
3574 {
3575         space_map_t *sm = msp->ms_sm;
3576         vdev_t *vd = msp->ms_group->mg_vd;
3577         uint64_t vdev_blocksize = 1ULL << vd->vdev_ashift;
3578
3579         ASSERT(MUTEX_HELD(&msp->ms_lock));
3580         ASSERT(msp->ms_loaded);
3581         ASSERT(sm != NULL);
3582         ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1);
3583
3584         /*
3585          * We always condense metaslabs that are empty and metaslabs for
3586          * which a condense request has been made.
3587          */
3588         if (range_tree_numsegs(msp->ms_allocatable) == 0 ||
3589             msp->ms_condense_wanted)
3590                 return (B_TRUE);
3591
3592         uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize);
3593         uint64_t object_size = space_map_length(sm);
3594         uint64_t optimal_size = space_map_estimate_optimal_size(sm,
3595             msp->ms_allocatable, SM_NO_VDEVID);
3596
3597         return (object_size >= (optimal_size * zfs_condense_pct / 100) &&
3598             object_size > zfs_metaslab_condense_block_threshold * record_size);
3599 }
3600
3601 /*
3602  * Condense the on-disk space map representation to its minimized form.
3603  * The minimized form consists of a small number of allocations followed
3604  * by the entries of the free range tree (ms_allocatable). The condensed
3605  * spacemap contains all the entries of previous TXGs (including those in
3606  * the pool-wide log spacemaps; thus this is effectively a superset of
3607  * metaslab_flush()), but this TXG's entries still need to be written.
3608  */
3609 static void
3610 metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
3611 {
3612         range_tree_t *condense_tree;
3613         space_map_t *sm = msp->ms_sm;
3614         uint64_t txg = dmu_tx_get_txg(tx);
3615         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
3616
3617         ASSERT(MUTEX_HELD(&msp->ms_lock));
3618         ASSERT(msp->ms_loaded);
3619         ASSERT(msp->ms_sm != NULL);
3620
3621         /*
3622          * In order to condense the space map, we need to change it so it
3623          * only describes which segments are currently allocated and free.
3624          *
3625          * All the current free space resides in the ms_allocatable, all
3626          * the ms_defer trees, and all the ms_allocating trees. We ignore
3627          * ms_freed because it is empty because we're in sync pass 1. We
3628          * ignore ms_freeing because these changes are not yet reflected
3629          * in the spacemap (they will be written later this txg).
3630          *
3631          * So to truncate the space map to represent all the entries of
3632          * previous TXGs we do the following:
3633          *
3634          * 1] We create a range tree (condense tree) that is 100% empty.
3635          * 2] We add to it all segments found in the ms_defer trees
3636          *    as those segments are marked as free in the original space
3637          *    map. We do the same with the ms_allocating trees for the same
3638          *    reason. Adding these segments should be a relatively
3639          *    inexpensive operation since we expect these trees to have a
3640          *    small number of nodes.
3641          * 3] We vacate any unflushed allocs, since they are not frees we
3642          *    need to add to the condense tree. Then we vacate any
3643          *    unflushed frees as they should already be part of ms_allocatable.
3644          * 4] At this point, we would ideally like to add all segments
3645          *    in the ms_allocatable tree from the condense tree. This way
3646          *    we would write all the entries of the condense tree as the
3647          *    condensed space map, which would only contain freed
3648          *    segments with everything else assumed to be allocated.
3649          *
3650          *    Doing so can be prohibitively expensive as ms_allocatable can
3651          *    be large, and therefore computationally expensive to add to
3652          *    the condense_tree. Instead we first sync out an entry marking
3653          *    everything as allocated, then the condense_tree and then the
3654          *    ms_allocatable, in the condensed space map. While this is not
3655          *    optimal, it is typically close to optimal and more importantly
3656          *    much cheaper to compute.
3657          *
3658          * 5] Finally, as both of the unflushed trees were written to our
3659          *    new and condensed metaslab space map, we basically flushed
3660          *    all the unflushed changes to disk, thus we call
3661          *    metaslab_flush_update().
3662          */
3663         ASSERT3U(spa_sync_pass(spa), ==, 1);
3664         ASSERT(range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */
3665
3666         zfs_dbgmsg("condensing: txg %llu, msp[%llu] %px, vdev id %llu, "
3667             "spa %s, smp size %llu, segments %llu, forcing condense=%s",
3668             (u_longlong_t)txg, (u_longlong_t)msp->ms_id, msp,
3669             (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
3670             spa->spa_name, (u_longlong_t)space_map_length(msp->ms_sm),
3671             (u_longlong_t)range_tree_numsegs(msp->ms_allocatable),
3672             msp->ms_condense_wanted ? "TRUE" : "FALSE");
3673
3674         msp->ms_condense_wanted = B_FALSE;
3675
3676         range_seg_type_t type;
3677         uint64_t shift, start;
3678         type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp,
3679             &start, &shift);
3680
3681         condense_tree = range_tree_create(NULL, type, NULL, start, shift);
3682
3683         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
3684                 range_tree_walk(msp->ms_defer[t],
3685                     range_tree_add, condense_tree);
3686         }
3687
3688         for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
3689                 range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
3690                     range_tree_add, condense_tree);
3691         }
3692
3693         ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
3694             metaslab_unflushed_changes_memused(msp));
3695         spa->spa_unflushed_stats.sus_memused -=
3696             metaslab_unflushed_changes_memused(msp);
3697         range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
3698         range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
3699
3700         /*
3701          * We're about to drop the metaslab's lock thus allowing other
3702          * consumers to change it's content. Set the metaslab's ms_condensing
3703          * flag to ensure that allocations on this metaslab do not occur
3704          * while we're in the middle of committing it to disk. This is only
3705          * critical for ms_allocatable as all other range trees use per TXG
3706          * views of their content.
3707          */
3708         msp->ms_condensing = B_TRUE;
3709
3710         mutex_exit(&msp->ms_lock);
3711         uint64_t object = space_map_object(msp->ms_sm);
3712         space_map_truncate(sm,
3713             spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
3714             zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx);
3715
3716         /*
3717          * space_map_truncate() may have reallocated the spacemap object.
3718          * If so, update the vdev_ms_array.
3719          */
3720         if (space_map_object(msp->ms_sm) != object) {
3721                 object = space_map_object(msp->ms_sm);
3722                 dmu_write(spa->spa_meta_objset,
3723                     msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) *
3724                     msp->ms_id, sizeof (uint64_t), &object, tx);
3725         }
3726
3727         /*
3728          * Note:
3729          * When the log space map feature is enabled, each space map will
3730          * always have ALLOCS followed by FREES for each sync pass. This is
3731          * typically true even when the log space map feature is disabled,
3732          * except from the case where a metaslab goes through metaslab_sync()
3733          * and gets condensed. In that case the metaslab's space map will have
3734          * ALLOCS followed by FREES (due to condensing) followed by ALLOCS
3735          * followed by FREES (due to space_map_write() in metaslab_sync()) for
3736          * sync pass 1.
3737          */
3738         range_tree_t *tmp_tree = range_tree_create(NULL, type, NULL, start,
3739             shift);
3740         range_tree_add(tmp_tree, msp->ms_start, msp->ms_size);
3741         space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx);
3742         space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
3743         space_map_write(sm, condense_tree, SM_FREE, SM_NO_VDEVID, tx);
3744
3745         range_tree_vacate(condense_tree, NULL, NULL);
3746         range_tree_destroy(condense_tree);
3747         range_tree_vacate(tmp_tree, NULL, NULL);
3748         range_tree_destroy(tmp_tree);
3749         mutex_enter(&msp->ms_lock);
3750
3751         msp->ms_condensing = B_FALSE;
3752         metaslab_flush_update(msp, tx);
3753 }
3754
3755 static void
3756 metaslab_unflushed_add(metaslab_t *msp, dmu_tx_t *tx)
3757 {
3758         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
3759         ASSERT(spa_syncing_log_sm(spa) != NULL);
3760         ASSERT(msp->ms_sm != NULL);
3761         ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
3762         ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
3763
3764         mutex_enter(&spa->spa_flushed_ms_lock);
3765         metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
3766         metaslab_set_unflushed_dirty(msp, B_TRUE);
3767         avl_add(&spa->spa_metaslabs_by_flushed, msp);
3768         mutex_exit(&spa->spa_flushed_ms_lock);
3769
3770         spa_log_sm_increment_current_mscount(spa);
3771         spa_log_summary_add_flushed_metaslab(spa, B_TRUE);
3772 }
3773
3774 void
3775 metaslab_unflushed_bump(metaslab_t *msp, dmu_tx_t *tx, boolean_t dirty)
3776 {
3777         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
3778         ASSERT(spa_syncing_log_sm(spa) != NULL);
3779         ASSERT(msp->ms_sm != NULL);
3780         ASSERT(metaslab_unflushed_txg(msp) != 0);
3781         ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp);
3782         ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
3783         ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
3784
3785         VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa));
3786
3787         /* update metaslab's position in our flushing tree */
3788         uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp);
3789         boolean_t ms_prev_flushed_dirty = metaslab_unflushed_dirty(msp);
3790         mutex_enter(&spa->spa_flushed_ms_lock);
3791         avl_remove(&spa->spa_metaslabs_by_flushed, msp);
3792         metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
3793         metaslab_set_unflushed_dirty(msp, dirty);
3794         avl_add(&spa->spa_metaslabs_by_flushed, msp);
3795         mutex_exit(&spa->spa_flushed_ms_lock);
3796
3797         /* update metaslab counts of spa_log_sm_t nodes */
3798         spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg);
3799         spa_log_sm_increment_current_mscount(spa);
3800
3801         /* update log space map summary */
3802         spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg,
3803             ms_prev_flushed_dirty);
3804         spa_log_summary_add_flushed_metaslab(spa, dirty);
3805
3806         /* cleanup obsolete logs if any */
3807         spa_cleanup_old_sm_logs(spa, tx);
3808 }
3809
3810 /*
3811  * Called when the metaslab has been flushed (its own spacemap now reflects
3812  * all the contents of the pool-wide spacemap log). Updates the metaslab's
3813  * metadata and any pool-wide related log space map data (e.g. summary,
3814  * obsolete logs, etc..) to reflect that.
3815  */
3816 static void
3817 metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx)
3818 {
3819         metaslab_group_t *mg = msp->ms_group;
3820         spa_t *spa = mg->mg_vd->vdev_spa;
3821
3822         ASSERT(MUTEX_HELD(&msp->ms_lock));
3823
3824         ASSERT3U(spa_sync_pass(spa), ==, 1);
3825
3826         /*
3827          * Just because a metaslab got flushed, that doesn't mean that
3828          * it will pass through metaslab_sync_done(). Thus, make sure to
3829          * update ms_synced_length here in case it doesn't.
3830          */
3831         msp->ms_synced_length = space_map_length(msp->ms_sm);
3832
3833         /*
3834          * We may end up here from metaslab_condense() without the
3835          * feature being active. In that case this is a no-op.
3836          */
3837         if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP) ||
3838             metaslab_unflushed_txg(msp) == 0)
3839                 return;
3840
3841         metaslab_unflushed_bump(msp, tx, B_FALSE);
3842 }
3843
3844 boolean_t
3845 metaslab_flush(metaslab_t *msp, dmu_tx_t *tx)
3846 {
3847         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
3848
3849         ASSERT(MUTEX_HELD(&msp->ms_lock));
3850         ASSERT3U(spa_sync_pass(spa), ==, 1);
3851         ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
3852
3853         ASSERT(msp->ms_sm != NULL);
3854         ASSERT(metaslab_unflushed_txg(msp) != 0);
3855         ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL);
3856
3857         /*
3858          * There is nothing wrong with flushing the same metaslab twice, as
3859          * this codepath should work on that case. However, the current
3860          * flushing scheme makes sure to avoid this situation as we would be
3861          * making all these calls without having anything meaningful to write
3862          * to disk. We assert this behavior here.
3863          */
3864         ASSERT3U(metaslab_unflushed_txg(msp), <, dmu_tx_get_txg(tx));
3865
3866         /*
3867          * We can not flush while loading, because then we would
3868          * not load the ms_unflushed_{allocs,frees}.
3869          */
3870         if (msp->ms_loading)
3871                 return (B_FALSE);
3872
3873         metaslab_verify_space(msp, dmu_tx_get_txg(tx));
3874         metaslab_verify_weight_and_frag(msp);
3875
3876         /*
3877          * Metaslab condensing is effectively flushing. Therefore if the
3878          * metaslab can be condensed we can just condense it instead of
3879          * flushing it.
3880          *
3881          * Note that metaslab_condense() does call metaslab_flush_update()
3882          * so we can just return immediately after condensing. We also
3883          * don't need to care about setting ms_flushing or broadcasting
3884          * ms_flush_cv, even if we temporarily drop the ms_lock in
3885          * metaslab_condense(), as the metaslab is already loaded.
3886          */
3887         if (msp->ms_loaded && metaslab_should_condense(msp)) {
3888                 metaslab_group_t *mg = msp->ms_group;
3889
3890                 /*
3891                  * For all histogram operations below refer to the
3892                  * comments of metaslab_sync() where we follow a
3893                  * similar procedure.
3894                  */
3895                 metaslab_group_histogram_verify(mg);
3896                 metaslab_class_histogram_verify(mg->mg_class);
3897                 metaslab_group_histogram_remove(mg, msp);
3898
3899                 metaslab_condense(msp, tx);
3900
3901                 space_map_histogram_clear(msp->ms_sm);
3902                 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
3903                 ASSERT(range_tree_is_empty(msp->ms_freed));
3904                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
3905                         space_map_histogram_add(msp->ms_sm,
3906                             msp->ms_defer[t], tx);
3907                 }
3908                 metaslab_aux_histograms_update(msp);
3909
3910                 metaslab_group_histogram_add(mg, msp);
3911                 metaslab_group_histogram_verify(mg);
3912                 metaslab_class_histogram_verify(mg->mg_class);
3913
3914                 metaslab_verify_space(msp, dmu_tx_get_txg(tx));
3915
3916                 /*
3917                  * Since we recreated the histogram (and potentially
3918                  * the ms_sm too while condensing) ensure that the
3919                  * weight is updated too because we are not guaranteed
3920                  * that this metaslab is dirty and will go through
3921                  * metaslab_sync_done().
3922                  */
3923                 metaslab_recalculate_weight_and_sort(msp);
3924                 return (B_TRUE);
3925         }
3926
3927         msp->ms_flushing = B_TRUE;
3928         uint64_t sm_len_before = space_map_length(msp->ms_sm);
3929
3930         mutex_exit(&msp->ms_lock);
3931         space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC,
3932             SM_NO_VDEVID, tx);
3933         space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE,
3934             SM_NO_VDEVID, tx);
3935         mutex_enter(&msp->ms_lock);
3936
3937         uint64_t sm_len_after = space_map_length(msp->ms_sm);
3938         if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) {
3939                 zfs_dbgmsg("flushing: txg %llu, spa %s, vdev_id %llu, "
3940                     "ms_id %llu, unflushed_allocs %llu, unflushed_frees %llu, "
3941                     "appended %llu bytes", (u_longlong_t)dmu_tx_get_txg(tx),
3942                     spa_name(spa),
3943                     (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
3944                     (u_longlong_t)msp->ms_id,
3945                     (u_longlong_t)range_tree_space(msp->ms_unflushed_allocs),
3946                     (u_longlong_t)range_tree_space(msp->ms_unflushed_frees),
3947                     (u_longlong_t)(sm_len_after - sm_len_before));
3948         }
3949
3950         ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
3951             metaslab_unflushed_changes_memused(msp));
3952         spa->spa_unflushed_stats.sus_memused -=
3953             metaslab_unflushed_changes_memused(msp);
3954         range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
3955         range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
3956
3957         metaslab_verify_space(msp, dmu_tx_get_txg(tx));
3958         metaslab_verify_weight_and_frag(msp);
3959
3960         metaslab_flush_update(msp, tx);
3961
3962         metaslab_verify_space(msp, dmu_tx_get_txg(tx));
3963         metaslab_verify_weight_and_frag(msp);
3964
3965         msp->ms_flushing = B_FALSE;
3966         cv_broadcast(&msp->ms_flush_cv);
3967         return (B_TRUE);
3968 }
3969
3970 /*
3971  * Write a metaslab to disk in the context of the specified transaction group.
3972  */
3973 void
3974 metaslab_sync(metaslab_t *msp, uint64_t txg)
3975 {
3976         metaslab_group_t *mg = msp->ms_group;
3977         vdev_t *vd = mg->mg_vd;
3978         spa_t *spa = vd->vdev_spa;
3979         objset_t *mos = spa_meta_objset(spa);
3980         range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK];
3981         dmu_tx_t *tx;
3982
3983         ASSERT(!vd->vdev_ishole);
3984
3985         /*
3986          * This metaslab has just been added so there's no work to do now.
3987          */
3988         if (msp->ms_new) {
3989                 ASSERT0(range_tree_space(alloctree));
3990                 ASSERT0(range_tree_space(msp->ms_freeing));
3991                 ASSERT0(range_tree_space(msp->ms_freed));
3992                 ASSERT0(range_tree_space(msp->ms_checkpointing));
3993                 ASSERT0(range_tree_space(msp->ms_trim));
3994                 return;
3995         }
3996
3997         /*
3998          * Normally, we don't want to process a metaslab if there are no
3999          * allocations or frees to perform. However, if the metaslab is being
4000          * forced to condense, it's loaded and we're not beyond the final
4001          * dirty txg, we need to let it through. Not condensing beyond the
4002          * final dirty txg prevents an issue where metaslabs that need to be
4003          * condensed but were loaded for other reasons could cause a panic
4004          * here. By only checking the txg in that branch of the conditional,
4005          * we preserve the utility of the VERIFY statements in all other
4006          * cases.
4007          */
4008         if (range_tree_is_empty(alloctree) &&
4009             range_tree_is_empty(msp->ms_freeing) &&
4010             range_tree_is_empty(msp->ms_checkpointing) &&
4011             !(msp->ms_loaded && msp->ms_condense_wanted &&
4012             txg <= spa_final_dirty_txg(spa)))
4013                 return;
4014
4015
4016         VERIFY3U(txg, <=, spa_final_dirty_txg(spa));
4017
4018         /*
4019          * The only state that can actually be changing concurrently
4020          * with metaslab_sync() is the metaslab's ms_allocatable. No
4021          * other thread can be modifying this txg's alloc, freeing,
4022          * freed, or space_map_phys_t.  We drop ms_lock whenever we
4023          * could call into the DMU, because the DMU can call down to
4024          * us (e.g. via zio_free()) at any time.
4025          *
4026          * The spa_vdev_remove_thread() can be reading metaslab state
4027          * concurrently, and it is locked out by the ms_sync_lock.
4028          * Note that the ms_lock is insufficient for this, because it
4029          * is dropped by space_map_write().
4030          */
4031         tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
4032
4033         /*
4034          * Generate a log space map if one doesn't exist already.
4035          */
4036         spa_generate_syncing_log_sm(spa, tx);
4037
4038         if (msp->ms_sm == NULL) {
4039                 uint64_t new_object = space_map_alloc(mos,
4040                     spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
4041                     zfs_metaslab_sm_blksz_with_log :
4042                     zfs_metaslab_sm_blksz_no_log, tx);
4043                 VERIFY3U(new_object, !=, 0);
4044
4045                 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
4046                     msp->ms_id, sizeof (uint64_t), &new_object, tx);
4047
4048                 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
4049                     msp->ms_start, msp->ms_size, vd->vdev_ashift));
4050                 ASSERT(msp->ms_sm != NULL);
4051
4052                 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
4053                 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
4054                 ASSERT0(metaslab_allocated_space(msp));
4055         }
4056
4057         if (!range_tree_is_empty(msp->ms_checkpointing) &&
4058             vd->vdev_checkpoint_sm == NULL) {
4059                 ASSERT(spa_has_checkpoint(spa));
4060
4061                 uint64_t new_object = space_map_alloc(mos,
4062                     zfs_vdev_standard_sm_blksz, tx);
4063                 VERIFY3U(new_object, !=, 0);
4064
4065                 VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
4066                     mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift));
4067                 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
4068
4069                 /*
4070                  * We save the space map object as an entry in vdev_top_zap
4071                  * so it can be retrieved when the pool is reopened after an
4072                  * export or through zdb.
4073                  */
4074                 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset,
4075                     vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
4076                     sizeof (new_object), 1, &new_object, tx));
4077         }
4078
4079         mutex_enter(&msp->ms_sync_lock);
4080         mutex_enter(&msp->ms_lock);
4081
4082         /*
4083          * Note: metaslab_condense() clears the space map's histogram.
4084          * Therefore we must verify and remove this histogram before
4085          * condensing.
4086          */
4087         metaslab_group_histogram_verify(mg);
4088         metaslab_class_histogram_verify(mg->mg_class);
4089         metaslab_group_histogram_remove(mg, msp);
4090
4091         if (spa->spa_sync_pass == 1 && msp->ms_loaded &&
4092             metaslab_should_condense(msp))
4093                 metaslab_condense(msp, tx);
4094
4095         /*
4096          * We'll be going to disk to sync our space accounting, thus we
4097          * drop the ms_lock during that time so allocations coming from
4098          * open-context (ZIL) for future TXGs do not block.
4099          */
4100         mutex_exit(&msp->ms_lock);
4101         space_map_t *log_sm = spa_syncing_log_sm(spa);
4102         if (log_sm != NULL) {
4103                 ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
4104                 if (metaslab_unflushed_txg(msp) == 0)
4105                         metaslab_unflushed_add(msp, tx);
4106                 else if (!metaslab_unflushed_dirty(msp))
4107                         metaslab_unflushed_bump(msp, tx, B_TRUE);
4108
4109                 space_map_write(log_sm, alloctree, SM_ALLOC,
4110                     vd->vdev_id, tx);
4111                 space_map_write(log_sm, msp->ms_freeing, SM_FREE,
4112                     vd->vdev_id, tx);
4113                 mutex_enter(&msp->ms_lock);
4114
4115                 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
4116                     metaslab_unflushed_changes_memused(msp));
4117                 spa->spa_unflushed_stats.sus_memused -=
4118                     metaslab_unflushed_changes_memused(msp);
4119                 range_tree_remove_xor_add(alloctree,
4120                     msp->ms_unflushed_frees, msp->ms_unflushed_allocs);
4121                 range_tree_remove_xor_add(msp->ms_freeing,
4122                     msp->ms_unflushed_allocs, msp->ms_unflushed_frees);
4123                 spa->spa_unflushed_stats.sus_memused +=
4124                     metaslab_unflushed_changes_memused(msp);
4125         } else {
4126                 ASSERT(!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
4127
4128                 space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
4129                     SM_NO_VDEVID, tx);
4130                 space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
4131                     SM_NO_VDEVID, tx);
4132                 mutex_enter(&msp->ms_lock);
4133         }
4134
4135         msp->ms_allocated_space += range_tree_space(alloctree);
4136         ASSERT3U(msp->ms_allocated_space, >=,
4137             range_tree_space(msp->ms_freeing));
4138         msp->ms_allocated_space -= range_tree_space(msp->ms_freeing);
4139
4140         if (!range_tree_is_empty(msp->ms_checkpointing)) {
4141                 ASSERT(spa_has_checkpoint(spa));
4142                 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
4143
4144                 /*
4145                  * Since we are doing writes to disk and the ms_checkpointing
4146                  * tree won't be changing during that time, we drop the
4147                  * ms_lock while writing to the checkpoint space map, for the
4148                  * same reason mentioned above.
4149                  */
4150                 mutex_exit(&msp->ms_lock);
4151                 space_map_write(vd->vdev_checkpoint_sm,
4152                     msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
4153                 mutex_enter(&msp->ms_lock);
4154
4155                 spa->spa_checkpoint_info.sci_dspace +=
4156                     range_tree_space(msp->ms_checkpointing);
4157                 vd->vdev_stat.vs_checkpoint_space +=
4158                     range_tree_space(msp->ms_checkpointing);
4159                 ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
4160                     -space_map_allocated(vd->vdev_checkpoint_sm));
4161
4162                 range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
4163         }
4164
4165         if (msp->ms_loaded) {
4166                 /*
4167                  * When the space map is loaded, we have an accurate
4168                  * histogram in the range tree. This gives us an opportunity
4169                  * to bring the space map's histogram up-to-date so we clear
4170                  * it first before updating it.
4171                  */
4172                 space_map_histogram_clear(msp->ms_sm);
4173                 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
4174
4175                 /*
4176                  * Since we've cleared the histogram we need to add back
4177                  * any free space that has already been processed, plus
4178                  * any deferred space. This allows the on-disk histogram
4179                  * to accurately reflect all free space even if some space
4180                  * is not yet available for allocation (i.e. deferred).
4181                  */
4182                 space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx);
4183
4184                 /*
4185                  * Add back any deferred free space that has not been
4186                  * added back into the in-core free tree yet. This will
4187                  * ensure that we don't end up with a space map histogram
4188                  * that is completely empty unless the metaslab is fully
4189                  * allocated.
4190                  */
4191                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
4192                         space_map_histogram_add(msp->ms_sm,
4193                             msp->ms_defer[t], tx);
4194                 }
4195         }
4196
4197         /*
4198          * Always add the free space from this sync pass to the space
4199          * map histogram. We want to make sure that the on-disk histogram
4200          * accounts for all free space. If the space map is not loaded,
4201          * then we will lose some accuracy but will correct it the next
4202          * time we load the space map.
4203          */
4204         space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
4205         metaslab_aux_histograms_update(msp);
4206
4207         metaslab_group_histogram_add(mg, msp);
4208         metaslab_group_histogram_verify(mg);
4209         metaslab_class_histogram_verify(mg->mg_class);
4210
4211         /*
4212          * For sync pass 1, we avoid traversing this txg's free range tree
4213          * and instead will just swap the pointers for freeing and freed.
4214          * We can safely do this since the freed_tree is guaranteed to be
4215          * empty on the initial pass.
4216          *
4217          * Keep in mind that even if we are currently using a log spacemap
4218          * we want current frees to end up in the ms_allocatable (but not
4219          * get appended to the ms_sm) so their ranges can be reused as usual.
4220          */
4221         if (spa_sync_pass(spa) == 1) {
4222                 range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
4223                 ASSERT0(msp->ms_allocated_this_txg);
4224         } else {
4225                 range_tree_vacate(msp->ms_freeing,
4226                     range_tree_add, msp->ms_freed);
4227         }
4228         msp->ms_allocated_this_txg += range_tree_space(alloctree);
4229         range_tree_vacate(alloctree, NULL, NULL);
4230
4231         ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
4232         ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg)
4233             & TXG_MASK]));
4234         ASSERT0(range_tree_space(msp->ms_freeing));
4235         ASSERT0(range_tree_space(msp->ms_checkpointing));
4236
4237         mutex_exit(&msp->ms_lock);
4238
4239         /*
4240          * Verify that the space map object ID has been recorded in the
4241          * vdev_ms_array.
4242          */
4243         uint64_t object;
4244         VERIFY0(dmu_read(mos, vd->vdev_ms_array,
4245             msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0));
4246         VERIFY3U(object, ==, space_map_object(msp->ms_sm));
4247
4248         mutex_exit(&msp->ms_sync_lock);
4249         dmu_tx_commit(tx);
4250 }
4251
4252 static void
4253 metaslab_evict(metaslab_t *msp, uint64_t txg)
4254 {
4255         if (!msp->ms_loaded || msp->ms_disabled != 0)
4256                 return;
4257
4258         for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
4259                 VERIFY0(range_tree_space(
4260                     msp->ms_allocating[(txg + t) & TXG_MASK]));
4261         }
4262         if (msp->ms_allocator != -1)
4263                 metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK);
4264
4265         if (!metaslab_debug_unload)
4266                 metaslab_unload(msp);
4267 }
4268
4269 /*
4270  * Called after a transaction group has completely synced to mark
4271  * all of the metaslab's free space as usable.
4272  */
4273 void
4274 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
4275 {
4276         metaslab_group_t *mg = msp->ms_group;
4277         vdev_t *vd = mg->mg_vd;
4278         spa_t *spa = vd->vdev_spa;
4279         range_tree_t **defer_tree;
4280         int64_t alloc_delta, defer_delta;
4281         boolean_t defer_allowed = B_TRUE;
4282
4283         ASSERT(!vd->vdev_ishole);
4284
4285         mutex_enter(&msp->ms_lock);
4286
4287         if (msp->ms_new) {
4288                 /* this is a new metaslab, add its capacity to the vdev */
4289                 metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
4290
4291                 /* there should be no allocations nor frees at this point */
4292                 VERIFY0(msp->ms_allocated_this_txg);
4293                 VERIFY0(range_tree_space(msp->ms_freed));
4294         }
4295
4296         ASSERT0(range_tree_space(msp->ms_freeing));
4297         ASSERT0(range_tree_space(msp->ms_checkpointing));
4298
4299         defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];
4300
4301         uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
4302             metaslab_class_get_alloc(spa_normal_class(spa));
4303         if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) {
4304                 defer_allowed = B_FALSE;
4305         }
4306
4307         defer_delta = 0;
4308         alloc_delta = msp->ms_allocated_this_txg -
4309             range_tree_space(msp->ms_freed);
4310
4311         if (defer_allowed) {
4312                 defer_delta = range_tree_space(msp->ms_freed) -
4313                     range_tree_space(*defer_tree);
4314         } else {
4315                 defer_delta -= range_tree_space(*defer_tree);
4316         }
4317         metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
4318             defer_delta, 0);
4319
4320         if (spa_syncing_log_sm(spa) == NULL) {
4321                 /*
4322                  * If there's a metaslab_load() in progress and we don't have
4323                  * a log space map, it means that we probably wrote to the
4324                  * metaslab's space map. If this is the case, we need to
4325                  * make sure that we wait for the load to complete so that we
4326                  * have a consistent view at the in-core side of the metaslab.
4327                  */
4328                 metaslab_load_wait(msp);
4329         } else {
4330                 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
4331         }
4332
4333         /*
4334          * When auto-trimming is enabled, free ranges which are added to
4335          * ms_allocatable are also be added to ms_trim.  The ms_trim tree is
4336          * periodically consumed by the vdev_autotrim_thread() which issues
4337          * trims for all ranges and then vacates the tree.  The ms_trim tree
4338          * can be discarded at any time with the sole consequence of recent
4339          * frees not being trimmed.
4340          */
4341         if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) {
4342                 range_tree_walk(*defer_tree, range_tree_add, msp->ms_trim);
4343                 if (!defer_allowed) {
4344                         range_tree_walk(msp->ms_freed, range_tree_add,
4345                             msp->ms_trim);
4346                 }
4347         } else {
4348                 range_tree_vacate(msp->ms_trim, NULL, NULL);
4349         }
4350
4351         /*
4352          * Move the frees from the defer_tree back to the free
4353          * range tree (if it's loaded). Swap the freed_tree and
4354          * the defer_tree -- this is safe to do because we've
4355          * just emptied out the defer_tree.
4356          */
4357         range_tree_vacate(*defer_tree,
4358             msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable);
4359         if (defer_allowed) {
4360                 range_tree_swap(&msp->ms_freed, defer_tree);
4361         } else {
4362                 range_tree_vacate(msp->ms_freed,
4363                     msp->ms_loaded ? range_tree_add : NULL,
4364                     msp->ms_allocatable);
4365         }
4366
4367         msp->ms_synced_length = space_map_length(msp->ms_sm);
4368
4369         msp->ms_deferspace += defer_delta;
4370         ASSERT3S(msp->ms_deferspace, >=, 0);
4371         ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
4372         if (msp->ms_deferspace != 0) {
4373                 /*
4374                  * Keep syncing this metaslab until all deferred frees
4375                  * are back in circulation.
4376                  */
4377                 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
4378         }
4379         metaslab_aux_histograms_update_done(msp, defer_allowed);
4380
4381         if (msp->ms_new) {
4382                 msp->ms_new = B_FALSE;
4383                 mutex_enter(&mg->mg_lock);
4384                 mg->mg_ms_ready++;
4385                 mutex_exit(&mg->mg_lock);
4386         }
4387
4388         /*
4389          * Re-sort metaslab within its group now that we've adjusted
4390          * its allocatable space.
4391          */
4392         metaslab_recalculate_weight_and_sort(msp);
4393
4394         ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
4395         ASSERT0(range_tree_space(msp->ms_freeing));
4396         ASSERT0(range_tree_space(msp->ms_freed));
4397         ASSERT0(range_tree_space(msp->ms_checkpointing));
4398         msp->ms_allocating_total -= msp->ms_allocated_this_txg;
4399         msp->ms_allocated_this_txg = 0;
4400         mutex_exit(&msp->ms_lock);
4401 }
4402
4403 void
4404 metaslab_sync_reassess(metaslab_group_t *mg)
4405 {
4406         spa_t *spa = mg->mg_class->mc_spa;
4407
4408         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
4409         metaslab_group_alloc_update(mg);
4410         mg->mg_fragmentation = metaslab_group_fragmentation(mg);
4411
4412         /*
4413          * Preload the next potential metaslabs but only on active
4414          * metaslab groups. We can get into a state where the metaslab
4415          * is no longer active since we dirty metaslabs as we remove a
4416          * a device, thus potentially making the metaslab group eligible
4417          * for preloading.
4418          */
4419         if (mg->mg_activation_count > 0) {
4420                 metaslab_group_preload(mg);
4421         }
4422         spa_config_exit(spa, SCL_ALLOC, FTAG);
4423 }
4424
4425 /*
4426  * When writing a ditto block (i.e. more than one DVA for a given BP) on
4427  * the same vdev as an existing DVA of this BP, then try to allocate it
4428  * on a different metaslab than existing DVAs (i.e. a unique metaslab).
4429  */
4430 static boolean_t
4431 metaslab_is_unique(metaslab_t *msp, dva_t *dva)
4432 {
4433         uint64_t dva_ms_id;
4434
4435         if (DVA_GET_ASIZE(dva) == 0)
4436                 return (B_TRUE);
4437
4438         if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
4439                 return (B_TRUE);
4440
4441         dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift;
4442
4443         return (msp->ms_id != dva_ms_id);
4444 }
4445
4446 /*
4447  * ==========================================================================
4448  * Metaslab allocation tracing facility
4449  * ==========================================================================
4450  */
4451
4452 /*
4453  * Add an allocation trace element to the allocation tracing list.
4454  */
4455 static void
4456 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
4457     metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset,
4458     int allocator)
4459 {
4460         metaslab_alloc_trace_t *mat;
4461
4462         if (!metaslab_trace_enabled)
4463                 return;
4464
4465         /*
4466          * When the tracing list reaches its maximum we remove
4467          * the second element in the list before adding a new one.
4468          * By removing the second element we preserve the original
4469          * entry as a clue to what allocations steps have already been
4470          * performed.
4471          */
4472         if (zal->zal_size == metaslab_trace_max_entries) {
4473                 metaslab_alloc_trace_t *mat_next;
4474 #ifdef ZFS_DEBUG
4475                 panic("too many entries in allocation list");
4476 #endif
4477                 METASLABSTAT_BUMP(metaslabstat_trace_over_limit);
4478                 zal->zal_size--;
4479                 mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
4480                 list_remove(&zal->zal_list, mat_next);
4481                 kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
4482         }
4483
4484         mat = kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
4485         list_link_init(&mat->mat_list_node);
4486         mat->mat_mg = mg;
4487         mat->mat_msp = msp;
4488         mat->mat_size = psize;
4489         mat->mat_dva_id = dva_id;
4490         mat->mat_offset = offset;
4491         mat->mat_weight = 0;
4492         mat->mat_allocator = allocator;
4493
4494         if (msp != NULL)
4495                 mat->mat_weight = msp->ms_weight;
4496
4497         /*
4498          * The list is part of the zio so locking is not required. Only
4499          * a single thread will perform allocations for a given zio.
4500          */
4501         list_insert_tail(&zal->zal_list, mat);
4502         zal->zal_size++;
4503
4504         ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
4505 }
4506
4507 void
4508 metaslab_trace_init(zio_alloc_list_t *zal)
4509 {
4510         list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
4511             offsetof(metaslab_alloc_trace_t, mat_list_node));
4512         zal->zal_size = 0;
4513 }
4514
4515 void
4516 metaslab_trace_fini(zio_alloc_list_t *zal)
4517 {
4518         metaslab_alloc_trace_t *mat;
4519
4520         while ((mat = list_remove_head(&zal->zal_list)) != NULL)
4521                 kmem_cache_free(metaslab_alloc_trace_cache, mat);
4522         list_destroy(&zal->zal_list);
4523         zal->zal_size = 0;
4524 }
4525
4526 /*
4527  * ==========================================================================
4528  * Metaslab block operations
4529  * ==========================================================================
4530  */
4531
4532 static void
4533 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, const void *tag,
4534     int flags, int allocator)
4535 {
4536         if (!(flags & METASLAB_ASYNC_ALLOC) ||
4537             (flags & METASLAB_DONT_THROTTLE))
4538                 return;
4539
4540         metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
4541         if (!mg->mg_class->mc_alloc_throttle_enabled)
4542                 return;
4543
4544         metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
4545         (void) zfs_refcount_add(&mga->mga_alloc_queue_depth, tag);
4546 }
4547
4548 static void
4549 metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
4550 {
4551         metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
4552         metaslab_class_allocator_t *mca =
4553             &mg->mg_class->mc_allocator[allocator];
4554         uint64_t max = mg->mg_max_alloc_queue_depth;
4555         uint64_t cur = mga->mga_cur_max_alloc_queue_depth;
4556         while (cur < max) {
4557                 if (atomic_cas_64(&mga->mga_cur_max_alloc_queue_depth,
4558                     cur, cur + 1) == cur) {
4559                         atomic_inc_64(&mca->mca_alloc_max_slots);
4560                         return;
4561                 }
4562                 cur = mga->mga_cur_max_alloc_queue_depth;
4563         }
4564 }
4565
4566 void
4567 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, const void *tag,
4568     int flags, int allocator, boolean_t io_complete)
4569 {
4570         if (!(flags & METASLAB_ASYNC_ALLOC) ||
4571             (flags & METASLAB_DONT_THROTTLE))
4572                 return;
4573
4574         metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
4575         if (!mg->mg_class->mc_alloc_throttle_enabled)
4576                 return;
4577
4578         metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
4579         (void) zfs_refcount_remove(&mga->mga_alloc_queue_depth, tag);
4580         if (io_complete)
4581                 metaslab_group_increment_qdepth(mg, allocator);
4582 }
4583
4584 void
4585 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, const void *tag,
4586     int allocator)
4587 {
4588 #ifdef ZFS_DEBUG
4589         const dva_t *dva = bp->blk_dva;
4590         int ndvas = BP_GET_NDVAS(bp);
4591
4592         for (int d = 0; d < ndvas; d++) {
4593                 uint64_t vdev = DVA_GET_VDEV(&dva[d]);
4594                 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
4595                 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
4596                 VERIFY(zfs_refcount_not_held(&mga->mga_alloc_queue_depth, tag));
4597         }
4598 #endif
4599 }
4600
4601 static uint64_t
4602 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
4603 {
4604         uint64_t start;
4605         range_tree_t *rt = msp->ms_allocatable;
4606         metaslab_class_t *mc = msp->ms_group->mg_class;
4607
4608         ASSERT(MUTEX_HELD(&msp->ms_lock));
4609         VERIFY(!msp->ms_condensing);
4610         VERIFY0(msp->ms_disabled);
4611
4612         start = mc->mc_ops->msop_alloc(msp, size);
4613         if (start != -1ULL) {
4614                 metaslab_group_t *mg = msp->ms_group;
4615                 vdev_t *vd = mg->mg_vd;
4616
4617                 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
4618                 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
4619                 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
4620                 range_tree_remove(rt, start, size);
4621                 range_tree_clear(msp->ms_trim, start, size);
4622
4623                 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
4624                         vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
4625
4626                 range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size);
4627                 msp->ms_allocating_total += size;
4628
4629                 /* Track the last successful allocation */
4630                 msp->ms_alloc_txg = txg;
4631                 metaslab_verify_space(msp, txg);
4632         }
4633
4634         /*
4635          * Now that we've attempted the allocation we need to update the
4636          * metaslab's maximum block size since it may have changed.
4637          */
4638         msp->ms_max_size = metaslab_largest_allocatable(msp);
4639         return (start);
4640 }
4641
4642 /*
4643  * Find the metaslab with the highest weight that is less than what we've
4644  * already tried.  In the common case, this means that we will examine each
4645  * metaslab at most once. Note that concurrent callers could reorder metaslabs
4646  * by activation/passivation once we have dropped the mg_lock. If a metaslab is
4647  * activated by another thread, and we fail to allocate from the metaslab we
4648  * have selected, we may not try the newly-activated metaslab, and instead
4649  * activate another metaslab.  This is not optimal, but generally does not cause
4650  * any problems (a possible exception being if every metaslab is completely full
4651  * except for the newly-activated metaslab which we fail to examine).
4652  */
4653 static metaslab_t *
4654 find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
4655     dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator,
4656     boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search,
4657     boolean_t *was_active)
4658 {
4659         avl_index_t idx;
4660         avl_tree_t *t = &mg->mg_metaslab_tree;
4661         metaslab_t *msp = avl_find(t, search, &idx);
4662         if (msp == NULL)
4663                 msp = avl_nearest(t, idx, AVL_AFTER);
4664
4665         uint_t tries = 0;
4666         for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
4667                 int i;
4668
4669                 if (!try_hard && tries > zfs_metaslab_find_max_tries) {
4670                         METASLABSTAT_BUMP(metaslabstat_too_many_tries);
4671                         return (NULL);
4672                 }
4673                 tries++;
4674
4675                 if (!metaslab_should_allocate(msp, asize, try_hard)) {
4676                         metaslab_trace_add(zal, mg, msp, asize, d,
4677                             TRACE_TOO_SMALL, allocator);
4678                         continue;
4679                 }
4680
4681                 /*
4682                  * If the selected metaslab is condensing or disabled,
4683                  * skip it.
4684                  */
4685                 if (msp->ms_condensing || msp->ms_disabled > 0)
4686                         continue;
4687
4688                 *was_active = msp->ms_allocator != -1;
4689                 /*
4690                  * If we're activating as primary, this is our first allocation
4691                  * from this disk, so we don't need to check how close we are.
4692                  * If the metaslab under consideration was already active,
4693                  * we're getting desperate enough to steal another allocator's
4694                  * metaslab, so we still don't care about distances.
4695                  */
4696                 if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active)
4697                         break;
4698
4699                 for (i = 0; i < d; i++) {
4700                         if (want_unique &&
4701                             !metaslab_is_unique(msp, &dva[i]))
4702                                 break;  /* try another metaslab */
4703                 }
4704                 if (i == d)
4705                         break;
4706         }
4707
4708         if (msp != NULL) {
4709                 search->ms_weight = msp->ms_weight;
4710                 search->ms_start = msp->ms_start + 1;
4711                 search->ms_allocator = msp->ms_allocator;
4712                 search->ms_primary = msp->ms_primary;
4713         }
4714         return (msp);
4715 }
4716
4717 static void
4718 metaslab_active_mask_verify(metaslab_t *msp)
4719 {
4720         ASSERT(MUTEX_HELD(&msp->ms_lock));
4721
4722         if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
4723                 return;
4724
4725         if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0)
4726                 return;
4727
4728         if (msp->ms_weight & METASLAB_WEIGHT_PRIMARY) {
4729                 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
4730                 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
4731                 VERIFY3S(msp->ms_allocator, !=, -1);
4732                 VERIFY(msp->ms_primary);
4733                 return;
4734         }
4735
4736         if (msp->ms_weight & METASLAB_WEIGHT_SECONDARY) {
4737                 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
4738                 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
4739                 VERIFY3S(msp->ms_allocator, !=, -1);
4740                 VERIFY(!msp->ms_primary);
4741                 return;
4742         }
4743
4744         if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
4745                 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
4746                 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
4747                 VERIFY3S(msp->ms_allocator, ==, -1);
4748                 return;
4749         }
4750 }
4751
4752 static uint64_t
4753 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
4754     uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
4755     int allocator, boolean_t try_hard)
4756 {
4757         metaslab_t *msp = NULL;
4758         uint64_t offset = -1ULL;
4759
4760         uint64_t activation_weight = METASLAB_WEIGHT_PRIMARY;
4761         for (int i = 0; i < d; i++) {
4762                 if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
4763                     DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
4764                         activation_weight = METASLAB_WEIGHT_SECONDARY;
4765                 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
4766                     DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
4767                         activation_weight = METASLAB_WEIGHT_CLAIM;
4768                         break;
4769                 }
4770         }
4771
4772         /*
4773          * If we don't have enough metaslabs active to fill the entire array, we
4774          * just use the 0th slot.
4775          */
4776         if (mg->mg_ms_ready < mg->mg_allocators * 3)
4777                 allocator = 0;
4778         metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
4779
4780         ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2);
4781
4782         metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
4783         search->ms_weight = UINT64_MAX;
4784         search->ms_start = 0;
4785         /*
4786          * At the end of the metaslab tree are the already-active metaslabs,
4787          * first the primaries, then the secondaries. When we resume searching
4788          * through the tree, we need to consider ms_allocator and ms_primary so
4789          * we start in the location right after where we left off, and don't
4790          * accidentally loop forever considering the same metaslabs.
4791          */
4792         search->ms_allocator = -1;
4793         search->ms_primary = B_TRUE;
4794         for (;;) {
4795                 boolean_t was_active = B_FALSE;
4796
4797                 mutex_enter(&mg->mg_lock);
4798
4799                 if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
4800                     mga->mga_primary != NULL) {
4801                         msp = mga->mga_primary;
4802
4803                         /*
4804                          * Even though we don't hold the ms_lock for the
4805                          * primary metaslab, those fields should not
4806                          * change while we hold the mg_lock. Thus it is
4807                          * safe to make assertions on them.
4808                          */
4809                         ASSERT(msp->ms_primary);
4810                         ASSERT3S(msp->ms_allocator, ==, allocator);
4811                         ASSERT(msp->ms_loaded);
4812
4813                         was_active = B_TRUE;
4814                         ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
4815                 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
4816                     mga->mga_secondary != NULL) {
4817                         msp = mga->mga_secondary;
4818
4819                         /*
4820                          * See comment above about the similar assertions
4821                          * for the primary metaslab.
4822                          */
4823                         ASSERT(!msp->ms_primary);
4824                         ASSERT3S(msp->ms_allocator, ==, allocator);
4825                         ASSERT(msp->ms_loaded);
4826
4827                         was_active = B_TRUE;
4828                         ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
4829                 } else {
4830                         msp = find_valid_metaslab(mg, activation_weight, dva, d,
4831                             want_unique, asize, allocator, try_hard, zal,
4832                             search, &was_active);
4833                 }
4834
4835                 mutex_exit(&mg->mg_lock);
4836                 if (msp == NULL) {
4837                         kmem_free(search, sizeof (*search));
4838                         return (-1ULL);
4839                 }
4840                 mutex_enter(&msp->ms_lock);
4841
4842                 metaslab_active_mask_verify(msp);
4843
4844                 /*
4845                  * This code is disabled out because of issues with
4846                  * tracepoints in non-gpl kernel modules.
4847                  */
4848 #if 0
4849                 DTRACE_PROBE3(ms__activation__attempt,
4850                     metaslab_t *, msp, uint64_t, activation_weight,
4851                     boolean_t, was_active);
4852 #endif
4853
4854                 /*
4855                  * Ensure that the metaslab we have selected is still
4856                  * capable of handling our request. It's possible that
4857                  * another thread may have changed the weight while we
4858                  * were blocked on the metaslab lock. We check the
4859                  * active status first to see if we need to set_selected_txg
4860                  * a new metaslab.
4861                  */
4862                 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
4863                         ASSERT3S(msp->ms_allocator, ==, -1);
4864                         mutex_exit(&msp->ms_lock);
4865                         continue;
4866                 }
4867
4868                 /*
4869                  * If the metaslab was activated for another allocator
4870                  * while we were waiting in the ms_lock above, or it's
4871                  * a primary and we're seeking a secondary (or vice versa),
4872                  * we go back and select a new metaslab.
4873                  */
4874                 if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) &&
4875                     (msp->ms_allocator != -1) &&
4876                     (msp->ms_allocator != allocator || ((activation_weight ==
4877                     METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) {
4878                         ASSERT(msp->ms_loaded);
4879                         ASSERT((msp->ms_weight & METASLAB_WEIGHT_CLAIM) ||
4880                             msp->ms_allocator != -1);
4881                         mutex_exit(&msp->ms_lock);
4882                         continue;
4883                 }
4884
4885                 /*
4886                  * This metaslab was used for claiming regions allocated
4887                  * by the ZIL during pool import. Once these regions are
4888                  * claimed we don't need to keep the CLAIM bit set
4889                  * anymore. Passivate this metaslab to zero its activation
4890                  * mask.
4891                  */
4892                 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM &&
4893                     activation_weight != METASLAB_WEIGHT_CLAIM) {
4894                         ASSERT(msp->ms_loaded);
4895                         ASSERT3S(msp->ms_allocator, ==, -1);
4896                         metaslab_passivate(msp, msp->ms_weight &
4897                             ~METASLAB_WEIGHT_CLAIM);
4898                         mutex_exit(&msp->ms_lock);
4899                         continue;
4900                 }
4901
4902                 metaslab_set_selected_txg(msp, txg);
4903
4904                 int activation_error =
4905                     metaslab_activate(msp, allocator, activation_weight);
4906                 metaslab_active_mask_verify(msp);
4907
4908                 /*
4909                  * If the metaslab was activated by another thread for
4910                  * another allocator or activation_weight (EBUSY), or it
4911                  * failed because another metaslab was assigned as primary
4912                  * for this allocator (EEXIST) we continue using this
4913                  * metaslab for our allocation, rather than going on to a
4914                  * worse metaslab (we waited for that metaslab to be loaded
4915                  * after all).
4916                  *
4917                  * If the activation failed due to an I/O error or ENOSPC we
4918                  * skip to the next metaslab.
4919                  */
4920                 boolean_t activated;
4921                 if (activation_error == 0) {
4922                         activated = B_TRUE;
4923                 } else if (activation_error == EBUSY ||
4924                     activation_error == EEXIST) {
4925                         activated = B_FALSE;
4926                 } else {
4927                         mutex_exit(&msp->ms_lock);
4928                         continue;
4929                 }
4930                 ASSERT(msp->ms_loaded);
4931
4932                 /*
4933                  * Now that we have the lock, recheck to see if we should
4934                  * continue to use this metaslab for this allocation. The
4935                  * the metaslab is now loaded so metaslab_should_allocate()
4936                  * can accurately determine if the allocation attempt should
4937                  * proceed.
4938                  */
4939                 if (!metaslab_should_allocate(msp, asize, try_hard)) {
4940                         /* Passivate this metaslab and select a new one. */
4941                         metaslab_trace_add(zal, mg, msp, asize, d,
4942                             TRACE_TOO_SMALL, allocator);
4943                         goto next;
4944                 }
4945
4946                 /*
4947                  * If this metaslab is currently condensing then pick again
4948                  * as we can't manipulate this metaslab until it's committed
4949                  * to disk. If this metaslab is being initialized, we shouldn't
4950                  * allocate from it since the allocated region might be
4951                  * overwritten after allocation.
4952                  */
4953                 if (msp->ms_condensing) {
4954                         metaslab_trace_add(zal, mg, msp, asize, d,
4955                             TRACE_CONDENSING, allocator);
4956                         if (activated) {
4957                                 metaslab_passivate(msp, msp->ms_weight &
4958                                     ~METASLAB_ACTIVE_MASK);
4959                         }
4960                         mutex_exit(&msp->ms_lock);
4961                         continue;
4962                 } else if (msp->ms_disabled > 0) {
4963                         metaslab_trace_add(zal, mg, msp, asize, d,
4964                             TRACE_DISABLED, allocator);
4965                         if (activated) {
4966                                 metaslab_passivate(msp, msp->ms_weight &
4967                                     ~METASLAB_ACTIVE_MASK);
4968                         }
4969                         mutex_exit(&msp->ms_lock);
4970                         continue;
4971                 }
4972
4973                 offset = metaslab_block_alloc(msp, asize, txg);
4974                 metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);
4975
4976                 if (offset != -1ULL) {
4977                         /* Proactively passivate the metaslab, if needed */
4978                         if (activated)
4979                                 metaslab_segment_may_passivate(msp);
4980                         break;
4981                 }
4982 next:
4983                 ASSERT(msp->ms_loaded);
4984
4985                 /*
4986                  * This code is disabled out because of issues with
4987                  * tracepoints in non-gpl kernel modules.
4988                  */
4989 #if 0
4990                 DTRACE_PROBE2(ms__alloc__failure, metaslab_t *, msp,
4991                     uint64_t, asize);
4992 #endif
4993
4994                 /*
4995                  * We were unable to allocate from this metaslab so determine
4996                  * a new weight for this metaslab. Now that we have loaded
4997                  * the metaslab we can provide a better hint to the metaslab
4998                  * selector.
4999                  *
5000                  * For space-based metaslabs, we use the maximum block size.
5001                  * This information is only available when the metaslab
5002                  * is loaded and is more accurate than the generic free
5003                  * space weight that was calculated by metaslab_weight().
5004                  * This information allows us to quickly compare the maximum
5005                  * available allocation in the metaslab to the allocation
5006                  * size being requested.
5007                  *
5008                  * For segment-based metaslabs, determine the new weight
5009                  * based on the highest bucket in the range tree. We
5010                  * explicitly use the loaded segment weight (i.e. the range
5011                  * tree histogram) since it contains the space that is
5012                  * currently available for allocation and is accurate
5013                  * even within a sync pass.
5014                  */
5015                 uint64_t weight;
5016                 if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
5017                         weight = metaslab_largest_allocatable(msp);
5018                         WEIGHT_SET_SPACEBASED(weight);
5019                 } else {
5020                         weight = metaslab_weight_from_range_tree(msp);
5021                 }
5022
5023                 if (activated) {
5024                         metaslab_passivate(msp, weight);
5025                 } else {
5026                         /*
5027                          * For the case where we use the metaslab that is
5028                          * active for another allocator we want to make
5029                          * sure that we retain the activation mask.
5030                          *
5031                          * Note that we could attempt to use something like
5032                          * metaslab_recalculate_weight_and_sort() that
5033                          * retains the activation mask here. That function
5034                          * uses metaslab_weight() to set the weight though
5035                          * which is not as accurate as the calculations
5036                          * above.
5037                          */
5038                         weight |= msp->ms_weight & METASLAB_ACTIVE_MASK;
5039                         metaslab_group_sort(mg, msp, weight);
5040                 }
5041                 metaslab_active_mask_verify(msp);
5042
5043                 /*
5044                  * We have just failed an allocation attempt, check
5045                  * that metaslab_should_allocate() agrees. Otherwise,
5046                  * we may end up in an infinite loop retrying the same
5047                  * metaslab.
5048                  */
5049                 ASSERT(!metaslab_should_allocate(msp, asize, try_hard));
5050
5051                 mutex_exit(&msp->ms_lock);
5052         }
5053         mutex_exit(&msp->ms_lock);
5054         kmem_free(search, sizeof (*search));
5055         return (offset);
5056 }
5057
5058 static uint64_t
5059 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
5060     uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
5061     int allocator, boolean_t try_hard)
5062 {
5063         uint64_t offset;
5064         ASSERT(mg->mg_initialized);
5065
5066         offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique,
5067             dva, d, allocator, try_hard);
5068
5069         mutex_enter(&mg->mg_lock);
5070         if (offset == -1ULL) {
5071                 mg->mg_failed_allocations++;
5072                 metaslab_trace_add(zal, mg, NULL, asize, d,
5073                     TRACE_GROUP_FAILURE, allocator);
5074                 if (asize == SPA_GANGBLOCKSIZE) {
5075                         /*
5076                          * This metaslab group was unable to allocate
5077                          * the minimum gang block size so it must be out of
5078                          * space. We must notify the allocation throttle
5079                          * to start skipping allocation attempts to this
5080                          * metaslab group until more space becomes available.
5081                          * Note: this failure cannot be caused by the
5082                          * allocation throttle since the allocation throttle
5083                          * is only responsible for skipping devices and
5084                          * not failing block allocations.
5085                          */
5086                         mg->mg_no_free_space = B_TRUE;
5087                 }
5088         }
5089         mg->mg_allocations++;
5090         mutex_exit(&mg->mg_lock);
5091         return (offset);
5092 }
5093
5094 /*
5095  * Allocate a block for the specified i/o.
5096  */
5097 int
5098 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
5099     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
5100     zio_alloc_list_t *zal, int allocator)
5101 {
5102         metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
5103         metaslab_group_t *mg, *rotor;
5104         vdev_t *vd;
5105         boolean_t try_hard = B_FALSE;
5106
5107         ASSERT(!DVA_IS_VALID(&dva[d]));
5108
5109         /*
5110          * For testing, make some blocks above a certain size be gang blocks.
5111          * This will result in more split blocks when using device removal,
5112          * and a large number of split blocks coupled with ztest-induced
5113          * damage can result in extremely long reconstruction times.  This
5114          * will also test spilling from special to normal.
5115          */
5116         if (psize >= metaslab_force_ganging &&
5117             metaslab_force_ganging_pct > 0 &&
5118             (random_in_range(100) < MIN(metaslab_force_ganging_pct, 100))) {
5119                 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
5120                     allocator);
5121                 return (SET_ERROR(ENOSPC));
5122         }
5123
5124         /*
5125          * Start at the rotor and loop through all mgs until we find something.
5126          * Note that there's no locking on mca_rotor or mca_aliquot because
5127          * nothing actually breaks if we miss a few updates -- we just won't
5128          * allocate quite as evenly.  It all balances out over time.
5129          *
5130          * If we are doing ditto or log blocks, try to spread them across
5131          * consecutive vdevs.  If we're forced to reuse a vdev before we've
5132          * allocated all of our ditto blocks, then try and spread them out on
5133          * that vdev as much as possible.  If it turns out to not be possible,
5134          * gradually lower our standards until anything becomes acceptable.
5135          * Also, allocating on consecutive vdevs (as opposed to random vdevs)
5136          * gives us hope of containing our fault domains to something we're
5137          * able to reason about.  Otherwise, any two top-level vdev failures
5138          * will guarantee the loss of data.  With consecutive allocation,
5139          * only two adjacent top-level vdev failures will result in data loss.
5140          *
5141          * If we are doing gang blocks (hintdva is non-NULL), try to keep
5142          * ourselves on the same vdev as our gang block header.  That
5143          * way, we can hope for locality in vdev_cache, plus it makes our
5144          * fault domains something tractable.
5145          */
5146         if (hintdva) {
5147                 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
5148
5149                 /*
5150                  * It's possible the vdev we're using as the hint no
5151                  * longer exists or its mg has been closed (e.g. by
5152                  * device removal).  Consult the rotor when
5153                  * all else fails.
5154                  */
5155                 if (vd != NULL && vd->vdev_mg != NULL) {
5156                         mg = vdev_get_mg(vd, mc);
5157
5158                         if (flags & METASLAB_HINTBP_AVOID)
5159                                 mg = mg->mg_next;
5160                 } else {
5161                         mg = mca->mca_rotor;
5162                 }
5163         } else if (d != 0) {
5164                 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
5165                 mg = vd->vdev_mg->mg_next;
5166         } else {
5167                 ASSERT(mca->mca_rotor != NULL);
5168                 mg = mca->mca_rotor;
5169         }
5170
5171         /*
5172          * If the hint put us into the wrong metaslab class, or into a
5173          * metaslab group that has been passivated, just follow the rotor.
5174          */
5175         if (mg->mg_class != mc || mg->mg_activation_count <= 0)
5176                 mg = mca->mca_rotor;
5177
5178         rotor = mg;
5179 top:
5180         do {
5181                 boolean_t allocatable;
5182
5183                 ASSERT(mg->mg_activation_count == 1);
5184                 vd = mg->mg_vd;
5185
5186                 /*
5187                  * Don't allocate from faulted devices.
5188                  */
5189                 if (try_hard) {
5190                         spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
5191                         allocatable = vdev_allocatable(vd);
5192                         spa_config_exit(spa, SCL_ZIO, FTAG);
5193                 } else {
5194                         allocatable = vdev_allocatable(vd);
5195                 }
5196
5197                 /*
5198                  * Determine if the selected metaslab group is eligible
5199                  * for allocations. If we're ganging then don't allow
5200                  * this metaslab group to skip allocations since that would
5201                  * inadvertently return ENOSPC and suspend the pool
5202                  * even though space is still available.
5203                  */
5204                 if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
5205                         allocatable = metaslab_group_allocatable(mg, rotor,
5206                             flags, psize, allocator, d);
5207                 }
5208
5209                 if (!allocatable) {
5210                         metaslab_trace_add(zal, mg, NULL, psize, d,
5211                             TRACE_NOT_ALLOCATABLE, allocator);
5212                         goto next;
5213                 }
5214
5215                 ASSERT(mg->mg_initialized);
5216
5217                 /*
5218                  * Avoid writing single-copy data to an unhealthy,
5219                  * non-redundant vdev, unless we've already tried all
5220                  * other vdevs.
5221                  */
5222                 if (vd->vdev_state < VDEV_STATE_HEALTHY &&
5223                     d == 0 && !try_hard && vd->vdev_children == 0) {
5224                         metaslab_trace_add(zal, mg, NULL, psize, d,
5225                             TRACE_VDEV_ERROR, allocator);
5226                         goto next;
5227                 }
5228
5229                 ASSERT(mg->mg_class == mc);
5230
5231                 uint64_t asize = vdev_psize_to_asize(vd, psize);
5232                 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
5233
5234                 /*
5235                  * If we don't need to try hard, then require that the
5236                  * block be on a different metaslab from any other DVAs
5237                  * in this BP (unique=true).  If we are trying hard, then
5238                  * allow any metaslab to be used (unique=false).
5239                  */
5240                 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
5241                     !try_hard, dva, d, allocator, try_hard);
5242
5243                 if (offset != -1ULL) {
5244                         /*
5245                          * If we've just selected this metaslab group,
5246                          * figure out whether the corresponding vdev is
5247                          * over- or under-used relative to the pool,
5248                          * and set an allocation bias to even it out.
5249                          *
5250                          * Bias is also used to compensate for unequally
5251                          * sized vdevs so that space is allocated fairly.
5252                          */
5253                         if (mca->mca_aliquot == 0 && metaslab_bias_enabled) {
5254                                 vdev_stat_t *vs = &vd->vdev_stat;
5255                                 int64_t vs_free = vs->vs_space - vs->vs_alloc;
5256                                 int64_t mc_free = mc->mc_space - mc->mc_alloc;
5257                                 int64_t ratio;
5258
5259                                 /*
5260                                  * Calculate how much more or less we should
5261                                  * try to allocate from this device during
5262                                  * this iteration around the rotor.
5263                                  *
5264                                  * This basically introduces a zero-centered
5265                                  * bias towards the devices with the most
5266                                  * free space, while compensating for vdev
5267                                  * size differences.
5268                                  *
5269                                  * Examples:
5270                                  *  vdev V1 = 16M/128M
5271                                  *  vdev V2 = 16M/128M
5272                                  *  ratio(V1) = 100% ratio(V2) = 100%
5273                                  *
5274                                  *  vdev V1 = 16M/128M
5275                                  *  vdev V2 = 64M/128M
5276                                  *  ratio(V1) = 127% ratio(V2) =  72%
5277                                  *
5278                                  *  vdev V1 = 16M/128M
5279                                  *  vdev V2 = 64M/512M
5280                                  *  ratio(V1) =  40% ratio(V2) = 160%
5281                                  */
5282                                 ratio = (vs_free * mc->mc_alloc_groups * 100) /
5283                                     (mc_free + 1);
5284                                 mg->mg_bias = ((ratio - 100) *
5285                                     (int64_t)mg->mg_aliquot) / 100;
5286                         } else if (!metaslab_bias_enabled) {
5287                                 mg->mg_bias = 0;
5288                         }
5289
5290                         if ((flags & METASLAB_ZIL) ||
5291                             atomic_add_64_nv(&mca->mca_aliquot, asize) >=
5292                             mg->mg_aliquot + mg->mg_bias) {
5293                                 mca->mca_rotor = mg->mg_next;
5294                                 mca->mca_aliquot = 0;
5295                         }
5296
5297                         DVA_SET_VDEV(&dva[d], vd->vdev_id);
5298                         DVA_SET_OFFSET(&dva[d], offset);
5299                         DVA_SET_GANG(&dva[d],
5300                             ((flags & METASLAB_GANG_HEADER) ? 1 : 0));
5301                         DVA_SET_ASIZE(&dva[d], asize);
5302
5303                         return (0);
5304                 }
5305 next:
5306                 mca->mca_rotor = mg->mg_next;
5307                 mca->mca_aliquot = 0;
5308         } while ((mg = mg->mg_next) != rotor);
5309
5310         /*
5311          * If we haven't tried hard, perhaps do so now.
5312          */
5313         if (!try_hard && (zfs_metaslab_try_hard_before_gang ||
5314             GANG_ALLOCATION(flags) || (flags & METASLAB_ZIL) != 0 ||
5315             psize <= 1 << spa->spa_min_ashift)) {
5316                 METASLABSTAT_BUMP(metaslabstat_try_hard);
5317                 try_hard = B_TRUE;
5318                 goto top;
5319         }
5320
5321         memset(&dva[d], 0, sizeof (dva_t));
5322
5323         metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator);
5324         return (SET_ERROR(ENOSPC));
5325 }
5326
5327 void
5328 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
5329     boolean_t checkpoint)
5330 {
5331         metaslab_t *msp;
5332         spa_t *spa = vd->vdev_spa;
5333
5334         ASSERT(vdev_is_concrete(vd));
5335         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
5336         ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
5337
5338         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
5339
5340         VERIFY(!msp->ms_condensing);
5341         VERIFY3U(offset, >=, msp->ms_start);
5342         VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size);
5343         VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
5344         VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
5345
5346         metaslab_check_free_impl(vd, offset, asize);
5347
5348         mutex_enter(&msp->ms_lock);
5349         if (range_tree_is_empty(msp->ms_freeing) &&
5350             range_tree_is_empty(msp->ms_checkpointing)) {
5351                 vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa));
5352         }
5353
5354         if (checkpoint) {
5355                 ASSERT(spa_has_checkpoint(spa));
5356                 range_tree_add(msp->ms_checkpointing, offset, asize);
5357         } else {
5358                 range_tree_add(msp->ms_freeing, offset, asize);
5359         }
5360         mutex_exit(&msp->ms_lock);
5361 }
5362
5363 void
5364 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
5365     uint64_t size, void *arg)
5366 {
5367         (void) inner_offset;
5368         boolean_t *checkpoint = arg;
5369
5370         ASSERT3P(checkpoint, !=, NULL);
5371
5372         if (vd->vdev_ops->vdev_op_remap != NULL)
5373                 vdev_indirect_mark_obsolete(vd, offset, size);
5374         else
5375                 metaslab_free_impl(vd, offset, size, *checkpoint);
5376 }
5377
5378 static void
5379 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
5380     boolean_t checkpoint)
5381 {
5382         spa_t *spa = vd->vdev_spa;
5383
5384         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
5385
5386         if (spa_syncing_txg(spa) > spa_freeze_txg(spa))
5387                 return;
5388
5389         if (spa->spa_vdev_removal != NULL &&
5390             spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
5391             vdev_is_concrete(vd)) {
5392                 /*
5393                  * Note: we check if the vdev is concrete because when
5394                  * we complete the removal, we first change the vdev to be
5395                  * an indirect vdev (in open context), and then (in syncing
5396                  * context) clear spa_vdev_removal.
5397                  */
5398                 free_from_removing_vdev(vd, offset, size);
5399         } else if (vd->vdev_ops->vdev_op_remap != NULL) {
5400                 vdev_indirect_mark_obsolete(vd, offset, size);
5401                 vd->vdev_ops->vdev_op_remap(vd, offset, size,
5402                     metaslab_free_impl_cb, &checkpoint);
5403         } else {
5404                 metaslab_free_concrete(vd, offset, size, checkpoint);
5405         }
5406 }
5407
5408 typedef struct remap_blkptr_cb_arg {
5409         blkptr_t *rbca_bp;
5410         spa_remap_cb_t rbca_cb;
5411         vdev_t *rbca_remap_vd;
5412         uint64_t rbca_remap_offset;
5413         void *rbca_cb_arg;
5414 } remap_blkptr_cb_arg_t;
5415
5416 static void
5417 remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
5418     uint64_t size, void *arg)
5419 {
5420         remap_blkptr_cb_arg_t *rbca = arg;
5421         blkptr_t *bp = rbca->rbca_bp;
5422
5423         /* We can not remap split blocks. */
5424         if (size != DVA_GET_ASIZE(&bp->blk_dva[0]))
5425                 return;
5426         ASSERT0(inner_offset);
5427
5428         if (rbca->rbca_cb != NULL) {
5429                 /*
5430                  * At this point we know that we are not handling split
5431                  * blocks and we invoke the callback on the previous
5432                  * vdev which must be indirect.
5433                  */
5434                 ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops);
5435
5436                 rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id,
5437                     rbca->rbca_remap_offset, size, rbca->rbca_cb_arg);
5438
5439                 /* set up remap_blkptr_cb_arg for the next call */
5440                 rbca->rbca_remap_vd = vd;
5441                 rbca->rbca_remap_offset = offset;
5442         }
5443
5444         /*
5445          * The phys birth time is that of dva[0].  This ensures that we know
5446          * when each dva was written, so that resilver can determine which
5447          * blocks need to be scrubbed (i.e. those written during the time
5448          * the vdev was offline).  It also ensures that the key used in
5449          * the ARC hash table is unique (i.e. dva[0] + phys_birth).  If
5450          * we didn't change the phys_birth, a lookup in the ARC for a
5451          * remapped BP could find the data that was previously stored at
5452          * this vdev + offset.
5453          */
5454         vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
5455             DVA_GET_VDEV(&bp->blk_dva[0]));
5456         vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
5457         bp->blk_phys_birth = vdev_indirect_births_physbirth(vib,
5458             DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
5459
5460         DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
5461         DVA_SET_OFFSET(&bp->blk_dva[0], offset);
5462 }
5463
5464 /*
5465  * If the block pointer contains any indirect DVAs, modify them to refer to
5466  * concrete DVAs.  Note that this will sometimes not be possible, leaving
5467  * the indirect DVA in place.  This happens if the indirect DVA spans multiple
5468  * segments in the mapping (i.e. it is a "split block").
5469  *
5470  * If the BP was remapped, calls the callback on the original dva (note the
5471  * callback can be called multiple times if the original indirect DVA refers
5472  * to another indirect DVA, etc).
5473  *
5474  * Returns TRUE if the BP was remapped.
5475  */
5476 boolean_t
5477 spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg)
5478 {
5479         remap_blkptr_cb_arg_t rbca;
5480
5481         if (!zfs_remap_blkptr_enable)
5482                 return (B_FALSE);
5483
5484         if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS))
5485                 return (B_FALSE);
5486
5487         /*
5488          * Dedup BP's can not be remapped, because ddt_phys_select() depends
5489          * on DVA[0] being the same in the BP as in the DDT (dedup table).
5490          */
5491         if (BP_GET_DEDUP(bp))
5492                 return (B_FALSE);
5493
5494         /*
5495          * Gang blocks can not be remapped, because
5496          * zio_checksum_gang_verifier() depends on the DVA[0] that's in
5497          * the BP used to read the gang block header (GBH) being the same
5498          * as the DVA[0] that we allocated for the GBH.
5499          */
5500         if (BP_IS_GANG(bp))
5501                 return (B_FALSE);
5502
5503         /*
5504          * Embedded BP's have no DVA to remap.
5505          */
5506         if (BP_GET_NDVAS(bp) < 1)
5507                 return (B_FALSE);
5508
5509         /*
5510          * Note: we only remap dva[0].  If we remapped other dvas, we
5511          * would no longer know what their phys birth txg is.
5512          */
5513         dva_t *dva = &bp->blk_dva[0];
5514
5515         uint64_t offset = DVA_GET_OFFSET(dva);
5516         uint64_t size = DVA_GET_ASIZE(dva);
5517         vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
5518
5519         if (vd->vdev_ops->vdev_op_remap == NULL)
5520                 return (B_FALSE);
5521
5522         rbca.rbca_bp = bp;
5523         rbca.rbca_cb = callback;
5524         rbca.rbca_remap_vd = vd;
5525         rbca.rbca_remap_offset = offset;
5526         rbca.rbca_cb_arg = arg;
5527
5528         /*
5529          * remap_blkptr_cb() will be called in order for each level of
5530          * indirection, until a concrete vdev is reached or a split block is
5531          * encountered. old_vd and old_offset are updated within the callback
5532          * as we go from the one indirect vdev to the next one (either concrete
5533          * or indirect again) in that order.
5534          */
5535         vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca);
5536
5537         /* Check if the DVA wasn't remapped because it is a split block */
5538         if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id)
5539                 return (B_FALSE);
5540
5541         return (B_TRUE);
5542 }
5543
5544 /*
5545  * Undo the allocation of a DVA which happened in the given transaction group.
5546  */
5547 void
5548 metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
5549 {
5550         metaslab_t *msp;
5551         vdev_t *vd;
5552         uint64_t vdev = DVA_GET_VDEV(dva);
5553         uint64_t offset = DVA_GET_OFFSET(dva);
5554         uint64_t size = DVA_GET_ASIZE(dva);
5555
5556         ASSERT(DVA_IS_VALID(dva));
5557         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
5558
5559         if (txg > spa_freeze_txg(spa))
5560                 return;
5561
5562         if ((vd = vdev_lookup_top(spa, vdev)) == NULL || !DVA_IS_VALID(dva) ||
5563             (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
5564                 zfs_panic_recover("metaslab_free_dva(): bad DVA %llu:%llu:%llu",
5565                     (u_longlong_t)vdev, (u_longlong_t)offset,
5566                     (u_longlong_t)size);
5567                 return;
5568         }
5569
5570         ASSERT(!vd->vdev_removing);
5571         ASSERT(vdev_is_concrete(vd));
5572         ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
5573         ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
5574
5575         if (DVA_GET_GANG(dva))
5576                 size = vdev_gang_header_asize(vd);
5577
5578         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
5579
5580         mutex_enter(&msp->ms_lock);
5581         range_tree_remove(msp->ms_allocating[txg & TXG_MASK],
5582             offset, size);
5583         msp->ms_allocating_total -= size;
5584
5585         VERIFY(!msp->ms_condensing);
5586         VERIFY3U(offset, >=, msp->ms_start);
5587         VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
5588         VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=,
5589             msp->ms_size);
5590         VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
5591         VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
5592         range_tree_add(msp->ms_allocatable, offset, size);
5593         mutex_exit(&msp->ms_lock);
5594 }
5595
5596 /*
5597  * Free the block represented by the given DVA.
5598  */
5599 void
5600 metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
5601 {
5602         uint64_t vdev = DVA_GET_VDEV(dva);
5603         uint64_t offset = DVA_GET_OFFSET(dva);
5604         uint64_t size = DVA_GET_ASIZE(dva);
5605         vdev_t *vd = vdev_lookup_top(spa, vdev);
5606
5607         ASSERT(DVA_IS_VALID(dva));
5608         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
5609
5610         if (DVA_GET_GANG(dva)) {
5611                 size = vdev_gang_header_asize(vd);
5612         }
5613
5614         metaslab_free_impl(vd, offset, size, checkpoint);
5615 }
5616
5617 /*
5618  * Reserve some allocation slots. The reservation system must be called
5619  * before we call into the allocator. If there aren't any available slots
5620  * then the I/O will be throttled until an I/O completes and its slots are
5621  * freed up. The function returns true if it was successful in placing
5622  * the reservation.
5623  */
5624 boolean_t
5625 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
5626     zio_t *zio, int flags)
5627 {
5628         metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
5629         uint64_t max = mca->mca_alloc_max_slots;
5630
5631         ASSERT(mc->mc_alloc_throttle_enabled);
5632         if (GANG_ALLOCATION(flags) || (flags & METASLAB_MUST_RESERVE) ||
5633             zfs_refcount_count(&mca->mca_alloc_slots) + slots <= max) {
5634                 /*
5635                  * The potential race between _count() and _add() is covered
5636                  * by the allocator lock in most cases, or irrelevant due to
5637                  * GANG_ALLOCATION() or METASLAB_MUST_RESERVE set in others.
5638                  * But even if we assume some other non-existing scenario, the
5639                  * worst that can happen is few more I/Os get to allocation
5640                  * earlier, that is not a problem.
5641                  *
5642                  * We reserve the slots individually so that we can unreserve
5643                  * them individually when an I/O completes.
5644                  */
5645                 zfs_refcount_add_few(&mca->mca_alloc_slots, slots, zio);
5646                 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
5647                 return (B_TRUE);
5648         }
5649         return (B_FALSE);
5650 }
5651
5652 void
5653 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
5654     int allocator, zio_t *zio)
5655 {
5656         metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
5657
5658         ASSERT(mc->mc_alloc_throttle_enabled);
5659         zfs_refcount_remove_few(&mca->mca_alloc_slots, slots, zio);
5660 }
5661
5662 static int
5663 metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
5664     uint64_t txg)
5665 {
5666         metaslab_t *msp;
5667         spa_t *spa = vd->vdev_spa;
5668         int error = 0;
5669
5670         if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count)
5671                 return (SET_ERROR(ENXIO));
5672
5673         ASSERT3P(vd->vdev_ms, !=, NULL);
5674         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
5675
5676         mutex_enter(&msp->ms_lock);
5677
5678         if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) {
5679                 error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM);
5680                 if (error == EBUSY) {
5681                         ASSERT(msp->ms_loaded);
5682                         ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
5683                         error = 0;
5684                 }
5685         }
5686
5687         if (error == 0 &&
5688             !range_tree_contains(msp->ms_allocatable, offset, size))
5689                 error = SET_ERROR(ENOENT);
5690
5691         if (error || txg == 0) {        /* txg == 0 indicates dry run */
5692                 mutex_exit(&msp->ms_lock);
5693                 return (error);
5694         }
5695
5696         VERIFY(!msp->ms_condensing);
5697         VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
5698         VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
5699         VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=,
5700             msp->ms_size);
5701         range_tree_remove(msp->ms_allocatable, offset, size);
5702         range_tree_clear(msp->ms_trim, offset, size);
5703
5704         if (spa_writeable(spa)) {       /* don't dirty if we're zdb(8) */
5705                 metaslab_class_t *mc = msp->ms_group->mg_class;
5706                 multilist_sublist_t *mls =
5707                     multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
5708                 if (!multilist_link_active(&msp->ms_class_txg_node)) {
5709                         msp->ms_selected_txg = txg;
5710                         multilist_sublist_insert_head(mls, msp);
5711                 }
5712                 multilist_sublist_unlock(mls);
5713
5714                 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
5715                         vdev_dirty(vd, VDD_METASLAB, msp, txg);
5716                 range_tree_add(msp->ms_allocating[txg & TXG_MASK],
5717                     offset, size);
5718                 msp->ms_allocating_total += size;
5719         }
5720
5721         mutex_exit(&msp->ms_lock);
5722
5723         return (0);
5724 }
5725
5726 typedef struct metaslab_claim_cb_arg_t {
5727         uint64_t        mcca_txg;
5728         int             mcca_error;
5729 } metaslab_claim_cb_arg_t;
5730
5731 static void
5732 metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
5733     uint64_t size, void *arg)
5734 {
5735         (void) inner_offset;
5736         metaslab_claim_cb_arg_t *mcca_arg = arg;
5737
5738         if (mcca_arg->mcca_error == 0) {
5739                 mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset,
5740                     size, mcca_arg->mcca_txg);
5741         }
5742 }
5743
5744 int
5745 metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
5746 {
5747         if (vd->vdev_ops->vdev_op_remap != NULL) {
5748                 metaslab_claim_cb_arg_t arg;
5749
5750                 /*
5751                  * Only zdb(8) can claim on indirect vdevs.  This is used
5752                  * to detect leaks of mapped space (that are not accounted
5753                  * for in the obsolete counts, spacemap, or bpobj).
5754                  */
5755                 ASSERT(!spa_writeable(vd->vdev_spa));
5756                 arg.mcca_error = 0;
5757                 arg.mcca_txg = txg;
5758
5759                 vd->vdev_ops->vdev_op_remap(vd, offset, size,
5760                     metaslab_claim_impl_cb, &arg);
5761
5762                 if (arg.mcca_error == 0) {
5763                         arg.mcca_error = metaslab_claim_concrete(vd,
5764                             offset, size, txg);
5765                 }
5766                 return (arg.mcca_error);
5767         } else {
5768                 return (metaslab_claim_concrete(vd, offset, size, txg));
5769         }
5770 }
5771
5772 /*
5773  * Intent log support: upon opening the pool after a crash, notify the SPA
5774  * of blocks that the intent log has allocated for immediate write, but
5775  * which are still considered free by the SPA because the last transaction
5776  * group didn't commit yet.
5777  */
5778 static int
5779 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
5780 {
5781         uint64_t vdev = DVA_GET_VDEV(dva);
5782         uint64_t offset = DVA_GET_OFFSET(dva);
5783         uint64_t size = DVA_GET_ASIZE(dva);
5784         vdev_t *vd;
5785
5786         if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
5787                 return (SET_ERROR(ENXIO));
5788         }
5789
5790         ASSERT(DVA_IS_VALID(dva));
5791
5792         if (DVA_GET_GANG(dva))
5793                 size = vdev_gang_header_asize(vd);
5794
5795         return (metaslab_claim_impl(vd, offset, size, txg));
5796 }
5797
5798 int
5799 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
5800     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
5801     zio_alloc_list_t *zal, zio_t *zio, int allocator)
5802 {
5803         dva_t *dva = bp->blk_dva;
5804         dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
5805         int error = 0;
5806
5807         ASSERT(bp->blk_birth == 0);
5808         ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
5809
5810         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
5811
5812         if (mc->mc_allocator[allocator].mca_rotor == NULL) {
5813                 /* no vdevs in this class */
5814                 spa_config_exit(spa, SCL_ALLOC, FTAG);
5815                 return (SET_ERROR(ENOSPC));
5816         }
5817
5818         ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
5819         ASSERT(BP_GET_NDVAS(bp) == 0);
5820         ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
5821         ASSERT3P(zal, !=, NULL);
5822
5823         for (int d = 0; d < ndvas; d++) {
5824                 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
5825                     txg, flags, zal, allocator);
5826                 if (error != 0) {
5827                         for (d--; d >= 0; d--) {
5828                                 metaslab_unalloc_dva(spa, &dva[d], txg);
5829                                 metaslab_group_alloc_decrement(spa,
5830                                     DVA_GET_VDEV(&dva[d]), zio, flags,
5831                                     allocator, B_FALSE);
5832                                 memset(&dva[d], 0, sizeof (dva_t));
5833                         }
5834                         spa_config_exit(spa, SCL_ALLOC, FTAG);
5835                         return (error);
5836                 } else {
5837                         /*
5838                          * Update the metaslab group's queue depth
5839                          * based on the newly allocated dva.
5840                          */
5841                         metaslab_group_alloc_increment(spa,
5842                             DVA_GET_VDEV(&dva[d]), zio, flags, allocator);
5843                 }
5844         }
5845         ASSERT(error == 0);
5846         ASSERT(BP_GET_NDVAS(bp) == ndvas);
5847
5848         spa_config_exit(spa, SCL_ALLOC, FTAG);
5849
5850         BP_SET_BIRTH(bp, txg, 0);
5851
5852         return (0);
5853 }
5854
5855 void
5856 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
5857 {
5858         const dva_t *dva = bp->blk_dva;
5859         int ndvas = BP_GET_NDVAS(bp);
5860
5861         ASSERT(!BP_IS_HOLE(bp));
5862         ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
5863
5864         /*
5865          * If we have a checkpoint for the pool we need to make sure that
5866          * the blocks that we free that are part of the checkpoint won't be
5867          * reused until the checkpoint is discarded or we revert to it.
5868          *
5869          * The checkpoint flag is passed down the metaslab_free code path
5870          * and is set whenever we want to add a block to the checkpoint's
5871          * accounting. That is, we "checkpoint" blocks that existed at the
5872          * time the checkpoint was created and are therefore referenced by
5873          * the checkpointed uberblock.
5874          *
5875          * Note that, we don't checkpoint any blocks if the current
5876          * syncing txg <= spa_checkpoint_txg. We want these frees to sync
5877          * normally as they will be referenced by the checkpointed uberblock.
5878          */
5879         boolean_t checkpoint = B_FALSE;
5880         if (bp->blk_birth <= spa->spa_checkpoint_txg &&
5881             spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
5882                 /*
5883                  * At this point, if the block is part of the checkpoint
5884                  * there is no way it was created in the current txg.
5885                  */
5886                 ASSERT(!now);
5887                 ASSERT3U(spa_syncing_txg(spa), ==, txg);
5888                 checkpoint = B_TRUE;
5889         }
5890
5891         spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
5892
5893         for (int d = 0; d < ndvas; d++) {
5894                 if (now) {
5895                         metaslab_unalloc_dva(spa, &dva[d], txg);
5896                 } else {
5897                         ASSERT3U(txg, ==, spa_syncing_txg(spa));
5898                         metaslab_free_dva(spa, &dva[d], checkpoint);
5899                 }
5900         }
5901
5902         spa_config_exit(spa, SCL_FREE, FTAG);
5903 }
5904
5905 int
5906 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
5907 {
5908         const dva_t *dva = bp->blk_dva;
5909         int ndvas = BP_GET_NDVAS(bp);
5910         int error = 0;
5911
5912         ASSERT(!BP_IS_HOLE(bp));
5913
5914         if (txg != 0) {
5915                 /*
5916                  * First do a dry run to make sure all DVAs are claimable,
5917                  * so we don't have to unwind from partial failures below.
5918                  */
5919                 if ((error = metaslab_claim(spa, bp, 0)) != 0)
5920                         return (error);
5921         }
5922
5923         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
5924
5925         for (int d = 0; d < ndvas; d++) {
5926                 error = metaslab_claim_dva(spa, &dva[d], txg);
5927                 if (error != 0)
5928                         break;
5929         }
5930
5931         spa_config_exit(spa, SCL_ALLOC, FTAG);
5932
5933         ASSERT(error == 0 || txg == 0);
5934
5935         return (error);
5936 }
5937
5938 static void
5939 metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
5940     uint64_t size, void *arg)
5941 {
5942         (void) inner, (void) arg;
5943
5944         if (vd->vdev_ops == &vdev_indirect_ops)
5945                 return;
5946
5947         metaslab_check_free_impl(vd, offset, size);
5948 }
5949
5950 static void
5951 metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
5952 {
5953         metaslab_t *msp;
5954         spa_t *spa __maybe_unused = vd->vdev_spa;
5955
5956         if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
5957                 return;
5958
5959         if (vd->vdev_ops->vdev_op_remap != NULL) {
5960                 vd->vdev_ops->vdev_op_remap(vd, offset, size,
5961                     metaslab_check_free_impl_cb, NULL);
5962                 return;
5963         }
5964
5965         ASSERT(vdev_is_concrete(vd));
5966         ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
5967         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
5968
5969         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
5970
5971         mutex_enter(&msp->ms_lock);
5972         if (msp->ms_loaded) {
5973                 range_tree_verify_not_present(msp->ms_allocatable,
5974                     offset, size);
5975         }
5976
5977         /*
5978          * Check all segments that currently exist in the freeing pipeline.
5979          *
5980          * It would intuitively make sense to also check the current allocating
5981          * tree since metaslab_unalloc_dva() exists for extents that are
5982          * allocated and freed in the same sync pass within the same txg.
5983          * Unfortunately there are places (e.g. the ZIL) where we allocate a
5984          * segment but then we free part of it within the same txg
5985          * [see zil_sync()]. Thus, we don't call range_tree_verify() in the
5986          * current allocating tree.
5987          */
5988         range_tree_verify_not_present(msp->ms_freeing, offset, size);
5989         range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
5990         range_tree_verify_not_present(msp->ms_freed, offset, size);
5991         for (int j = 0; j < TXG_DEFER_SIZE; j++)
5992                 range_tree_verify_not_present(msp->ms_defer[j], offset, size);
5993         range_tree_verify_not_present(msp->ms_trim, offset, size);
5994         mutex_exit(&msp->ms_lock);
5995 }
5996
5997 void
5998 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
5999 {
6000         if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
6001                 return;
6002
6003         spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
6004         for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
6005                 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
6006                 vdev_t *vd = vdev_lookup_top(spa, vdev);
6007                 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
6008                 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
6009
6010                 if (DVA_GET_GANG(&bp->blk_dva[i]))
6011                         size = vdev_gang_header_asize(vd);
6012
6013                 ASSERT3P(vd, !=, NULL);
6014
6015                 metaslab_check_free_impl(vd, offset, size);
6016         }
6017         spa_config_exit(spa, SCL_VDEV, FTAG);
6018 }
6019
6020 static void
6021 metaslab_group_disable_wait(metaslab_group_t *mg)
6022 {
6023         ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
6024         while (mg->mg_disabled_updating) {
6025                 cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
6026         }
6027 }
6028
6029 static void
6030 metaslab_group_disabled_increment(metaslab_group_t *mg)
6031 {
6032         ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
6033         ASSERT(mg->mg_disabled_updating);
6034
6035         while (mg->mg_ms_disabled >= max_disabled_ms) {
6036                 cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
6037         }
6038         mg->mg_ms_disabled++;
6039         ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms);
6040 }
6041
6042 /*
6043  * Mark the metaslab as disabled to prevent any allocations on this metaslab.
6044  * We must also track how many metaslabs are currently disabled within a
6045  * metaslab group and limit them to prevent allocation failures from
6046  * occurring because all metaslabs are disabled.
6047  */
6048 void
6049 metaslab_disable(metaslab_t *msp)
6050 {
6051         ASSERT(!MUTEX_HELD(&msp->ms_lock));
6052         metaslab_group_t *mg = msp->ms_group;
6053
6054         mutex_enter(&mg->mg_ms_disabled_lock);
6055
6056         /*
6057          * To keep an accurate count of how many threads have disabled
6058          * a specific metaslab group, we only allow one thread to mark
6059          * the metaslab group at a time. This ensures that the value of
6060          * ms_disabled will be accurate when we decide to mark a metaslab
6061          * group as disabled. To do this we force all other threads
6062          * to wait till the metaslab's mg_disabled_updating flag is no
6063          * longer set.
6064          */
6065         metaslab_group_disable_wait(mg);
6066         mg->mg_disabled_updating = B_TRUE;
6067         if (msp->ms_disabled == 0) {
6068                 metaslab_group_disabled_increment(mg);
6069         }
6070         mutex_enter(&msp->ms_lock);
6071         msp->ms_disabled++;
6072         mutex_exit(&msp->ms_lock);
6073
6074         mg->mg_disabled_updating = B_FALSE;
6075         cv_broadcast(&mg->mg_ms_disabled_cv);
6076         mutex_exit(&mg->mg_ms_disabled_lock);
6077 }
6078
6079 void
6080 metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload)
6081 {
6082         metaslab_group_t *mg = msp->ms_group;
6083         spa_t *spa = mg->mg_vd->vdev_spa;
6084
6085         /*
6086          * Wait for the outstanding IO to be synced to prevent newly
6087          * allocated blocks from being overwritten.  This used by
6088          * initialize and TRIM which are modifying unallocated space.
6089          */
6090         if (sync)
6091                 txg_wait_synced(spa_get_dsl(spa), 0);
6092
6093         mutex_enter(&mg->mg_ms_disabled_lock);
6094         mutex_enter(&msp->ms_lock);
6095         if (--msp->ms_disabled == 0) {
6096                 mg->mg_ms_disabled--;
6097                 cv_broadcast(&mg->mg_ms_disabled_cv);
6098                 if (unload)
6099                         metaslab_unload(msp);
6100         }
6101         mutex_exit(&msp->ms_lock);
6102         mutex_exit(&mg->mg_ms_disabled_lock);
6103 }
6104
6105 void
6106 metaslab_set_unflushed_dirty(metaslab_t *ms, boolean_t dirty)
6107 {
6108         ms->ms_unflushed_dirty = dirty;
6109 }
6110
6111 static void
6112 metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx)
6113 {
6114         vdev_t *vd = ms->ms_group->mg_vd;
6115         spa_t *spa = vd->vdev_spa;
6116         objset_t *mos = spa_meta_objset(spa);
6117
6118         ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
6119
6120         metaslab_unflushed_phys_t entry = {
6121                 .msp_unflushed_txg = metaslab_unflushed_txg(ms),
6122         };
6123         uint64_t entry_size = sizeof (entry);
6124         uint64_t entry_offset = ms->ms_id * entry_size;
6125
6126         uint64_t object = 0;
6127         int err = zap_lookup(mos, vd->vdev_top_zap,
6128             VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
6129             &object);
6130         if (err == ENOENT) {
6131                 object = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA,
6132                     SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
6133                 VERIFY0(zap_add(mos, vd->vdev_top_zap,
6134                     VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
6135                     &object, tx));
6136         } else {
6137                 VERIFY0(err);
6138         }
6139
6140         dmu_write(spa_meta_objset(spa), object, entry_offset, entry_size,
6141             &entry, tx);
6142 }
6143
6144 void
6145 metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx)
6146 {
6147         ms->ms_unflushed_txg = txg;
6148         metaslab_update_ondisk_flush_data(ms, tx);
6149 }
6150
6151 boolean_t
6152 metaslab_unflushed_dirty(metaslab_t *ms)
6153 {
6154         return (ms->ms_unflushed_dirty);
6155 }
6156
6157 uint64_t
6158 metaslab_unflushed_txg(metaslab_t *ms)
6159 {
6160         return (ms->ms_unflushed_txg);
6161 }
6162
6163 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, aliquot, U64, ZMOD_RW,
6164         "Allocation granularity (a.k.a. stripe size)");
6165
6166 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_load, INT, ZMOD_RW,
6167         "Load all metaslabs when pool is first opened");
6168
6169 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_unload, INT, ZMOD_RW,
6170         "Prevent metaslabs from being unloaded");
6171
6172 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_enabled, INT, ZMOD_RW,
6173         "Preload potential metaslabs during reassessment");
6174
6175 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_limit, UINT, ZMOD_RW,
6176         "Max number of metaslabs per group to preload");
6177
6178 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay, UINT, ZMOD_RW,
6179         "Delay in txgs after metaslab was last used before unloading");
6180
6181 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay_ms, UINT, ZMOD_RW,
6182         "Delay in milliseconds after metaslab was last used before unloading");
6183
6184 /* BEGIN CSTYLED */
6185 ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, noalloc_threshold, UINT, ZMOD_RW,
6186         "Percentage of metaslab group size that should be free to make it "
6187         "eligible for allocation");
6188
6189 ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, fragmentation_threshold, UINT, ZMOD_RW,
6190         "Percentage of metaslab group size that should be considered eligible "
6191         "for allocations unless all metaslab groups within the metaslab class "
6192         "have also crossed this threshold");
6193
6194 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, fragmentation_factor_enabled, INT,
6195         ZMOD_RW,
6196         "Use the fragmentation metric to prefer less fragmented metaslabs");
6197 /* END CSTYLED */
6198
6199 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, fragmentation_threshold, UINT,
6200         ZMOD_RW, "Fragmentation for metaslab to allow allocation");
6201
6202 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, lba_weighting_enabled, INT, ZMOD_RW,
6203         "Prefer metaslabs with lower LBAs");
6204
6205 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, bias_enabled, INT, ZMOD_RW,
6206         "Enable metaslab group biasing");
6207
6208 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, segment_weight_enabled, INT,
6209         ZMOD_RW, "Enable segment-based metaslab selection");
6210
6211 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, switch_threshold, INT, ZMOD_RW,
6212         "Segment-based metaslab selection maximum buckets before switching");
6213
6214 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, U64, ZMOD_RW,
6215         "Blocks larger than this size are sometimes forced to be gang blocks");
6216
6217 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging_pct, UINT, ZMOD_RW,
6218         "Percentage of large blocks that will be forced to be gang blocks");
6219
6220 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, UINT, ZMOD_RW,
6221         "Max distance (bytes) to search forward before using size tree");
6222
6223 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_use_largest_segment, INT, ZMOD_RW,
6224         "When looking in size tree, use largest segment instead of exact fit");
6225
6226 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, U64,
6227         ZMOD_RW, "How long to trust the cached max chunk size of a metaslab");
6228
6229 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, mem_limit, UINT, ZMOD_RW,
6230         "Percentage of memory that can be used to store metaslab range trees");
6231
6232 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, try_hard_before_gang, INT,
6233         ZMOD_RW, "Try hard to allocate before ganging");
6234
6235 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, UINT, ZMOD_RW,
6236         "Normally only consider this many of the best metaslabs in each vdev");