module/zfs/spa_log_spacemap.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2018, 2019 by Delphix. All rights reserved.
  24  */
  25
  26 #include <sys/dmu_objset.h>
  27 #include <sys/metaslab.h>
  28 #include <sys/metaslab_impl.h>
  29 #include <sys/spa.h>
  30 #include <sys/spa_impl.h>
  31 #include <sys/spa_log_spacemap.h>
  32 #include <sys/vdev_impl.h>
  33 #include <sys/zap.h>
  34
  35 /*
  36  * Log Space Maps
  37  *
  38  * Log space maps are an optimization in ZFS metadata allocations for pools
  39  * whose workloads are primarily random-writes. Random-write workloads are also
  40  * typically random-free, meaning that they are freeing from locations scattered
  41  * throughout the pool. This means that each TXG we will have to append some
  42  * FREE records to almost every metaslab. With log space maps, we hold their
  43  * changes in memory and log them altogether in one pool-wide space map on-disk
  44  * for persistence. As more blocks are accumulated in the log space maps and
  45  * more unflushed changes are accounted in memory, we flush a selected group
  46  * of metaslabs every TXG to relieve memory pressure and potential overheads
  47  * when loading the pool. Flushing a metaslab to disk relieves memory as we
  48  * flush any unflushed changes from memory to disk (i.e. the metaslab's space
  49  * map) and saves import time by making old log space maps obsolete and
  50  * eventually destroying them. [A log space map is said to be obsolete when all
  51  * its entries have made it to their corresponding metaslab space maps].
  52  *
  53  * == On disk data structures used ==
  54  *
  55  * - The pool has a new feature flag and a new entry in the MOS. The feature
  56  *   is activated when we create the first log space map and remains active
  57  *   for the lifetime of the pool. The new entry in the MOS Directory [refer
  58  *   to DMU_POOL_LOG_SPACEMAP_ZAP] is populated with a ZAP whose key-value
  59  *   pairs are of the form <key: txg, value: log space map object for that txg>.
  60  *   This entry is our on-disk reference of the log space maps that exist in
  61  *   the pool for each TXG and it is used during import to load all the
  62  *   metaslab unflushed changes in memory. To see how this structure is first
  63  *   created and later populated refer to spa_generate_syncing_log_sm(). To see
  64  *   how it is used during import time refer to spa_ld_log_sm_metadata().
  65  *
  66  * - Each vdev has a new entry in its vdev_top_zap (see field
  67  *   VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS) which holds the msp_unflushed_txg of
  68  *   each metaslab in this vdev. This field is the on-disk counterpart of the
  69  *   in-memory field ms_unflushed_txg which tells us from which TXG and onwards
  70  *   the metaslab haven't had its changes flushed. During import, we use this
  71  *   to ignore any entries in the space map log that are for this metaslab but
  72  *   from a TXG before msp_unflushed_txg. At that point, we also populate its
  73  *   in-memory counterpart and from there both fields are updated every time
  74  *   we flush that metaslab.
  75  *
  76  * - A space map is created every TXG and, during that TXG, it is used to log
  77  *   all incoming changes (the log space map). When created, the log space map
  78  *   is referenced in memory by spa_syncing_log_sm and its object ID is inserted
  79  *   to the space map ZAP mentioned above. The log space map is closed at the
  80  *   end of the TXG and will be destroyed when it becomes fully obsolete. We
  81  *   know when a log space map has become obsolete by looking at the oldest
  82  *   (and smallest) ms_unflushed_txg in the pool. If the value of that is bigger
  83  *   than the log space map's TXG, then it means that there is no metaslab who
  84  *   doesn't have the changes from that log and we can therefore destroy it.
  85  *   [see spa_cleanup_old_sm_logs()].
  86  *
  87  * == Important in-memory structures ==
  88  *
  89  * - The per-spa field spa_metaslabs_by_flushed sorts all the metaslabs in
  90  *   the pool by their ms_unflushed_txg field. It is primarily used for three
  91  *   reasons. First of all, it is used during flushing where we try to flush
  92  *   metaslabs in-order from the oldest-flushed to the most recently flushed
  93  *   every TXG. Secondly, it helps us to lookup the ms_unflushed_txg of the
  94  *   oldest flushed metaslab to distinguish which log space maps have become
  95  *   obsolete and which ones are still relevant. Finally it tells us which
  96  *   metaslabs have unflushed changes in a pool where this feature was just
  97  *   enabled, as we don't immediately add all of the pool's metaslabs but we
  98  *   add them over time as they go through metaslab_sync(). The reason that
  99  *   we do that is to ease these pools into the behavior of the flushing
 100  *   algorithm (described later on).
 101  *
 102  * - The per-spa field spa_sm_logs_by_txg can be thought as the in-memory
 103  *   counterpart of the space map ZAP mentioned above. It's an AVL tree whose
 104  *   nodes represent the log space maps in the pool. This in-memory
 105  *   representation of log space maps in the pool sorts the log space maps by
 106  *   the TXG that they were created (which is also the TXG of their unflushed
 107  *   changes). It also contains the following extra information for each
 108  *   space map:
 109  *   [1] The number of metaslabs that were last flushed on that TXG. This is
 110  *       important because if that counter is zero and this is the oldest
 111  *       log then it means that it is also obsolete.
 112  *   [2] The number of blocks of that space map. This field is used by the
 113  *       block heuristic of our flushing algorithm (described later on).
 114  *       It represents how many blocks of metadata changes ZFS had to write
 115  *       to disk for that TXG.
 116  *
 117  * - The per-spa field spa_log_summary is a list of entries that summarizes
 118  *   the metaslab and block counts of all the nodes of the spa_sm_logs_by_txg
 119  *   AVL tree mentioned above. The reason this exists is that our flushing
 120  *   algorithm (described later) tries to estimate how many metaslabs to flush
 121  *   in each TXG by iterating over all the log space maps and looking at their
 122  *   block counts. Summarizing that information means that don't have to
 123  *   iterate through each space map, minimizing the runtime overhead of the
 124  *   flushing algorithm which would be induced in syncing context. In terms of
 125  *   implementation the log summary is used as a queue:
 126  *   * we modify or pop entries from its head when we flush metaslabs
 127  *   * we modify or append entries to its tail when we sync changes.
 128  *
 129  * - Each metaslab has two new range trees that hold its unflushed changes,
 130  *   ms_unflushed_allocs and ms_unflushed_frees. These are always disjoint.
 131  *
 132  * == Flushing algorithm ==
 133  *
 134  * The decision of how many metaslabs to flush on a give TXG is guided by
 135  * two heuristics:
 136  *
 137  * [1] The memory heuristic -
 138  * We keep track of the memory used by the unflushed trees from all the
 139  * metaslabs [see sus_memused of spa_unflushed_stats] and we ensure that it
 140  * stays below a certain threshold which is determined by an arbitrary hard
 141  * limit and an arbitrary percentage of the system's memory [see
 142  * spa_log_exceeds_memlimit()]. When we see that the memory usage of the
 143  * unflushed changes are passing that threshold, we flush metaslabs, which
 144  * empties their unflushed range trees, reducing the memory used.
 145  *
 146  * [2] The block heuristic -
 147  * We try to keep the total number of blocks in the log space maps in check
 148  * so the log doesn't grow indefinitely and we don't induce a lot of overhead
 149  * when loading the pool. At the same time we don't want to flush a lot of
 150  * metaslabs too often as this would defeat the purpose of the log space map.
 151  * As a result we set a limit in the amount of blocks that we think it's
 152  * acceptable for the log space maps to have and try not to cross it.
 153  * [see sus_blocklimit from spa_unflushed_stats].
 154  *
 155  * In order to stay below the block limit every TXG we have to estimate how
 156  * many metaslabs we need to flush based on the current rate of incoming blocks
 157  * and our history of log space map blocks. The main idea here is to answer
 158  * the question of how many metaslabs do we need to flush in order to get rid
 159  * at least an X amount of log space map blocks. We can answer this question
 160  * by iterating backwards from the oldest log space map to the newest one
 161  * and looking at their metaslab and block counts. At this point the log summary
 162  * mentioned above comes handy as it reduces the amount of things that we have
 163  * to iterate (even though it may reduce the preciseness of our estimates due
 164  * to its aggregation of data). So with that in mind, we project the incoming
 165  * rate of the current TXG into the future and attempt to approximate how many
 166  * metaslabs would we need to flush from now in order to avoid exceeding our
 167  * block limit in different points in the future (granted that we would keep
 168  * flushing the same number of metaslabs for every TXG). Then we take the
 169  * maximum number from all these estimates to be on the safe side. For the
 170  * exact implementation details of algorithm refer to
 171  * spa_estimate_metaslabs_to_flush.
 172  */
 173
 174 /*
 175  * This is used as the block size for the space maps used for the
 176  * log space map feature. These space maps benefit from a bigger
 177  * block size as we expect to be writing a lot of data to them at
 178  * once.
 179  */
 180 unsigned long zfs_log_sm_blksz = 1ULL << 17;
 181
 182 /*
 183  * Percentage of the overall system's memory that ZFS allows to be
 184  * used for unflushed changes (e.g. the sum of size of all the nodes
 185  * in the unflushed trees).
 186  *
 187  * Note that this value is calculated over 1000000 for finer granularity
 188  * (thus the _ppm suffix; reads as "parts per million"). As an example,
 189  * the default of 1000 allows 0.1% of memory to be used.
 190  */
 191 unsigned long zfs_unflushed_max_mem_ppm = 1000;
 192
 193 /*
 194  * Specific hard-limit in memory that ZFS allows to be used for
 195  * unflushed changes.
 196  */
 197 unsigned long zfs_unflushed_max_mem_amt = 1ULL << 30;
 198
 199 /*
 200  * The following tunable determines the number of blocks that can be used for
 201  * the log space maps. It is expressed as a percentage of the total number of
 202  * metaslabs in the pool (i.e. the default of 400 means that the number of log
 203  * blocks is capped at 4 times the number of metaslabs).
 204  *
 205  * This value exists to tune our flushing algorithm, with higher values
 206  * flushing metaslabs less often (doing less I/Os) per TXG versus lower values
 207  * flushing metaslabs more aggressively with the upside of saving overheads
 208  * when loading the pool. Another factor in this tradeoff is that flushing
 209  * less often can potentially lead to better utilization of the metaslab space
 210  * map's block size as we accumulate more changes per flush.
 211  *
 212  * Given that this tunable indirectly controls the flush rate (metaslabs
 213  * flushed per txg) and that's why making it a percentage in terms of the
 214  * number of metaslabs in the pool makes sense here.
 215  *
 216  * As a rule of thumb we default this tunable to 400% based on the following:
 217  *
 218  * 1] Assuming a constant flush rate and a constant incoming rate of log blocks
 219  *    it is reasonable to expect that the amount of obsolete entries changes
 220  *    linearly from txg to txg (e.g. the oldest log should have the most
 221  *    obsolete entries, and the most recent one the least). With this we could
 222  *    say that, at any given time, about half of the entries in the whole space
 223  *    map log are obsolete. Thus for every two entries for a metaslab in the
 224  *    log space map, only one of them is valid and actually makes it to the
 225  *    metaslab's space map.
 226  *    [factor of 2]
 227  * 2] Each entry in the log space map is guaranteed to be two words while
 228  *    entries in metaslab space maps are generally single-word.
 229  *    [an extra factor of 2 - 400% overall]
 230  * 3] Even if [1] and [2] are slightly less than 2 each, we haven't taken into
 231  *    account any consolidation of segments from the log space map to the
 232  *    unflushed range trees nor their history (e.g. a segment being allocated,
 233  *    then freed, then allocated again means 3 log space map entries but 0
 234  *    metaslab space map entries). Depending on the workload, we've seen ~1.8
 235  *    non-obsolete log space map entries per metaslab entry, for a total of
 236  *    ~600%. Since most of these estimates though are workload dependent, we
 237  *    default on 400% to be conservative.
 238  *
 239  *    Thus we could say that even in the worst
 240  *    case of [1] and [2], the factor should end up being 4.
 241  *
 242  * That said, regardless of the number of metaslabs in the pool we need to
 243  * provide upper and lower bounds for the log block limit.
 244  * [see zfs_unflushed_log_block_{min,max}]
 245  */
 246 unsigned long zfs_unflushed_log_block_pct = 400;
 247
 248 /*
 249  * If the number of metaslabs is small and our incoming rate is high, we could
 250  * get into a situation that we are flushing all our metaslabs every TXG. Thus
 251  * we always allow at least this many log blocks.
 252  */
 253 unsigned long zfs_unflushed_log_block_min = 1000;
 254
 255 /*
 256  * If the log becomes too big, the import time of the pool can take a hit in
 257  * terms of performance. Thus we have a hard limit in the size of the log in
 258  * terms of blocks.
 259  */
 260 unsigned long zfs_unflushed_log_block_max = (1ULL << 18);
 261
 262 /*
 263  * Max # of rows allowed for the log_summary. The tradeoff here is accuracy and
 264  * stability of the flushing algorithm (longer summary) vs its runtime overhead
 265  * (smaller summary is faster to traverse).
 266  */
 267 unsigned long zfs_max_logsm_summary_length = 10;
 268
 269 /*
 270  * Tunable that sets the lower bound on the metaslabs to flush every TXG.
 271  *
 272  * Setting this to 0 has no effect since if the pool is idle we won't even be
 273  * creating log space maps and therefore we won't be flushing. On the other
 274  * hand if the pool has any incoming workload our block heuristic will start
 275  * flushing metaslabs anyway.
 276  *
 277  * The point of this tunable is to be used in extreme cases where we really
 278  * want to flush more metaslabs than our adaptable heuristic plans to flush.
 279  */
 280 unsigned long zfs_min_metaslabs_to_flush = 1;
 281
 282 /*
 283  * Tunable that specifies how far in the past do we want to look when trying to
 284  * estimate the incoming log blocks for the current TXG.
 285  *
 286  * Setting this too high may not only increase runtime but also minimize the
 287  * effect of the incoming rates from the most recent TXGs as we take the
 288  * average over all the blocks that we walk
 289  * [see spa_estimate_incoming_log_blocks].
 290  */
 291 unsigned long zfs_max_log_walking = 5;
 292
 293 /*
 294  * This tunable exists solely for testing purposes. It ensures that the log
 295  * spacemaps are not flushed and destroyed during export in order for the
 296  * relevant log spacemap import code paths to be tested (effectively simulating
 297  * a crash).
 298  */
 299 int zfs_keep_log_spacemaps_at_export = 0;
 300
 301 static uint64_t
 302 spa_estimate_incoming_log_blocks(spa_t *spa)
 303 {
 304         ASSERT3U(spa_sync_pass(spa), ==, 1);
 305         uint64_t steps = 0, sum = 0;
 306         for (spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg);
 307             sls != NULL && steps < zfs_max_log_walking;
 308             sls = AVL_PREV(&spa->spa_sm_logs_by_txg, sls)) {
 309                 if (sls->sls_txg == spa_syncing_txg(spa)) {
 310                         /*
 311                          * skip the log created in this TXG as this would
 312                          * make our estimations inaccurate.
 313                          */
 314                         continue;
 315                 }
 316                 sum += sls->sls_nblocks;
 317                 steps++;
 318         }
 319         return ((steps > 0) ? DIV_ROUND_UP(sum, steps) : 0);
 320 }
 321
 322 uint64_t
 323 spa_log_sm_blocklimit(spa_t *spa)
 324 {
 325         return (spa->spa_unflushed_stats.sus_blocklimit);
 326 }
 327
 328 void
 329 spa_log_sm_set_blocklimit(spa_t *spa)
 330 {
 331         if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
 332                 ASSERT0(spa_log_sm_blocklimit(spa));
 333                 return;
 334         }
 335
 336         uint64_t calculated_limit =
 337             (spa_total_metaslabs(spa) * zfs_unflushed_log_block_pct) / 100;
 338         spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(calculated_limit,
 339             zfs_unflushed_log_block_min), zfs_unflushed_log_block_max);
 340 }
 341
 342 uint64_t
 343 spa_log_sm_nblocks(spa_t *spa)
 344 {
 345         return (spa->spa_unflushed_stats.sus_nblocks);
 346 }
 347
 348 /*
 349  * Ensure that the in-memory log space map structures and the summary
 350  * have the same block and metaslab counts.
 351  */
 352 static void
 353 spa_log_summary_verify_counts(spa_t *spa)
 354 {
 355         ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
 356
 357         if ((zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) == 0)
 358                 return;
 359
 360         uint64_t ms_in_avl = avl_numnodes(&spa->spa_metaslabs_by_flushed);
 361
 362         uint64_t ms_in_summary = 0, blk_in_summary = 0;
 363         for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
 364             e; e = list_next(&spa->spa_log_summary, e)) {
 365                 ms_in_summary += e->lse_mscount;
 366                 blk_in_summary += e->lse_blkcount;
 367         }
 368
 369         uint64_t ms_in_logs = 0, blk_in_logs = 0;
 370         for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
 371             sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
 372                 ms_in_logs += sls->sls_mscount;
 373                 blk_in_logs += sls->sls_nblocks;
 374         }
 375
 376         VERIFY3U(ms_in_logs, ==, ms_in_summary);
 377         VERIFY3U(ms_in_logs, ==, ms_in_avl);
 378         VERIFY3U(blk_in_logs, ==, blk_in_summary);
 379         VERIFY3U(blk_in_logs, ==, spa_log_sm_nblocks(spa));
 380 }
 381
 382 static boolean_t
 383 summary_entry_is_full(spa_t *spa, log_summary_entry_t *e)
 384 {
 385         uint64_t blocks_per_row = MAX(1,
 386             DIV_ROUND_UP(spa_log_sm_blocklimit(spa),
 387             zfs_max_logsm_summary_length));
 388         return (blocks_per_row <= e->lse_blkcount);
 389 }
 390
 391 /*
 392  * Update the log summary information to reflect the fact that a metaslab
 393  * was flushed or destroyed (e.g due to device removal or pool export/destroy).
 394  *
 395  * We typically flush the oldest flushed metaslab so the first (and oldest)
 396  * entry of the summary is updated. However if that metaslab is getting loaded
 397  * we may flush the second oldest one which may be part of an entry later in
 398  * the summary. Moreover, if we call into this function from metaslab_fini()
 399  * the metaslabs probably won't be ordered by ms_unflushed_txg. Thus we ask
 400  * for a txg as an argument so we can locate the appropriate summary entry for
 401  * the metaslab.
 402  */
 403 void
 404 spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg)
 405 {
 406         /*
 407          * We don't track summary data for read-only pools and this function
 408          * can be called from metaslab_fini(). In that case return immediately.
 409          */
 410         if (!spa_writeable(spa))
 411                 return;
 412
 413         log_summary_entry_t *target = NULL;
 414         for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
 415             e != NULL; e = list_next(&spa->spa_log_summary, e)) {
 416                 if (e->lse_start > txg)
 417                         break;
 418                 target = e;
 419         }
 420
 421         if (target == NULL || target->lse_mscount == 0) {
 422                 /*
 423                  * We didn't find a summary entry for this metaslab. We must be
 424                  * at the teardown of a spa_load() attempt that got an error
 425                  * while reading the log space maps.
 426                  */
 427                 VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR);
 428                 return;
 429         }
 430
 431         target->lse_mscount--;
 432 }
 433
 434 /*
 435  * Update the log summary information to reflect the fact that we destroyed
 436  * old log space maps. Since we can only destroy the oldest log space maps,
 437  * we decrement the block count of the oldest summary entry and potentially
 438  * destroy it when that count hits 0.
 439  *
 440  * This function is called after a metaslab is flushed and typically that
 441  * metaslab is the oldest flushed, which means that this function will
 442  * typically decrement the block count of the first entry of the summary and
 443  * potentially free it if the block count gets to zero (its metaslab count
 444  * should be zero too at that point).
 445  *
 446  * There are certain scenarios though that don't work exactly like that so we
 447  * need to account for them:
 448  *
 449  * Scenario [1]: It is possible that after we flushed the oldest flushed
 450  * metaslab and we destroyed the oldest log space map, more recent logs had 0
 451  * metaslabs pointing to them so we got rid of them too. This can happen due
 452  * to metaslabs being destroyed through device removal, or because the oldest
 453  * flushed metaslab was loading but we kept flushing more recently flushed
 454  * metaslabs due to the memory pressure of unflushed changes. Because of that,
 455  * we always iterate from the beginning of the summary and if blocks_gone is
 456  * bigger than the block_count of the current entry we free that entry (we
 457  * expect its metaslab count to be zero), we decrement blocks_gone and on to
 458  * the next entry repeating this procedure until blocks_gone gets decremented
 459  * to 0. Doing this also works for the typical case mentioned above.
 460  *
 461  * Scenario [2]: The oldest flushed metaslab isn't necessarily accounted by
 462  * the first (and oldest) entry in the summary. If the first few entries of
 463  * the summary were only accounting metaslabs from a device that was just
 464  * removed, then the current oldest flushed metaslab could be accounted by an
 465  * entry somewhere in the middle of the summary. Moreover flushing that
 466  * metaslab will destroy all the log space maps older than its ms_unflushed_txg
 467  * because they became obsolete after the removal. Thus, iterating as we did
 468  * for scenario [1] works out for this case too.
 469  *
 470  * Scenario [3]: At times we decide to flush all the metaslabs in the pool
 471  * in one TXG (either because we are exporting the pool or because our flushing
 472  * heuristics decided to do so). When that happens all the log space maps get
 473  * destroyed except the one created for the current TXG which doesn't have
 474  * any log blocks yet. As log space maps get destroyed with every metaslab that
 475  * we flush, entries in the summary are also destroyed. This brings a weird
 476  * corner-case when we flush the last metaslab and the log space map of the
 477  * current TXG is in the same summary entry with other log space maps that
 478  * are older. When that happens we are eventually left with this one last
 479  * summary entry whose blocks are gone (blocks_gone equals the entry's block
 480  * count) but its metaslab count is non-zero (because it accounts all the
 481  * metaslabs in the pool as they all got flushed). Under this scenario we can't
 482  * free this last summary entry as it's referencing all the metaslabs in the
 483  * pool and its block count will get incremented at the end of this sync (when
 484  * we close the syncing log space map). Thus we just decrement its current
 485  * block count and leave it alone. In the case that the pool gets exported,
 486  * its metaslab count will be decremented over time as we call metaslab_fini()
 487  * for all the metaslabs in the pool and the entry will be freed at
 488  * spa_unload_log_sm_metadata().
 489  */
 490 void
 491 spa_log_summary_decrement_blkcount(spa_t *spa, uint64_t blocks_gone)
 492 {
 493         for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
 494             e != NULL; e = list_head(&spa->spa_log_summary)) {
 495                 if (e->lse_blkcount > blocks_gone) {
 496                         /*
 497                          * Assert that we stopped at an entry that is not
 498                          * obsolete.
 499                          */
 500                         ASSERT(e->lse_mscount != 0);
 501
 502                         e->lse_blkcount -= blocks_gone;
 503                         blocks_gone = 0;
 504                         break;
 505                 } else if (e->lse_mscount == 0) {
 506                         /* remove obsolete entry */
 507                         blocks_gone -= e->lse_blkcount;
 508                         list_remove(&spa->spa_log_summary, e);
 509                         kmem_free(e, sizeof (log_summary_entry_t));
 510                 } else {
 511                         /* Verify that this is scenario [3] mentioned above. */
 512                         VERIFY3U(blocks_gone, ==, e->lse_blkcount);
 513
 514                         /*
 515                          * Assert that this is scenario [3] further by ensuring
 516                          * that this is the only entry in the summary.
 517                          */
 518                         VERIFY3P(e, ==, list_tail(&spa->spa_log_summary));
 519                         ASSERT3P(e, ==, list_head(&spa->spa_log_summary));
 520
 521                         blocks_gone = e->lse_blkcount = 0;
 522                         break;
 523                 }
 524         }
 525
 526         /*
 527          * Ensure that there is no way we are trying to remove more blocks
 528          * than the # of blocks in the summary.
 529          */
 530         ASSERT0(blocks_gone);
 531 }
 532
 533 void
 534 spa_log_sm_decrement_mscount(spa_t *spa, uint64_t txg)
 535 {
 536         spa_log_sm_t target = { .sls_txg = txg };
 537         spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg,
 538             &target, NULL);
 539
 540         if (sls == NULL) {
 541                 /*
 542                  * We must be at the teardown of a spa_load() attempt that
 543                  * got an error while reading the log space maps.
 544                  */
 545                 VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR);
 546                 return;
 547         }
 548
 549         ASSERT(sls->sls_mscount > 0);
 550         sls->sls_mscount--;
 551 }
 552
 553 void
 554 spa_log_sm_increment_current_mscount(spa_t *spa)
 555 {
 556         spa_log_sm_t *last_sls = avl_last(&spa->spa_sm_logs_by_txg);
 557         ASSERT3U(last_sls->sls_txg, ==, spa_syncing_txg(spa));
 558         last_sls->sls_mscount++;
 559 }
 560
 561 static void
 562 summary_add_data(spa_t *spa, uint64_t txg, uint64_t metaslabs_flushed,
 563     uint64_t nblocks)
 564 {
 565         log_summary_entry_t *e = list_tail(&spa->spa_log_summary);
 566
 567         if (e == NULL || summary_entry_is_full(spa, e)) {
 568                 e = kmem_zalloc(sizeof (log_summary_entry_t), KM_SLEEP);
 569                 e->lse_start = txg;
 570                 list_insert_tail(&spa->spa_log_summary, e);
 571         }
 572
 573         ASSERT3U(e->lse_start, <=, txg);
 574         e->lse_mscount += metaslabs_flushed;
 575         e->lse_blkcount += nblocks;
 576 }
 577
 578 static void
 579 spa_log_summary_add_incoming_blocks(spa_t *spa, uint64_t nblocks)
 580 {
 581         summary_add_data(spa, spa_syncing_txg(spa), 0, nblocks);
 582 }
 583
 584 void
 585 spa_log_summary_add_flushed_metaslab(spa_t *spa)
 586 {
 587         summary_add_data(spa, spa_syncing_txg(spa), 1, 0);
 588 }
 589
 590 /*
 591  * This function attempts to estimate how many metaslabs should
 592  * we flush to satisfy our block heuristic for the log spacemap
 593  * for the upcoming TXGs.
 594  *
 595  * Specifically, it first tries to estimate the number of incoming
 596  * blocks in this TXG. Then by projecting that incoming rate to
 597  * future TXGs and using the log summary, it figures out how many
 598  * flushes we would need to do for future TXGs individually to
 599  * stay below our block limit and returns the maximum number of
 600  * flushes from those estimates.
 601  */
 602 static uint64_t
 603 spa_estimate_metaslabs_to_flush(spa_t *spa)
 604 {
 605         ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
 606         ASSERT3U(spa_sync_pass(spa), ==, 1);
 607         ASSERT(spa_log_sm_blocklimit(spa) != 0);
 608
 609         /*
 610          * This variable contains the incoming rate that will be projected
 611          * and used for our flushing estimates in the future.
 612          */
 613         uint64_t incoming = spa_estimate_incoming_log_blocks(spa);
 614
 615         /*
 616          * At any point in time this variable tells us how many
 617          * TXGs in the future we are so we can make our estimations.
 618          */
 619         uint64_t txgs_in_future = 1;
 620
 621         /*
 622          * This variable tells us how much room do we have until we hit
 623          * our limit. When it goes negative, it means that we've exceeded
 624          * our limit and we need to flush.
 625          *
 626          * Note that since we start at the first TXG in the future (i.e.
 627          * txgs_in_future starts from 1) we already decrement this
 628          * variable by the incoming rate.
 629          */
 630         int64_t available_blocks =
 631             spa_log_sm_blocklimit(spa) - spa_log_sm_nblocks(spa) - incoming;
 632
 633         /*
 634          * This variable tells us the total number of flushes needed to
 635          * keep the log size within the limit when we reach txgs_in_future.
 636          */
 637         uint64_t total_flushes = 0;
 638
 639         /* Holds the current maximum of our estimates so far. */
 640         uint64_t max_flushes_pertxg =
 641             MIN(avl_numnodes(&spa->spa_metaslabs_by_flushed),
 642             zfs_min_metaslabs_to_flush);
 643
 644         /*
 645          * For our estimations we only look as far in the future
 646          * as the summary allows us.
 647          */
 648         for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
 649             e; e = list_next(&spa->spa_log_summary, e)) {
 650
 651                 /*
 652                  * If there is still room before we exceed our limit
 653                  * then keep skipping TXGs accumulating more blocks
 654                  * based on the incoming rate until we exceed it.
 655                  */
 656                 if (available_blocks >= 0) {
 657                         uint64_t skip_txgs = (available_blocks / incoming) + 1;
 658                         available_blocks -= (skip_txgs * incoming);
 659                         txgs_in_future += skip_txgs;
 660                         ASSERT3S(available_blocks, >=, -incoming);
 661                 }
 662
 663                 /*
 664                  * At this point we're far enough into the future where
 665                  * the limit was just exceeded and we flush metaslabs
 666                  * based on the current entry in the summary, updating
 667                  * our available_blocks.
 668                  */
 669                 ASSERT3S(available_blocks, <, 0);
 670                 available_blocks += e->lse_blkcount;
 671                 total_flushes += e->lse_mscount;
 672
 673                 /*
 674                  * Keep the running maximum of the total_flushes that
 675                  * we've done so far over the number of TXGs in the
 676                  * future that we are. The idea here is to estimate
 677                  * the average number of flushes that we should do
 678                  * every TXG so that when we are that many TXGs in the
 679                  * future we stay under the limit.
 680                  */
 681                 max_flushes_pertxg = MAX(max_flushes_pertxg,
 682                     DIV_ROUND_UP(total_flushes, txgs_in_future));
 683                 ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=,
 684                     max_flushes_pertxg);
 685         }
 686         return (max_flushes_pertxg);
 687 }
 688
 689 uint64_t
 690 spa_log_sm_memused(spa_t *spa)
 691 {
 692         return (spa->spa_unflushed_stats.sus_memused);
 693 }
 694
 695 static boolean_t
 696 spa_log_exceeds_memlimit(spa_t *spa)
 697 {
 698         if (spa_log_sm_memused(spa) > zfs_unflushed_max_mem_amt)
 699                 return (B_TRUE);
 700
 701         uint64_t system_mem_allowed = ((physmem * PAGESIZE) *
 702             zfs_unflushed_max_mem_ppm) / 1000000;
 703         if (spa_log_sm_memused(spa) > system_mem_allowed)
 704                 return (B_TRUE);
 705
 706         return (B_FALSE);
 707 }
 708
 709 boolean_t
 710 spa_flush_all_logs_requested(spa_t *spa)
 711 {
 712         return (spa->spa_log_flushall_txg != 0);
 713 }
 714
 715 void
 716 spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx)
 717 {
 718         uint64_t txg = dmu_tx_get_txg(tx);
 719
 720         if (spa_sync_pass(spa) != 1)
 721                 return;
 722
 723         if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
 724                 return;
 725
 726         /*
 727          * If we don't have any metaslabs with unflushed changes
 728          * return immediately.
 729          */
 730         if (avl_numnodes(&spa->spa_metaslabs_by_flushed) == 0)
 731                 return;
 732
 733         /*
 734          * During SPA export we leave a few empty TXGs to go by [see
 735          * spa_final_dirty_txg() to understand why]. For this specific
 736          * case, it is important to not flush any metaslabs as that
 737          * would dirty this TXG.
 738          *
 739          * That said, during one of these dirty TXGs that is less or
 740          * equal to spa_final_dirty(), spa_unload() will request that
 741          * we try to flush all the metaslabs for that TXG before
 742          * exporting the pool, thus we ensure that we didn't get a
 743          * request of flushing everything before we attempt to return
 744          * immediately.
 745          */
 746         if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
 747             !dmu_objset_is_dirty(spa_meta_objset(spa), txg) &&
 748             !spa_flush_all_logs_requested(spa))
 749                 return;
 750
 751         /*
 752          * We need to generate a log space map before flushing because this
 753          * will set up the in-memory data (i.e. node in spa_sm_logs_by_txg)
 754          * for this TXG's flushed metaslab count (aka sls_mscount which is
 755          * manipulated in many ways down the metaslab_flush() codepath).
 756          *
 757          * That is not to say that we may generate a log space map when we
 758          * don't need it. If we are flushing metaslabs, that means that we
 759          * were going to write changes to disk anyway, so even if we were
 760          * not flushing, a log space map would have been created anyway in
 761          * metaslab_sync().
 762          */
 763         spa_generate_syncing_log_sm(spa, tx);
 764
 765         /*
 766          * This variable tells us how many metaslabs we want to flush based
 767          * on the block-heuristic of our flushing algorithm (see block comment
 768          * of log space map feature). We also decrement this as we flush
 769          * metaslabs and attempt to destroy old log space maps.
 770          */
 771         uint64_t want_to_flush;
 772         if (spa_flush_all_logs_requested(spa)) {
 773                 ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED);
 774                 want_to_flush = avl_numnodes(&spa->spa_metaslabs_by_flushed);
 775         } else {
 776                 want_to_flush = spa_estimate_metaslabs_to_flush(spa);
 777         }
 778
 779         ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=,
 780             want_to_flush);
 781
 782         /* Used purely for verification purposes */
 783         uint64_t visited = 0;
 784
 785         /*
 786          * Ideally we would only iterate through spa_metaslabs_by_flushed
 787          * using only one variable (curr). We can't do that because
 788          * metaslab_flush() mutates position of curr in the AVL when
 789          * it flushes that metaslab by moving it to the end of the tree.
 790          * Thus we always keep track of the original next node of the
 791          * current node (curr) in another variable (next).
 792          */
 793         metaslab_t *next = NULL;
 794         for (metaslab_t *curr = avl_first(&spa->spa_metaslabs_by_flushed);
 795             curr != NULL; curr = next) {
 796                 next = AVL_NEXT(&spa->spa_metaslabs_by_flushed, curr);
 797
 798                 /*
 799                  * If this metaslab has been flushed this txg then we've done
 800                  * a full circle over the metaslabs.
 801                  */
 802                 if (metaslab_unflushed_txg(curr) == txg)
 803                         break;
 804
 805                 /*
 806                  * If we are done flushing for the block heuristic and the
 807                  * unflushed changes don't exceed the memory limit just stop.
 808                  */
 809                 if (want_to_flush == 0 && !spa_log_exceeds_memlimit(spa))
 810                         break;
 811
 812                 mutex_enter(&curr->ms_sync_lock);
 813                 mutex_enter(&curr->ms_lock);
 814                 boolean_t flushed = metaslab_flush(curr, tx);
 815                 mutex_exit(&curr->ms_lock);
 816                 mutex_exit(&curr->ms_sync_lock);
 817
 818                 /*
 819                  * If we failed to flush a metaslab (because it was loading),
 820                  * then we are done with the block heuristic as it's not
 821                  * possible to destroy any log space maps once you've skipped
 822                  * a metaslab. In that case we just set our counter to 0 but
 823                  * we continue looping in case there is still memory pressure
 824                  * due to unflushed changes. Note that, flushing a metaslab
 825                  * that is not the oldest flushed in the pool, will never
 826                  * destroy any log space maps [see spa_cleanup_old_sm_logs()].
 827                  */
 828                 if (!flushed) {
 829                         want_to_flush = 0;
 830                 } else if (want_to_flush > 0) {
 831                         want_to_flush--;
 832                 }
 833
 834                 visited++;
 835         }
 836         ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, visited);
 837 }
 838
 839 /*
 840  * Close the log space map for this TXG and update the block counts
 841  * for the log's in-memory structure and the summary.
 842  */
 843 void
 844 spa_sync_close_syncing_log_sm(spa_t *spa)
 845 {
 846         if (spa_syncing_log_sm(spa) == NULL)
 847                 return;
 848         ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
 849
 850         spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg);
 851         ASSERT3U(sls->sls_txg, ==, spa_syncing_txg(spa));
 852
 853         sls->sls_nblocks = space_map_nblocks(spa_syncing_log_sm(spa));
 854         spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
 855
 856         /*
 857          * Note that we can't assert that sls_mscount is not 0,
 858          * because there is the case where the first metaslab
 859          * in spa_metaslabs_by_flushed is loading and we were
 860          * not able to flush any metaslabs the current TXG.
 861          */
 862         ASSERT(sls->sls_nblocks != 0);
 863
 864         spa_log_summary_add_incoming_blocks(spa, sls->sls_nblocks);
 865         spa_log_summary_verify_counts(spa);
 866
 867         space_map_close(spa->spa_syncing_log_sm);
 868         spa->spa_syncing_log_sm = NULL;
 869
 870         /*
 871          * At this point we tried to flush as many metaslabs as we
 872          * can as the pool is getting exported. Reset the "flush all"
 873          * so the last few TXGs before closing the pool can be empty
 874          * (e.g. not dirty).
 875          */
 876         if (spa_flush_all_logs_requested(spa)) {
 877                 ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED);
 878                 spa->spa_log_flushall_txg = 0;
 879         }
 880 }
 881
 882 void
 883 spa_cleanup_old_sm_logs(spa_t *spa, dmu_tx_t *tx)
 884 {
 885         objset_t *mos = spa_meta_objset(spa);
 886
 887         uint64_t spacemap_zap;
 888         int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
 889             DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
 890         if (error == ENOENT) {
 891                 ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
 892                 return;
 893         }
 894         VERIFY0(error);
 895
 896         metaslab_t *oldest = avl_first(&spa->spa_metaslabs_by_flushed);
 897         uint64_t oldest_flushed_txg = metaslab_unflushed_txg(oldest);
 898
 899         /* Free all log space maps older than the oldest_flushed_txg. */
 900         for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
 901             sls && sls->sls_txg < oldest_flushed_txg;
 902             sls = avl_first(&spa->spa_sm_logs_by_txg)) {
 903                 ASSERT0(sls->sls_mscount);
 904                 avl_remove(&spa->spa_sm_logs_by_txg, sls);
 905                 space_map_free_obj(mos, sls->sls_sm_obj, tx);
 906                 VERIFY0(zap_remove_int(mos, spacemap_zap, sls->sls_txg, tx));
 907                 spa->spa_unflushed_stats.sus_nblocks -= sls->sls_nblocks;
 908                 kmem_free(sls, sizeof (spa_log_sm_t));
 909         }
 910 }
 911
 912 static spa_log_sm_t *
 913 spa_log_sm_alloc(uint64_t sm_obj, uint64_t txg)
 914 {
 915         spa_log_sm_t *sls = kmem_zalloc(sizeof (*sls), KM_SLEEP);
 916         sls->sls_sm_obj = sm_obj;
 917         sls->sls_txg = txg;
 918         return (sls);
 919 }
 920
 921 void
 922 spa_generate_syncing_log_sm(spa_t *spa, dmu_tx_t *tx)
 923 {
 924         uint64_t txg = dmu_tx_get_txg(tx);
 925         objset_t *mos = spa_meta_objset(spa);
 926
 927         if (spa_syncing_log_sm(spa) != NULL)
 928                 return;
 929
 930         if (!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP))
 931                 return;
 932
 933         uint64_t spacemap_zap;
 934         int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
 935             DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
 936         if (error == ENOENT) {
 937                 ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
 938
 939                 error = 0;
 940                 spacemap_zap = zap_create(mos,
 941                     DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
 942                 VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
 943                     DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1,
 944                     &spacemap_zap, tx));
 945                 spa_feature_incr(spa, SPA_FEATURE_LOG_SPACEMAP, tx);
 946         }
 947         VERIFY0(error);
 948
 949         uint64_t sm_obj;
 950         ASSERT3U(zap_lookup_int_key(mos, spacemap_zap, txg, &sm_obj),
 951             ==, ENOENT);
 952         sm_obj = space_map_alloc(mos, zfs_log_sm_blksz, tx);
 953         VERIFY0(zap_add_int_key(mos, spacemap_zap, txg, sm_obj, tx));
 954         avl_add(&spa->spa_sm_logs_by_txg, spa_log_sm_alloc(sm_obj, txg));
 955
 956         /*
 957          * We pass UINT64_MAX as the space map's representation size
 958          * and SPA_MINBLOCKSHIFT as the shift, to make the space map
 959          * accept any sorts of segments since there's no real advantage
 960          * to being more restrictive (given that we're already going
 961          * to be using 2-word entries).
 962          */
 963         VERIFY0(space_map_open(&spa->spa_syncing_log_sm, mos, sm_obj,
 964             0, UINT64_MAX, SPA_MINBLOCKSHIFT));
 965
 966         /*
 967          * If the log space map feature was just enabled, the blocklimit
 968          * has not yet been set.
 969          */
 970         if (spa_log_sm_blocklimit(spa) == 0)
 971                 spa_log_sm_set_blocklimit(spa);
 972 }
 973
 974 /*
 975  * Find all the log space maps stored in the space map ZAP and sort
 976  * them by their TXG in spa_sm_logs_by_txg.
 977  */
 978 static int
 979 spa_ld_log_sm_metadata(spa_t *spa)
 980 {
 981         int error;
 982         uint64_t spacemap_zap;
 983
 984         ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
 985
 986         error = zap_lookup(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
 987             DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
 988         if (error == ENOENT) {
 989                 /* the space map ZAP doesn't exist yet */
 990                 return (0);
 991         } else if (error != 0) {
 992                 spa_load_failed(spa, "spa_ld_log_sm_metadata(): failed at "
 993                     "zap_lookup(DMU_POOL_DIRECTORY_OBJECT) [error %d]",
 994                     error);
 995                 return (error);
 996         }
 997
 998         zap_cursor_t zc;
 999         zap_attribute_t za;
1000         for (zap_cursor_init(&zc, spa_meta_objset(spa), spacemap_zap);
1001             (error = zap_cursor_retrieve(&zc, &za)) == 0;
1002             zap_cursor_advance(&zc)) {
1003                 uint64_t log_txg = zfs_strtonum(za.za_name, NULL);
1004                 spa_log_sm_t *sls =
1005                     spa_log_sm_alloc(za.za_first_integer, log_txg);
1006                 avl_add(&spa->spa_sm_logs_by_txg, sls);
1007         }
1008         zap_cursor_fini(&zc);
1009         if (error != ENOENT) {
1010                 spa_load_failed(spa, "spa_ld_log_sm_metadata(): failed at "
1011                     "zap_cursor_retrieve(spacemap_zap) [error %d]",
1012                     error);
1013                 return (error);
1014         }
1015
1016         for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed);
1017             m; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) {
1018                 spa_log_sm_t target = { .sls_txg = metaslab_unflushed_txg(m) };
1019                 spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg,
1020                     &target, NULL);
1021
1022                 /*
1023                  * At this point if sls is zero it means that a bug occurred
1024                  * in ZFS the last time the pool was open or earlier in the
1025                  * import code path. In general, we would have placed a
1026                  * VERIFY() here or in this case just let the kernel panic
1027                  * with NULL pointer dereference when incrementing sls_mscount,
1028                  * but since this is the import code path we can be a bit more
1029                  * lenient. Thus, for DEBUG bits we always cause a panic, while
1030                  * in production we log the error and just fail the import.
1031                  */
1032                 ASSERT(sls != NULL);
1033                 if (sls == NULL) {
1034                         spa_load_failed(spa, "spa_ld_log_sm_metadata(): bug "
1035                             "encountered: could not find log spacemap for "
1036                             "TXG %llu [error %d]",
1037                             (u_longlong_t)metaslab_unflushed_txg(m), ENOENT);
1038                         return (ENOENT);
1039                 }
1040                 sls->sls_mscount++;
1041         }
1042
1043         return (0);
1044 }
1045
1046 typedef struct spa_ld_log_sm_arg {
1047         spa_t *slls_spa;
1048         uint64_t slls_txg;
1049 } spa_ld_log_sm_arg_t;
1050
1051 static int
1052 spa_ld_log_sm_cb(space_map_entry_t *sme, void *arg)
1053 {
1054         uint64_t offset = sme->sme_offset;
1055         uint64_t size = sme->sme_run;
1056         uint32_t vdev_id = sme->sme_vdev;
1057
1058         spa_ld_log_sm_arg_t *slls = arg;
1059         spa_t *spa = slls->slls_spa;
1060
1061         vdev_t *vd = vdev_lookup_top(spa, vdev_id);
1062
1063         /*
1064          * If the vdev has been removed (i.e. it is indirect or a hole)
1065          * skip this entry. The contents of this vdev have already moved
1066          * elsewhere.
1067          */
1068         if (!vdev_is_concrete(vd))
1069                 return (0);
1070
1071         metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1072         ASSERT(!ms->ms_loaded);
1073
1074         /*
1075          * If we have already flushed entries for this TXG to this
1076          * metaslab's space map, then ignore it. Note that we flush
1077          * before processing any allocations/frees for that TXG, so
1078          * the metaslab's space map only has entries from *before*
1079          * the unflushed TXG.
1080          */
1081         if (slls->slls_txg < metaslab_unflushed_txg(ms))
1082                 return (0);
1083
1084         switch (sme->sme_type) {
1085         case SM_ALLOC:
1086                 range_tree_remove_xor_add_segment(offset, offset + size,
1087                     ms->ms_unflushed_frees, ms->ms_unflushed_allocs);
1088                 break;
1089         case SM_FREE:
1090                 range_tree_remove_xor_add_segment(offset, offset + size,
1091                     ms->ms_unflushed_allocs, ms->ms_unflushed_frees);
1092                 break;
1093         default:
1094                 panic("invalid maptype_t");
1095                 break;
1096         }
1097         return (0);
1098 }
1099
1100 static int
1101 spa_ld_log_sm_data(spa_t *spa)
1102 {
1103         int error = 0;
1104
1105         /*
1106          * If we are not going to do any writes there is no need
1107          * to read the log space maps.
1108          */
1109         if (!spa_writeable(spa))
1110                 return (0);
1111
1112         ASSERT0(spa->spa_unflushed_stats.sus_nblocks);
1113         ASSERT0(spa->spa_unflushed_stats.sus_memused);
1114
1115         hrtime_t read_logs_starttime = gethrtime();
1116         /* this is a no-op when we don't have space map logs */
1117         for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
1118             sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
1119                 space_map_t *sm = NULL;
1120                 error = space_map_open(&sm, spa_meta_objset(spa),
1121                     sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT);
1122                 if (error != 0) {
1123                         spa_load_failed(spa, "spa_ld_log_sm_data(): failed at "
1124                             "space_map_open(obj=%llu) [error %d]",
1125                             (u_longlong_t)sls->sls_sm_obj, error);
1126                         goto out;
1127                 }
1128
1129                 struct spa_ld_log_sm_arg vla = {
1130                         .slls_spa = spa,
1131                         .slls_txg = sls->sls_txg
1132                 };
1133                 error = space_map_iterate(sm, space_map_length(sm),
1134                     spa_ld_log_sm_cb, &vla);
1135                 if (error != 0) {
1136                         space_map_close(sm);
1137                         spa_load_failed(spa, "spa_ld_log_sm_data(): failed "
1138                             "at space_map_iterate(obj=%llu) [error %d]",
1139                             (u_longlong_t)sls->sls_sm_obj, error);
1140                         goto out;
1141                 }
1142
1143                 ASSERT0(sls->sls_nblocks);
1144                 sls->sls_nblocks = space_map_nblocks(sm);
1145                 spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
1146                 summary_add_data(spa, sls->sls_txg,
1147                     sls->sls_mscount, sls->sls_nblocks);
1148
1149                 space_map_close(sm);
1150         }
1151         hrtime_t read_logs_endtime = gethrtime();
1152         spa_load_note(spa,
1153             "read %llu log space maps (%llu total blocks - blksz = %llu bytes) "
1154             "in %lld ms", (u_longlong_t)avl_numnodes(&spa->spa_sm_logs_by_txg),
1155             (u_longlong_t)spa_log_sm_nblocks(spa),
1156             (u_longlong_t)zfs_log_sm_blksz,
1157             (longlong_t)((read_logs_endtime - read_logs_starttime) / 1000000));
1158
1159 out:
1160         /*
1161          * Now that the metaslabs contain their unflushed changes:
1162          * [1] recalculate their actual allocated space
1163          * [2] recalculate their weights
1164          * [3] sum up the memory usage of their unflushed range trees
1165          * [4] optionally load them, if debug_load is set
1166          *
1167          * Note that even in the case where we get here because of an
1168          * error (e.g. error != 0), we still want to update the fields
1169          * below in order to have a proper teardown in spa_unload().
1170          */
1171         for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed);
1172             m != NULL; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) {
1173                 mutex_enter(&m->ms_lock);
1174                 m->ms_allocated_space = space_map_allocated(m->ms_sm) +
1175                     range_tree_space(m->ms_unflushed_allocs) -
1176                     range_tree_space(m->ms_unflushed_frees);
1177
1178                 vdev_t *vd = m->ms_group->mg_vd;
1179                 metaslab_space_update(vd, m->ms_group->mg_class,
1180                     range_tree_space(m->ms_unflushed_allocs), 0, 0);
1181                 metaslab_space_update(vd, m->ms_group->mg_class,
1182                     -range_tree_space(m->ms_unflushed_frees), 0, 0);
1183
1184                 ASSERT0(m->ms_weight & METASLAB_ACTIVE_MASK);
1185                 metaslab_recalculate_weight_and_sort(m);
1186
1187                 spa->spa_unflushed_stats.sus_memused +=
1188                     metaslab_unflushed_changes_memused(m);
1189
1190                 if (metaslab_debug_load && m->ms_sm != NULL) {
1191                         VERIFY0(metaslab_load(m));
1192                         metaslab_set_selected_txg(m, 0);
1193                 }
1194                 mutex_exit(&m->ms_lock);
1195         }
1196
1197         return (error);
1198 }
1199
1200 static int
1201 spa_ld_unflushed_txgs(vdev_t *vd)
1202 {
1203         spa_t *spa = vd->vdev_spa;
1204         objset_t *mos = spa_meta_objset(spa);
1205
1206         if (vd->vdev_top_zap == 0)
1207                 return (0);
1208
1209         uint64_t object = 0;
1210         int error = zap_lookup(mos, vd->vdev_top_zap,
1211             VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
1212             sizeof (uint64_t), 1, &object);
1213         if (error == ENOENT)
1214                 return (0);
1215         else if (error != 0) {
1216                 spa_load_failed(spa, "spa_ld_unflushed_txgs(): failed at "
1217                     "zap_lookup(vdev_top_zap=%llu) [error %d]",
1218                     (u_longlong_t)vd->vdev_top_zap, error);
1219                 return (error);
1220         }
1221
1222         for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
1223                 metaslab_t *ms = vd->vdev_ms[m];
1224                 ASSERT(ms != NULL);
1225
1226                 metaslab_unflushed_phys_t entry;
1227                 uint64_t entry_size = sizeof (entry);
1228                 uint64_t entry_offset = ms->ms_id * entry_size;
1229
1230                 error = dmu_read(mos, object,
1231                     entry_offset, entry_size, &entry, 0);
1232                 if (error != 0) {
1233                         spa_load_failed(spa, "spa_ld_unflushed_txgs(): "
1234                             "failed at dmu_read(obj=%llu) [error %d]",
1235                             (u_longlong_t)object, error);
1236                         return (error);
1237                 }
1238
1239                 ms->ms_unflushed_txg = entry.msp_unflushed_txg;
1240                 if (ms->ms_unflushed_txg != 0) {
1241                         mutex_enter(&spa->spa_flushed_ms_lock);
1242                         avl_add(&spa->spa_metaslabs_by_flushed, ms);
1243                         mutex_exit(&spa->spa_flushed_ms_lock);
1244                 }
1245         }
1246         return (0);
1247 }
1248
1249 /*
1250  * Read all the log space map entries into their respective
1251  * metaslab unflushed trees and keep them sorted by TXG in the
1252  * SPA's metadata. In addition, setup all the metadata for the
1253  * memory and the block heuristics.
1254  */
1255 int
1256 spa_ld_log_spacemaps(spa_t *spa)
1257 {
1258         int error;
1259
1260         spa_log_sm_set_blocklimit(spa);
1261
1262         for (uint64_t c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
1263                 vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
1264                 error = spa_ld_unflushed_txgs(vd);
1265                 if (error != 0)
1266                         return (error);
1267         }
1268
1269         error = spa_ld_log_sm_metadata(spa);
1270         if (error != 0)
1271                 return (error);
1272
1273         /*
1274          * Note: we don't actually expect anything to change at this point
1275          * but we grab the config lock so we don't fail any assertions
1276          * when using vdev_lookup_top().
1277          */
1278         spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
1279         error = spa_ld_log_sm_data(spa);
1280         spa_config_exit(spa, SCL_CONFIG, FTAG);
1281
1282         return (error);
1283 }
1284
1285 /* BEGIN CSTYLED */
1286 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_amt, ULONG, ZMOD_RW,
1287     "Specific hard-limit in memory that ZFS allows to be used for "
1288     "unflushed changes");
1289
1290 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_ppm, ULONG, ZMOD_RW,
1291     "Percentage of the overall system memory that ZFS allows to be "
1292     "used for unflushed changes (value is calculated over 1000000 for "
1293     "finer granularity)");
1294
1295 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_max, ULONG, ZMOD_RW,
1296     "Hard limit (upper-bound) in the size of the space map log "
1297     "in terms of blocks.");
1298
1299 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_min, ULONG, ZMOD_RW,
1300     "Lower-bound limit for the maximum amount of blocks allowed in "
1301     "log spacemap (see zfs_unflushed_log_block_max)");
1302
1303 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_pct, ULONG, ZMOD_RW,
1304     "Tunable used to determine the number of blocks that can be used for "
1305     "the spacemap log, expressed as a percentage of the total number of "
1306     "metaslabs in the pool (e.g. 400 means the number of log blocks is "
1307     "capped at 4 times the number of metaslabs)");
1308
1309 ZFS_MODULE_PARAM(zfs, zfs_, max_log_walking, ULONG, ZMOD_RW,
1310     "The number of past TXGs that the flushing algorithm of the log "
1311     "spacemap feature uses to estimate incoming log blocks");
1312
1313 ZFS_MODULE_PARAM(zfs, zfs_, max_logsm_summary_length, ULONG, ZMOD_RW,
1314     "Maximum number of rows allowed in the summary of the spacemap log");
1315
1316 ZFS_MODULE_PARAM(zfs, zfs_, min_metaslabs_to_flush, ULONG, ZMOD_RW,
1317     "Minimum number of metaslabs to flush per dirty TXG");
1318
1319 ZFS_MODULE_PARAM(zfs, zfs_, keep_log_spacemaps_at_export, INT, ZMOD_RW,
1320     "Prevent the log spacemaps from being flushed and destroyed "
1321     "during pool export/destroy");
1322 /* END CSTYLED */