module/zfs/spa_misc.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or https://opensource.org/licenses/CDDL-1.0.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
  24  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  25  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  26  * Copyright 2013 Saso Kiselkov. All rights reserved.
  27  * Copyright (c) 2017 Datto Inc.
  28  * Copyright (c) 2017, Intel Corporation.
  29  * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
  30  * Copyright (c) 2023, Klara Inc.
  31  */
  32
  33 #include <sys/zfs_context.h>
  34 #include <sys/zfs_chksum.h>
  35 #include <sys/spa_impl.h>
  36 #include <sys/zio.h>
  37 #include <sys/zio_checksum.h>
  38 #include <sys/zio_compress.h>
  39 #include <sys/dmu.h>
  40 #include <sys/dmu_tx.h>
  41 #include <sys/zap.h>
  42 #include <sys/zil.h>
  43 #include <sys/vdev_impl.h>
  44 #include <sys/vdev_initialize.h>
  45 #include <sys/vdev_trim.h>
  46 #include <sys/vdev_file.h>
  47 #include <sys/vdev_raidz.h>
  48 #include <sys/metaslab.h>
  49 #include <sys/uberblock_impl.h>
  50 #include <sys/txg.h>
  51 #include <sys/avl.h>
  52 #include <sys/unique.h>
  53 #include <sys/dsl_pool.h>
  54 #include <sys/dsl_dir.h>
  55 #include <sys/dsl_prop.h>
  56 #include <sys/fm/util.h>
  57 #include <sys/dsl_scan.h>
  58 #include <sys/fs/zfs.h>
  59 #include <sys/metaslab_impl.h>
  60 #include <sys/arc.h>
  61 #include <sys/brt.h>
  62 #include <sys/ddt.h>
  63 #include <sys/kstat.h>
  64 #include "zfs_prop.h"
  65 #include <sys/btree.h>
  66 #include <sys/zfeature.h>
  67 #include <sys/qat.h>
  68 #include <sys/zstd/zstd.h>
  69
  70 /*
  71  * SPA locking
  72  *
  73  * There are three basic locks for managing spa_t structures:
  74  *
  75  * spa_namespace_lock (global mutex)
  76  *
  77  *      This lock must be acquired to do any of the following:
  78  *
  79  *              - Lookup a spa_t by name
  80  *              - Add or remove a spa_t from the namespace
  81  *              - Increase spa_refcount from non-zero
  82  *              - Check if spa_refcount is zero
  83  *              - Rename a spa_t
  84  *              - add/remove/attach/detach devices
  85  *              - Held for the duration of create/destroy/import/export
  86  *
  87  *      It does not need to handle recursion.  A create or destroy may
  88  *      reference objects (files or zvols) in other pools, but by
  89  *      definition they must have an existing reference, and will never need
  90  *      to lookup a spa_t by name.
  91  *
  92  * spa_refcount (per-spa zfs_refcount_t protected by mutex)
  93  *
  94  *      This reference count keep track of any active users of the spa_t.  The
  95  *      spa_t cannot be destroyed or freed while this is non-zero.  Internally,
  96  *      the refcount is never really 'zero' - opening a pool implicitly keeps
  97  *      some references in the DMU.  Internally we check against spa_minref, but
  98  *      present the image of a zero/non-zero value to consumers.
  99  *
 100  * spa_config_lock[] (per-spa array of rwlocks)
 101  *
 102  *      This protects the spa_t from config changes, and must be held in
 103  *      the following circumstances:
 104  *
 105  *              - RW_READER to perform I/O to the spa
 106  *              - RW_WRITER to change the vdev config
 107  *
 108  * The locking order is fairly straightforward:
 109  *
 110  *              spa_namespace_lock      ->      spa_refcount
 111  *
 112  *      The namespace lock must be acquired to increase the refcount from 0
 113  *      or to check if it is zero.
 114  *
 115  *              spa_refcount            ->      spa_config_lock[]
 116  *
 117  *      There must be at least one valid reference on the spa_t to acquire
 118  *      the config lock.
 119  *
 120  *              spa_namespace_lock      ->      spa_config_lock[]
 121  *
 122  *      The namespace lock must always be taken before the config lock.
 123  *
 124  *
 125  * The spa_namespace_lock can be acquired directly and is globally visible.
 126  *
 127  * The namespace is manipulated using the following functions, all of which
 128  * require the spa_namespace_lock to be held.
 129  *
 130  *      spa_lookup()            Lookup a spa_t by name.
 131  *
 132  *      spa_add()               Create a new spa_t in the namespace.
 133  *
 134  *      spa_remove()            Remove a spa_t from the namespace.  This also
 135  *                              frees up any memory associated with the spa_t.
 136  *
 137  *      spa_next()              Returns the next spa_t in the system, or the
 138  *                              first if NULL is passed.
 139  *
 140  *      spa_evict_all()         Shutdown and remove all spa_t structures in
 141  *                              the system.
 142  *
 143  *      spa_guid_exists()       Determine whether a pool/device guid exists.
 144  *
 145  * The spa_refcount is manipulated using the following functions:
 146  *
 147  *      spa_open_ref()          Adds a reference to the given spa_t.  Must be
 148  *                              called with spa_namespace_lock held if the
 149  *                              refcount is currently zero.
 150  *
 151  *      spa_close()             Remove a reference from the spa_t.  This will
 152  *                              not free the spa_t or remove it from the
 153  *                              namespace.  No locking is required.
 154  *
 155  *      spa_refcount_zero()     Returns true if the refcount is currently
 156  *                              zero.  Must be called with spa_namespace_lock
 157  *                              held.
 158  *
 159  * The spa_config_lock[] is an array of rwlocks, ordered as follows:
 160  * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV.
 161  * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}().
 162  *
 163  * To read the configuration, it suffices to hold one of these locks as reader.
 164  * To modify the configuration, you must hold all locks as writer.  To modify
 165  * vdev state without altering the vdev tree's topology (e.g. online/offline),
 166  * you must hold SCL_STATE and SCL_ZIO as writer.
 167  *
 168  * We use these distinct config locks to avoid recursive lock entry.
 169  * For example, spa_sync() (which holds SCL_CONFIG as reader) induces
 170  * block allocations (SCL_ALLOC), which may require reading space maps
 171  * from disk (dmu_read() -> zio_read() -> SCL_ZIO).
 172  *
 173  * The spa config locks cannot be normal rwlocks because we need the
 174  * ability to hand off ownership.  For example, SCL_ZIO is acquired
 175  * by the issuing thread and later released by an interrupt thread.
 176  * They do, however, obey the usual write-wanted semantics to prevent
 177  * writer (i.e. system administrator) starvation.
 178  *
 179  * The lock acquisition rules are as follows:
 180  *
 181  * SCL_CONFIG
 182  *      Protects changes to the vdev tree topology, such as vdev
 183  *      add/remove/attach/detach.  Protects the dirty config list
 184  *      (spa_config_dirty_list) and the set of spares and l2arc devices.
 185  *
 186  * SCL_STATE
 187  *      Protects changes to pool state and vdev state, such as vdev
 188  *      online/offline/fault/degrade/clear.  Protects the dirty state list
 189  *      (spa_state_dirty_list) and global pool state (spa_state).
 190  *
 191  * SCL_ALLOC
 192  *      Protects changes to metaslab groups and classes.
 193  *      Held as reader by metaslab_alloc() and metaslab_claim().
 194  *
 195  * SCL_ZIO
 196  *      Held by bp-level zios (those which have no io_vd upon entry)
 197  *      to prevent changes to the vdev tree.  The bp-level zio implicitly
 198  *      protects all of its vdev child zios, which do not hold SCL_ZIO.
 199  *
 200  * SCL_FREE
 201  *      Protects changes to metaslab groups and classes.
 202  *      Held as reader by metaslab_free().  SCL_FREE is distinct from
 203  *      SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free
 204  *      blocks in zio_done() while another i/o that holds either
 205  *      SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete.
 206  *
 207  * SCL_VDEV
 208  *      Held as reader to prevent changes to the vdev tree during trivial
 209  *      inquiries such as bp_get_dsize().  SCL_VDEV is distinct from the
 210  *      other locks, and lower than all of them, to ensure that it's safe
 211  *      to acquire regardless of caller context.
 212  *
 213  * In addition, the following rules apply:
 214  *
 215  * (a)  spa_props_lock protects pool properties, spa_config and spa_config_list.
 216  *      The lock ordering is SCL_CONFIG > spa_props_lock.
 217  *
 218  * (b)  I/O operations on leaf vdevs.  For any zio operation that takes
 219  *      an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(),
 220  *      or zio_write_phys() -- the caller must ensure that the config cannot
 221  *      cannot change in the interim, and that the vdev cannot be reopened.
 222  *      SCL_STATE as reader suffices for both.
 223  *
 224  * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
 225  *
 226  *      spa_vdev_enter()        Acquire the namespace lock and the config lock
 227  *                              for writing.
 228  *
 229  *      spa_vdev_exit()         Release the config lock, wait for all I/O
 230  *                              to complete, sync the updated configs to the
 231  *                              cache, and release the namespace lock.
 232  *
 233  * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit().
 234  * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual
 235  * locking is, always, based on spa_namespace_lock and spa_config_lock[].
 236  */
 237
 238 static avl_tree_t spa_namespace_avl;
 239 kmutex_t spa_namespace_lock;
 240 static kcondvar_t spa_namespace_cv;
 241 static const int spa_max_replication_override = SPA_DVAS_PER_BP;
 242
 243 static kmutex_t spa_spare_lock;
 244 static avl_tree_t spa_spare_avl;
 245 static kmutex_t spa_l2cache_lock;
 246 static avl_tree_t spa_l2cache_avl;
 247
 248 spa_mode_t spa_mode_global = SPA_MODE_UNINIT;
 249
 250 #ifdef ZFS_DEBUG
 251 /*
 252  * Everything except dprintf, set_error, spa, and indirect_remap is on
 253  * by default in debug builds.
 254  */
 255 int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SET_ERROR |
 256     ZFS_DEBUG_INDIRECT_REMAP);
 257 #else
 258 int zfs_flags = 0;
 259 #endif
 260
 261 /*
 262  * zfs_recover can be set to nonzero to attempt to recover from
 263  * otherwise-fatal errors, typically caused by on-disk corruption.  When
 264  * set, calls to zfs_panic_recover() will turn into warning messages.
 265  * This should only be used as a last resort, as it typically results
 266  * in leaked space, or worse.
 267  */
 268 int zfs_recover = B_FALSE;
 269
 270 /*
 271  * If destroy encounters an EIO while reading metadata (e.g. indirect
 272  * blocks), space referenced by the missing metadata can not be freed.
 273  * Normally this causes the background destroy to become "stalled", as
 274  * it is unable to make forward progress.  While in this stalled state,
 275  * all remaining space to free from the error-encountering filesystem is
 276  * "temporarily leaked".  Set this flag to cause it to ignore the EIO,
 277  * permanently leak the space from indirect blocks that can not be read,
 278  * and continue to free everything else that it can.
 279  *
 280  * The default, "stalling" behavior is useful if the storage partially
 281  * fails (i.e. some but not all i/os fail), and then later recovers.  In
 282  * this case, we will be able to continue pool operations while it is
 283  * partially failed, and when it recovers, we can continue to free the
 284  * space, with no leaks.  However, note that this case is actually
 285  * fairly rare.
 286  *
 287  * Typically pools either (a) fail completely (but perhaps temporarily,
 288  * e.g. a top-level vdev going offline), or (b) have localized,
 289  * permanent errors (e.g. disk returns the wrong data due to bit flip or
 290  * firmware bug).  In case (a), this setting does not matter because the
 291  * pool will be suspended and the sync thread will not be able to make
 292  * forward progress regardless.  In case (b), because the error is
 293  * permanent, the best we can do is leak the minimum amount of space,
 294  * which is what setting this flag will do.  Therefore, it is reasonable
 295  * for this flag to normally be set, but we chose the more conservative
 296  * approach of not setting it, so that there is no possibility of
 297  * leaking space in the "partial temporary" failure case.
 298  */
 299 int zfs_free_leak_on_eio = B_FALSE;
 300
 301 /*
 302  * Expiration time in milliseconds. This value has two meanings. First it is
 303  * used to determine when the spa_deadman() logic should fire. By default the
 304  * spa_deadman() will fire if spa_sync() has not completed in 600 seconds.
 305  * Secondly, the value determines if an I/O is considered "hung". Any I/O that
 306  * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
 307  * in one of three behaviors controlled by zfs_deadman_failmode.
 308  */
 309 uint64_t zfs_deadman_synctime_ms = 600000UL;  /* 10 min. */
 310
 311 /*
 312  * This value controls the maximum amount of time zio_wait() will block for an
 313  * outstanding IO.  By default this is 300 seconds at which point the "hung"
 314  * behavior will be applied as described for zfs_deadman_synctime_ms.
 315  */
 316 uint64_t zfs_deadman_ziotime_ms = 300000UL;  /* 5 min. */
 317
 318 /*
 319  * Check time in milliseconds. This defines the frequency at which we check
 320  * for hung I/O.
 321  */
 322 uint64_t zfs_deadman_checktime_ms = 60000UL;  /* 1 min. */
 323
 324 /*
 325  * By default the deadman is enabled.
 326  */
 327 int zfs_deadman_enabled = B_TRUE;
 328
 329 /*
 330  * Controls the behavior of the deadman when it detects a "hung" I/O.
 331  * Valid values are zfs_deadman_failmode=<wait|continue|panic>.
 332  *
 333  * wait     - Wait for the "hung" I/O (default)
 334  * continue - Attempt to recover from a "hung" I/O
 335  * panic    - Panic the system
 336  */
 337 const char *zfs_deadman_failmode = "wait";
 338
 339 /*
 340  * The worst case is single-sector max-parity RAID-Z blocks, in which
 341  * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
 342  * times the size; so just assume that.  Add to this the fact that
 343  * we can have up to 3 DVAs per bp, and one more factor of 2 because
 344  * the block may be dittoed with up to 3 DVAs by ddt_sync().  All together,
 345  * the worst case is:
 346  *     (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24
 347  */
 348 uint_t spa_asize_inflation = 24;
 349
 350 /*
 351  * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in
 352  * the pool to be consumed (bounded by spa_max_slop).  This ensures that we
 353  * don't run the pool completely out of space, due to unaccounted changes (e.g.
 354  * to the MOS).  It also limits the worst-case time to allocate space.  If we
 355  * have less than this amount of free space, most ZPL operations (e.g.  write,
 356  * create) will return ENOSPC.  The ZIL metaslabs (spa_embedded_log_class) are
 357  * also part of this 3.2% of space which can't be consumed by normal writes;
 358  * the slop space "proper" (spa_get_slop_space()) is decreased by the embedded
 359  * log space.
 360  *
 361  * Certain operations (e.g. file removal, most administrative actions) can
 362  * use half the slop space.  They will only return ENOSPC if less than half
 363  * the slop space is free.  Typically, once the pool has less than the slop
 364  * space free, the user will use these operations to free up space in the pool.
 365  * These are the operations that call dsl_pool_adjustedsize() with the netfree
 366  * argument set to TRUE.
 367  *
 368  * Operations that are almost guaranteed to free up space in the absence of
 369  * a pool checkpoint can use up to three quarters of the slop space
 370  * (e.g zfs destroy).
 371  *
 372  * A very restricted set of operations are always permitted, regardless of
 373  * the amount of free space.  These are the operations that call
 374  * dsl_sync_task(ZFS_SPACE_CHECK_NONE). If these operations result in a net
 375  * increase in the amount of space used, it is possible to run the pool
 376  * completely out of space, causing it to be permanently read-only.
 377  *
 378  * Note that on very small pools, the slop space will be larger than
 379  * 3.2%, in an effort to have it be at least spa_min_slop (128MB),
 380  * but we never allow it to be more than half the pool size.
 381  *
 382  * Further, on very large pools, the slop space will be smaller than
 383  * 3.2%, to avoid reserving much more space than we actually need; bounded
 384  * by spa_max_slop (128GB).
 385  *
 386  * See also the comments in zfs_space_check_t.
 387  */
 388 uint_t spa_slop_shift = 5;
 389 static const uint64_t spa_min_slop = 128ULL * 1024 * 1024;
 390 static const uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024;
 391 static const int spa_allocators = 4;
 392
 393
 394 void
 395 spa_load_failed(spa_t *spa, const char *fmt, ...)
 396 {
 397         va_list adx;
 398         char buf[256];
 399
 400         va_start(adx, fmt);
 401         (void) vsnprintf(buf, sizeof (buf), fmt, adx);
 402         va_end(adx);
 403
 404         zfs_dbgmsg("spa_load(%s, config %s): FAILED: %s", spa->spa_name,
 405             spa->spa_trust_config ? "trusted" : "untrusted", buf);
 406 }
 407
 408 void
 409 spa_load_note(spa_t *spa, const char *fmt, ...)
 410 {
 411         va_list adx;
 412         char buf[256];
 413
 414         va_start(adx, fmt);
 415         (void) vsnprintf(buf, sizeof (buf), fmt, adx);
 416         va_end(adx);
 417
 418         zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name,
 419             spa->spa_trust_config ? "trusted" : "untrusted", buf);
 420 }
 421
 422 /*
 423  * By default dedup and user data indirects land in the special class
 424  */
 425 static int zfs_ddt_data_is_special = B_TRUE;
 426 static int zfs_user_indirect_is_special = B_TRUE;
 427
 428 /*
 429  * The percentage of special class final space reserved for metadata only.
 430  * Once we allocate 100 - zfs_special_class_metadata_reserve_pct we only
 431  * let metadata into the class.
 432  */
 433 static uint_t zfs_special_class_metadata_reserve_pct = 25;
 434
 435 /*
 436  * ==========================================================================
 437  * SPA config locking
 438  * ==========================================================================
 439  */
 440 static void
 441 spa_config_lock_init(spa_t *spa)
 442 {
 443         for (int i = 0; i < SCL_LOCKS; i++) {
 444                 spa_config_lock_t *scl = &spa->spa_config_lock[i];
 445                 mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL);
 446                 cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL);
 447                 scl->scl_writer = NULL;
 448                 scl->scl_write_wanted = 0;
 449                 scl->scl_count = 0;
 450         }
 451 }
 452
 453 static void
 454 spa_config_lock_destroy(spa_t *spa)
 455 {
 456         for (int i = 0; i < SCL_LOCKS; i++) {
 457                 spa_config_lock_t *scl = &spa->spa_config_lock[i];
 458                 mutex_destroy(&scl->scl_lock);
 459                 cv_destroy(&scl->scl_cv);
 460                 ASSERT(scl->scl_writer == NULL);
 461                 ASSERT(scl->scl_write_wanted == 0);
 462                 ASSERT(scl->scl_count == 0);
 463         }
 464 }
 465
 466 int
 467 spa_config_tryenter(spa_t *spa, int locks, const void *tag, krw_t rw)
 468 {
 469         for (int i = 0; i < SCL_LOCKS; i++) {
 470                 spa_config_lock_t *scl = &spa->spa_config_lock[i];
 471                 if (!(locks & (1 << i)))
 472                         continue;
 473                 mutex_enter(&scl->scl_lock);
 474                 if (rw == RW_READER) {
 475                         if (scl->scl_writer || scl->scl_write_wanted) {
 476                                 mutex_exit(&scl->scl_lock);
 477                                 spa_config_exit(spa, locks & ((1 << i) - 1),
 478                                     tag);
 479                                 return (0);
 480                         }
 481                 } else {
 482                         ASSERT(scl->scl_writer != curthread);
 483                         if (scl->scl_count != 0) {
 484                                 mutex_exit(&scl->scl_lock);
 485                                 spa_config_exit(spa, locks & ((1 << i) - 1),
 486                                     tag);
 487                                 return (0);
 488                         }
 489                         scl->scl_writer = curthread;
 490                 }
 491                 scl->scl_count++;
 492                 mutex_exit(&scl->scl_lock);
 493         }
 494         return (1);
 495 }
 496
 497 static void
 498 spa_config_enter_impl(spa_t *spa, int locks, const void *tag, krw_t rw,
 499     int mmp_flag)
 500 {
 501         (void) tag;
 502         int wlocks_held = 0;
 503
 504         ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY);
 505
 506         for (int i = 0; i < SCL_LOCKS; i++) {
 507                 spa_config_lock_t *scl = &spa->spa_config_lock[i];
 508                 if (scl->scl_writer == curthread)
 509                         wlocks_held |= (1 << i);
 510                 if (!(locks & (1 << i)))
 511                         continue;
 512                 mutex_enter(&scl->scl_lock);
 513                 if (rw == RW_READER) {
 514                         while (scl->scl_writer ||
 515                             (!mmp_flag && scl->scl_write_wanted)) {
 516                                 cv_wait(&scl->scl_cv, &scl->scl_lock);
 517                         }
 518                 } else {
 519                         ASSERT(scl->scl_writer != curthread);
 520                         while (scl->scl_count != 0) {
 521                                 scl->scl_write_wanted++;
 522                                 cv_wait(&scl->scl_cv, &scl->scl_lock);
 523                                 scl->scl_write_wanted--;
 524                         }
 525                         scl->scl_writer = curthread;
 526                 }
 527                 scl->scl_count++;
 528                 mutex_exit(&scl->scl_lock);
 529         }
 530         ASSERT3U(wlocks_held, <=, locks);
 531 }
 532
 533 void
 534 spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw)
 535 {
 536         spa_config_enter_impl(spa, locks, tag, rw, 0);
 537 }
 538
 539 /*
 540  * The spa_config_enter_mmp() allows the mmp thread to cut in front of
 541  * outstanding write lock requests. This is needed since the mmp updates are
 542  * time sensitive and failure to service them promptly will result in a
 543  * suspended pool. This pool suspension has been seen in practice when there is
 544  * a single disk in a pool that is responding slowly and presumably about to
 545  * fail.
 546  */
 547
 548 void
 549 spa_config_enter_mmp(spa_t *spa, int locks, const void *tag, krw_t rw)
 550 {
 551         spa_config_enter_impl(spa, locks, tag, rw, 1);
 552 }
 553
 554 void
 555 spa_config_exit(spa_t *spa, int locks, const void *tag)
 556 {
 557         (void) tag;
 558         for (int i = SCL_LOCKS - 1; i >= 0; i--) {
 559                 spa_config_lock_t *scl = &spa->spa_config_lock[i];
 560                 if (!(locks & (1 << i)))
 561                         continue;
 562                 mutex_enter(&scl->scl_lock);
 563                 ASSERT(scl->scl_count > 0);
 564                 if (--scl->scl_count == 0) {
 565                         ASSERT(scl->scl_writer == NULL ||
 566                             scl->scl_writer == curthread);
 567                         scl->scl_writer = NULL; /* OK in either case */
 568                         cv_broadcast(&scl->scl_cv);
 569                 }
 570                 mutex_exit(&scl->scl_lock);
 571         }
 572 }
 573
 574 int
 575 spa_config_held(spa_t *spa, int locks, krw_t rw)
 576 {
 577         int locks_held = 0;
 578
 579         for (int i = 0; i < SCL_LOCKS; i++) {
 580                 spa_config_lock_t *scl = &spa->spa_config_lock[i];
 581                 if (!(locks & (1 << i)))
 582                         continue;
 583                 if ((rw == RW_READER && scl->scl_count != 0) ||
 584                     (rw == RW_WRITER && scl->scl_writer == curthread))
 585                         locks_held |= 1 << i;
 586         }
 587
 588         return (locks_held);
 589 }
 590
 591 /*
 592  * ==========================================================================
 593  * SPA namespace functions
 594  * ==========================================================================
 595  */
 596
 597 /*
 598  * Lookup the named spa_t in the AVL tree.  The spa_namespace_lock must be held.
 599  * Returns NULL if no matching spa_t is found.
 600  */
 601 spa_t *
 602 spa_lookup(const char *name)
 603 {
 604         static spa_t search;    /* spa_t is large; don't allocate on stack */
 605         spa_t *spa;
 606         avl_index_t where;
 607         char *cp;
 608
 609         ASSERT(MUTEX_HELD(&spa_namespace_lock));
 610
 611         (void) strlcpy(search.spa_name, name, sizeof (search.spa_name));
 612
 613         /*
 614          * If it's a full dataset name, figure out the pool name and
 615          * just use that.
 616          */
 617         cp = strpbrk(search.spa_name, "/@#");
 618         if (cp != NULL)
 619                 *cp = '\0';
 620
 621         spa = avl_find(&spa_namespace_avl, &search, &where);
 622
 623         return (spa);
 624 }
 625
 626 /*
 627  * Fires when spa_sync has not completed within zfs_deadman_synctime_ms.
 628  * If the zfs_deadman_enabled flag is set then it inspects all vdev queues
 629  * looking for potentially hung I/Os.
 630  */
 631 void
 632 spa_deadman(void *arg)
 633 {
 634         spa_t *spa = arg;
 635
 636         /* Disable the deadman if the pool is suspended. */
 637         if (spa_suspended(spa))
 638                 return;
 639
 640         zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu",
 641             (gethrtime() - spa->spa_sync_starttime) / NANOSEC,
 642             (u_longlong_t)++spa->spa_deadman_calls);
 643         if (zfs_deadman_enabled)
 644                 vdev_deadman(spa->spa_root_vdev, FTAG);
 645
 646         spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
 647             spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
 648             MSEC_TO_TICK(zfs_deadman_checktime_ms));
 649 }
 650
 651 static int
 652 spa_log_sm_sort_by_txg(const void *va, const void *vb)
 653 {
 654         const spa_log_sm_t *a = va;
 655         const spa_log_sm_t *b = vb;
 656
 657         return (TREE_CMP(a->sls_txg, b->sls_txg));
 658 }
 659
 660 /*
 661  * Create an uninitialized spa_t with the given name.  Requires
 662  * spa_namespace_lock.  The caller must ensure that the spa_t doesn't already
 663  * exist by calling spa_lookup() first.
 664  */
 665 spa_t *
 666 spa_add(const char *name, nvlist_t *config, const char *altroot)
 667 {
 668         spa_t *spa;
 669         spa_config_dirent_t *dp;
 670
 671         ASSERT(MUTEX_HELD(&spa_namespace_lock));
 672
 673         spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
 674
 675         mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
 676         mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
 677         mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
 678         mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL);
 679         mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
 680         mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL);
 681         mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
 682         mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL);
 683         mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
 684         mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
 685         mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
 686         mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL);
 687         mutex_init(&spa->spa_flushed_ms_lock, NULL, MUTEX_DEFAULT, NULL);
 688         mutex_init(&spa->spa_activities_lock, NULL, MUTEX_DEFAULT, NULL);
 689
 690         cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
 691         cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
 692         cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL);
 693         cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
 694         cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
 695         cv_init(&spa->spa_activities_cv, NULL, CV_DEFAULT, NULL);
 696         cv_init(&spa->spa_waiters_cv, NULL, CV_DEFAULT, NULL);
 697
 698         for (int t = 0; t < TXG_SIZE; t++)
 699                 bplist_create(&spa->spa_free_bplist[t]);
 700
 701         (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
 702         spa->spa_state = POOL_STATE_UNINITIALIZED;
 703         spa->spa_freeze_txg = UINT64_MAX;
 704         spa->spa_final_txg = UINT64_MAX;
 705         spa->spa_load_max_txg = UINT64_MAX;
 706         spa->spa_proc = &p0;
 707         spa->spa_proc_state = SPA_PROC_NONE;
 708         spa->spa_trust_config = B_TRUE;
 709         spa->spa_hostid = zone_get_hostid(NULL);
 710
 711         spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
 712         spa->spa_deadman_ziotime = MSEC2NSEC(zfs_deadman_ziotime_ms);
 713         spa_set_deadman_failmode(spa, zfs_deadman_failmode);
 714
 715         zfs_refcount_create(&spa->spa_refcount);
 716         spa_config_lock_init(spa);
 717         spa_stats_init(spa);
 718
 719         avl_add(&spa_namespace_avl, spa);
 720
 721         /*
 722          * Set the alternate root, if there is one.
 723          */
 724         if (altroot)
 725                 spa->spa_root = spa_strdup(altroot);
 726
 727         spa->spa_alloc_count = spa_allocators;
 728         spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count *
 729             sizeof (spa_alloc_t), KM_SLEEP);
 730         for (int i = 0; i < spa->spa_alloc_count; i++) {
 731                 mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT,
 732                     NULL);
 733                 avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
 734                     sizeof (zio_t), offsetof(zio_t, io_queue_node.a));
 735         }
 736         avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
 737             sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node));
 738         avl_create(&spa->spa_sm_logs_by_txg, spa_log_sm_sort_by_txg,
 739             sizeof (spa_log_sm_t), offsetof(spa_log_sm_t, sls_node));
 740         list_create(&spa->spa_log_summary, sizeof (log_summary_entry_t),
 741             offsetof(log_summary_entry_t, lse_node));
 742
 743         /*
 744          * Every pool starts with the default cachefile
 745          */
 746         list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t),
 747             offsetof(spa_config_dirent_t, scd_link));
 748
 749         dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP);
 750         dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path);
 751         list_insert_head(&spa->spa_config_list, dp);
 752
 753         VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME,
 754             KM_SLEEP) == 0);
 755
 756         if (config != NULL) {
 757                 nvlist_t *features;
 758
 759                 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
 760                     &features) == 0) {
 761                         VERIFY(nvlist_dup(features, &spa->spa_label_features,
 762                             0) == 0);
 763                 }
 764
 765                 VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
 766         }
 767
 768         if (spa->spa_label_features == NULL) {
 769                 VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME,
 770                     KM_SLEEP) == 0);
 771         }
 772
 773         spa->spa_min_ashift = INT_MAX;
 774         spa->spa_max_ashift = 0;
 775         spa->spa_min_alloc = INT_MAX;
 776         spa->spa_gcd_alloc = INT_MAX;
 777
 778         /* Reset cached value */
 779         spa->spa_dedup_dspace = ~0ULL;
 780
 781         /*
 782          * As a pool is being created, treat all features as disabled by
 783          * setting SPA_FEATURE_DISABLED for all entries in the feature
 784          * refcount cache.
 785          */
 786         for (int i = 0; i < SPA_FEATURES; i++) {
 787                 spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED;
 788         }
 789
 790         list_create(&spa->spa_leaf_list, sizeof (vdev_t),
 791             offsetof(vdev_t, vdev_leaf_node));
 792
 793         return (spa);
 794 }
 795
 796 /*
 797  * Removes a spa_t from the namespace, freeing up any memory used.  Requires
 798  * spa_namespace_lock.  This is called only after the spa_t has been closed and
 799  * deactivated.
 800  */
 801 void
 802 spa_remove(spa_t *spa)
 803 {
 804         spa_config_dirent_t *dp;
 805
 806         ASSERT(MUTEX_HELD(&spa_namespace_lock));
 807         ASSERT(spa_state(spa) == POOL_STATE_UNINITIALIZED);
 808         ASSERT3U(zfs_refcount_count(&spa->spa_refcount), ==, 0);
 809         ASSERT0(spa->spa_waiters);
 810
 811         nvlist_free(spa->spa_config_splitting);
 812
 813         avl_remove(&spa_namespace_avl, spa);
 814         cv_broadcast(&spa_namespace_cv);
 815
 816         if (spa->spa_root)
 817                 spa_strfree(spa->spa_root);
 818
 819         while ((dp = list_remove_head(&spa->spa_config_list)) != NULL) {
 820                 if (dp->scd_path != NULL)
 821                         spa_strfree(dp->scd_path);
 822                 kmem_free(dp, sizeof (spa_config_dirent_t));
 823         }
 824
 825         for (int i = 0; i < spa->spa_alloc_count; i++) {
 826                 avl_destroy(&spa->spa_allocs[i].spaa_tree);
 827                 mutex_destroy(&spa->spa_allocs[i].spaa_lock);
 828         }
 829         kmem_free(spa->spa_allocs, spa->spa_alloc_count *
 830             sizeof (spa_alloc_t));
 831
 832         avl_destroy(&spa->spa_metaslabs_by_flushed);
 833         avl_destroy(&spa->spa_sm_logs_by_txg);
 834         list_destroy(&spa->spa_log_summary);
 835         list_destroy(&spa->spa_config_list);
 836         list_destroy(&spa->spa_leaf_list);
 837
 838         nvlist_free(spa->spa_label_features);
 839         nvlist_free(spa->spa_load_info);
 840         nvlist_free(spa->spa_feat_stats);
 841         spa_config_set(spa, NULL);
 842
 843         zfs_refcount_destroy(&spa->spa_refcount);
 844
 845         spa_stats_destroy(spa);
 846         spa_config_lock_destroy(spa);
 847
 848         for (int t = 0; t < TXG_SIZE; t++)
 849                 bplist_destroy(&spa->spa_free_bplist[t]);
 850
 851         zio_checksum_templates_free(spa);
 852
 853         cv_destroy(&spa->spa_async_cv);
 854         cv_destroy(&spa->spa_evicting_os_cv);
 855         cv_destroy(&spa->spa_proc_cv);
 856         cv_destroy(&spa->spa_scrub_io_cv);
 857         cv_destroy(&spa->spa_suspend_cv);
 858         cv_destroy(&spa->spa_activities_cv);
 859         cv_destroy(&spa->spa_waiters_cv);
 860
 861         mutex_destroy(&spa->spa_flushed_ms_lock);
 862         mutex_destroy(&spa->spa_async_lock);
 863         mutex_destroy(&spa->spa_errlist_lock);
 864         mutex_destroy(&spa->spa_errlog_lock);
 865         mutex_destroy(&spa->spa_evicting_os_lock);
 866         mutex_destroy(&spa->spa_history_lock);
 867         mutex_destroy(&spa->spa_proc_lock);
 868         mutex_destroy(&spa->spa_props_lock);
 869         mutex_destroy(&spa->spa_cksum_tmpls_lock);
 870         mutex_destroy(&spa->spa_scrub_lock);
 871         mutex_destroy(&spa->spa_suspend_lock);
 872         mutex_destroy(&spa->spa_vdev_top_lock);
 873         mutex_destroy(&spa->spa_feat_stats_lock);
 874         mutex_destroy(&spa->spa_activities_lock);
 875
 876         kmem_free(spa, sizeof (spa_t));
 877 }
 878
 879 /*
 880  * Given a pool, return the next pool in the namespace, or NULL if there is
 881  * none.  If 'prev' is NULL, return the first pool.
 882  */
 883 spa_t *
 884 spa_next(spa_t *prev)
 885 {
 886         ASSERT(MUTEX_HELD(&spa_namespace_lock));
 887
 888         if (prev)
 889                 return (AVL_NEXT(&spa_namespace_avl, prev));
 890         else
 891                 return (avl_first(&spa_namespace_avl));
 892 }
 893
 894 /*
 895  * ==========================================================================
 896  * SPA refcount functions
 897  * ==========================================================================
 898  */
 899
 900 /*
 901  * Add a reference to the given spa_t.  Must have at least one reference, or
 902  * have the namespace lock held.
 903  */
 904 void
 905 spa_open_ref(spa_t *spa, const void *tag)
 906 {
 907         ASSERT(zfs_refcount_count(&spa->spa_refcount) >= spa->spa_minref ||
 908             MUTEX_HELD(&spa_namespace_lock));
 909         (void) zfs_refcount_add(&spa->spa_refcount, tag);
 910 }
 911
 912 /*
 913  * Remove a reference to the given spa_t.  Must have at least one reference, or
 914  * have the namespace lock held.
 915  */
 916 void
 917 spa_close(spa_t *spa, const void *tag)
 918 {
 919         ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref ||
 920             MUTEX_HELD(&spa_namespace_lock));
 921         (void) zfs_refcount_remove(&spa->spa_refcount, tag);
 922 }
 923
 924 /*
 925  * Remove a reference to the given spa_t held by a dsl dir that is
 926  * being asynchronously released.  Async releases occur from a taskq
 927  * performing eviction of dsl datasets and dirs.  The namespace lock
 928  * isn't held and the hold by the object being evicted may contribute to
 929  * spa_minref (e.g. dataset or directory released during pool export),
 930  * so the asserts in spa_close() do not apply.
 931  */
 932 void
 933 spa_async_close(spa_t *spa, const void *tag)
 934 {
 935         (void) zfs_refcount_remove(&spa->spa_refcount, tag);
 936 }
 937
 938 /*
 939  * Check to see if the spa refcount is zero.  Must be called with
 940  * spa_namespace_lock held.  We really compare against spa_minref, which is the
 941  * number of references acquired when opening a pool
 942  */
 943 boolean_t
 944 spa_refcount_zero(spa_t *spa)
 945 {
 946         ASSERT(MUTEX_HELD(&spa_namespace_lock));
 947
 948         return (zfs_refcount_count(&spa->spa_refcount) == spa->spa_minref);
 949 }
 950
 951 /*
 952  * ==========================================================================
 953  * SPA spare and l2cache tracking
 954  * ==========================================================================
 955  */
 956
 957 /*
 958  * Hot spares and cache devices are tracked using the same code below,
 959  * for 'auxiliary' devices.
 960  */
 961
 962 typedef struct spa_aux {
 963         uint64_t        aux_guid;
 964         uint64_t        aux_pool;
 965         avl_node_t      aux_avl;
 966         int             aux_count;
 967 } spa_aux_t;
 968
 969 static inline int
 970 spa_aux_compare(const void *a, const void *b)
 971 {
 972         const spa_aux_t *sa = (const spa_aux_t *)a;
 973         const spa_aux_t *sb = (const spa_aux_t *)b;
 974
 975         return (TREE_CMP(sa->aux_guid, sb->aux_guid));
 976 }
 977
 978 static void
 979 spa_aux_add(vdev_t *vd, avl_tree_t *avl)
 980 {
 981         avl_index_t where;
 982         spa_aux_t search;
 983         spa_aux_t *aux;
 984
 985         search.aux_guid = vd->vdev_guid;
 986         if ((aux = avl_find(avl, &search, &where)) != NULL) {
 987                 aux->aux_count++;
 988         } else {
 989                 aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP);
 990                 aux->aux_guid = vd->vdev_guid;
 991                 aux->aux_count = 1;
 992                 avl_insert(avl, aux, where);
 993         }
 994 }
 995
 996 static void
 997 spa_aux_remove(vdev_t *vd, avl_tree_t *avl)
 998 {
 999         spa_aux_t search;
1000         spa_aux_t *aux;
1001         avl_index_t where;
1002
1003         search.aux_guid = vd->vdev_guid;
1004         aux = avl_find(avl, &search, &where);
1005
1006         ASSERT(aux != NULL);
1007
1008         if (--aux->aux_count == 0) {
1009                 avl_remove(avl, aux);
1010                 kmem_free(aux, sizeof (spa_aux_t));
1011         } else if (aux->aux_pool == spa_guid(vd->vdev_spa)) {
1012                 aux->aux_pool = 0ULL;
1013         }
1014 }
1015
1016 static boolean_t
1017 spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl)
1018 {
1019         spa_aux_t search, *found;
1020
1021         search.aux_guid = guid;
1022         found = avl_find(avl, &search, NULL);
1023
1024         if (pool) {
1025                 if (found)
1026                         *pool = found->aux_pool;
1027                 else
1028                         *pool = 0ULL;
1029         }
1030
1031         if (refcnt) {
1032                 if (found)
1033                         *refcnt = found->aux_count;
1034                 else
1035                         *refcnt = 0;
1036         }
1037
1038         return (found != NULL);
1039 }
1040
1041 static void
1042 spa_aux_activate(vdev_t *vd, avl_tree_t *avl)
1043 {
1044         spa_aux_t search, *found;
1045         avl_index_t where;
1046
1047         search.aux_guid = vd->vdev_guid;
1048         found = avl_find(avl, &search, &where);
1049         ASSERT(found != NULL);
1050         ASSERT(found->aux_pool == 0ULL);
1051
1052         found->aux_pool = spa_guid(vd->vdev_spa);
1053 }
1054
1055 /*
1056  * Spares are tracked globally due to the following constraints:
1057  *
1058  *      - A spare may be part of multiple pools.
1059  *      - A spare may be added to a pool even if it's actively in use within
1060  *        another pool.
1061  *      - A spare in use in any pool can only be the source of a replacement if
1062  *        the target is a spare in the same pool.
1063  *
1064  * We keep track of all spares on the system through the use of a reference
1065  * counted AVL tree.  When a vdev is added as a spare, or used as a replacement
1066  * spare, then we bump the reference count in the AVL tree.  In addition, we set
1067  * the 'vdev_isspare' member to indicate that the device is a spare (active or
1068  * inactive).  When a spare is made active (used to replace a device in the
1069  * pool), we also keep track of which pool its been made a part of.
1070  *
1071  * The 'spa_spare_lock' protects the AVL tree.  These functions are normally
1072  * called under the spa_namespace lock as part of vdev reconfiguration.  The
1073  * separate spare lock exists for the status query path, which does not need to
1074  * be completely consistent with respect to other vdev configuration changes.
1075  */
1076
1077 static int
1078 spa_spare_compare(const void *a, const void *b)
1079 {
1080         return (spa_aux_compare(a, b));
1081 }
1082
1083 void
1084 spa_spare_add(vdev_t *vd)
1085 {
1086         mutex_enter(&spa_spare_lock);
1087         ASSERT(!vd->vdev_isspare);
1088         spa_aux_add(vd, &spa_spare_avl);
1089         vd->vdev_isspare = B_TRUE;
1090         mutex_exit(&spa_spare_lock);
1091 }
1092
1093 void
1094 spa_spare_remove(vdev_t *vd)
1095 {
1096         mutex_enter(&spa_spare_lock);
1097         ASSERT(vd->vdev_isspare);
1098         spa_aux_remove(vd, &spa_spare_avl);
1099         vd->vdev_isspare = B_FALSE;
1100         mutex_exit(&spa_spare_lock);
1101 }
1102
1103 boolean_t
1104 spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt)
1105 {
1106         boolean_t found;
1107
1108         mutex_enter(&spa_spare_lock);
1109         found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl);
1110         mutex_exit(&spa_spare_lock);
1111
1112         return (found);
1113 }
1114
1115 void
1116 spa_spare_activate(vdev_t *vd)
1117 {
1118         mutex_enter(&spa_spare_lock);
1119         ASSERT(vd->vdev_isspare);
1120         spa_aux_activate(vd, &spa_spare_avl);
1121         mutex_exit(&spa_spare_lock);
1122 }
1123
1124 /*
1125  * Level 2 ARC devices are tracked globally for the same reasons as spares.
1126  * Cache devices currently only support one pool per cache device, and so
1127  * for these devices the aux reference count is currently unused beyond 1.
1128  */
1129
1130 static int
1131 spa_l2cache_compare(const void *a, const void *b)
1132 {
1133         return (spa_aux_compare(a, b));
1134 }
1135
1136 void
1137 spa_l2cache_add(vdev_t *vd)
1138 {
1139         mutex_enter(&spa_l2cache_lock);
1140         ASSERT(!vd->vdev_isl2cache);
1141         spa_aux_add(vd, &spa_l2cache_avl);
1142         vd->vdev_isl2cache = B_TRUE;
1143         mutex_exit(&spa_l2cache_lock);
1144 }
1145
1146 void
1147 spa_l2cache_remove(vdev_t *vd)
1148 {
1149         mutex_enter(&spa_l2cache_lock);
1150         ASSERT(vd->vdev_isl2cache);
1151         spa_aux_remove(vd, &spa_l2cache_avl);
1152         vd->vdev_isl2cache = B_FALSE;
1153         mutex_exit(&spa_l2cache_lock);
1154 }
1155
1156 boolean_t
1157 spa_l2cache_exists(uint64_t guid, uint64_t *pool)
1158 {
1159         boolean_t found;
1160
1161         mutex_enter(&spa_l2cache_lock);
1162         found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl);
1163         mutex_exit(&spa_l2cache_lock);
1164
1165         return (found);
1166 }
1167
1168 void
1169 spa_l2cache_activate(vdev_t *vd)
1170 {
1171         mutex_enter(&spa_l2cache_lock);
1172         ASSERT(vd->vdev_isl2cache);
1173         spa_aux_activate(vd, &spa_l2cache_avl);
1174         mutex_exit(&spa_l2cache_lock);
1175 }
1176
1177 /*
1178  * ==========================================================================
1179  * SPA vdev locking
1180  * ==========================================================================
1181  */
1182
1183 /*
1184  * Lock the given spa_t for the purpose of adding or removing a vdev.
1185  * Grabs the global spa_namespace_lock plus the spa config lock for writing.
1186  * It returns the next transaction group for the spa_t.
1187  */
1188 uint64_t
1189 spa_vdev_enter(spa_t *spa)
1190 {
1191         mutex_enter(&spa->spa_vdev_top_lock);
1192         mutex_enter(&spa_namespace_lock);
1193
1194         vdev_autotrim_stop_all(spa);
1195
1196         return (spa_vdev_config_enter(spa));
1197 }
1198
1199 /*
1200  * The same as spa_vdev_enter() above but additionally takes the guid of
1201  * the vdev being detached.  When there is a rebuild in process it will be
1202  * suspended while the vdev tree is modified then resumed by spa_vdev_exit().
1203  * The rebuild is canceled if only a single child remains after the detach.
1204  */
1205 uint64_t
1206 spa_vdev_detach_enter(spa_t *spa, uint64_t guid)
1207 {
1208         mutex_enter(&spa->spa_vdev_top_lock);
1209         mutex_enter(&spa_namespace_lock);
1210
1211         vdev_autotrim_stop_all(spa);
1212
1213         if (guid != 0) {
1214                 vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
1215                 if (vd) {
1216                         vdev_rebuild_stop_wait(vd->vdev_top);
1217                 }
1218         }
1219
1220         return (spa_vdev_config_enter(spa));
1221 }
1222
1223 /*
1224  * Internal implementation for spa_vdev_enter().  Used when a vdev
1225  * operation requires multiple syncs (i.e. removing a device) while
1226  * keeping the spa_namespace_lock held.
1227  */
1228 uint64_t
1229 spa_vdev_config_enter(spa_t *spa)
1230 {
1231         ASSERT(MUTEX_HELD(&spa_namespace_lock));
1232
1233         spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
1234
1235         return (spa_last_synced_txg(spa) + 1);
1236 }
1237
1238 /*
1239  * Used in combination with spa_vdev_config_enter() to allow the syncing
1240  * of multiple transactions without releasing the spa_namespace_lock.
1241  */
1242 void
1243 spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error,
1244     const char *tag)
1245 {
1246         ASSERT(MUTEX_HELD(&spa_namespace_lock));
1247
1248         int config_changed = B_FALSE;
1249
1250         ASSERT(txg > spa_last_synced_txg(spa));
1251
1252         spa->spa_pending_vdev = NULL;
1253
1254         /*
1255          * Reassess the DTLs.
1256          */
1257         vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE, B_FALSE);
1258
1259         if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
1260                 config_changed = B_TRUE;
1261                 spa->spa_config_generation++;
1262         }
1263
1264         /*
1265          * Verify the metaslab classes.
1266          */
1267         ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0);
1268         ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0);
1269         ASSERT(metaslab_class_validate(spa_embedded_log_class(spa)) == 0);
1270         ASSERT(metaslab_class_validate(spa_special_class(spa)) == 0);
1271         ASSERT(metaslab_class_validate(spa_dedup_class(spa)) == 0);
1272
1273         spa_config_exit(spa, SCL_ALL, spa);
1274
1275         /*
1276          * Panic the system if the specified tag requires it.  This
1277          * is useful for ensuring that configurations are updated
1278          * transactionally.
1279          */
1280         if (zio_injection_enabled)
1281                 zio_handle_panic_injection(spa, tag, 0);
1282
1283         /*
1284          * Note: this txg_wait_synced() is important because it ensures
1285          * that there won't be more than one config change per txg.
1286          * This allows us to use the txg as the generation number.
1287          */
1288         if (error == 0)
1289                 txg_wait_synced(spa->spa_dsl_pool, txg);
1290
1291         if (vd != NULL) {
1292                 ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL);
1293                 if (vd->vdev_ops->vdev_op_leaf) {
1294                         mutex_enter(&vd->vdev_initialize_lock);
1295                         vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED,
1296                             NULL);
1297                         mutex_exit(&vd->vdev_initialize_lock);
1298
1299                         mutex_enter(&vd->vdev_trim_lock);
1300                         vdev_trim_stop(vd, VDEV_TRIM_CANCELED, NULL);
1301                         mutex_exit(&vd->vdev_trim_lock);
1302                 }
1303
1304                 /*
1305                  * The vdev may be both a leaf and top-level device.
1306                  */
1307                 vdev_autotrim_stop_wait(vd);
1308
1309                 spa_config_enter(spa, SCL_STATE_ALL, spa, RW_WRITER);
1310                 vdev_free(vd);
1311                 spa_config_exit(spa, SCL_STATE_ALL, spa);
1312         }
1313
1314         /*
1315          * If the config changed, update the config cache.
1316          */
1317         if (config_changed)
1318                 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE);
1319 }
1320
1321 /*
1322  * Unlock the spa_t after adding or removing a vdev.  Besides undoing the
1323  * locking of spa_vdev_enter(), we also want make sure the transactions have
1324  * synced to disk, and then update the global configuration cache with the new
1325  * information.
1326  */
1327 int
1328 spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
1329 {
1330         vdev_autotrim_restart(spa);
1331         vdev_rebuild_restart(spa);
1332
1333         spa_vdev_config_exit(spa, vd, txg, error, FTAG);
1334         mutex_exit(&spa_namespace_lock);
1335         mutex_exit(&spa->spa_vdev_top_lock);
1336
1337         return (error);
1338 }
1339
1340 /*
1341  * Lock the given spa_t for the purpose of changing vdev state.
1342  */
1343 void
1344 spa_vdev_state_enter(spa_t *spa, int oplocks)
1345 {
1346         int locks = SCL_STATE_ALL | oplocks;
1347
1348         /*
1349          * Root pools may need to read of the underlying devfs filesystem
1350          * when opening up a vdev.  Unfortunately if we're holding the
1351          * SCL_ZIO lock it will result in a deadlock when we try to issue
1352          * the read from the root filesystem.  Instead we "prefetch"
1353          * the associated vnodes that we need prior to opening the
1354          * underlying devices and cache them so that we can prevent
1355          * any I/O when we are doing the actual open.
1356          */
1357         if (spa_is_root(spa)) {
1358                 int low = locks & ~(SCL_ZIO - 1);
1359                 int high = locks & ~low;
1360
1361                 spa_config_enter(spa, high, spa, RW_WRITER);
1362                 vdev_hold(spa->spa_root_vdev);
1363                 spa_config_enter(spa, low, spa, RW_WRITER);
1364         } else {
1365                 spa_config_enter(spa, locks, spa, RW_WRITER);
1366         }
1367         spa->spa_vdev_locks = locks;
1368 }
1369
1370 int
1371 spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
1372 {
1373         boolean_t config_changed = B_FALSE;
1374         vdev_t *vdev_top;
1375
1376         if (vd == NULL || vd == spa->spa_root_vdev) {
1377                 vdev_top = spa->spa_root_vdev;
1378         } else {
1379                 vdev_top = vd->vdev_top;
1380         }
1381
1382         if (vd != NULL || error == 0)
1383                 vdev_dtl_reassess(vdev_top, 0, 0, B_FALSE, B_FALSE);
1384
1385         if (vd != NULL) {
1386                 if (vd != spa->spa_root_vdev)
1387                         vdev_state_dirty(vdev_top);
1388
1389                 config_changed = B_TRUE;
1390                 spa->spa_config_generation++;
1391         }
1392
1393         if (spa_is_root(spa))
1394                 vdev_rele(spa->spa_root_vdev);
1395
1396         ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL);
1397         spa_config_exit(spa, spa->spa_vdev_locks, spa);
1398
1399         /*
1400          * If anything changed, wait for it to sync.  This ensures that,
1401          * from the system administrator's perspective, zpool(8) commands
1402          * are synchronous.  This is important for things like zpool offline:
1403          * when the command completes, you expect no further I/O from ZFS.
1404          */
1405         if (vd != NULL)
1406                 txg_wait_synced(spa->spa_dsl_pool, 0);
1407
1408         /*
1409          * If the config changed, update the config cache.
1410          */
1411         if (config_changed) {
1412                 mutex_enter(&spa_namespace_lock);
1413                 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE);
1414                 mutex_exit(&spa_namespace_lock);
1415         }
1416
1417         return (error);
1418 }
1419
1420 /*
1421  * ==========================================================================
1422  * Miscellaneous functions
1423  * ==========================================================================
1424  */
1425
1426 void
1427 spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx)
1428 {
1429         if (!nvlist_exists(spa->spa_label_features, feature)) {
1430                 fnvlist_add_boolean(spa->spa_label_features, feature);
1431                 /*
1432                  * When we are creating the pool (tx_txg==TXG_INITIAL), we can't
1433                  * dirty the vdev config because lock SCL_CONFIG is not held.
1434                  * Thankfully, in this case we don't need to dirty the config
1435                  * because it will be written out anyway when we finish
1436                  * creating the pool.
1437                  */
1438                 if (tx->tx_txg != TXG_INITIAL)
1439                         vdev_config_dirty(spa->spa_root_vdev);
1440         }
1441 }
1442
1443 void
1444 spa_deactivate_mos_feature(spa_t *spa, const char *feature)
1445 {
1446         if (nvlist_remove_all(spa->spa_label_features, feature) == 0)
1447                 vdev_config_dirty(spa->spa_root_vdev);
1448 }
1449
1450 /*
1451  * Return the spa_t associated with given pool_guid, if it exists.  If
1452  * device_guid is non-zero, determine whether the pool exists *and* contains
1453  * a device with the specified device_guid.
1454  */
1455 spa_t *
1456 spa_by_guid(uint64_t pool_guid, uint64_t device_guid)
1457 {
1458         spa_t *spa;
1459         avl_tree_t *t = &spa_namespace_avl;
1460
1461         ASSERT(MUTEX_HELD(&spa_namespace_lock));
1462
1463         for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
1464                 if (spa->spa_state == POOL_STATE_UNINITIALIZED)
1465                         continue;
1466                 if (spa->spa_root_vdev == NULL)
1467                         continue;
1468                 if (spa_guid(spa) == pool_guid) {
1469                         if (device_guid == 0)
1470                                 break;
1471
1472                         if (vdev_lookup_by_guid(spa->spa_root_vdev,
1473                             device_guid) != NULL)
1474                                 break;
1475
1476                         /*
1477                          * Check any devices we may be in the process of adding.
1478                          */
1479                         if (spa->spa_pending_vdev) {
1480                                 if (vdev_lookup_by_guid(spa->spa_pending_vdev,
1481                                     device_guid) != NULL)
1482                                         break;
1483                         }
1484                 }
1485         }
1486
1487         return (spa);
1488 }
1489
1490 /*
1491  * Determine whether a pool with the given pool_guid exists.
1492  */
1493 boolean_t
1494 spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
1495 {
1496         return (spa_by_guid(pool_guid, device_guid) != NULL);
1497 }
1498
1499 char *
1500 spa_strdup(const char *s)
1501 {
1502         size_t len;
1503         char *new;
1504
1505         len = strlen(s);
1506         new = kmem_alloc(len + 1, KM_SLEEP);
1507         memcpy(new, s, len + 1);
1508
1509         return (new);
1510 }
1511
1512 void
1513 spa_strfree(char *s)
1514 {
1515         kmem_free(s, strlen(s) + 1);
1516 }
1517
1518 uint64_t
1519 spa_generate_guid(spa_t *spa)
1520 {
1521         uint64_t guid;
1522
1523         if (spa != NULL) {
1524                 do {
1525                         (void) random_get_pseudo_bytes((void *)&guid,
1526                             sizeof (guid));
1527                 } while (guid == 0 || spa_guid_exists(spa_guid(spa), guid));
1528         } else {
1529                 do {
1530                         (void) random_get_pseudo_bytes((void *)&guid,
1531                             sizeof (guid));
1532                 } while (guid == 0 || spa_guid_exists(guid, 0));
1533         }
1534
1535         return (guid);
1536 }
1537
1538 void
1539 snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
1540 {
1541         char type[256];
1542         const char *checksum = NULL;
1543         const char *compress = NULL;
1544
1545         if (bp != NULL) {
1546                 if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) {
1547                         dmu_object_byteswap_t bswap =
1548                             DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
1549                         (void) snprintf(type, sizeof (type), "bswap %s %s",
1550                             DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ?
1551                             "metadata" : "data",
1552                             dmu_ot_byteswap[bswap].ob_name);
1553                 } else {
1554                         (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name,
1555                             sizeof (type));
1556                 }
1557                 if (!BP_IS_EMBEDDED(bp)) {
1558                         checksum =
1559                             zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
1560                 }
1561                 compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
1562         }
1563
1564         SNPRINTF_BLKPTR(kmem_scnprintf, ' ', buf, buflen, bp, type, checksum,
1565             compress);
1566 }
1567
1568 void
1569 spa_freeze(spa_t *spa)
1570 {
1571         uint64_t freeze_txg = 0;
1572
1573         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1574         if (spa->spa_freeze_txg == UINT64_MAX) {
1575                 freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE;
1576                 spa->spa_freeze_txg = freeze_txg;
1577         }
1578         spa_config_exit(spa, SCL_ALL, FTAG);
1579         if (freeze_txg != 0)
1580                 txg_wait_synced(spa_get_dsl(spa), freeze_txg);
1581 }
1582
1583 void
1584 zfs_panic_recover(const char *fmt, ...)
1585 {
1586         va_list adx;
1587
1588         va_start(adx, fmt);
1589         vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx);
1590         va_end(adx);
1591 }
1592
1593 /*
1594  * This is a stripped-down version of strtoull, suitable only for converting
1595  * lowercase hexadecimal numbers that don't overflow.
1596  */
1597 uint64_t
1598 zfs_strtonum(const char *str, char **nptr)
1599 {
1600         uint64_t val = 0;
1601         char c;
1602         int digit;
1603
1604         while ((c = *str) != '\0') {
1605                 if (c >= '0' && c <= '9')
1606                         digit = c - '0';
1607                 else if (c >= 'a' && c <= 'f')
1608                         digit = 10 + c - 'a';
1609                 else
1610                         break;
1611
1612                 val *= 16;
1613                 val += digit;
1614
1615                 str++;
1616         }
1617
1618         if (nptr)
1619                 *nptr = (char *)str;
1620
1621         return (val);
1622 }
1623
1624 void
1625 spa_activate_allocation_classes(spa_t *spa, dmu_tx_t *tx)
1626 {
1627         /*
1628          * We bump the feature refcount for each special vdev added to the pool
1629          */
1630         ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES));
1631         spa_feature_incr(spa, SPA_FEATURE_ALLOCATION_CLASSES, tx);
1632 }
1633
1634 /*
1635  * ==========================================================================
1636  * Accessor functions
1637  * ==========================================================================
1638  */
1639
1640 boolean_t
1641 spa_shutting_down(spa_t *spa)
1642 {
1643         return (spa->spa_async_suspended);
1644 }
1645
1646 dsl_pool_t *
1647 spa_get_dsl(spa_t *spa)
1648 {
1649         return (spa->spa_dsl_pool);
1650 }
1651
1652 boolean_t
1653 spa_is_initializing(spa_t *spa)
1654 {
1655         return (spa->spa_is_initializing);
1656 }
1657
1658 boolean_t
1659 spa_indirect_vdevs_loaded(spa_t *spa)
1660 {
1661         return (spa->spa_indirect_vdevs_loaded);
1662 }
1663
1664 blkptr_t *
1665 spa_get_rootblkptr(spa_t *spa)
1666 {
1667         return (&spa->spa_ubsync.ub_rootbp);
1668 }
1669
1670 void
1671 spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp)
1672 {
1673         spa->spa_uberblock.ub_rootbp = *bp;
1674 }
1675
1676 void
1677 spa_altroot(spa_t *spa, char *buf, size_t buflen)
1678 {
1679         if (spa->spa_root == NULL)
1680                 buf[0] = '\0';
1681         else
1682                 (void) strlcpy(buf, spa->spa_root, buflen);
1683 }
1684
1685 uint32_t
1686 spa_sync_pass(spa_t *spa)
1687 {
1688         return (spa->spa_sync_pass);
1689 }
1690
1691 char *
1692 spa_name(spa_t *spa)
1693 {
1694         return (spa->spa_name);
1695 }
1696
1697 uint64_t
1698 spa_guid(spa_t *spa)
1699 {
1700         dsl_pool_t *dp = spa_get_dsl(spa);
1701         uint64_t guid;
1702
1703         /*
1704          * If we fail to parse the config during spa_load(), we can go through
1705          * the error path (which posts an ereport) and end up here with no root
1706          * vdev.  We stash the original pool guid in 'spa_config_guid' to handle
1707          * this case.
1708          */
1709         if (spa->spa_root_vdev == NULL)
1710                 return (spa->spa_config_guid);
1711
1712         guid = spa->spa_last_synced_guid != 0 ?
1713             spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid;
1714
1715         /*
1716          * Return the most recently synced out guid unless we're
1717          * in syncing context.
1718          */
1719         if (dp && dsl_pool_sync_context(dp))
1720                 return (spa->spa_root_vdev->vdev_guid);
1721         else
1722                 return (guid);
1723 }
1724
1725 uint64_t
1726 spa_load_guid(spa_t *spa)
1727 {
1728         /*
1729          * This is a GUID that exists solely as a reference for the
1730          * purposes of the arc.  It is generated at load time, and
1731          * is never written to persistent storage.
1732          */
1733         return (spa->spa_load_guid);
1734 }
1735
1736 uint64_t
1737 spa_last_synced_txg(spa_t *spa)
1738 {
1739         return (spa->spa_ubsync.ub_txg);
1740 }
1741
1742 uint64_t
1743 spa_first_txg(spa_t *spa)
1744 {
1745         return (spa->spa_first_txg);
1746 }
1747
1748 uint64_t
1749 spa_syncing_txg(spa_t *spa)
1750 {
1751         return (spa->spa_syncing_txg);
1752 }
1753
1754 /*
1755  * Return the last txg where data can be dirtied. The final txgs
1756  * will be used to just clear out any deferred frees that remain.
1757  */
1758 uint64_t
1759 spa_final_dirty_txg(spa_t *spa)
1760 {
1761         return (spa->spa_final_txg - TXG_DEFER_SIZE);
1762 }
1763
1764 pool_state_t
1765 spa_state(spa_t *spa)
1766 {
1767         return (spa->spa_state);
1768 }
1769
1770 spa_load_state_t
1771 spa_load_state(spa_t *spa)
1772 {
1773         return (spa->spa_load_state);
1774 }
1775
1776 uint64_t
1777 spa_freeze_txg(spa_t *spa)
1778 {
1779         return (spa->spa_freeze_txg);
1780 }
1781
1782 /*
1783  * Return the inflated asize for a logical write in bytes. This is used by the
1784  * DMU to calculate the space a logical write will require on disk.
1785  * If lsize is smaller than the largest physical block size allocatable on this
1786  * pool we use its value instead, since the write will end up using the whole
1787  * block anyway.
1788  */
1789 uint64_t
1790 spa_get_worst_case_asize(spa_t *spa, uint64_t lsize)
1791 {
1792         if (lsize == 0)
1793                 return (0);     /* No inflation needed */
1794         return (MAX(lsize, 1 << spa->spa_max_ashift) * spa_asize_inflation);
1795 }
1796
1797 /*
1798  * Return the amount of slop space in bytes.  It is typically 1/32 of the pool
1799  * (3.2%), minus the embedded log space.  On very small pools, it may be
1800  * slightly larger than this.  On very large pools, it will be capped to
1801  * the value of spa_max_slop.  The embedded log space is not included in
1802  * spa_dspace.  By subtracting it, the usable space (per "zfs list") is a
1803  * constant 97% of the total space, regardless of metaslab size (assuming the
1804  * default spa_slop_shift=5 and a non-tiny pool).
1805  *
1806  * See the comment above spa_slop_shift for more details.
1807  */
1808 uint64_t
1809 spa_get_slop_space(spa_t *spa)
1810 {
1811         uint64_t space = 0;
1812         uint64_t slop = 0;
1813
1814         /*
1815          * Make sure spa_dedup_dspace has been set.
1816          */
1817         if (spa->spa_dedup_dspace == ~0ULL)
1818                 spa_update_dspace(spa);
1819
1820         /*
1821          * spa_get_dspace() includes the space only logically "used" by
1822          * deduplicated data, so since it's not useful to reserve more
1823          * space with more deduplicated data, we subtract that out here.
1824          */
1825         space = spa_get_dspace(spa) - spa->spa_dedup_dspace;
1826         slop = MIN(space >> spa_slop_shift, spa_max_slop);
1827
1828         /*
1829          * Subtract the embedded log space, but no more than half the (3.2%)
1830          * unusable space.  Note, the "no more than half" is only relevant if
1831          * zfs_embedded_slog_min_ms >> spa_slop_shift < 2, which is not true by
1832          * default.
1833          */
1834         uint64_t embedded_log =
1835             metaslab_class_get_dspace(spa_embedded_log_class(spa));
1836         slop -= MIN(embedded_log, slop >> 1);
1837
1838         /*
1839          * Slop space should be at least spa_min_slop, but no more than half
1840          * the entire pool.
1841          */
1842         slop = MAX(slop, MIN(space >> 1, spa_min_slop));
1843         return (slop);
1844 }
1845
1846 uint64_t
1847 spa_get_dspace(spa_t *spa)
1848 {
1849         return (spa->spa_dspace);
1850 }
1851
1852 uint64_t
1853 spa_get_checkpoint_space(spa_t *spa)
1854 {
1855         return (spa->spa_checkpoint_info.sci_dspace);
1856 }
1857
1858 void
1859 spa_update_dspace(spa_t *spa)
1860 {
1861         spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
1862             ddt_get_dedup_dspace(spa) + brt_get_dspace(spa);
1863         if (spa->spa_nonallocating_dspace > 0) {
1864                 /*
1865                  * Subtract the space provided by all non-allocating vdevs that
1866                  * contribute to dspace.  If a file is overwritten, its old
1867                  * blocks are freed and new blocks are allocated.  If there are
1868                  * no snapshots of the file, the available space should remain
1869                  * the same.  The old blocks could be freed from the
1870                  * non-allocating vdev, but the new blocks must be allocated on
1871                  * other (allocating) vdevs.  By reserving the entire size of
1872                  * the non-allocating vdevs (including allocated space), we
1873                  * ensure that there will be enough space on the allocating
1874                  * vdevs for this file overwrite to succeed.
1875                  *
1876                  * Note that the DMU/DSL doesn't actually know or care
1877                  * how much space is allocated (it does its own tracking
1878                  * of how much space has been logically used).  So it
1879                  * doesn't matter that the data we are moving may be
1880                  * allocated twice (on the old device and the new device).
1881                  */
1882                 ASSERT3U(spa->spa_dspace, >=, spa->spa_nonallocating_dspace);
1883                 spa->spa_dspace -= spa->spa_nonallocating_dspace;
1884         }
1885 }
1886
1887 /*
1888  * Return the failure mode that has been set to this pool. The default
1889  * behavior will be to block all I/Os when a complete failure occurs.
1890  */
1891 uint64_t
1892 spa_get_failmode(spa_t *spa)
1893 {
1894         return (spa->spa_failmode);
1895 }
1896
1897 boolean_t
1898 spa_suspended(spa_t *spa)
1899 {
1900         return (spa->spa_suspended != ZIO_SUSPEND_NONE);
1901 }
1902
1903 uint64_t
1904 spa_version(spa_t *spa)
1905 {
1906         return (spa->spa_ubsync.ub_version);
1907 }
1908
1909 boolean_t
1910 spa_deflate(spa_t *spa)
1911 {
1912         return (spa->spa_deflate);
1913 }
1914
1915 metaslab_class_t *
1916 spa_normal_class(spa_t *spa)
1917 {
1918         return (spa->spa_normal_class);
1919 }
1920
1921 metaslab_class_t *
1922 spa_log_class(spa_t *spa)
1923 {
1924         return (spa->spa_log_class);
1925 }
1926
1927 metaslab_class_t *
1928 spa_embedded_log_class(spa_t *spa)
1929 {
1930         return (spa->spa_embedded_log_class);
1931 }
1932
1933 metaslab_class_t *
1934 spa_special_class(spa_t *spa)
1935 {
1936         return (spa->spa_special_class);
1937 }
1938
1939 metaslab_class_t *
1940 spa_dedup_class(spa_t *spa)
1941 {
1942         return (spa->spa_dedup_class);
1943 }
1944
1945 /*
1946  * Locate an appropriate allocation class
1947  */
1948 metaslab_class_t *
1949 spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype,
1950     uint_t level, uint_t special_smallblk)
1951 {
1952         /*
1953          * ZIL allocations determine their class in zio_alloc_zil().
1954          */
1955         ASSERT(objtype != DMU_OT_INTENT_LOG);
1956
1957         boolean_t has_special_class = spa->spa_special_class->mc_groups != 0;
1958
1959         if (DMU_OT_IS_DDT(objtype)) {
1960                 if (spa->spa_dedup_class->mc_groups != 0)
1961                         return (spa_dedup_class(spa));
1962                 else if (has_special_class && zfs_ddt_data_is_special)
1963                         return (spa_special_class(spa));
1964                 else
1965                         return (spa_normal_class(spa));
1966         }
1967
1968         /* Indirect blocks for user data can land in special if allowed */
1969         if (level > 0 && (DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL)) {
1970                 if (has_special_class && zfs_user_indirect_is_special)
1971                         return (spa_special_class(spa));
1972                 else
1973                         return (spa_normal_class(spa));
1974         }
1975
1976         if (DMU_OT_IS_METADATA(objtype) || level > 0) {
1977                 if (has_special_class)
1978                         return (spa_special_class(spa));
1979                 else
1980                         return (spa_normal_class(spa));
1981         }
1982
1983         /*
1984          * Allow small file blocks in special class in some cases (like
1985          * for the dRAID vdev feature). But always leave a reserve of
1986          * zfs_special_class_metadata_reserve_pct exclusively for metadata.
1987          */
1988         if (DMU_OT_IS_FILE(objtype) &&
1989             has_special_class && size <= special_smallblk) {
1990                 metaslab_class_t *special = spa_special_class(spa);
1991                 uint64_t alloc = metaslab_class_get_alloc(special);
1992                 uint64_t space = metaslab_class_get_space(special);
1993                 uint64_t limit =
1994                     (space * (100 - zfs_special_class_metadata_reserve_pct))
1995                     / 100;
1996
1997                 if (alloc < limit)
1998                         return (special);
1999         }
2000
2001         return (spa_normal_class(spa));
2002 }
2003
2004 void
2005 spa_evicting_os_register(spa_t *spa, objset_t *os)
2006 {
2007         mutex_enter(&spa->spa_evicting_os_lock);
2008         list_insert_head(&spa->spa_evicting_os_list, os);
2009         mutex_exit(&spa->spa_evicting_os_lock);
2010 }
2011
2012 void
2013 spa_evicting_os_deregister(spa_t *spa, objset_t *os)
2014 {
2015         mutex_enter(&spa->spa_evicting_os_lock);
2016         list_remove(&spa->spa_evicting_os_list, os);
2017         cv_broadcast(&spa->spa_evicting_os_cv);
2018         mutex_exit(&spa->spa_evicting_os_lock);
2019 }
2020
2021 void
2022 spa_evicting_os_wait(spa_t *spa)
2023 {
2024         mutex_enter(&spa->spa_evicting_os_lock);
2025         while (!list_is_empty(&spa->spa_evicting_os_list))
2026                 cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock);
2027         mutex_exit(&spa->spa_evicting_os_lock);
2028
2029         dmu_buf_user_evict_wait();
2030 }
2031
2032 int
2033 spa_max_replication(spa_t *spa)
2034 {
2035         /*
2036          * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to
2037          * handle BPs with more than one DVA allocated.  Set our max
2038          * replication level accordingly.
2039          */
2040         if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS)
2041                 return (1);
2042         return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
2043 }
2044
2045 int
2046 spa_prev_software_version(spa_t *spa)
2047 {
2048         return (spa->spa_prev_software_version);
2049 }
2050
2051 uint64_t
2052 spa_deadman_synctime(spa_t *spa)
2053 {
2054         return (spa->spa_deadman_synctime);
2055 }
2056
2057 spa_autotrim_t
2058 spa_get_autotrim(spa_t *spa)
2059 {
2060         return (spa->spa_autotrim);
2061 }
2062
2063 uint64_t
2064 spa_deadman_ziotime(spa_t *spa)
2065 {
2066         return (spa->spa_deadman_ziotime);
2067 }
2068
2069 uint64_t
2070 spa_get_deadman_failmode(spa_t *spa)
2071 {
2072         return (spa->spa_deadman_failmode);
2073 }
2074
2075 void
2076 spa_set_deadman_failmode(spa_t *spa, const char *failmode)
2077 {
2078         if (strcmp(failmode, "wait") == 0)
2079                 spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT;
2080         else if (strcmp(failmode, "continue") == 0)
2081                 spa->spa_deadman_failmode = ZIO_FAILURE_MODE_CONTINUE;
2082         else if (strcmp(failmode, "panic") == 0)
2083                 spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC;
2084         else
2085                 spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT;
2086 }
2087
2088 void
2089 spa_set_deadman_ziotime(hrtime_t ns)
2090 {
2091         spa_t *spa = NULL;
2092
2093         if (spa_mode_global != SPA_MODE_UNINIT) {
2094                 mutex_enter(&spa_namespace_lock);
2095                 while ((spa = spa_next(spa)) != NULL)
2096                         spa->spa_deadman_ziotime = ns;
2097                 mutex_exit(&spa_namespace_lock);
2098         }
2099 }
2100
2101 void
2102 spa_set_deadman_synctime(hrtime_t ns)
2103 {
2104         spa_t *spa = NULL;
2105
2106         if (spa_mode_global != SPA_MODE_UNINIT) {
2107                 mutex_enter(&spa_namespace_lock);
2108                 while ((spa = spa_next(spa)) != NULL)
2109                         spa->spa_deadman_synctime = ns;
2110                 mutex_exit(&spa_namespace_lock);
2111         }
2112 }
2113
2114 uint64_t
2115 dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
2116 {
2117         uint64_t asize = DVA_GET_ASIZE(dva);
2118         uint64_t dsize = asize;
2119
2120         ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
2121
2122         if (asize != 0 && spa->spa_deflate) {
2123                 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
2124                 if (vd != NULL)
2125                         dsize = (asize >> SPA_MINBLOCKSHIFT) *
2126                             vd->vdev_deflate_ratio;
2127         }
2128
2129         return (dsize);
2130 }
2131
2132 uint64_t
2133 bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp)
2134 {
2135         uint64_t dsize = 0;
2136
2137         for (int d = 0; d < BP_GET_NDVAS(bp); d++)
2138                 dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
2139
2140         return (dsize);
2141 }
2142
2143 uint64_t
2144 bp_get_dsize(spa_t *spa, const blkptr_t *bp)
2145 {
2146         uint64_t dsize = 0;
2147
2148         spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2149
2150         for (int d = 0; d < BP_GET_NDVAS(bp); d++)
2151                 dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
2152
2153         spa_config_exit(spa, SCL_VDEV, FTAG);
2154
2155         return (dsize);
2156 }
2157
2158 uint64_t
2159 spa_dirty_data(spa_t *spa)
2160 {
2161         return (spa->spa_dsl_pool->dp_dirty_total);
2162 }
2163
2164 /*
2165  * ==========================================================================
2166  * SPA Import Progress Routines
2167  * ==========================================================================
2168  */
2169
2170 typedef struct spa_import_progress {
2171         uint64_t                pool_guid;      /* unique id for updates */
2172         char                    *pool_name;
2173         spa_load_state_t        spa_load_state;
2174         uint64_t                mmp_sec_remaining;      /* MMP activity check */
2175         uint64_t                spa_load_max_txg;       /* rewind txg */
2176         procfs_list_node_t      smh_node;
2177 } spa_import_progress_t;
2178
2179 spa_history_list_t *spa_import_progress_list = NULL;
2180
2181 static int
2182 spa_import_progress_show_header(struct seq_file *f)
2183 {
2184         seq_printf(f, "%-20s %-14s %-14s %-12s %s\n", "pool_guid",
2185             "load_state", "multihost_secs", "max_txg",
2186             "pool_name");
2187         return (0);
2188 }
2189
2190 static int
2191 spa_import_progress_show(struct seq_file *f, void *data)
2192 {
2193         spa_import_progress_t *sip = (spa_import_progress_t *)data;
2194
2195         seq_printf(f, "%-20llu %-14llu %-14llu %-12llu %s\n",
2196             (u_longlong_t)sip->pool_guid, (u_longlong_t)sip->spa_load_state,
2197             (u_longlong_t)sip->mmp_sec_remaining,
2198             (u_longlong_t)sip->spa_load_max_txg,
2199             (sip->pool_name ? sip->pool_name : "-"));
2200
2201         return (0);
2202 }
2203
2204 /* Remove oldest elements from list until there are no more than 'size' left */
2205 static void
2206 spa_import_progress_truncate(spa_history_list_t *shl, unsigned int size)
2207 {
2208         spa_import_progress_t *sip;
2209         while (shl->size > size) {
2210                 sip = list_remove_head(&shl->procfs_list.pl_list);
2211                 if (sip->pool_name)
2212                         spa_strfree(sip->pool_name);
2213                 kmem_free(sip, sizeof (spa_import_progress_t));
2214                 shl->size--;
2215         }
2216
2217         IMPLY(size == 0, list_is_empty(&shl->procfs_list.pl_list));
2218 }
2219
2220 static void
2221 spa_import_progress_init(void)
2222 {
2223         spa_import_progress_list = kmem_zalloc(sizeof (spa_history_list_t),
2224             KM_SLEEP);
2225
2226         spa_import_progress_list->size = 0;
2227
2228         spa_import_progress_list->procfs_list.pl_private =
2229             spa_import_progress_list;
2230
2231         procfs_list_install("zfs",
2232             NULL,
2233             "import_progress",
2234             0644,
2235             &spa_import_progress_list->procfs_list,
2236             spa_import_progress_show,
2237             spa_import_progress_show_header,
2238             NULL,
2239             offsetof(spa_import_progress_t, smh_node));
2240 }
2241
2242 static void
2243 spa_import_progress_destroy(void)
2244 {
2245         spa_history_list_t *shl = spa_import_progress_list;
2246         procfs_list_uninstall(&shl->procfs_list);
2247         spa_import_progress_truncate(shl, 0);
2248         procfs_list_destroy(&shl->procfs_list);
2249         kmem_free(shl, sizeof (spa_history_list_t));
2250 }
2251
2252 int
2253 spa_import_progress_set_state(uint64_t pool_guid,
2254     spa_load_state_t load_state)
2255 {
2256         spa_history_list_t *shl = spa_import_progress_list;
2257         spa_import_progress_t *sip;
2258         int error = ENOENT;
2259
2260         if (shl->size == 0)
2261                 return (0);
2262
2263         mutex_enter(&shl->procfs_list.pl_lock);
2264         for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL;
2265             sip = list_prev(&shl->procfs_list.pl_list, sip)) {
2266                 if (sip->pool_guid == pool_guid) {
2267                         sip->spa_load_state = load_state;
2268                         error = 0;
2269                         break;
2270                 }
2271         }
2272         mutex_exit(&shl->procfs_list.pl_lock);
2273
2274         return (error);
2275 }
2276
2277 int
2278 spa_import_progress_set_max_txg(uint64_t pool_guid, uint64_t load_max_txg)
2279 {
2280         spa_history_list_t *shl = spa_import_progress_list;
2281         spa_import_progress_t *sip;
2282         int error = ENOENT;
2283
2284         if (shl->size == 0)
2285                 return (0);
2286
2287         mutex_enter(&shl->procfs_list.pl_lock);
2288         for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL;
2289             sip = list_prev(&shl->procfs_list.pl_list, sip)) {
2290                 if (sip->pool_guid == pool_guid) {
2291                         sip->spa_load_max_txg = load_max_txg;
2292                         error = 0;
2293                         break;
2294                 }
2295         }
2296         mutex_exit(&shl->procfs_list.pl_lock);
2297
2298         return (error);
2299 }
2300
2301 int
2302 spa_import_progress_set_mmp_check(uint64_t pool_guid,
2303     uint64_t mmp_sec_remaining)
2304 {
2305         spa_history_list_t *shl = spa_import_progress_list;
2306         spa_import_progress_t *sip;
2307         int error = ENOENT;
2308
2309         if (shl->size == 0)
2310                 return (0);
2311
2312         mutex_enter(&shl->procfs_list.pl_lock);
2313         for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL;
2314             sip = list_prev(&shl->procfs_list.pl_list, sip)) {
2315                 if (sip->pool_guid == pool_guid) {
2316                         sip->mmp_sec_remaining = mmp_sec_remaining;
2317                         error = 0;
2318                         break;
2319                 }
2320         }
2321         mutex_exit(&shl->procfs_list.pl_lock);
2322
2323         return (error);
2324 }
2325
2326 /*
2327  * A new import is in progress, add an entry.
2328  */
2329 void
2330 spa_import_progress_add(spa_t *spa)
2331 {
2332         spa_history_list_t *shl = spa_import_progress_list;
2333         spa_import_progress_t *sip;
2334         const char *poolname = NULL;
2335
2336         sip = kmem_zalloc(sizeof (spa_import_progress_t), KM_SLEEP);
2337         sip->pool_guid = spa_guid(spa);
2338
2339         (void) nvlist_lookup_string(spa->spa_config, ZPOOL_CONFIG_POOL_NAME,
2340             &poolname);
2341         if (poolname == NULL)
2342                 poolname = spa_name(spa);
2343         sip->pool_name = spa_strdup(poolname);
2344         sip->spa_load_state = spa_load_state(spa);
2345
2346         mutex_enter(&shl->procfs_list.pl_lock);
2347         procfs_list_add(&shl->procfs_list, sip);
2348         shl->size++;
2349         mutex_exit(&shl->procfs_list.pl_lock);
2350 }
2351
2352 void
2353 spa_import_progress_remove(uint64_t pool_guid)
2354 {
2355         spa_history_list_t *shl = spa_import_progress_list;
2356         spa_import_progress_t *sip;
2357
2358         mutex_enter(&shl->procfs_list.pl_lock);
2359         for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL;
2360             sip = list_prev(&shl->procfs_list.pl_list, sip)) {
2361                 if (sip->pool_guid == pool_guid) {
2362                         if (sip->pool_name)
2363                                 spa_strfree(sip->pool_name);
2364                         list_remove(&shl->procfs_list.pl_list, sip);
2365                         shl->size--;
2366                         kmem_free(sip, sizeof (spa_import_progress_t));
2367                         break;
2368                 }
2369         }
2370         mutex_exit(&shl->procfs_list.pl_lock);
2371 }
2372
2373 /*
2374  * ==========================================================================
2375  * Initialization and Termination
2376  * ==========================================================================
2377  */
2378
2379 static int
2380 spa_name_compare(const void *a1, const void *a2)
2381 {
2382         const spa_t *s1 = a1;
2383         const spa_t *s2 = a2;
2384         int s;
2385
2386         s = strcmp(s1->spa_name, s2->spa_name);
2387
2388         return (TREE_ISIGN(s));
2389 }
2390
2391 void
2392 spa_boot_init(void)
2393 {
2394         spa_config_load();
2395 }
2396
2397 void
2398 spa_init(spa_mode_t mode)
2399 {
2400         mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
2401         mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL);
2402         mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL);
2403         cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL);
2404
2405         avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t),
2406             offsetof(spa_t, spa_avl));
2407
2408         avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t),
2409             offsetof(spa_aux_t, aux_avl));
2410
2411         avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t),
2412             offsetof(spa_aux_t, aux_avl));
2413
2414         spa_mode_global = mode;
2415
2416 #ifndef _KERNEL
2417         if (spa_mode_global != SPA_MODE_READ && dprintf_find_string("watch")) {
2418                 struct sigaction sa;
2419
2420                 sa.sa_flags = SA_SIGINFO;
2421                 sigemptyset(&sa.sa_mask);
2422                 sa.sa_sigaction = arc_buf_sigsegv;
2423
2424                 if (sigaction(SIGSEGV, &sa, NULL) == -1) {
2425                         perror("could not enable watchpoints: "
2426                             "sigaction(SIGSEGV, ...) = ");
2427                 } else {
2428                         arc_watch = B_TRUE;
2429                 }
2430         }
2431 #endif
2432
2433         fm_init();
2434         zfs_refcount_init();
2435         unique_init();
2436         zfs_btree_init();
2437         metaslab_stat_init();
2438         brt_init();
2439         ddt_init();
2440         zio_init();
2441         dmu_init();
2442         zil_init();
2443         vdev_mirror_stat_init();
2444         vdev_raidz_math_init();
2445         vdev_file_init();
2446         zfs_prop_init();
2447         chksum_init();
2448         zpool_prop_init();
2449         zpool_feature_init();
2450         spa_config_load();
2451         vdev_prop_init();
2452         l2arc_start();
2453         scan_init();
2454         qat_init();
2455         spa_import_progress_init();
2456 }
2457
2458 void
2459 spa_fini(void)
2460 {
2461         l2arc_stop();
2462
2463         spa_evict_all();
2464
2465         vdev_file_fini();
2466         vdev_mirror_stat_fini();
2467         vdev_raidz_math_fini();
2468         chksum_fini();
2469         zil_fini();
2470         dmu_fini();
2471         zio_fini();
2472         ddt_fini();
2473         brt_fini();
2474         metaslab_stat_fini();
2475         zfs_btree_fini();
2476         unique_fini();
2477         zfs_refcount_fini();
2478         fm_fini();
2479         scan_fini();
2480         qat_fini();
2481         spa_import_progress_destroy();
2482
2483         avl_destroy(&spa_namespace_avl);
2484         avl_destroy(&spa_spare_avl);
2485         avl_destroy(&spa_l2cache_avl);
2486
2487         cv_destroy(&spa_namespace_cv);
2488         mutex_destroy(&spa_namespace_lock);
2489         mutex_destroy(&spa_spare_lock);
2490         mutex_destroy(&spa_l2cache_lock);
2491 }
2492
2493 /*
2494  * Return whether this pool has a dedicated slog device. No locking needed.
2495  * It's not a problem if the wrong answer is returned as it's only for
2496  * performance and not correctness.
2497  */
2498 boolean_t
2499 spa_has_slogs(spa_t *spa)
2500 {
2501         return (spa->spa_log_class->mc_groups != 0);
2502 }
2503
2504 spa_log_state_t
2505 spa_get_log_state(spa_t *spa)
2506 {
2507         return (spa->spa_log_state);
2508 }
2509
2510 void
2511 spa_set_log_state(spa_t *spa, spa_log_state_t state)
2512 {
2513         spa->spa_log_state = state;
2514 }
2515
2516 boolean_t
2517 spa_is_root(spa_t *spa)
2518 {
2519         return (spa->spa_is_root);
2520 }
2521
2522 boolean_t
2523 spa_writeable(spa_t *spa)
2524 {
2525         return (!!(spa->spa_mode & SPA_MODE_WRITE) && spa->spa_trust_config);
2526 }
2527
2528 /*
2529  * Returns true if there is a pending sync task in any of the current
2530  * syncing txg, the current quiescing txg, or the current open txg.
2531  */
2532 boolean_t
2533 spa_has_pending_synctask(spa_t *spa)
2534 {
2535         return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks) ||
2536             !txg_all_lists_empty(&spa->spa_dsl_pool->dp_early_sync_tasks));
2537 }
2538
2539 spa_mode_t
2540 spa_mode(spa_t *spa)
2541 {
2542         return (spa->spa_mode);
2543 }
2544
2545 uint64_t
2546 spa_bootfs(spa_t *spa)
2547 {
2548         return (spa->spa_bootfs);
2549 }
2550
2551 uint64_t
2552 spa_delegation(spa_t *spa)
2553 {
2554         return (spa->spa_delegation);
2555 }
2556
2557 objset_t *
2558 spa_meta_objset(spa_t *spa)
2559 {
2560         return (spa->spa_meta_objset);
2561 }
2562
2563 enum zio_checksum
2564 spa_dedup_checksum(spa_t *spa)
2565 {
2566         return (spa->spa_dedup_checksum);
2567 }
2568
2569 /*
2570  * Reset pool scan stat per scan pass (or reboot).
2571  */
2572 void
2573 spa_scan_stat_init(spa_t *spa)
2574 {
2575         /* data not stored on disk */
2576         spa->spa_scan_pass_start = gethrestime_sec();
2577         if (dsl_scan_is_paused_scrub(spa->spa_dsl_pool->dp_scan))
2578                 spa->spa_scan_pass_scrub_pause = spa->spa_scan_pass_start;
2579         else
2580                 spa->spa_scan_pass_scrub_pause = 0;
2581
2582         if (dsl_errorscrub_is_paused(spa->spa_dsl_pool->dp_scan))
2583                 spa->spa_scan_pass_errorscrub_pause = spa->spa_scan_pass_start;
2584         else
2585                 spa->spa_scan_pass_errorscrub_pause = 0;
2586
2587         spa->spa_scan_pass_scrub_spent_paused = 0;
2588         spa->spa_scan_pass_exam = 0;
2589         spa->spa_scan_pass_issued = 0;
2590
2591         // error scrub stats
2592         spa->spa_scan_pass_errorscrub_spent_paused = 0;
2593 }
2594
2595 /*
2596  * Get scan stats for zpool status reports
2597  */
2598 int
2599 spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
2600 {
2601         dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL;
2602
2603         if (scn == NULL || (scn->scn_phys.scn_func == POOL_SCAN_NONE &&
2604             scn->errorscrub_phys.dep_func == POOL_SCAN_NONE))
2605                 return (SET_ERROR(ENOENT));
2606
2607         memset(ps, 0, sizeof (pool_scan_stat_t));
2608
2609         /* data stored on disk */
2610         ps->pss_func = scn->scn_phys.scn_func;
2611         ps->pss_state = scn->scn_phys.scn_state;
2612         ps->pss_start_time = scn->scn_phys.scn_start_time;
2613         ps->pss_end_time = scn->scn_phys.scn_end_time;
2614         ps->pss_to_examine = scn->scn_phys.scn_to_examine;
2615         ps->pss_examined = scn->scn_phys.scn_examined;
2616         ps->pss_skipped = scn->scn_phys.scn_skipped;
2617         ps->pss_processed = scn->scn_phys.scn_processed;
2618         ps->pss_errors = scn->scn_phys.scn_errors;
2619
2620         /* data not stored on disk */
2621         ps->pss_pass_exam = spa->spa_scan_pass_exam;
2622         ps->pss_pass_start = spa->spa_scan_pass_start;
2623         ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause;
2624         ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused;
2625         ps->pss_pass_issued = spa->spa_scan_pass_issued;
2626         ps->pss_issued =
2627             scn->scn_issued_before_pass + spa->spa_scan_pass_issued;
2628
2629         /* error scrub data stored on disk */
2630         ps->pss_error_scrub_func = scn->errorscrub_phys.dep_func;
2631         ps->pss_error_scrub_state = scn->errorscrub_phys.dep_state;
2632         ps->pss_error_scrub_start = scn->errorscrub_phys.dep_start_time;
2633         ps->pss_error_scrub_end = scn->errorscrub_phys.dep_end_time;
2634         ps->pss_error_scrub_examined = scn->errorscrub_phys.dep_examined;
2635         ps->pss_error_scrub_to_be_examined =
2636             scn->errorscrub_phys.dep_to_examine;
2637
2638         /* error scrub data not stored on disk */
2639         ps->pss_pass_error_scrub_pause = spa->spa_scan_pass_errorscrub_pause;
2640
2641         return (0);
2642 }
2643
2644 int
2645 spa_maxblocksize(spa_t *spa)
2646 {
2647         if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS))
2648                 return (SPA_MAXBLOCKSIZE);
2649         else
2650                 return (SPA_OLD_MAXBLOCKSIZE);
2651 }
2652
2653
2654 /*
2655  * Returns the txg that the last device removal completed. No indirect mappings
2656  * have been added since this txg.
2657  */
2658 uint64_t
2659 spa_get_last_removal_txg(spa_t *spa)
2660 {
2661         uint64_t vdevid;
2662         uint64_t ret = -1ULL;
2663
2664         spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2665         /*
2666          * sr_prev_indirect_vdev is only modified while holding all the
2667          * config locks, so it is sufficient to hold SCL_VDEV as reader when
2668          * examining it.
2669          */
2670         vdevid = spa->spa_removing_phys.sr_prev_indirect_vdev;
2671
2672         while (vdevid != -1ULL) {
2673                 vdev_t *vd = vdev_lookup_top(spa, vdevid);
2674                 vdev_indirect_births_t *vib = vd->vdev_indirect_births;
2675
2676                 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
2677
2678                 /*
2679                  * If the removal did not remap any data, we don't care.
2680                  */
2681                 if (vdev_indirect_births_count(vib) != 0) {
2682                         ret = vdev_indirect_births_last_entry_txg(vib);
2683                         break;
2684                 }
2685
2686                 vdevid = vd->vdev_indirect_config.vic_prev_indirect_vdev;
2687         }
2688         spa_config_exit(spa, SCL_VDEV, FTAG);
2689
2690         IMPLY(ret != -1ULL,
2691             spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
2692
2693         return (ret);
2694 }
2695
2696 int
2697 spa_maxdnodesize(spa_t *spa)
2698 {
2699         if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE))
2700                 return (DNODE_MAX_SIZE);
2701         else
2702                 return (DNODE_MIN_SIZE);
2703 }
2704
2705 boolean_t
2706 spa_multihost(spa_t *spa)
2707 {
2708         return (spa->spa_multihost ? B_TRUE : B_FALSE);
2709 }
2710
2711 uint32_t
2712 spa_get_hostid(spa_t *spa)
2713 {
2714         return (spa->spa_hostid);
2715 }
2716
2717 boolean_t
2718 spa_trust_config(spa_t *spa)
2719 {
2720         return (spa->spa_trust_config);
2721 }
2722
2723 uint64_t
2724 spa_missing_tvds_allowed(spa_t *spa)
2725 {
2726         return (spa->spa_missing_tvds_allowed);
2727 }
2728
2729 space_map_t *
2730 spa_syncing_log_sm(spa_t *spa)
2731 {
2732         return (spa->spa_syncing_log_sm);
2733 }
2734
2735 void
2736 spa_set_missing_tvds(spa_t *spa, uint64_t missing)
2737 {
2738         spa->spa_missing_tvds = missing;
2739 }
2740
2741 /*
2742  * Return the pool state string ("ONLINE", "DEGRADED", "SUSPENDED", etc).
2743  */
2744 const char *
2745 spa_state_to_name(spa_t *spa)
2746 {
2747         ASSERT3P(spa, !=, NULL);
2748
2749         /*
2750          * it is possible for the spa to exist, without root vdev
2751          * as the spa transitions during import/export
2752          */
2753         vdev_t *rvd = spa->spa_root_vdev;
2754         if (rvd == NULL) {
2755                 return ("TRANSITIONING");
2756         }
2757         vdev_state_t state = rvd->vdev_state;
2758         vdev_aux_t aux = rvd->vdev_stat.vs_aux;
2759
2760         if (spa_suspended(spa))
2761                 return ("SUSPENDED");
2762
2763         switch (state) {
2764         case VDEV_STATE_CLOSED:
2765         case VDEV_STATE_OFFLINE:
2766                 return ("OFFLINE");
2767         case VDEV_STATE_REMOVED:
2768                 return ("REMOVED");
2769         case VDEV_STATE_CANT_OPEN:
2770                 if (aux == VDEV_AUX_CORRUPT_DATA || aux == VDEV_AUX_BAD_LOG)
2771                         return ("FAULTED");
2772                 else if (aux == VDEV_AUX_SPLIT_POOL)
2773                         return ("SPLIT");
2774                 else
2775                         return ("UNAVAIL");
2776         case VDEV_STATE_FAULTED:
2777                 return ("FAULTED");
2778         case VDEV_STATE_DEGRADED:
2779                 return ("DEGRADED");
2780         case VDEV_STATE_HEALTHY:
2781                 return ("ONLINE");
2782         default:
2783                 break;
2784         }
2785
2786         return ("UNKNOWN");
2787 }
2788
2789 boolean_t
2790 spa_top_vdevs_spacemap_addressable(spa_t *spa)
2791 {
2792         vdev_t *rvd = spa->spa_root_vdev;
2793         for (uint64_t c = 0; c < rvd->vdev_children; c++) {
2794                 if (!vdev_is_spacemap_addressable(rvd->vdev_child[c]))
2795                         return (B_FALSE);
2796         }
2797         return (B_TRUE);
2798 }
2799
2800 boolean_t
2801 spa_has_checkpoint(spa_t *spa)
2802 {
2803         return (spa->spa_checkpoint_txg != 0);
2804 }
2805
2806 boolean_t
2807 spa_importing_readonly_checkpoint(spa_t *spa)
2808 {
2809         return ((spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT) &&
2810             spa->spa_mode == SPA_MODE_READ);
2811 }
2812
2813 uint64_t
2814 spa_min_claim_txg(spa_t *spa)
2815 {
2816         uint64_t checkpoint_txg = spa->spa_uberblock.ub_checkpoint_txg;
2817
2818         if (checkpoint_txg != 0)
2819                 return (checkpoint_txg + 1);
2820
2821         return (spa->spa_first_txg);
2822 }
2823
2824 /*
2825  * If there is a checkpoint, async destroys may consume more space from
2826  * the pool instead of freeing it. In an attempt to save the pool from
2827  * getting suspended when it is about to run out of space, we stop
2828  * processing async destroys.
2829  */
2830 boolean_t
2831 spa_suspend_async_destroy(spa_t *spa)
2832 {
2833         dsl_pool_t *dp = spa_get_dsl(spa);
2834
2835         uint64_t unreserved = dsl_pool_unreserved_space(dp,
2836             ZFS_SPACE_CHECK_EXTRA_RESERVED);
2837         uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes;
2838         uint64_t avail = (unreserved > used) ? (unreserved - used) : 0;
2839
2840         if (spa_has_checkpoint(spa) && avail == 0)
2841                 return (B_TRUE);
2842
2843         return (B_FALSE);
2844 }
2845
2846 #if defined(_KERNEL)
2847
2848 int
2849 param_set_deadman_failmode_common(const char *val)
2850 {
2851         spa_t *spa = NULL;
2852         char *p;
2853
2854         if (val == NULL)
2855                 return (SET_ERROR(EINVAL));
2856
2857         if ((p = strchr(val, '\n')) != NULL)
2858                 *p = '\0';
2859
2860         if (strcmp(val, "wait") != 0 && strcmp(val, "continue") != 0 &&
2861             strcmp(val, "panic"))
2862                 return (SET_ERROR(EINVAL));
2863
2864         if (spa_mode_global != SPA_MODE_UNINIT) {
2865                 mutex_enter(&spa_namespace_lock);
2866                 while ((spa = spa_next(spa)) != NULL)
2867                         spa_set_deadman_failmode(spa, val);
2868                 mutex_exit(&spa_namespace_lock);
2869         }
2870
2871         return (0);
2872 }
2873 #endif
2874
2875 /* Namespace manipulation */
2876 EXPORT_SYMBOL(spa_lookup);
2877 EXPORT_SYMBOL(spa_add);
2878 EXPORT_SYMBOL(spa_remove);
2879 EXPORT_SYMBOL(spa_next);
2880
2881 /* Refcount functions */
2882 EXPORT_SYMBOL(spa_open_ref);
2883 EXPORT_SYMBOL(spa_close);
2884 EXPORT_SYMBOL(spa_refcount_zero);
2885
2886 /* Pool configuration lock */
2887 EXPORT_SYMBOL(spa_config_tryenter);
2888 EXPORT_SYMBOL(spa_config_enter);
2889 EXPORT_SYMBOL(spa_config_exit);
2890 EXPORT_SYMBOL(spa_config_held);
2891
2892 /* Pool vdev add/remove lock */
2893 EXPORT_SYMBOL(spa_vdev_enter);
2894 EXPORT_SYMBOL(spa_vdev_exit);
2895
2896 /* Pool vdev state change lock */
2897 EXPORT_SYMBOL(spa_vdev_state_enter);
2898 EXPORT_SYMBOL(spa_vdev_state_exit);
2899
2900 /* Accessor functions */
2901 EXPORT_SYMBOL(spa_shutting_down);
2902 EXPORT_SYMBOL(spa_get_dsl);
2903 EXPORT_SYMBOL(spa_get_rootblkptr);
2904 EXPORT_SYMBOL(spa_set_rootblkptr);
2905 EXPORT_SYMBOL(spa_altroot);
2906 EXPORT_SYMBOL(spa_sync_pass);
2907 EXPORT_SYMBOL(spa_name);
2908 EXPORT_SYMBOL(spa_guid);
2909 EXPORT_SYMBOL(spa_last_synced_txg);
2910 EXPORT_SYMBOL(spa_first_txg);
2911 EXPORT_SYMBOL(spa_syncing_txg);
2912 EXPORT_SYMBOL(spa_version);
2913 EXPORT_SYMBOL(spa_state);
2914 EXPORT_SYMBOL(spa_load_state);
2915 EXPORT_SYMBOL(spa_freeze_txg);
2916 EXPORT_SYMBOL(spa_get_dspace);
2917 EXPORT_SYMBOL(spa_update_dspace);
2918 EXPORT_SYMBOL(spa_deflate);
2919 EXPORT_SYMBOL(spa_normal_class);
2920 EXPORT_SYMBOL(spa_log_class);
2921 EXPORT_SYMBOL(spa_special_class);
2922 EXPORT_SYMBOL(spa_preferred_class);
2923 EXPORT_SYMBOL(spa_max_replication);
2924 EXPORT_SYMBOL(spa_prev_software_version);
2925 EXPORT_SYMBOL(spa_get_failmode);
2926 EXPORT_SYMBOL(spa_suspended);
2927 EXPORT_SYMBOL(spa_bootfs);
2928 EXPORT_SYMBOL(spa_delegation);
2929 EXPORT_SYMBOL(spa_meta_objset);
2930 EXPORT_SYMBOL(spa_maxblocksize);
2931 EXPORT_SYMBOL(spa_maxdnodesize);
2932
2933 /* Miscellaneous support routines */
2934 EXPORT_SYMBOL(spa_guid_exists);
2935 EXPORT_SYMBOL(spa_strdup);
2936 EXPORT_SYMBOL(spa_strfree);
2937 EXPORT_SYMBOL(spa_generate_guid);
2938 EXPORT_SYMBOL(snprintf_blkptr);
2939 EXPORT_SYMBOL(spa_freeze);
2940 EXPORT_SYMBOL(spa_upgrade);
2941 EXPORT_SYMBOL(spa_evict_all);
2942 EXPORT_SYMBOL(spa_lookup_by_guid);
2943 EXPORT_SYMBOL(spa_has_spare);
2944 EXPORT_SYMBOL(dva_get_dsize_sync);
2945 EXPORT_SYMBOL(bp_get_dsize_sync);
2946 EXPORT_SYMBOL(bp_get_dsize);
2947 EXPORT_SYMBOL(spa_has_slogs);
2948 EXPORT_SYMBOL(spa_is_root);
2949 EXPORT_SYMBOL(spa_writeable);
2950 EXPORT_SYMBOL(spa_mode);
2951 EXPORT_SYMBOL(spa_namespace_lock);
2952 EXPORT_SYMBOL(spa_trust_config);
2953 EXPORT_SYMBOL(spa_missing_tvds_allowed);
2954 EXPORT_SYMBOL(spa_set_missing_tvds);
2955 EXPORT_SYMBOL(spa_state_to_name);
2956 EXPORT_SYMBOL(spa_importing_readonly_checkpoint);
2957 EXPORT_SYMBOL(spa_min_claim_txg);
2958 EXPORT_SYMBOL(spa_suspend_async_destroy);
2959 EXPORT_SYMBOL(spa_has_checkpoint);
2960 EXPORT_SYMBOL(spa_top_vdevs_spacemap_addressable);
2961
2962 ZFS_MODULE_PARAM(zfs, zfs_, flags, UINT, ZMOD_RW,
2963         "Set additional debugging flags");
2964
2965 ZFS_MODULE_PARAM(zfs, zfs_, recover, INT, ZMOD_RW,
2966         "Set to attempt to recover from fatal errors");
2967
2968 ZFS_MODULE_PARAM(zfs, zfs_, free_leak_on_eio, INT, ZMOD_RW,
2969         "Set to ignore IO errors during free and permanently leak the space");
2970
2971 ZFS_MODULE_PARAM(zfs_deadman, zfs_deadman_, checktime_ms, U64, ZMOD_RW,
2972         "Dead I/O check interval in milliseconds");
2973
2974 ZFS_MODULE_PARAM(zfs_deadman, zfs_deadman_, enabled, INT, ZMOD_RW,
2975         "Enable deadman timer");
2976
2977 ZFS_MODULE_PARAM(zfs_spa, spa_, asize_inflation, UINT, ZMOD_RW,
2978         "SPA size estimate multiplication factor");
2979
2980 ZFS_MODULE_PARAM(zfs, zfs_, ddt_data_is_special, INT, ZMOD_RW,
2981         "Place DDT data into the special class");
2982
2983 ZFS_MODULE_PARAM(zfs, zfs_, user_indirect_is_special, INT, ZMOD_RW,
2984         "Place user data indirect blocks into the special class");
2985
2986 /* BEGIN CSTYLED */
2987 ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, failmode,
2988         param_set_deadman_failmode, param_get_charp, ZMOD_RW,
2989         "Failmode for deadman timer");
2990
2991 ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, synctime_ms,
2992         param_set_deadman_synctime, spl_param_get_u64, ZMOD_RW,
2993         "Pool sync expiration time in milliseconds");
2994
2995 ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, ziotime_ms,
2996         param_set_deadman_ziotime, spl_param_get_u64, ZMOD_RW,
2997         "IO expiration time in milliseconds");
2998
2999 ZFS_MODULE_PARAM(zfs, zfs_, special_class_metadata_reserve_pct, UINT, ZMOD_RW,
3000         "Small file blocks in special vdevs depends on this much "
3001         "free space available");
3002 /* END CSTYLED */
3003
3004 ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift,
3005         param_get_uint, ZMOD_RW, "Reserved free space in pool");