module/zfs/vdev_initialize.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or https://opensource.org/licenses/CDDL-1.0.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2016, 2024 by Delphix. All rights reserved.
  24  */
  25
  26 #include <sys/spa.h>
  27 #include <sys/spa_impl.h>
  28 #include <sys/txg.h>
  29 #include <sys/vdev_impl.h>
  30 #include <sys/metaslab_impl.h>
  31 #include <sys/dsl_synctask.h>
  32 #include <sys/zap.h>
  33 #include <sys/dmu_tx.h>
  34 #include <sys/vdev_initialize.h>
  35
  36 /*
  37  * Value that is written to disk during initialization.
  38  */
  39 static uint64_t zfs_initialize_value = 0xdeadbeefdeadbeeeULL;
  40
  41 /* maximum number of I/Os outstanding per leaf vdev */
  42 static const int zfs_initialize_limit = 1;
  43
  44 /* size of initializing writes; default 1MiB, see zfs_remove_max_segment */
  45 static uint64_t zfs_initialize_chunk_size = 1024 * 1024;
  46
  47 static boolean_t
  48 vdev_initialize_should_stop(vdev_t *vd)
  49 {
  50         return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) ||
  51             vd->vdev_detached || vd->vdev_top->vdev_removing ||
  52             vd->vdev_top->vdev_rz_expanding);
  53 }
  54
  55 static void
  56 vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx)
  57 {
  58         /*
  59          * We pass in the guid instead of the vdev_t since the vdev may
  60          * have been freed prior to the sync task being processed. This
  61          * happens when a vdev is detached as we call spa_config_vdev_exit(),
  62          * stop the initializing thread, schedule the sync task, and free
  63          * the vdev. Later when the scheduled sync task is invoked, it would
  64          * find that the vdev has been freed.
  65          */
  66         uint64_t guid = *(uint64_t *)arg;
  67         uint64_t txg = dmu_tx_get_txg(tx);
  68         kmem_free(arg, sizeof (uint64_t));
  69
  70         vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
  71         if (vd == NULL || vd->vdev_top->vdev_removing ||
  72             !vdev_is_concrete(vd) || vd->vdev_top->vdev_rz_expanding)
  73                 return;
  74
  75         uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK];
  76         vd->vdev_initialize_offset[txg & TXG_MASK] = 0;
  77
  78         VERIFY(vd->vdev_leaf_zap != 0);
  79
  80         objset_t *mos = vd->vdev_spa->spa_meta_objset;
  81
  82         if (last_offset > 0) {
  83                 vd->vdev_initialize_last_offset = last_offset;
  84                 VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
  85                     VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET,
  86                     sizeof (last_offset), 1, &last_offset, tx));
  87         }
  88         if (vd->vdev_initialize_action_time > 0) {
  89                 uint64_t val = (uint64_t)vd->vdev_initialize_action_time;
  90                 VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
  91                     VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (val),
  92                     1, &val, tx));
  93         }
  94
  95         uint64_t initialize_state = vd->vdev_initialize_state;
  96         VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
  97             VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1,
  98             &initialize_state, tx));
  99 }
 100
 101 static void
 102 vdev_initialize_zap_remove_sync(void *arg, dmu_tx_t *tx)
 103 {
 104         uint64_t guid = *(uint64_t *)arg;
 105
 106         kmem_free(arg, sizeof (uint64_t));
 107
 108         vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
 109         if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd))
 110                 return;
 111
 112         ASSERT3S(vd->vdev_initialize_state, ==, VDEV_INITIALIZE_NONE);
 113         ASSERT3U(vd->vdev_leaf_zap, !=, 0);
 114
 115         vd->vdev_initialize_last_offset = 0;
 116         vd->vdev_initialize_action_time = 0;
 117
 118         objset_t *mos = vd->vdev_spa->spa_meta_objset;
 119         int error;
 120
 121         error = zap_remove(mos, vd->vdev_leaf_zap,
 122             VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, tx);
 123         VERIFY(error == 0 || error == ENOENT);
 124
 125         error = zap_remove(mos, vd->vdev_leaf_zap,
 126             VDEV_LEAF_ZAP_INITIALIZE_STATE, tx);
 127         VERIFY(error == 0 || error == ENOENT);
 128
 129         error = zap_remove(mos, vd->vdev_leaf_zap,
 130             VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, tx);
 131         VERIFY(error == 0 || error == ENOENT);
 132 }
 133
 134 static void
 135 vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
 136 {
 137         ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
 138         spa_t *spa = vd->vdev_spa;
 139
 140         if (new_state == vd->vdev_initialize_state)
 141                 return;
 142
 143         /*
 144          * Copy the vd's guid, this will be freed by the sync task.
 145          */
 146         uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
 147         *guid = vd->vdev_guid;
 148
 149         /*
 150          * If we're suspending, then preserving the original start time.
 151          */
 152         if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) {
 153                 vd->vdev_initialize_action_time = gethrestime_sec();
 154         }
 155
 156         vdev_initializing_state_t old_state = vd->vdev_initialize_state;
 157         vd->vdev_initialize_state = new_state;
 158
 159         dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 160         VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 161
 162         if (new_state != VDEV_INITIALIZE_NONE) {
 163                 dsl_sync_task_nowait(spa_get_dsl(spa),
 164                     vdev_initialize_zap_update_sync, guid, tx);
 165         } else {
 166                 dsl_sync_task_nowait(spa_get_dsl(spa),
 167                     vdev_initialize_zap_remove_sync, guid, tx);
 168         }
 169
 170         switch (new_state) {
 171         case VDEV_INITIALIZE_ACTIVE:
 172                 spa_history_log_internal(spa, "initialize", tx,
 173                     "vdev=%s activated", vd->vdev_path);
 174                 break;
 175         case VDEV_INITIALIZE_SUSPENDED:
 176                 spa_history_log_internal(spa, "initialize", tx,
 177                     "vdev=%s suspended", vd->vdev_path);
 178                 break;
 179         case VDEV_INITIALIZE_CANCELED:
 180                 if (old_state == VDEV_INITIALIZE_ACTIVE ||
 181                     old_state == VDEV_INITIALIZE_SUSPENDED)
 182                         spa_history_log_internal(spa, "initialize", tx,
 183                             "vdev=%s canceled", vd->vdev_path);
 184                 break;
 185         case VDEV_INITIALIZE_COMPLETE:
 186                 spa_history_log_internal(spa, "initialize", tx,
 187                     "vdev=%s complete", vd->vdev_path);
 188                 break;
 189         case VDEV_INITIALIZE_NONE:
 190                 spa_history_log_internal(spa, "uninitialize", tx,
 191                     "vdev=%s", vd->vdev_path);
 192                 break;
 193         default:
 194                 panic("invalid state %llu", (unsigned long long)new_state);
 195         }
 196
 197         dmu_tx_commit(tx);
 198
 199         if (new_state != VDEV_INITIALIZE_ACTIVE)
 200                 spa_notify_waiters(spa);
 201 }
 202
 203 static void
 204 vdev_initialize_cb(zio_t *zio)
 205 {
 206         vdev_t *vd = zio->io_vd;
 207         mutex_enter(&vd->vdev_initialize_io_lock);
 208         if (zio->io_error == ENXIO && !vdev_writeable(vd)) {
 209                 /*
 210                  * The I/O failed because the vdev was unavailable; roll the
 211                  * last offset back. (This works because spa_sync waits on
 212                  * spa_txg_zio before it runs sync tasks.)
 213                  */
 214                 uint64_t *off =
 215                     &vd->vdev_initialize_offset[zio->io_txg & TXG_MASK];
 216                 *off = MIN(*off, zio->io_offset);
 217         } else {
 218                 /*
 219                  * Since initializing is best-effort, we ignore I/O errors and
 220                  * rely on vdev_probe to determine if the errors are more
 221                  * critical.
 222                  */
 223                 if (zio->io_error != 0)
 224                         vd->vdev_stat.vs_initialize_errors++;
 225
 226                 vd->vdev_initialize_bytes_done += zio->io_orig_size;
 227         }
 228         ASSERT3U(vd->vdev_initialize_inflight, >, 0);
 229         vd->vdev_initialize_inflight--;
 230         cv_broadcast(&vd->vdev_initialize_io_cv);
 231         mutex_exit(&vd->vdev_initialize_io_lock);
 232
 233         spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
 234 }
 235
 236 /* Takes care of physical writing and limiting # of concurrent ZIOs. */
 237 static int
 238 vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data)
 239 {
 240         spa_t *spa = vd->vdev_spa;
 241
 242         /* Limit inflight initializing I/Os */
 243         mutex_enter(&vd->vdev_initialize_io_lock);
 244         while (vd->vdev_initialize_inflight >= zfs_initialize_limit) {
 245                 cv_wait(&vd->vdev_initialize_io_cv,
 246                     &vd->vdev_initialize_io_lock);
 247         }
 248         vd->vdev_initialize_inflight++;
 249         mutex_exit(&vd->vdev_initialize_io_lock);
 250
 251         dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 252         VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 253         uint64_t txg = dmu_tx_get_txg(tx);
 254
 255         spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER);
 256         mutex_enter(&vd->vdev_initialize_lock);
 257
 258         if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) {
 259                 uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
 260                 *guid = vd->vdev_guid;
 261
 262                 /* This is the first write of this txg. */
 263                 dsl_sync_task_nowait(spa_get_dsl(spa),
 264                     vdev_initialize_zap_update_sync, guid, tx);
 265         }
 266
 267         /*
 268          * We know the vdev struct will still be around since all
 269          * consumers of vdev_free must stop the initialization first.
 270          */
 271         if (vdev_initialize_should_stop(vd)) {
 272                 mutex_enter(&vd->vdev_initialize_io_lock);
 273                 ASSERT3U(vd->vdev_initialize_inflight, >, 0);
 274                 vd->vdev_initialize_inflight--;
 275                 mutex_exit(&vd->vdev_initialize_io_lock);
 276                 spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
 277                 mutex_exit(&vd->vdev_initialize_lock);
 278                 dmu_tx_commit(tx);
 279                 return (SET_ERROR(EINTR));
 280         }
 281         mutex_exit(&vd->vdev_initialize_lock);
 282
 283         vd->vdev_initialize_offset[txg & TXG_MASK] = start + size;
 284         zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start,
 285             size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL,
 286             ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE));
 287         /* vdev_initialize_cb releases SCL_STATE_ALL */
 288
 289         dmu_tx_commit(tx);
 290
 291         return (0);
 292 }
 293
 294 /*
 295  * Callback to fill each ABD chunk with zfs_initialize_value. len must be
 296  * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD
 297  * allocation will guarantee these for us.
 298  */
 299 static int
 300 vdev_initialize_block_fill(void *buf, size_t len, void *unused)
 301 {
 302         (void) unused;
 303
 304         ASSERT0(len % sizeof (uint64_t));
 305         for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) {
 306                 *(uint64_t *)((char *)(buf) + i) = zfs_initialize_value;
 307         }
 308         return (0);
 309 }
 310
 311 static abd_t *
 312 vdev_initialize_block_alloc(void)
 313 {
 314         /* Allocate ABD for filler data */
 315         abd_t *data = abd_alloc_for_io(zfs_initialize_chunk_size, B_FALSE);
 316
 317         ASSERT0(zfs_initialize_chunk_size % sizeof (uint64_t));
 318         (void) abd_iterate_func(data, 0, zfs_initialize_chunk_size,
 319             vdev_initialize_block_fill, NULL);
 320
 321         return (data);
 322 }
 323
 324 static void
 325 vdev_initialize_block_free(abd_t *data)
 326 {
 327         abd_free(data);
 328 }
 329
 330 static int
 331 vdev_initialize_ranges(vdev_t *vd, abd_t *data)
 332 {
 333         range_tree_t *rt = vd->vdev_initialize_tree;
 334         zfs_btree_t *bt = &rt->rt_root;
 335         zfs_btree_index_t where;
 336
 337         for (range_seg_t *rs = zfs_btree_first(bt, &where); rs != NULL;
 338             rs = zfs_btree_next(bt, &where, &where)) {
 339                 uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt);
 340
 341                 /* Split range into legally-sized physical chunks */
 342                 uint64_t writes_required =
 343                     ((size - 1) / zfs_initialize_chunk_size) + 1;
 344
 345                 for (uint64_t w = 0; w < writes_required; w++) {
 346                         int error;
 347
 348                         error = vdev_initialize_write(vd,
 349                             VDEV_LABEL_START_SIZE + rs_get_start(rs, rt) +
 350                             (w * zfs_initialize_chunk_size),
 351                             MIN(size - (w * zfs_initialize_chunk_size),
 352                             zfs_initialize_chunk_size), data);
 353                         if (error != 0)
 354                                 return (error);
 355                 }
 356         }
 357         return (0);
 358 }
 359
 360 static void
 361 vdev_initialize_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs)
 362 {
 363         uint64_t *last_rs_end = (uint64_t *)arg;
 364
 365         if (physical_rs->rs_end > *last_rs_end)
 366                 *last_rs_end = physical_rs->rs_end;
 367 }
 368
 369 static void
 370 vdev_initialize_xlate_progress(void *arg, range_seg64_t *physical_rs)
 371 {
 372         vdev_t *vd = (vdev_t *)arg;
 373
 374         uint64_t size = physical_rs->rs_end - physical_rs->rs_start;
 375         vd->vdev_initialize_bytes_est += size;
 376
 377         if (vd->vdev_initialize_last_offset > physical_rs->rs_end) {
 378                 vd->vdev_initialize_bytes_done += size;
 379         } else if (vd->vdev_initialize_last_offset > physical_rs->rs_start &&
 380             vd->vdev_initialize_last_offset < physical_rs->rs_end) {
 381                 vd->vdev_initialize_bytes_done +=
 382                     vd->vdev_initialize_last_offset - physical_rs->rs_start;
 383         }
 384 }
 385
 386 static void
 387 vdev_initialize_calculate_progress(vdev_t *vd)
 388 {
 389         ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
 390             spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
 391         ASSERT(vd->vdev_leaf_zap != 0);
 392
 393         vd->vdev_initialize_bytes_est = 0;
 394         vd->vdev_initialize_bytes_done = 0;
 395
 396         for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) {
 397                 metaslab_t *msp = vd->vdev_top->vdev_ms[i];
 398                 mutex_enter(&msp->ms_lock);
 399
 400                 uint64_t ms_free = (msp->ms_size -
 401                     metaslab_allocated_space(msp)) /
 402                     vdev_get_ndisks(vd->vdev_top);
 403
 404                 /*
 405                  * Convert the metaslab range to a physical range
 406                  * on our vdev. We use this to determine if we are
 407                  * in the middle of this metaslab range.
 408                  */
 409                 range_seg64_t logical_rs, physical_rs, remain_rs;
 410                 logical_rs.rs_start = msp->ms_start;
 411                 logical_rs.rs_end = msp->ms_start + msp->ms_size;
 412
 413                 /* Metaslab space after this offset has not been initialized */
 414                 vdev_xlate(vd, &logical_rs, &physical_rs, &remain_rs);
 415                 if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) {
 416                         vd->vdev_initialize_bytes_est += ms_free;
 417                         mutex_exit(&msp->ms_lock);
 418                         continue;
 419                 }
 420
 421                 /* Metaslab space before this offset has been initialized */
 422                 uint64_t last_rs_end = physical_rs.rs_end;
 423                 if (!vdev_xlate_is_empty(&remain_rs)) {
 424                         vdev_xlate_walk(vd, &remain_rs,
 425                             vdev_initialize_xlate_last_rs_end, &last_rs_end);
 426                 }
 427
 428                 if (vd->vdev_initialize_last_offset > last_rs_end) {
 429                         vd->vdev_initialize_bytes_done += ms_free;
 430                         vd->vdev_initialize_bytes_est += ms_free;
 431                         mutex_exit(&msp->ms_lock);
 432                         continue;
 433                 }
 434
 435                 /*
 436                  * If we get here, we're in the middle of initializing this
 437                  * metaslab. Load it and walk the free tree for more accurate
 438                  * progress estimation.
 439                  */
 440                 VERIFY0(metaslab_load(msp));
 441
 442                 zfs_btree_index_t where;
 443                 range_tree_t *rt = msp->ms_allocatable;
 444                 for (range_seg_t *rs =
 445                     zfs_btree_first(&rt->rt_root, &where); rs;
 446                     rs = zfs_btree_next(&rt->rt_root, &where,
 447                     &where)) {
 448                         logical_rs.rs_start = rs_get_start(rs, rt);
 449                         logical_rs.rs_end = rs_get_end(rs, rt);
 450
 451                         vdev_xlate_walk(vd, &logical_rs,
 452                             vdev_initialize_xlate_progress, vd);
 453                 }
 454                 mutex_exit(&msp->ms_lock);
 455         }
 456 }
 457
 458 static int
 459 vdev_initialize_load(vdev_t *vd)
 460 {
 461         int err = 0;
 462         ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
 463             spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
 464         ASSERT(vd->vdev_leaf_zap != 0);
 465
 466         if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE ||
 467             vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED) {
 468                 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
 469                     vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET,
 470                     sizeof (vd->vdev_initialize_last_offset), 1,
 471                     &vd->vdev_initialize_last_offset);
 472                 if (err == ENOENT) {
 473                         vd->vdev_initialize_last_offset = 0;
 474                         err = 0;
 475                 }
 476         }
 477
 478         vdev_initialize_calculate_progress(vd);
 479         return (err);
 480 }
 481
 482 static void
 483 vdev_initialize_xlate_range_add(void *arg, range_seg64_t *physical_rs)
 484 {
 485         vdev_t *vd = arg;
 486
 487         /* Only add segments that we have not visited yet */
 488         if (physical_rs->rs_end <= vd->vdev_initialize_last_offset)
 489                 return;
 490
 491         /* Pick up where we left off mid-range. */
 492         if (vd->vdev_initialize_last_offset > physical_rs->rs_start) {
 493                 zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to "
 494                     "(%llu, %llu)", vd->vdev_path,
 495                     (u_longlong_t)physical_rs->rs_start,
 496                     (u_longlong_t)physical_rs->rs_end,
 497                     (u_longlong_t)vd->vdev_initialize_last_offset,
 498                     (u_longlong_t)physical_rs->rs_end);
 499                 ASSERT3U(physical_rs->rs_end, >,
 500                     vd->vdev_initialize_last_offset);
 501                 physical_rs->rs_start = vd->vdev_initialize_last_offset;
 502         }
 503
 504         ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start);
 505
 506         range_tree_add(vd->vdev_initialize_tree, physical_rs->rs_start,
 507             physical_rs->rs_end - physical_rs->rs_start);
 508 }
 509
 510 /*
 511  * Convert the logical range into a physical range and add it to our
 512  * avl tree.
 513  */
 514 static void
 515 vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size)
 516 {
 517         vdev_t *vd = arg;
 518         range_seg64_t logical_rs;
 519         logical_rs.rs_start = start;
 520         logical_rs.rs_end = start + size;
 521
 522         ASSERT(vd->vdev_ops->vdev_op_leaf);
 523         vdev_xlate_walk(vd, &logical_rs, vdev_initialize_xlate_range_add, arg);
 524 }
 525
 526 static __attribute__((noreturn)) void
 527 vdev_initialize_thread(void *arg)
 528 {
 529         vdev_t *vd = arg;
 530         spa_t *spa = vd->vdev_spa;
 531         int error = 0;
 532         uint64_t ms_count = 0;
 533
 534         ASSERT(vdev_is_concrete(vd));
 535         spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 536
 537         vd->vdev_initialize_last_offset = 0;
 538         VERIFY0(vdev_initialize_load(vd));
 539
 540         abd_t *deadbeef = vdev_initialize_block_alloc();
 541
 542         vd->vdev_initialize_tree = range_tree_create(NULL, RANGE_SEG64, NULL,
 543             0, 0);
 544
 545         for (uint64_t i = 0; !vd->vdev_detached &&
 546             i < vd->vdev_top->vdev_ms_count; i++) {
 547                 metaslab_t *msp = vd->vdev_top->vdev_ms[i];
 548                 boolean_t unload_when_done = B_FALSE;
 549
 550                 /*
 551                  * If we've expanded the top-level vdev or it's our
 552                  * first pass, calculate our progress.
 553                  */
 554                 if (vd->vdev_top->vdev_ms_count != ms_count) {
 555                         vdev_initialize_calculate_progress(vd);
 556                         ms_count = vd->vdev_top->vdev_ms_count;
 557                 }
 558
 559                 spa_config_exit(spa, SCL_CONFIG, FTAG);
 560                 metaslab_disable(msp);
 561                 mutex_enter(&msp->ms_lock);
 562                 if (!msp->ms_loaded && !msp->ms_loading)
 563                         unload_when_done = B_TRUE;
 564                 VERIFY0(metaslab_load(msp));
 565
 566                 range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add,
 567                     vd);
 568                 mutex_exit(&msp->ms_lock);
 569
 570                 error = vdev_initialize_ranges(vd, deadbeef);
 571                 metaslab_enable(msp, B_TRUE, unload_when_done);
 572                 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 573
 574                 range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL);
 575                 if (error != 0)
 576                         break;
 577         }
 578
 579         spa_config_exit(spa, SCL_CONFIG, FTAG);
 580         mutex_enter(&vd->vdev_initialize_io_lock);
 581         while (vd->vdev_initialize_inflight > 0) {
 582                 cv_wait(&vd->vdev_initialize_io_cv,
 583                     &vd->vdev_initialize_io_lock);
 584         }
 585         mutex_exit(&vd->vdev_initialize_io_lock);
 586
 587         range_tree_destroy(vd->vdev_initialize_tree);
 588         vdev_initialize_block_free(deadbeef);
 589         vd->vdev_initialize_tree = NULL;
 590
 591         mutex_enter(&vd->vdev_initialize_lock);
 592         if (!vd->vdev_initialize_exit_wanted) {
 593                 if (vdev_writeable(vd)) {
 594                         vdev_initialize_change_state(vd,
 595                             VDEV_INITIALIZE_COMPLETE);
 596                 } else if (vd->vdev_faulted) {
 597                         vdev_initialize_change_state(vd,
 598                             VDEV_INITIALIZE_CANCELED);
 599                 }
 600         }
 601         ASSERT(vd->vdev_initialize_thread != NULL ||
 602             vd->vdev_initialize_inflight == 0);
 603
 604         /*
 605          * Drop the vdev_initialize_lock while we sync out the
 606          * txg since it's possible that a device might be trying to
 607          * come online and must check to see if it needs to restart an
 608          * initialization. That thread will be holding the spa_config_lock
 609          * which would prevent the txg_wait_synced from completing.
 610          */
 611         mutex_exit(&vd->vdev_initialize_lock);
 612         txg_wait_synced(spa_get_dsl(spa), 0);
 613         mutex_enter(&vd->vdev_initialize_lock);
 614
 615         vd->vdev_initialize_thread = NULL;
 616         cv_broadcast(&vd->vdev_initialize_cv);
 617         mutex_exit(&vd->vdev_initialize_lock);
 618
 619         thread_exit();
 620 }
 621
 622 /*
 623  * Initiates a device. Caller must hold vdev_initialize_lock.
 624  * Device must be a leaf and not already be initializing.
 625  */
 626 void
 627 vdev_initialize(vdev_t *vd)
 628 {
 629         ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
 630         ASSERT(vd->vdev_ops->vdev_op_leaf);
 631         ASSERT(vdev_is_concrete(vd));
 632         ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
 633         ASSERT(!vd->vdev_detached);
 634         ASSERT(!vd->vdev_initialize_exit_wanted);
 635         ASSERT(!vd->vdev_top->vdev_removing);
 636         ASSERT(!vd->vdev_top->vdev_rz_expanding);
 637
 638         vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE);
 639         vd->vdev_initialize_thread = thread_create(NULL, 0,
 640             vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
 641 }
 642
 643 /*
 644  * Uninitializes a device. Caller must hold vdev_initialize_lock.
 645  * Device must be a leaf and not already be initializing.
 646  */
 647 void
 648 vdev_uninitialize(vdev_t *vd)
 649 {
 650         ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
 651         ASSERT(vd->vdev_ops->vdev_op_leaf);
 652         ASSERT(vdev_is_concrete(vd));
 653         ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
 654         ASSERT(!vd->vdev_detached);
 655         ASSERT(!vd->vdev_initialize_exit_wanted);
 656         ASSERT(!vd->vdev_top->vdev_removing);
 657
 658         vdev_initialize_change_state(vd, VDEV_INITIALIZE_NONE);
 659 }
 660
 661 /*
 662  * Wait for the initialize thread to be terminated (cancelled or stopped).
 663  */
 664 static void
 665 vdev_initialize_stop_wait_impl(vdev_t *vd)
 666 {
 667         ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
 668
 669         while (vd->vdev_initialize_thread != NULL)
 670                 cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock);
 671
 672         ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
 673         vd->vdev_initialize_exit_wanted = B_FALSE;
 674 }
 675
 676 /*
 677  * Wait for vdev initialize threads which were either to cleanly exit.
 678  */
 679 void
 680 vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list)
 681 {
 682         (void) spa;
 683         vdev_t *vd;
 684
 685         ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
 686             spa->spa_export_thread == curthread);
 687
 688         while ((vd = list_remove_head(vd_list)) != NULL) {
 689                 mutex_enter(&vd->vdev_initialize_lock);
 690                 vdev_initialize_stop_wait_impl(vd);
 691                 mutex_exit(&vd->vdev_initialize_lock);
 692         }
 693 }
 694
 695 /*
 696  * Stop initializing a device, with the resultant initializing state being
 697  * tgt_state.  For blocking behavior pass NULL for vd_list.  Otherwise, when
 698  * a list_t is provided the stopping vdev is inserted in to the list.  Callers
 699  * are then required to call vdev_initialize_stop_wait() to block for all the
 700  * initialization threads to exit.  The caller must hold vdev_initialize_lock
 701  * and must not be writing to the spa config, as the initializing thread may
 702  * try to enter the config as a reader before exiting.
 703  */
 704 void
 705 vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state,
 706     list_t *vd_list)
 707 {
 708         ASSERT(!spa_config_held(vd->vdev_spa, SCL_CONFIG|SCL_STATE, RW_WRITER));
 709         ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
 710         ASSERT(vd->vdev_ops->vdev_op_leaf);
 711         ASSERT(vdev_is_concrete(vd));
 712
 713         /*
 714          * Allow cancel requests to proceed even if the initialize thread
 715          * has stopped.
 716          */
 717         if (vd->vdev_initialize_thread == NULL &&
 718             tgt_state != VDEV_INITIALIZE_CANCELED) {
 719                 return;
 720         }
 721
 722         vdev_initialize_change_state(vd, tgt_state);
 723         vd->vdev_initialize_exit_wanted = B_TRUE;
 724
 725         if (vd_list == NULL) {
 726                 vdev_initialize_stop_wait_impl(vd);
 727         } else {
 728                 ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
 729                     vd->vdev_spa->spa_export_thread == curthread);
 730                 list_insert_tail(vd_list, vd);
 731         }
 732 }
 733
 734 static void
 735 vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state,
 736     list_t *vd_list)
 737 {
 738         if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) {
 739                 mutex_enter(&vd->vdev_initialize_lock);
 740                 vdev_initialize_stop(vd, tgt_state, vd_list);
 741                 mutex_exit(&vd->vdev_initialize_lock);
 742                 return;
 743         }
 744
 745         for (uint64_t i = 0; i < vd->vdev_children; i++) {
 746                 vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state,
 747                     vd_list);
 748         }
 749 }
 750
 751 /*
 752  * Convenience function to stop initializing of a vdev tree and set all
 753  * initialize thread pointers to NULL.
 754  */
 755 void
 756 vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state)
 757 {
 758         spa_t *spa = vd->vdev_spa;
 759         list_t vd_list;
 760
 761         ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
 762             spa->spa_export_thread == curthread);
 763
 764         list_create(&vd_list, sizeof (vdev_t),
 765             offsetof(vdev_t, vdev_initialize_node));
 766
 767         vdev_initialize_stop_all_impl(vd, tgt_state, &vd_list);
 768         vdev_initialize_stop_wait(spa, &vd_list);
 769
 770         if (vd->vdev_spa->spa_sync_on) {
 771                 /* Make sure that our state has been synced to disk */
 772                 txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
 773         }
 774
 775         list_destroy(&vd_list);
 776 }
 777
 778 void
 779 vdev_initialize_restart(vdev_t *vd)
 780 {
 781         ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
 782             vd->vdev_spa->spa_load_thread == curthread);
 783         ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
 784
 785         if (vd->vdev_leaf_zap != 0) {
 786                 mutex_enter(&vd->vdev_initialize_lock);
 787                 uint64_t initialize_state = VDEV_INITIALIZE_NONE;
 788                 int err = zap_lookup(vd->vdev_spa->spa_meta_objset,
 789                     vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE,
 790                     sizeof (initialize_state), 1, &initialize_state);
 791                 ASSERT(err == 0 || err == ENOENT);
 792                 vd->vdev_initialize_state = initialize_state;
 793
 794                 uint64_t timestamp = 0;
 795                 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
 796                     vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME,
 797                     sizeof (timestamp), 1, &timestamp);
 798                 ASSERT(err == 0 || err == ENOENT);
 799                 vd->vdev_initialize_action_time = timestamp;
 800
 801                 if ((vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED ||
 802                     vd->vdev_offline) && !vd->vdev_top->vdev_rz_expanding) {
 803                         /* load progress for reporting, but don't resume */
 804                         VERIFY0(vdev_initialize_load(vd));
 805                 } else if (vd->vdev_initialize_state ==
 806                     VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd) &&
 807                     !vd->vdev_top->vdev_removing &&
 808                     !vd->vdev_top->vdev_rz_expanding &&
 809                     vd->vdev_initialize_thread == NULL) {
 810                         vdev_initialize(vd);
 811                 }
 812
 813                 mutex_exit(&vd->vdev_initialize_lock);
 814         }
 815
 816         for (uint64_t i = 0; i < vd->vdev_children; i++) {
 817                 vdev_initialize_restart(vd->vdev_child[i]);
 818         }
 819 }
 820
 821 EXPORT_SYMBOL(vdev_initialize);
 822 EXPORT_SYMBOL(vdev_uninitialize);
 823 EXPORT_SYMBOL(vdev_initialize_stop);
 824 EXPORT_SYMBOL(vdev_initialize_stop_all);
 825 EXPORT_SYMBOL(vdev_initialize_stop_wait);
 826 EXPORT_SYMBOL(vdev_initialize_restart);
 827
 828 ZFS_MODULE_PARAM(zfs, zfs_, initialize_value, U64, ZMOD_RW,
 829         "Value written during zpool initialize");
 830
 831 ZFS_MODULE_PARAM(zfs, zfs_, initialize_chunk_size, U64, ZMOD_RW,
 832         "Size in bytes of writes by zpool initialize");