kernel/fs/zfs/txg.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Portions Copyright 2011 Martin Matuska
  24  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  25  */
  26
  27 #include <sys/zfs_context.h>
  28 #include <sys/txg_impl.h>
  29 #include <sys/dmu_impl.h>
  30 #include <sys/dmu_tx.h>
  31 #include <sys/dsl_pool.h>
  32 #include <sys/dsl_scan.h>
  33 #include <sys/zil.h>
  34 #include <sys/callb.h>
  35
  36 /*
  37  * ZFS Transaction Groups
  38  * ----------------------
  39  *
  40  * ZFS transaction groups are, as the name implies, groups of transactions
  41  * that act on persistent state. ZFS asserts consistency at the granularity of
  42  * these transaction groups. Each successive transaction group (txg) is
  43  * assigned a 64-bit consecutive identifier. There are three active
  44  * transaction group states: open, quiescing, or syncing. At any given time,
  45  * there may be an active txg associated with each state; each active txg may
  46  * either be processing, or blocked waiting to enter the next state. There may
  47  * be up to three active txgs, and there is always a txg in the open state
  48  * (though it may be blocked waiting to enter the quiescing state). In broad
  49  * strokes, transactions -- operations that change in-memory structures -- are
  50  * accepted into the txg in the open state, and are completed while the txg is
  51  * in the open or quiescing states. The accumulated changes are written to
  52  * disk in the syncing state.
  53  *
  54  * Open
  55  *
  56  * When a new txg becomes active, it first enters the open state. New
  57  * transactions -- updates to in-memory structures -- are assigned to the
  58  * currently open txg. There is always a txg in the open state so that ZFS can
  59  * accept new changes (though the txg may refuse new changes if it has hit
  60  * some limit). ZFS advances the open txg to the next state for a variety of
  61  * reasons such as it hitting a time or size threshold, or the execution of an
  62  * administrative action that must be completed in the syncing state.
  63  *
  64  * Quiescing
  65  *
  66  * After a txg exits the open state, it enters the quiescing state. The
  67  * quiescing state is intended to provide a buffer between accepting new
  68  * transactions in the open state and writing them out to stable storage in
  69  * the syncing state. While quiescing, transactions can continue their
  70  * operation without delaying either of the other states. Typically, a txg is
  71  * in the quiescing state very briefly since the operations are bounded by
  72  * software latencies rather than, say, slower I/O latencies. After all
  73  * transactions complete, the txg is ready to enter the next state.
  74  *
  75  * Syncing
  76  *
  77  * In the syncing state, the in-memory state built up during the open and (to
  78  * a lesser degree) the quiescing states is written to stable storage. The
  79  * process of writing out modified data can, in turn modify more data. For
  80  * example when we write new blocks, we need to allocate space for them; those
  81  * allocations modify metadata (space maps)... which themselves must be
  82  * written to stable storage. During the sync state, ZFS iterates, writing out
  83  * data until it converges and all in-memory changes have been written out.
  84  * The first such pass is the largest as it encompasses all the modified user
  85  * data (as opposed to filesystem metadata). Subsequent passes typically have
  86  * far less data to write as they consist exclusively of filesystem metadata.
  87  *
  88  * To ensure convergence, after a certain number of passes ZFS begins
  89  * overwriting locations on stable storage that had been allocated earlier in
  90  * the syncing state (and subsequently freed). ZFS usually allocates new
  91  * blocks to optimize for large, continuous, writes. For the syncing state to
  92  * converge however it must complete a pass where no new blocks are allocated
  93  * since each allocation requires a modification of persistent metadata.
  94  * Further, to hasten convergence, after a prescribed number of passes, ZFS
  95  * also defers frees, and stops compressing.
  96  *
  97  * In addition to writing out user data, we must also execute synctasks during
  98  * the syncing context. A synctask is the mechanism by which some
  99  * administrative activities work such as creating and destroying snapshots or
 100  * datasets. Note that when a synctask is initiated it enters the open txg,
 101  * and ZFS then pushes that txg as quickly as possible to completion of the
 102  * syncing state in order to reduce the latency of the administrative
 103  * activity. To complete the syncing state, ZFS writes out a new uberblock,
 104  * the root of the tree of blocks that comprise all state stored on the ZFS
 105  * pool. Finally, if there is a quiesced txg waiting, we signal that it can
 106  * now transition to the syncing state.
 107  */
 108
 109 static void txg_sync_thread(dsl_pool_t *dp);
 110 static void txg_quiesce_thread(dsl_pool_t *dp);
 111
 112 int zfs_txg_timeout = 5;        /* max seconds worth of delta per txg */
 113
 114 /*
 115  * Prepare the txg subsystem.
 116  */
 117 void
 118 txg_init(dsl_pool_t *dp, uint64_t txg)
 119 {
 120         tx_state_t *tx = &dp->dp_tx;
 121         int c;
 122         bzero(tx, sizeof (tx_state_t));
 123
 124         tx->tx_cpu = kmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP);
 125
 126         for (c = 0; c < max_ncpus; c++) {
 127                 int i;
 128
 129                 mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL);
 130                 mutex_init(&tx->tx_cpu[c].tc_open_lock, NULL, MUTEX_DEFAULT,
 131                     NULL);
 132                 for (i = 0; i < TXG_SIZE; i++) {
 133                         cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT,
 134                             NULL);
 135                         list_create(&tx->tx_cpu[c].tc_callbacks[i],
 136                             sizeof (dmu_tx_callback_t),
 137                             offsetof(dmu_tx_callback_t, dcb_node));
 138                 }
 139         }
 140
 141         mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL);
 142
 143         cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL);
 144         cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL);
 145         cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL);
 146         cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL);
 147         cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL);
 148
 149         tx->tx_open_txg = txg;
 150 }
 151
 152 /*
 153  * Close down the txg subsystem.
 154  */
 155 void
 156 txg_fini(dsl_pool_t *dp)
 157 {
 158         tx_state_t *tx = &dp->dp_tx;
 159         int c;
 160
 161         ASSERT(tx->tx_threads == 0);
 162
 163         mutex_destroy(&tx->tx_sync_lock);
 164
 165         cv_destroy(&tx->tx_sync_more_cv);
 166         cv_destroy(&tx->tx_sync_done_cv);
 167         cv_destroy(&tx->tx_quiesce_more_cv);
 168         cv_destroy(&tx->tx_quiesce_done_cv);
 169         cv_destroy(&tx->tx_exit_cv);
 170
 171         for (c = 0; c < max_ncpus; c++) {
 172                 int i;
 173
 174                 mutex_destroy(&tx->tx_cpu[c].tc_open_lock);
 175                 mutex_destroy(&tx->tx_cpu[c].tc_lock);
 176                 for (i = 0; i < TXG_SIZE; i++) {
 177                         cv_destroy(&tx->tx_cpu[c].tc_cv[i]);
 178                         list_destroy(&tx->tx_cpu[c].tc_callbacks[i]);
 179                 }
 180         }
 181
 182         if (tx->tx_commit_cb_taskq != NULL)
 183                 taskq_destroy(tx->tx_commit_cb_taskq);
 184
 185         kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t));
 186
 187         bzero(tx, sizeof (tx_state_t));
 188 }
 189
 190 /*
 191  * Start syncing transaction groups.
 192  */
 193 void
 194 txg_sync_start(dsl_pool_t *dp)
 195 {
 196         tx_state_t *tx = &dp->dp_tx;
 197
 198         mutex_enter(&tx->tx_sync_lock);
 199
 200         dprintf("pool %p\n", dp);
 201
 202         ASSERT(tx->tx_threads == 0);
 203
 204         tx->tx_threads = 2;
 205
 206         tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread,
 207             dp, 0, &p0, TS_RUN, minclsyspri);
 208
 209         /*
 210          * The sync thread can need a larger-than-default stack size on
 211          * 32-bit x86.  This is due in part to nested pools and
 212          * scrub_visitbp() recursion.
 213          */
 214         tx->tx_sync_thread = thread_create(NULL, 32<<10, txg_sync_thread,
 215             dp, 0, &p0, TS_RUN, minclsyspri);
 216
 217         mutex_exit(&tx->tx_sync_lock);
 218 }
 219
 220 static void
 221 txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr)
 222 {
 223         CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG);
 224         mutex_enter(&tx->tx_sync_lock);
 225 }
 226
 227 static void
 228 txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp)
 229 {
 230         ASSERT(*tpp != NULL);
 231         *tpp = NULL;
 232         tx->tx_threads--;
 233         cv_broadcast(&tx->tx_exit_cv);
 234         CALLB_CPR_EXIT(cpr);            /* drops &tx->tx_sync_lock */
 235         thread_exit();
 236 }
 237
 238 static void
 239 txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time)
 240 {
 241         CALLB_CPR_SAFE_BEGIN(cpr);
 242
 243         if (time)
 244                 (void) cv_timedwait(cv, &tx->tx_sync_lock,
 245                     ddi_get_lbolt() + time);
 246         else
 247                 cv_wait(cv, &tx->tx_sync_lock);
 248
 249         CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock);
 250 }
 251
 252 /*
 253  * Stop syncing transaction groups.
 254  */
 255 void
 256 txg_sync_stop(dsl_pool_t *dp)
 257 {
 258         tx_state_t *tx = &dp->dp_tx;
 259
 260         dprintf("pool %p\n", dp);
 261         /*
 262          * Finish off any work in progress.
 263          */
 264         ASSERT(tx->tx_threads == 2);
 265
 266         /*
 267          * We need to ensure that we've vacated the deferred space_maps.
 268          */
 269         txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE);
 270
 271         /*
 272          * Wake all sync threads and wait for them to die.
 273          */
 274         mutex_enter(&tx->tx_sync_lock);
 275
 276         ASSERT(tx->tx_threads == 2);
 277
 278         tx->tx_exiting = 1;
 279
 280         cv_broadcast(&tx->tx_quiesce_more_cv);
 281         cv_broadcast(&tx->tx_quiesce_done_cv);
 282         cv_broadcast(&tx->tx_sync_more_cv);
 283
 284         while (tx->tx_threads != 0)
 285                 cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock);
 286
 287         tx->tx_exiting = 0;
 288
 289         mutex_exit(&tx->tx_sync_lock);
 290 }
 291
 292 uint64_t
 293 txg_hold_open(dsl_pool_t *dp, txg_handle_t *th)
 294 {
 295         tx_state_t *tx = &dp->dp_tx;
 296         tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID];
 297         uint64_t txg;
 298
 299         mutex_enter(&tc->tc_open_lock);
 300         txg = tx->tx_open_txg;
 301
 302         mutex_enter(&tc->tc_lock);
 303         tc->tc_count[txg & TXG_MASK]++;
 304         mutex_exit(&tc->tc_lock);
 305
 306         th->th_cpu = tc;
 307         th->th_txg = txg;
 308
 309         return (txg);
 310 }
 311
 312 void
 313 txg_rele_to_quiesce(txg_handle_t *th)
 314 {
 315         tx_cpu_t *tc = th->th_cpu;
 316
 317         ASSERT(!MUTEX_HELD(&tc->tc_lock));
 318         mutex_exit(&tc->tc_open_lock);
 319 }
 320
 321 void
 322 txg_register_callbacks(txg_handle_t *th, list_t *tx_callbacks)
 323 {
 324         tx_cpu_t *tc = th->th_cpu;
 325         int g = th->th_txg & TXG_MASK;
 326
 327         mutex_enter(&tc->tc_lock);
 328         list_move_tail(&tc->tc_callbacks[g], tx_callbacks);
 329         mutex_exit(&tc->tc_lock);
 330 }
 331
 332 void
 333 txg_rele_to_sync(txg_handle_t *th)
 334 {
 335         tx_cpu_t *tc = th->th_cpu;
 336         int g = th->th_txg & TXG_MASK;
 337
 338         mutex_enter(&tc->tc_lock);
 339         ASSERT(tc->tc_count[g] != 0);
 340         if (--tc->tc_count[g] == 0)
 341                 cv_broadcast(&tc->tc_cv[g]);
 342         mutex_exit(&tc->tc_lock);
 343
 344         th->th_cpu = NULL;      /* defensive */
 345 }
 346
 347 /*
 348  * Blocks until all transactions in the group are committed.
 349  *
 350  * On return, the transaction group has reached a stable state in which it can
 351  * then be passed off to the syncing context.
 352  */
 353 static void
 354 txg_quiesce(dsl_pool_t *dp, uint64_t txg)
 355 {
 356         tx_state_t *tx = &dp->dp_tx;
 357         int g = txg & TXG_MASK;
 358         int c;
 359
 360         /*
 361          * Grab all tc_open_locks so nobody else can get into this txg.
 362          */
 363         for (c = 0; c < max_ncpus; c++)
 364                 mutex_enter(&tx->tx_cpu[c].tc_open_lock);
 365
 366         ASSERT(txg == tx->tx_open_txg);
 367         tx->tx_open_txg++;
 368         tx->tx_open_time = gethrtime();
 369
 370         DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg);
 371         DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg);
 372
 373         /*
 374          * Now that we've incremented tx_open_txg, we can let threads
 375          * enter the next transaction group.
 376          */
 377         for (c = 0; c < max_ncpus; c++)
 378                 mutex_exit(&tx->tx_cpu[c].tc_open_lock);
 379
 380         /*
 381          * Quiesce the transaction group by waiting for everyone to txg_exit().
 382          */
 383         for (c = 0; c < max_ncpus; c++) {
 384                 tx_cpu_t *tc = &tx->tx_cpu[c];
 385                 mutex_enter(&tc->tc_lock);
 386                 while (tc->tc_count[g] != 0)
 387                         cv_wait(&tc->tc_cv[g], &tc->tc_lock);
 388                 mutex_exit(&tc->tc_lock);
 389         }
 390 }
 391
 392 static void
 393 txg_do_callbacks(list_t *cb_list)
 394 {
 395         dmu_tx_do_callbacks(cb_list, 0);
 396
 397         list_destroy(cb_list);
 398
 399         kmem_free(cb_list, sizeof (list_t));
 400 }
 401
 402 /*
 403  * Dispatch the commit callbacks registered on this txg to worker threads.
 404  *
 405  * If no callbacks are registered for a given TXG, nothing happens.
 406  * This function creates a taskq for the associated pool, if needed.
 407  */
 408 static void
 409 txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
 410 {
 411         int c;
 412         tx_state_t *tx = &dp->dp_tx;
 413         list_t *cb_list;
 414
 415         for (c = 0; c < max_ncpus; c++) {
 416                 tx_cpu_t *tc = &tx->tx_cpu[c];
 417                 /*
 418                  * No need to lock tx_cpu_t at this point, since this can
 419                  * only be called once a txg has been synced.
 420                  */
 421
 422                 int g = txg & TXG_MASK;
 423
 424                 if (list_is_empty(&tc->tc_callbacks[g]))
 425                         continue;
 426
 427                 if (tx->tx_commit_cb_taskq == NULL) {
 428                         /*
 429                          * Commit callback taskq hasn't been created yet.
 430                          */
 431                         tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb",
 432                             max_ncpus, minclsyspri, max_ncpus, max_ncpus * 2,
 433                             TASKQ_PREPOPULATE);
 434                 }
 435
 436                 cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
 437                 list_create(cb_list, sizeof (dmu_tx_callback_t),
 438                     offsetof(dmu_tx_callback_t, dcb_node));
 439
 440                 list_move_tail(cb_list, &tc->tc_callbacks[g]);
 441
 442                 (void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *)
 443                     txg_do_callbacks, cb_list, TQ_SLEEP);
 444         }
 445 }
 446
 447 static void
 448 txg_sync_thread(dsl_pool_t *dp)
 449 {
 450         spa_t *spa = dp->dp_spa;
 451         tx_state_t *tx = &dp->dp_tx;
 452         callb_cpr_t cpr;
 453         uint64_t start, delta;
 454
 455         txg_thread_enter(tx, &cpr);
 456
 457         start = delta = 0;
 458         for (;;) {
 459                 uint64_t timeout = zfs_txg_timeout * hz;
 460                 uint64_t timer;
 461                 uint64_t txg;
 462
 463                 /*
 464                  * We sync when we're scanning, there's someone waiting
 465                  * on us, or the quiesce thread has handed off a txg to
 466                  * us, or we have reached our timeout.
 467                  */
 468                 timer = (delta >= timeout ? 0 : timeout - delta);
 469                 while (!dsl_scan_active(dp->dp_scan) &&
 470                     !tx->tx_exiting && timer > 0 &&
 471                     tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
 472                     tx->tx_quiesced_txg == 0 &&
 473                     dp->dp_dirty_total < zfs_dirty_data_sync) {
 474                         dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
 475                             tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
 476                         txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
 477                         delta = ddi_get_lbolt() - start;
 478                         timer = (delta > timeout ? 0 : timeout - delta);
 479                 }
 480
 481                 /*
 482                  * Wait until the quiesce thread hands off a txg to us,
 483                  * prompting it to do so if necessary.
 484                  */
 485                 while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) {
 486                         if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
 487                                 tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
 488                         cv_broadcast(&tx->tx_quiesce_more_cv);
 489                         txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
 490                 }
 491
 492                 if (tx->tx_exiting)
 493                         txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
 494
 495                 /*
 496                  * Consume the quiesced txg which has been handed off to
 497                  * us.  This may cause the quiescing thread to now be
 498                  * able to quiesce another txg, so we must signal it.
 499                  */
 500                 txg = tx->tx_quiesced_txg;
 501                 tx->tx_quiesced_txg = 0;
 502                 tx->tx_syncing_txg = txg;
 503                 DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg);
 504                 cv_broadcast(&tx->tx_quiesce_more_cv);
 505
 506                 dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
 507                     txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
 508                 mutex_exit(&tx->tx_sync_lock);
 509
 510                 start = ddi_get_lbolt();
 511                 spa_sync(spa, txg);
 512                 delta = ddi_get_lbolt() - start;
 513
 514                 mutex_enter(&tx->tx_sync_lock);
 515                 tx->tx_synced_txg = txg;
 516                 tx->tx_syncing_txg = 0;
 517                 DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg);
 518                 cv_broadcast(&tx->tx_sync_done_cv);
 519
 520                 /*
 521                  * Dispatch commit callbacks to worker threads.
 522                  */
 523                 txg_dispatch_callbacks(dp, txg);
 524         }
 525 }
 526
 527 static void
 528 txg_quiesce_thread(dsl_pool_t *dp)
 529 {
 530         tx_state_t *tx = &dp->dp_tx;
 531         callb_cpr_t cpr;
 532
 533         txg_thread_enter(tx, &cpr);
 534
 535         for (;;) {
 536                 uint64_t txg;
 537
 538                 /*
 539                  * We quiesce when there's someone waiting on us.
 540                  * However, we can only have one txg in "quiescing" or
 541                  * "quiesced, waiting to sync" state.  So we wait until
 542                  * the "quiesced, waiting to sync" txg has been consumed
 543                  * by the sync thread.
 544                  */
 545                 while (!tx->tx_exiting &&
 546                     (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting ||
 547                     tx->tx_quiesced_txg != 0))
 548                         txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0);
 549
 550                 if (tx->tx_exiting)
 551                         txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread);
 552
 553                 txg = tx->tx_open_txg;
 554                 dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
 555                     txg, tx->tx_quiesce_txg_waiting,
 556                     tx->tx_sync_txg_waiting);
 557                 mutex_exit(&tx->tx_sync_lock);
 558                 txg_quiesce(dp, txg);
 559                 mutex_enter(&tx->tx_sync_lock);
 560
 561                 /*
 562                  * Hand this txg off to the sync thread.
 563                  */
 564                 dprintf("quiesce done, handing off txg %llu\n", txg);
 565                 tx->tx_quiesced_txg = txg;
 566                 DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg);
 567                 cv_broadcast(&tx->tx_sync_more_cv);
 568                 cv_broadcast(&tx->tx_quiesce_done_cv);
 569         }
 570 }
 571
 572 /*
 573  * Delay this thread by delay nanoseconds if we are still in the open
 574  * transaction group and there is already a waiting txg quiescing or quiesced.
 575  * Abort the delay if this txg stalls or enters the quiescing state.
 576  */
 577 void
 578 txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution)
 579 {
 580         tx_state_t *tx = &dp->dp_tx;
 581         hrtime_t start = gethrtime();
 582
 583         /* don't delay if this txg could transition to quiescing immediately */
 584         if (tx->tx_open_txg > txg ||
 585             tx->tx_syncing_txg == txg-1 || tx->tx_synced_txg == txg-1)
 586                 return;
 587
 588         mutex_enter(&tx->tx_sync_lock);
 589         if (tx->tx_open_txg > txg || tx->tx_synced_txg == txg-1) {
 590                 mutex_exit(&tx->tx_sync_lock);
 591                 return;
 592         }
 593
 594         while (gethrtime() - start < delay &&
 595             tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) {
 596                 (void) cv_timedwait_hires(&tx->tx_quiesce_more_cv,
 597                     &tx->tx_sync_lock, delay, resolution, 0);
 598         }
 599
 600         mutex_exit(&tx->tx_sync_lock);
 601 }
 602
 603 void
 604 txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
 605 {
 606         tx_state_t *tx = &dp->dp_tx;
 607
 608         ASSERT(!dsl_pool_config_held(dp));
 609
 610         mutex_enter(&tx->tx_sync_lock);
 611         ASSERT(tx->tx_threads == 2);
 612         if (txg == 0)
 613                 txg = tx->tx_open_txg + TXG_DEFER_SIZE;
 614         if (tx->tx_sync_txg_waiting < txg)
 615                 tx->tx_sync_txg_waiting = txg;
 616         dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
 617             txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
 618         while (tx->tx_synced_txg < txg) {
 619                 dprintf("broadcasting sync more "
 620                     "tx_synced=%llu waiting=%llu dp=%p\n",
 621                     tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
 622                 cv_broadcast(&tx->tx_sync_more_cv);
 623                 cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock);
 624         }
 625         mutex_exit(&tx->tx_sync_lock);
 626 }
 627
 628 void
 629 txg_wait_open(dsl_pool_t *dp, uint64_t txg)
 630 {
 631         tx_state_t *tx = &dp->dp_tx;
 632
 633         ASSERT(!dsl_pool_config_held(dp));
 634
 635         mutex_enter(&tx->tx_sync_lock);
 636         ASSERT(tx->tx_threads == 2);
 637         if (txg == 0)
 638                 txg = tx->tx_open_txg + 1;
 639         if (tx->tx_quiesce_txg_waiting < txg)
 640                 tx->tx_quiesce_txg_waiting = txg;
 641         dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
 642             txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
 643         while (tx->tx_open_txg < txg) {
 644                 cv_broadcast(&tx->tx_quiesce_more_cv);
 645                 cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock);
 646         }
 647         mutex_exit(&tx->tx_sync_lock);
 648 }
 649
 650 /*
 651  * If there isn't a txg syncing or in the pipeline, push another txg through
 652  * the pipeline by queiscing the open txg.
 653  */
 654 void
 655 txg_kick(dsl_pool_t *dp)
 656 {
 657         tx_state_t *tx = &dp->dp_tx;
 658
 659         ASSERT(!dsl_pool_config_held(dp));
 660
 661         mutex_enter(&tx->tx_sync_lock);
 662         if (tx->tx_syncing_txg == 0 &&
 663             tx->tx_quiesce_txg_waiting <= tx->tx_open_txg &&
 664             tx->tx_sync_txg_waiting <= tx->tx_synced_txg &&
 665             tx->tx_quiesced_txg <= tx->tx_synced_txg) {
 666                 tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1;
 667                 cv_broadcast(&tx->tx_quiesce_more_cv);
 668         }
 669         mutex_exit(&tx->tx_sync_lock);
 670 }
 671
 672 boolean_t
 673 txg_stalled(dsl_pool_t *dp)
 674 {
 675         tx_state_t *tx = &dp->dp_tx;
 676         return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg);
 677 }
 678
 679 boolean_t
 680 txg_sync_waiting(dsl_pool_t *dp)
 681 {
 682         tx_state_t *tx = &dp->dp_tx;
 683
 684         return (tx->tx_syncing_txg <= tx->tx_sync_txg_waiting ||
 685             tx->tx_quiesced_txg != 0);
 686 }
 687
 688 /*
 689  * Verify that this txg is active (open, quiescing, syncing).  Non-active
 690  * txg's should not be manipulated.
 691  */
 692 void
 693 txg_verify(spa_t *spa, uint64_t txg)
 694 {
 695         dsl_pool_t *dp = spa_get_dsl(spa);
 696         if (txg <= TXG_INITIAL || txg == ZILTEST_TXG)
 697                 return;
 698         ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
 699         ASSERT3U(txg, >=, dp->dp_tx.tx_synced_txg);
 700         ASSERT3U(txg, >=, dp->dp_tx.tx_open_txg - TXG_CONCURRENT_STATES);
 701 }
 702
 703 /*
 704  * Per-txg object lists.
 705  */
 706 void
 707 txg_list_create(txg_list_t *tl, spa_t *spa, size_t offset)
 708 {
 709         int t;
 710
 711         mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL);
 712
 713         tl->tl_offset = offset;
 714         tl->tl_spa = spa;
 715
 716         for (t = 0; t < TXG_SIZE; t++)
 717                 tl->tl_head[t] = NULL;
 718 }
 719
 720 void
 721 txg_list_destroy(txg_list_t *tl)
 722 {
 723         int t;
 724
 725         for (t = 0; t < TXG_SIZE; t++)
 726                 ASSERT(txg_list_empty(tl, t));
 727
 728         mutex_destroy(&tl->tl_lock);
 729 }
 730
 731 boolean_t
 732 txg_list_empty(txg_list_t *tl, uint64_t txg)
 733 {
 734         txg_verify(tl->tl_spa, txg);
 735         return (tl->tl_head[txg & TXG_MASK] == NULL);
 736 }
 737
 738 /*
 739  * Returns true if all txg lists are empty.
 740  *
 741  * Warning: this is inherently racy (an item could be added immediately
 742  * after this function returns). We don't bother with the lock because
 743  * it wouldn't change the semantics.
 744  */
 745 boolean_t
 746 txg_all_lists_empty(txg_list_t *tl)
 747 {
 748         for (int i = 0; i < TXG_SIZE; i++) {
 749                 if (!txg_list_empty(tl, i)) {
 750                         return (B_FALSE);
 751                 }
 752         }
 753         return (B_TRUE);
 754 }
 755
 756 /*
 757  * Add an entry to the list (unless it's already on the list).
 758  * Returns B_TRUE if it was actually added.
 759  */
 760 boolean_t
 761 txg_list_add(txg_list_t *tl, void *p, uint64_t txg)
 762 {
 763         int t = txg & TXG_MASK;
 764         txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
 765         boolean_t add;
 766
 767         txg_verify(tl->tl_spa, txg);
 768         mutex_enter(&tl->tl_lock);
 769         add = (tn->tn_member[t] == 0);
 770         if (add) {
 771                 tn->tn_member[t] = 1;
 772                 tn->tn_next[t] = tl->tl_head[t];
 773                 tl->tl_head[t] = tn;
 774         }
 775         mutex_exit(&tl->tl_lock);
 776
 777         return (add);
 778 }
 779
 780 /*
 781  * Add an entry to the end of the list, unless it's already on the list.
 782  * (walks list to find end)
 783  * Returns B_TRUE if it was actually added.
 784  */
 785 boolean_t
 786 txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg)
 787 {
 788         int t = txg & TXG_MASK;
 789         txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
 790         boolean_t add;
 791
 792         txg_verify(tl->tl_spa, txg);
 793         mutex_enter(&tl->tl_lock);
 794         add = (tn->tn_member[t] == 0);
 795         if (add) {
 796                 txg_node_t **tp;
 797
 798                 for (tp = &tl->tl_head[t]; *tp != NULL; tp = &(*tp)->tn_next[t])
 799                         continue;
 800
 801                 tn->tn_member[t] = 1;
 802                 tn->tn_next[t] = NULL;
 803                 *tp = tn;
 804         }
 805         mutex_exit(&tl->tl_lock);
 806
 807         return (add);
 808 }
 809
 810 /*
 811  * Remove the head of the list and return it.
 812  */
 813 void *
 814 txg_list_remove(txg_list_t *tl, uint64_t txg)
 815 {
 816         int t = txg & TXG_MASK;
 817         txg_node_t *tn;
 818         void *p = NULL;
 819
 820         txg_verify(tl->tl_spa, txg);
 821         mutex_enter(&tl->tl_lock);
 822         if ((tn = tl->tl_head[t]) != NULL) {
 823                 p = (char *)tn - tl->tl_offset;
 824                 tl->tl_head[t] = tn->tn_next[t];
 825                 tn->tn_next[t] = NULL;
 826                 tn->tn_member[t] = 0;
 827         }
 828         mutex_exit(&tl->tl_lock);
 829
 830         return (p);
 831 }
 832
 833 /*
 834  * Remove a specific item from the list and return it.
 835  */
 836 void *
 837 txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg)
 838 {
 839         int t = txg & TXG_MASK;
 840         txg_node_t *tn, **tp;
 841
 842         txg_verify(tl->tl_spa, txg);
 843         mutex_enter(&tl->tl_lock);
 844
 845         for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) {
 846                 if ((char *)tn - tl->tl_offset == p) {
 847                         *tp = tn->tn_next[t];
 848                         tn->tn_next[t] = NULL;
 849                         tn->tn_member[t] = 0;
 850                         mutex_exit(&tl->tl_lock);
 851                         return (p);
 852                 }
 853         }
 854
 855         mutex_exit(&tl->tl_lock);
 856
 857         return (NULL);
 858 }
 859
 860 boolean_t
 861 txg_list_member(txg_list_t *tl, void *p, uint64_t txg)
 862 {
 863         int t = txg & TXG_MASK;
 864         txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
 865
 866         txg_verify(tl->tl_spa, txg);
 867         return (tn->tn_member[t] != 0);
 868 }
 869
 870 /*
 871  * Walk a txg list -- only safe if you know it's not changing.
 872  */
 873 void *
 874 txg_list_head(txg_list_t *tl, uint64_t txg)
 875 {
 876         int t = txg & TXG_MASK;
 877         txg_node_t *tn = tl->tl_head[t];
 878
 879         txg_verify(tl->tl_spa, txg);
 880         return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
 881 }
 882
 883 void *
 884 txg_list_next(txg_list_t *tl, void *p, uint64_t txg)
 885 {
 886         int t = txg & TXG_MASK;
 887         txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
 888
 889         txg_verify(tl->tl_spa, txg);
 890         tn = tn->tn_next[t];
 891
 892         return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
 893 }