4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/dsl_pool.h>
27 #include <sys/dsl_dataset.h>
28 #include <sys/dsl_dir.h>
29 #include <sys/dsl_synctask.h>
30 #include <sys/dmu_tx.h>
31 #include <sys/dmu_objset.h>
35 #include <sys/zfs_context.h>
36 #include <sys/fs/zfs.h>
37 #include <sys/zfs_znode.h>
38 #include <sys/spa_impl.h>
40 int zfs_no_write_throttle
= 0;
41 int zfs_write_limit_shift
= 3; /* 1/8th of physical memory */
42 int zfs_txg_synctime
= 5; /* target secs to sync a txg */
44 uint64_t zfs_write_limit_min
= 32 << 20; /* min write limit is 32MB */
45 uint64_t zfs_write_limit_max
= 0; /* max data payload per txg */
46 uint64_t zfs_write_limit_inflated
= 0;
47 uint64_t zfs_write_limit_override
= 0;
49 kmutex_t zfs_write_limit_lock
;
51 static pgcnt_t old_physmem
= 0;
54 dsl_pool_open_special_dir(dsl_pool_t
*dp
, const char *name
, dsl_dir_t
**ddp
)
59 err
= zap_lookup(dp
->dp_meta_objset
,
60 dp
->dp_root_dir
->dd_phys
->dd_child_dir_zapobj
,
61 name
, sizeof (obj
), 1, &obj
);
65 return (dsl_dir_open_obj(dp
, obj
, name
, dp
, ddp
));
69 dsl_pool_open_impl(spa_t
*spa
, uint64_t txg
)
72 blkptr_t
*bp
= spa_get_rootblkptr(spa
);
74 dp
= kmem_zalloc(sizeof (dsl_pool_t
), KM_SLEEP
);
76 dp
->dp_meta_rootbp
= *bp
;
77 rw_init(&dp
->dp_config_rwlock
, NULL
, RW_DEFAULT
, NULL
);
78 dp
->dp_write_limit
= zfs_write_limit_min
;
81 txg_list_create(&dp
->dp_dirty_datasets
,
82 offsetof(dsl_dataset_t
, ds_dirty_link
));
83 txg_list_create(&dp
->dp_dirty_dirs
,
84 offsetof(dsl_dir_t
, dd_dirty_link
));
85 txg_list_create(&dp
->dp_sync_tasks
,
86 offsetof(dsl_sync_task_group_t
, dstg_node
));
87 list_create(&dp
->dp_synced_datasets
, sizeof (dsl_dataset_t
),
88 offsetof(dsl_dataset_t
, ds_synced_link
));
90 mutex_init(&dp
->dp_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
91 mutex_init(&dp
->dp_scrub_cancel_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
97 dsl_pool_open(spa_t
*spa
, uint64_t txg
, dsl_pool_t
**dpp
)
100 dsl_pool_t
*dp
= dsl_pool_open_impl(spa
, txg
);
105 rw_enter(&dp
->dp_config_rwlock
, RW_WRITER
);
106 err
= dmu_objset_open_impl(spa
, NULL
, &dp
->dp_meta_rootbp
, &osi
);
109 dp
->dp_meta_objset
= &osi
->os
;
111 err
= zap_lookup(dp
->dp_meta_objset
, DMU_POOL_DIRECTORY_OBJECT
,
112 DMU_POOL_ROOT_DATASET
, sizeof (uint64_t), 1,
113 &dp
->dp_root_dir_obj
);
117 err
= dsl_dir_open_obj(dp
, dp
->dp_root_dir_obj
,
118 NULL
, dp
, &dp
->dp_root_dir
);
122 err
= dsl_pool_open_special_dir(dp
, MOS_DIR_NAME
, &dp
->dp_mos_dir
);
126 if (spa_version(spa
) >= SPA_VERSION_ORIGIN
) {
127 err
= dsl_pool_open_special_dir(dp
, ORIGIN_DIR_NAME
, &dd
);
130 err
= dsl_dataset_hold_obj(dp
, dd
->dd_phys
->dd_head_dataset_obj
,
134 err
= dsl_dataset_hold_obj(dp
, ds
->ds_phys
->ds_prev_snap_obj
,
135 dp
, &dp
->dp_origin_snap
);
138 dsl_dataset_rele(ds
, FTAG
);
139 dsl_dir_close(dd
, dp
);
142 /* get scrub status */
143 err
= zap_lookup(dp
->dp_meta_objset
, DMU_POOL_DIRECTORY_OBJECT
,
144 DMU_POOL_SCRUB_FUNC
, sizeof (uint32_t), 1,
147 err
= zap_lookup(dp
->dp_meta_objset
, DMU_POOL_DIRECTORY_OBJECT
,
148 DMU_POOL_SCRUB_QUEUE
, sizeof (uint64_t), 1,
149 &dp
->dp_scrub_queue_obj
);
152 err
= zap_lookup(dp
->dp_meta_objset
, DMU_POOL_DIRECTORY_OBJECT
,
153 DMU_POOL_SCRUB_MIN_TXG
, sizeof (uint64_t), 1,
154 &dp
->dp_scrub_min_txg
);
157 err
= zap_lookup(dp
->dp_meta_objset
, DMU_POOL_DIRECTORY_OBJECT
,
158 DMU_POOL_SCRUB_MAX_TXG
, sizeof (uint64_t), 1,
159 &dp
->dp_scrub_max_txg
);
162 err
= zap_lookup(dp
->dp_meta_objset
, DMU_POOL_DIRECTORY_OBJECT
,
163 DMU_POOL_SCRUB_BOOKMARK
, sizeof (uint64_t), 4,
164 &dp
->dp_scrub_bookmark
);
167 err
= zap_lookup(dp
->dp_meta_objset
, DMU_POOL_DIRECTORY_OBJECT
,
168 DMU_POOL_SCRUB_ERRORS
, sizeof (uint64_t), 1,
169 &spa
->spa_scrub_errors
);
172 if (spa_version(spa
) < SPA_VERSION_DSL_SCRUB
) {
174 * A new-type scrub was in progress on an old
175 * pool. Restart from the beginning, since the
176 * old software may have changed the pool in the
179 dsl_pool_scrub_restart(dp
);
183 * It's OK if there is no scrub in progress (and if
184 * there was an I/O error, ignore it).
190 rw_exit(&dp
->dp_config_rwlock
);
200 dsl_pool_close(dsl_pool_t
*dp
)
202 /* drop our references from dsl_pool_open() */
205 * Since we held the origin_snap from "syncing" context (which
206 * includes pool-opening context), it actually only got a "ref"
207 * and not a hold, so just drop that here.
209 if (dp
->dp_origin_snap
)
210 dsl_dataset_drop_ref(dp
->dp_origin_snap
, dp
);
212 dsl_dir_close(dp
->dp_mos_dir
, dp
);
214 dsl_dir_close(dp
->dp_root_dir
, dp
);
216 /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
217 if (dp
->dp_meta_objset
)
218 dmu_objset_evict(NULL
, dp
->dp_meta_objset
->os
);
220 txg_list_destroy(&dp
->dp_dirty_datasets
);
221 txg_list_destroy(&dp
->dp_dirty_dirs
);
222 txg_list_destroy(&dp
->dp_sync_tasks
);
223 list_destroy(&dp
->dp_synced_datasets
);
225 arc_flush(dp
->dp_spa
);
227 rw_destroy(&dp
->dp_config_rwlock
);
228 mutex_destroy(&dp
->dp_lock
);
229 mutex_destroy(&dp
->dp_scrub_cancel_lock
);
231 kmem_free(dp
->dp_blkstats
, sizeof (zfs_all_blkstats_t
));
232 kmem_free(dp
, sizeof (dsl_pool_t
));
236 dsl_pool_create(spa_t
*spa
, nvlist_t
*zplprops
, uint64_t txg
)
239 dsl_pool_t
*dp
= dsl_pool_open_impl(spa
, txg
);
240 dmu_tx_t
*tx
= dmu_tx_create_assigned(dp
, txg
);
245 /* create and open the MOS (meta-objset) */
246 dp
->dp_meta_objset
= &dmu_objset_create_impl(spa
,
247 NULL
, &dp
->dp_meta_rootbp
, DMU_OST_META
, tx
)->os
;
249 /* create the pool directory */
250 err
= zap_create_claim(dp
->dp_meta_objset
, DMU_POOL_DIRECTORY_OBJECT
,
251 DMU_OT_OBJECT_DIRECTORY
, DMU_OT_NONE
, 0, tx
);
252 ASSERT3U(err
, ==, 0);
254 /* create and open the root dir */
255 dp
->dp_root_dir_obj
= dsl_dir_create_sync(dp
, NULL
, NULL
, tx
);
256 VERIFY(0 == dsl_dir_open_obj(dp
, dp
->dp_root_dir_obj
,
257 NULL
, dp
, &dp
->dp_root_dir
));
259 /* create and open the meta-objset dir */
260 (void) dsl_dir_create_sync(dp
, dp
->dp_root_dir
, MOS_DIR_NAME
, tx
);
261 VERIFY(0 == dsl_pool_open_special_dir(dp
,
262 MOS_DIR_NAME
, &dp
->dp_mos_dir
));
264 if (spa_version(spa
) >= SPA_VERSION_DSL_SCRUB
)
265 dsl_pool_create_origin(dp
, tx
);
267 /* create the root dataset */
268 dsobj
= dsl_dataset_create_sync_dd(dp
->dp_root_dir
, NULL
, 0, tx
);
270 /* create the root objset */
271 VERIFY(0 == dsl_dataset_hold_obj(dp
, dsobj
, FTAG
, &ds
));
272 osip
= dmu_objset_create_impl(dp
->dp_spa
, ds
,
273 dsl_dataset_get_blkptr(ds
), DMU_OST_ZFS
, tx
);
275 zfs_create_fs(&osip
->os
, kcred
, zplprops
, tx
);
277 dsl_dataset_rele(ds
, FTAG
);
285 dsl_pool_sync(dsl_pool_t
*dp
, uint64_t txg
)
291 dsl_sync_task_group_t
*dstg
;
292 objset_impl_t
*mosi
= dp
->dp_meta_objset
->os
;
293 hrtime_t start
, write_time
;
294 uint64_t data_written
;
297 tx
= dmu_tx_create_assigned(dp
, txg
);
299 dp
->dp_read_overhead
= 0;
300 zio
= zio_root(dp
->dp_spa
, NULL
, NULL
, ZIO_FLAG_MUSTSUCCEED
);
301 while (ds
= txg_list_remove(&dp
->dp_dirty_datasets
, txg
)) {
302 if (!list_link_active(&ds
->ds_synced_link
))
303 list_insert_tail(&dp
->dp_synced_datasets
, ds
);
305 dmu_buf_rele(ds
->ds_dbuf
, ds
);
306 dsl_dataset_sync(ds
, zio
, tx
);
308 DTRACE_PROBE(pool_sync__1setup
);
312 write_time
= gethrtime() - start
;
314 DTRACE_PROBE(pool_sync__2rootzio
);
316 while (dstg
= txg_list_remove(&dp
->dp_sync_tasks
, txg
))
317 dsl_sync_task_group_sync(dstg
, tx
);
318 DTRACE_PROBE(pool_sync__3task
);
321 while (dd
= txg_list_remove(&dp
->dp_dirty_dirs
, txg
))
322 dsl_dir_sync(dd
, tx
);
323 write_time
+= gethrtime() - start
;
325 if (spa_sync_pass(dp
->dp_spa
) == 1)
326 dsl_pool_scrub_sync(dp
, tx
);
329 if (list_head(&mosi
->os_dirty_dnodes
[txg
& TXG_MASK
]) != NULL
||
330 list_head(&mosi
->os_free_dnodes
[txg
& TXG_MASK
]) != NULL
) {
331 zio
= zio_root(dp
->dp_spa
, NULL
, NULL
, ZIO_FLAG_MUSTSUCCEED
);
332 dmu_objset_sync(mosi
, zio
, tx
);
335 dprintf_bp(&dp
->dp_meta_rootbp
, "meta objset rootbp is %s", "");
336 spa_set_rootblkptr(dp
->dp_spa
, &dp
->dp_meta_rootbp
);
338 write_time
+= gethrtime() - start
;
339 DTRACE_PROBE2(pool_sync__4io
, hrtime_t
, write_time
,
340 hrtime_t
, dp
->dp_read_overhead
);
341 write_time
-= dp
->dp_read_overhead
;
345 data_written
= dp
->dp_space_towrite
[txg
& TXG_MASK
];
346 dp
->dp_space_towrite
[txg
& TXG_MASK
] = 0;
347 ASSERT(dp
->dp_tempreserved
[txg
& TXG_MASK
] == 0);
350 * If the write limit max has not been explicitly set, set it
351 * to a fraction of available physical memory (default 1/8th).
352 * Note that we must inflate the limit because the spa
353 * inflates write sizes to account for data replication.
354 * Check this each sync phase to catch changing memory size.
356 if (physmem
!= old_physmem
&& zfs_write_limit_shift
) {
357 mutex_enter(&zfs_write_limit_lock
);
358 old_physmem
= physmem
;
359 zfs_write_limit_max
= ptob(physmem
) >> zfs_write_limit_shift
;
360 zfs_write_limit_inflated
= MAX(zfs_write_limit_min
,
361 spa_get_asize(dp
->dp_spa
, zfs_write_limit_max
));
362 mutex_exit(&zfs_write_limit_lock
);
366 * Attempt to keep the sync time consistent by adjusting the
367 * amount of write traffic allowed into each transaction group.
368 * Weight the throughput calculation towards the current value:
369 * thru = 3/4 old_thru + 1/4 new_thru
371 ASSERT(zfs_write_limit_min
> 0);
372 if (data_written
> zfs_write_limit_min
/ 8 && write_time
> 0) {
373 uint64_t throughput
= (data_written
* NANOSEC
) / write_time
;
374 if (dp
->dp_throughput
)
375 dp
->dp_throughput
= throughput
/ 4 +
376 3 * dp
->dp_throughput
/ 4;
378 dp
->dp_throughput
= throughput
;
379 dp
->dp_write_limit
= MIN(zfs_write_limit_inflated
,
380 MAX(zfs_write_limit_min
,
381 dp
->dp_throughput
* zfs_txg_synctime
));
386 dsl_pool_zil_clean(dsl_pool_t
*dp
)
390 while (ds
= list_head(&dp
->dp_synced_datasets
)) {
391 list_remove(&dp
->dp_synced_datasets
, ds
);
392 ASSERT(ds
->ds_user_ptr
!= NULL
);
393 zil_clean(((objset_impl_t
*)ds
->ds_user_ptr
)->os_zil
);
394 dmu_buf_rele(ds
->ds_dbuf
, ds
);
399 * TRUE if the current thread is the tx_sync_thread or if we
400 * are being called from SPA context during pool initialization.
403 dsl_pool_sync_context(dsl_pool_t
*dp
)
405 return (curthread
== dp
->dp_tx
.tx_sync_thread
||
406 spa_get_dsl(dp
->dp_spa
) == NULL
);
410 dsl_pool_adjustedsize(dsl_pool_t
*dp
, boolean_t netfree
)
412 uint64_t space
, resv
;
415 * Reserve about 1.6% (1/64), or at least 32MB, for allocation
417 * XXX The intent log is not accounted for, so it must fit
420 * If we're trying to assess whether it's OK to do a free,
421 * cut the reservation in half to allow forward progress
422 * (e.g. make it possible to rm(1) files from a full pool).
424 space
= spa_get_dspace(dp
->dp_spa
);
425 resv
= MAX(space
>> 6, SPA_MINDEVSIZE
>> 1);
429 return (space
- resv
);
433 dsl_pool_tempreserve_space(dsl_pool_t
*dp
, uint64_t space
, dmu_tx_t
*tx
)
435 uint64_t reserved
= 0;
436 uint64_t write_limit
= (zfs_write_limit_override
?
437 zfs_write_limit_override
: dp
->dp_write_limit
);
439 if (zfs_no_write_throttle
) {
440 atomic_add_64(&dp
->dp_tempreserved
[tx
->tx_txg
& TXG_MASK
],
446 * Check to see if we have exceeded the maximum allowed IO for
447 * this transaction group. We can do this without locks since
448 * a little slop here is ok. Note that we do the reserved check
449 * with only half the requested reserve: this is because the
450 * reserve requests are worst-case, and we really don't want to
451 * throttle based off of worst-case estimates.
453 if (write_limit
> 0) {
454 reserved
= dp
->dp_space_towrite
[tx
->tx_txg
& TXG_MASK
]
455 + dp
->dp_tempreserved
[tx
->tx_txg
& TXG_MASK
] / 2;
457 if (reserved
&& reserved
> write_limit
)
461 atomic_add_64(&dp
->dp_tempreserved
[tx
->tx_txg
& TXG_MASK
], space
);
464 * If this transaction group is over 7/8ths capacity, delay
465 * the caller 1 clock tick. This will slow down the "fill"
466 * rate until the sync process can catch up with us.
468 if (reserved
&& reserved
> (write_limit
- (write_limit
>> 3)))
469 txg_delay(dp
, tx
->tx_txg
, 1);
475 dsl_pool_tempreserve_clear(dsl_pool_t
*dp
, int64_t space
, dmu_tx_t
*tx
)
477 ASSERT(dp
->dp_tempreserved
[tx
->tx_txg
& TXG_MASK
] >= space
);
478 atomic_add_64(&dp
->dp_tempreserved
[tx
->tx_txg
& TXG_MASK
], -space
);
482 dsl_pool_memory_pressure(dsl_pool_t
*dp
)
484 uint64_t space_inuse
= 0;
487 if (dp
->dp_write_limit
== zfs_write_limit_min
)
490 for (i
= 0; i
< TXG_SIZE
; i
++) {
491 space_inuse
+= dp
->dp_space_towrite
[i
];
492 space_inuse
+= dp
->dp_tempreserved
[i
];
494 dp
->dp_write_limit
= MAX(zfs_write_limit_min
,
495 MIN(dp
->dp_write_limit
, space_inuse
/ 4));
499 dsl_pool_willuse_space(dsl_pool_t
*dp
, int64_t space
, dmu_tx_t
*tx
)
502 mutex_enter(&dp
->dp_lock
);
503 dp
->dp_space_towrite
[tx
->tx_txg
& TXG_MASK
] += space
;
504 mutex_exit(&dp
->dp_lock
);
510 upgrade_clones_cb(spa_t
*spa
, uint64_t dsobj
, const char *dsname
, void *arg
)
513 dsl_dataset_t
*ds
, *prev
= NULL
;
515 dsl_pool_t
*dp
= spa_get_dsl(spa
);
517 err
= dsl_dataset_hold_obj(dp
, dsobj
, FTAG
, &ds
);
521 while (ds
->ds_phys
->ds_prev_snap_obj
!= 0) {
522 err
= dsl_dataset_hold_obj(dp
, ds
->ds_phys
->ds_prev_snap_obj
,
525 dsl_dataset_rele(ds
, FTAG
);
529 if (prev
->ds_phys
->ds_next_snap_obj
!= ds
->ds_object
)
531 dsl_dataset_rele(ds
, FTAG
);
537 prev
= dp
->dp_origin_snap
;
540 * The $ORIGIN can't have any data, or the accounting
543 ASSERT(prev
->ds_phys
->ds_bp
.blk_birth
== 0);
545 /* The origin doesn't get attached to itself */
546 if (ds
->ds_object
== prev
->ds_object
) {
547 dsl_dataset_rele(ds
, FTAG
);
551 dmu_buf_will_dirty(ds
->ds_dbuf
, tx
);
552 ds
->ds_phys
->ds_prev_snap_obj
= prev
->ds_object
;
553 ds
->ds_phys
->ds_prev_snap_txg
= prev
->ds_phys
->ds_creation_txg
;
555 dmu_buf_will_dirty(ds
->ds_dir
->dd_dbuf
, tx
);
556 ds
->ds_dir
->dd_phys
->dd_origin_obj
= prev
->ds_object
;
558 dmu_buf_will_dirty(prev
->ds_dbuf
, tx
);
559 prev
->ds_phys
->ds_num_children
++;
561 if (ds
->ds_phys
->ds_next_snap_obj
== 0) {
562 ASSERT(ds
->ds_prev
== NULL
);
563 VERIFY(0 == dsl_dataset_hold_obj(dp
,
564 ds
->ds_phys
->ds_prev_snap_obj
, ds
, &ds
->ds_prev
));
568 ASSERT(ds
->ds_dir
->dd_phys
->dd_origin_obj
== prev
->ds_object
);
569 ASSERT(ds
->ds_phys
->ds_prev_snap_obj
== prev
->ds_object
);
571 if (prev
->ds_phys
->ds_next_clones_obj
== 0) {
572 prev
->ds_phys
->ds_next_clones_obj
=
573 zap_create(dp
->dp_meta_objset
,
574 DMU_OT_NEXT_CLONES
, DMU_OT_NONE
, 0, tx
);
576 VERIFY(0 == zap_add_int(dp
->dp_meta_objset
,
577 prev
->ds_phys
->ds_next_clones_obj
, ds
->ds_object
, tx
));
579 dsl_dataset_rele(ds
, FTAG
);
580 if (prev
!= dp
->dp_origin_snap
)
581 dsl_dataset_rele(prev
, FTAG
);
586 dsl_pool_upgrade_clones(dsl_pool_t
*dp
, dmu_tx_t
*tx
)
588 ASSERT(dmu_tx_is_syncing(tx
));
589 ASSERT(dp
->dp_origin_snap
!= NULL
);
591 (void) dmu_objset_find_spa(dp
->dp_spa
, NULL
, upgrade_clones_cb
,
592 tx
, DS_FIND_CHILDREN
);
596 dsl_pool_create_origin(dsl_pool_t
*dp
, dmu_tx_t
*tx
)
601 ASSERT(dmu_tx_is_syncing(tx
));
602 ASSERT(dp
->dp_origin_snap
== NULL
);
604 /* create the origin dir, ds, & snap-ds */
605 rw_enter(&dp
->dp_config_rwlock
, RW_WRITER
);
606 dsobj
= dsl_dataset_create_sync(dp
->dp_root_dir
, ORIGIN_DIR_NAME
,
608 VERIFY(0 == dsl_dataset_hold_obj(dp
, dsobj
, FTAG
, &ds
));
609 dsl_dataset_snapshot_sync(ds
, ORIGIN_DIR_NAME
, kcred
, tx
);
610 VERIFY(0 == dsl_dataset_hold_obj(dp
, ds
->ds_phys
->ds_prev_snap_obj
,
611 dp
, &dp
->dp_origin_snap
));
612 dsl_dataset_rele(ds
, FTAG
);
613 rw_exit(&dp
->dp_config_rwlock
);