4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/zfs_context.h>
27 #include <sys/spa_impl.h>
29 #include <sys/dmu_tx.h>
30 #include <sys/space_map.h>
31 #include <sys/metaslab_impl.h>
32 #include <sys/vdev_impl.h>
35 uint64_t metaslab_aliquot
= 512ULL << 10;
36 uint64_t metaslab_gang_bang
= SPA_MAXBLOCKSIZE
+ 1; /* force gang blocks */
39 * ==========================================================================
41 * ==========================================================================
44 metaslab_class_create(void)
48 mc
= kmem_zalloc(sizeof (metaslab_class_t
), KM_SLEEP
);
56 metaslab_class_destroy(metaslab_class_t
*mc
)
60 while ((mg
= mc
->mc_rotor
) != NULL
) {
61 metaslab_class_remove(mc
, mg
);
62 metaslab_group_destroy(mg
);
65 kmem_free(mc
, sizeof (metaslab_class_t
));
69 metaslab_class_add(metaslab_class_t
*mc
, metaslab_group_t
*mg
)
71 metaslab_group_t
*mgprev
, *mgnext
;
73 ASSERT(mg
->mg_class
== NULL
);
75 if ((mgprev
= mc
->mc_rotor
) == NULL
) {
79 mgnext
= mgprev
->mg_next
;
90 metaslab_class_remove(metaslab_class_t
*mc
, metaslab_group_t
*mg
)
92 metaslab_group_t
*mgprev
, *mgnext
;
94 ASSERT(mg
->mg_class
== mc
);
102 mc
->mc_rotor
= mgnext
;
103 mgprev
->mg_next
= mgnext
;
104 mgnext
->mg_prev
= mgprev
;
113 * ==========================================================================
115 * ==========================================================================
118 metaslab_compare(const void *x1
, const void *x2
)
120 const metaslab_t
*m1
= x1
;
121 const metaslab_t
*m2
= x2
;
123 if (m1
->ms_weight
< m2
->ms_weight
)
125 if (m1
->ms_weight
> m2
->ms_weight
)
129 * If the weights are identical, use the offset to force uniqueness.
131 if (m1
->ms_map
.sm_start
< m2
->ms_map
.sm_start
)
133 if (m1
->ms_map
.sm_start
> m2
->ms_map
.sm_start
)
136 ASSERT3P(m1
, ==, m2
);
142 metaslab_group_create(metaslab_class_t
*mc
, vdev_t
*vd
)
144 metaslab_group_t
*mg
;
146 mg
= kmem_zalloc(sizeof (metaslab_group_t
), KM_SLEEP
);
147 mutex_init(&mg
->mg_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
148 avl_create(&mg
->mg_metaslab_tree
, metaslab_compare
,
149 sizeof (metaslab_t
), offsetof(struct metaslab
, ms_group_node
));
150 mg
->mg_aliquot
= metaslab_aliquot
* MAX(1, vd
->vdev_children
);
152 metaslab_class_add(mc
, mg
);
158 metaslab_group_destroy(metaslab_group_t
*mg
)
160 avl_destroy(&mg
->mg_metaslab_tree
);
161 mutex_destroy(&mg
->mg_lock
);
162 kmem_free(mg
, sizeof (metaslab_group_t
));
166 metaslab_group_add(metaslab_group_t
*mg
, metaslab_t
*msp
)
168 mutex_enter(&mg
->mg_lock
);
169 ASSERT(msp
->ms_group
== NULL
);
172 avl_add(&mg
->mg_metaslab_tree
, msp
);
173 mutex_exit(&mg
->mg_lock
);
177 metaslab_group_remove(metaslab_group_t
*mg
, metaslab_t
*msp
)
179 mutex_enter(&mg
->mg_lock
);
180 ASSERT(msp
->ms_group
== mg
);
181 avl_remove(&mg
->mg_metaslab_tree
, msp
);
182 msp
->ms_group
= NULL
;
183 mutex_exit(&mg
->mg_lock
);
187 metaslab_group_sort(metaslab_group_t
*mg
, metaslab_t
*msp
, uint64_t weight
)
190 * Although in principle the weight can be any value, in
191 * practice we do not use values in the range [1, 510].
193 ASSERT(weight
>= SPA_MINBLOCKSIZE
-1 || weight
== 0);
194 ASSERT(MUTEX_HELD(&msp
->ms_lock
));
196 mutex_enter(&mg
->mg_lock
);
197 ASSERT(msp
->ms_group
== mg
);
198 avl_remove(&mg
->mg_metaslab_tree
, msp
);
199 msp
->ms_weight
= weight
;
200 avl_add(&mg
->mg_metaslab_tree
, msp
);
201 mutex_exit(&mg
->mg_lock
);
205 * ==========================================================================
206 * The first-fit block allocator
207 * ==========================================================================
210 metaslab_ff_load(space_map_t
*sm
)
212 ASSERT(sm
->sm_ppd
== NULL
);
213 sm
->sm_ppd
= kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP
);
217 metaslab_ff_unload(space_map_t
*sm
)
219 kmem_free(sm
->sm_ppd
, 64 * sizeof (uint64_t));
224 metaslab_ff_alloc(space_map_t
*sm
, uint64_t size
)
226 avl_tree_t
*t
= &sm
->sm_root
;
227 uint64_t align
= size
& -size
;
228 uint64_t *cursor
= (uint64_t *)sm
->sm_ppd
+ highbit(align
) - 1;
229 space_seg_t
*ss
, ssearch
;
232 ssearch
.ss_start
= *cursor
;
233 ssearch
.ss_end
= *cursor
+ size
;
235 ss
= avl_find(t
, &ssearch
, &where
);
237 ss
= avl_nearest(t
, where
, AVL_AFTER
);
240 uint64_t offset
= P2ROUNDUP(ss
->ss_start
, align
);
242 if (offset
+ size
<= ss
->ss_end
) {
243 *cursor
= offset
+ size
;
246 ss
= AVL_NEXT(t
, ss
);
250 * If we know we've searched the whole map (*cursor == 0), give up.
251 * Otherwise, reset the cursor to the beginning and try again.
257 return (metaslab_ff_alloc(sm
, size
));
262 metaslab_ff_claim(space_map_t
*sm
, uint64_t start
, uint64_t size
)
264 /* No need to update cursor */
269 metaslab_ff_free(space_map_t
*sm
, uint64_t start
, uint64_t size
)
271 /* No need to update cursor */
274 static space_map_ops_t metaslab_ff_ops
= {
283 * ==========================================================================
285 * ==========================================================================
288 metaslab_init(metaslab_group_t
*mg
, space_map_obj_t
*smo
,
289 uint64_t start
, uint64_t size
, uint64_t txg
)
291 vdev_t
*vd
= mg
->mg_vd
;
294 msp
= kmem_zalloc(sizeof (metaslab_t
), KM_SLEEP
);
295 mutex_init(&msp
->ms_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
297 msp
->ms_smo_syncing
= *smo
;
300 * We create the main space map here, but we don't create the
301 * allocmaps and freemaps until metaslab_sync_done(). This serves
302 * two purposes: it allows metaslab_sync_done() to detect the
303 * addition of new space; and for debugging, it ensures that we'd
304 * data fault on any attempt to use this metaslab before it's ready.
306 space_map_create(&msp
->ms_map
, start
, size
,
307 vd
->vdev_ashift
, &msp
->ms_lock
);
309 metaslab_group_add(mg
, msp
);
312 * If we're opening an existing pool (txg == 0) or creating
313 * a new one (txg == TXG_INITIAL), all space is available now.
314 * If we're adding space to an existing pool, the new space
315 * does not become available until after this txg has synced.
317 if (txg
<= TXG_INITIAL
)
318 metaslab_sync_done(msp
, 0);
322 * The vdev is dirty, but the metaslab isn't -- it just needs
323 * to have metaslab_sync_done() invoked from vdev_sync_done().
324 * [We could just dirty the metaslab, but that would cause us
325 * to allocate a space map object for it, which is wasteful
326 * and would mess up the locality logic in metaslab_weight().]
328 ASSERT(TXG_CLEAN(txg
) == spa_last_synced_txg(vd
->vdev_spa
));
329 vdev_dirty(vd
, 0, NULL
, txg
);
330 vdev_dirty(vd
, VDD_METASLAB
, msp
, TXG_CLEAN(txg
));
337 metaslab_fini(metaslab_t
*msp
)
339 metaslab_group_t
*mg
= msp
->ms_group
;
342 vdev_space_update(mg
->mg_vd
, -msp
->ms_map
.sm_size
,
343 -msp
->ms_smo
.smo_alloc
, B_TRUE
);
345 metaslab_group_remove(mg
, msp
);
347 mutex_enter(&msp
->ms_lock
);
349 space_map_unload(&msp
->ms_map
);
350 space_map_destroy(&msp
->ms_map
);
352 for (t
= 0; t
< TXG_SIZE
; t
++) {
353 space_map_destroy(&msp
->ms_allocmap
[t
]);
354 space_map_destroy(&msp
->ms_freemap
[t
]);
357 mutex_exit(&msp
->ms_lock
);
358 mutex_destroy(&msp
->ms_lock
);
360 kmem_free(msp
, sizeof (metaslab_t
));
363 #define METASLAB_WEIGHT_PRIMARY (1ULL << 63)
364 #define METASLAB_WEIGHT_SECONDARY (1ULL << 62)
365 #define METASLAB_ACTIVE_MASK \
366 (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
367 #define METASLAB_SMO_BONUS_MULTIPLIER 2
370 metaslab_weight(metaslab_t
*msp
)
372 metaslab_group_t
*mg
= msp
->ms_group
;
373 space_map_t
*sm
= &msp
->ms_map
;
374 space_map_obj_t
*smo
= &msp
->ms_smo
;
375 vdev_t
*vd
= mg
->mg_vd
;
376 uint64_t weight
, space
;
378 ASSERT(MUTEX_HELD(&msp
->ms_lock
));
381 * The baseline weight is the metaslab's free space.
383 space
= sm
->sm_size
- smo
->smo_alloc
;
387 * Modern disks have uniform bit density and constant angular velocity.
388 * Therefore, the outer recording zones are faster (higher bandwidth)
389 * than the inner zones by the ratio of outer to inner track diameter,
390 * which is typically around 2:1. We account for this by assigning
391 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
392 * In effect, this means that we'll select the metaslab with the most
393 * free bandwidth rather than simply the one with the most free space.
395 weight
= 2 * weight
-
396 ((sm
->sm_start
>> vd
->vdev_ms_shift
) * weight
) / vd
->vdev_ms_count
;
397 ASSERT(weight
>= space
&& weight
<= 2 * space
);
400 * For locality, assign higher weight to metaslabs we've used before.
402 if (smo
->smo_object
!= 0)
403 weight
*= METASLAB_SMO_BONUS_MULTIPLIER
;
404 ASSERT(weight
>= space
&&
405 weight
<= 2 * METASLAB_SMO_BONUS_MULTIPLIER
* space
);
408 * If this metaslab is one we're actively using, adjust its weight to
409 * make it preferable to any inactive metaslab so we'll polish it off.
411 weight
|= (msp
->ms_weight
& METASLAB_ACTIVE_MASK
);
417 metaslab_activate(metaslab_t
*msp
, uint64_t activation_weight
)
419 space_map_t
*sm
= &msp
->ms_map
;
421 ASSERT(MUTEX_HELD(&msp
->ms_lock
));
423 if ((msp
->ms_weight
& METASLAB_ACTIVE_MASK
) == 0) {
424 int error
= space_map_load(sm
, &metaslab_ff_ops
,
425 SM_FREE
, &msp
->ms_smo
,
426 msp
->ms_group
->mg_vd
->vdev_spa
->spa_meta_objset
);
428 metaslab_group_sort(msp
->ms_group
, msp
, 0);
431 metaslab_group_sort(msp
->ms_group
, msp
,
432 msp
->ms_weight
| activation_weight
);
434 ASSERT(sm
->sm_loaded
);
435 ASSERT(msp
->ms_weight
& METASLAB_ACTIVE_MASK
);
441 metaslab_passivate(metaslab_t
*msp
, uint64_t size
)
444 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
445 * this metaslab again. In that case, it had better be empty,
446 * or we would be leaving space on the table.
448 ASSERT(size
>= SPA_MINBLOCKSIZE
|| msp
->ms_map
.sm_space
== 0);
449 metaslab_group_sort(msp
->ms_group
, msp
, MIN(msp
->ms_weight
, size
));
450 ASSERT((msp
->ms_weight
& METASLAB_ACTIVE_MASK
) == 0);
454 * Write a metaslab to disk in the context of the specified transaction group.
457 metaslab_sync(metaslab_t
*msp
, uint64_t txg
)
459 vdev_t
*vd
= msp
->ms_group
->mg_vd
;
460 spa_t
*spa
= vd
->vdev_spa
;
461 objset_t
*mos
= spa
->spa_meta_objset
;
462 space_map_t
*allocmap
= &msp
->ms_allocmap
[txg
& TXG_MASK
];
463 space_map_t
*freemap
= &msp
->ms_freemap
[txg
& TXG_MASK
];
464 space_map_t
*freed_map
= &msp
->ms_freemap
[TXG_CLEAN(txg
) & TXG_MASK
];
465 space_map_t
*sm
= &msp
->ms_map
;
466 space_map_obj_t
*smo
= &msp
->ms_smo_syncing
;
471 tx
= dmu_tx_create_assigned(spa_get_dsl(spa
), txg
);
474 * The only state that can actually be changing concurrently with
475 * metaslab_sync() is the metaslab's ms_map. No other thread can
476 * be modifying this txg's allocmap, freemap, freed_map, or smo.
477 * Therefore, we only hold ms_lock to satify space_map ASSERTs.
478 * We drop it whenever we call into the DMU, because the DMU
479 * can call down to us (e.g. via zio_free()) at any time.
481 mutex_enter(&msp
->ms_lock
);
483 if (smo
->smo_object
== 0) {
484 ASSERT(smo
->smo_objsize
== 0);
485 ASSERT(smo
->smo_alloc
== 0);
486 mutex_exit(&msp
->ms_lock
);
487 smo
->smo_object
= dmu_object_alloc(mos
,
488 DMU_OT_SPACE_MAP
, 1 << SPACE_MAP_BLOCKSHIFT
,
489 DMU_OT_SPACE_MAP_HEADER
, sizeof (*smo
), tx
);
490 ASSERT(smo
->smo_object
!= 0);
491 dmu_write(mos
, vd
->vdev_ms_array
, sizeof (uint64_t) *
492 (sm
->sm_start
>> vd
->vdev_ms_shift
),
493 sizeof (uint64_t), &smo
->smo_object
, tx
);
494 mutex_enter(&msp
->ms_lock
);
497 space_map_walk(freemap
, space_map_add
, freed_map
);
499 if (sm
->sm_loaded
&& spa_sync_pass(spa
) == 1 && smo
->smo_objsize
>=
500 2 * sizeof (uint64_t) * avl_numnodes(&sm
->sm_root
)) {
502 * The in-core space map representation is twice as compact
503 * as the on-disk one, so it's time to condense the latter
504 * by generating a pure allocmap from first principles.
506 * This metaslab is 100% allocated,
507 * minus the content of the in-core map (sm),
508 * minus what's been freed this txg (freed_map),
509 * minus allocations from txgs in the future
510 * (because they haven't been committed yet).
512 space_map_vacate(allocmap
, NULL
, NULL
);
513 space_map_vacate(freemap
, NULL
, NULL
);
515 space_map_add(allocmap
, allocmap
->sm_start
, allocmap
->sm_size
);
517 space_map_walk(sm
, space_map_remove
, allocmap
);
518 space_map_walk(freed_map
, space_map_remove
, allocmap
);
520 for (t
= 1; t
< TXG_CONCURRENT_STATES
; t
++)
521 space_map_walk(&msp
->ms_allocmap
[(txg
+ t
) & TXG_MASK
],
522 space_map_remove
, allocmap
);
524 mutex_exit(&msp
->ms_lock
);
525 space_map_truncate(smo
, mos
, tx
);
526 mutex_enter(&msp
->ms_lock
);
529 space_map_sync(allocmap
, SM_ALLOC
, smo
, mos
, tx
);
530 space_map_sync(freemap
, SM_FREE
, smo
, mos
, tx
);
532 mutex_exit(&msp
->ms_lock
);
534 VERIFY(0 == dmu_bonus_hold(mos
, smo
->smo_object
, FTAG
, &db
));
535 dmu_buf_will_dirty(db
, tx
);
536 ASSERT3U(db
->db_size
, >=, sizeof (*smo
));
537 bcopy(smo
, db
->db_data
, sizeof (*smo
));
538 dmu_buf_rele(db
, FTAG
);
544 * Called after a transaction group has completely synced to mark
545 * all of the metaslab's free space as usable.
548 metaslab_sync_done(metaslab_t
*msp
, uint64_t txg
)
550 space_map_obj_t
*smo
= &msp
->ms_smo
;
551 space_map_obj_t
*smosync
= &msp
->ms_smo_syncing
;
552 space_map_t
*sm
= &msp
->ms_map
;
553 space_map_t
*freed_map
= &msp
->ms_freemap
[TXG_CLEAN(txg
) & TXG_MASK
];
554 metaslab_group_t
*mg
= msp
->ms_group
;
555 vdev_t
*vd
= mg
->mg_vd
;
558 mutex_enter(&msp
->ms_lock
);
561 * If this metaslab is just becoming available, initialize its
562 * allocmaps and freemaps and add its capacity to the vdev.
564 if (freed_map
->sm_size
== 0) {
565 for (t
= 0; t
< TXG_SIZE
; t
++) {
566 space_map_create(&msp
->ms_allocmap
[t
], sm
->sm_start
,
567 sm
->sm_size
, sm
->sm_shift
, sm
->sm_lock
);
568 space_map_create(&msp
->ms_freemap
[t
], sm
->sm_start
,
569 sm
->sm_size
, sm
->sm_shift
, sm
->sm_lock
);
571 vdev_space_update(vd
, sm
->sm_size
, 0, B_TRUE
);
574 vdev_space_update(vd
, 0, smosync
->smo_alloc
- smo
->smo_alloc
, B_TRUE
);
576 ASSERT(msp
->ms_allocmap
[txg
& TXG_MASK
].sm_space
== 0);
577 ASSERT(msp
->ms_freemap
[txg
& TXG_MASK
].sm_space
== 0);
580 * If there's a space_map_load() in progress, wait for it to complete
581 * so that we have a consistent view of the in-core space map.
582 * Then, add everything we freed in this txg to the map.
584 space_map_load_wait(sm
);
585 space_map_vacate(freed_map
, sm
->sm_loaded
? space_map_free
: NULL
, sm
);
590 * If the map is loaded but no longer active, evict it as soon as all
591 * future allocations have synced. (If we unloaded it now and then
592 * loaded a moment later, the map wouldn't reflect those allocations.)
594 if (sm
->sm_loaded
&& (msp
->ms_weight
& METASLAB_ACTIVE_MASK
) == 0) {
597 for (t
= 1; t
< TXG_CONCURRENT_STATES
; t
++)
598 if (msp
->ms_allocmap
[(txg
+ t
) & TXG_MASK
].sm_space
)
602 space_map_unload(sm
);
605 metaslab_group_sort(mg
, msp
, metaslab_weight(msp
));
607 mutex_exit(&msp
->ms_lock
);
611 metaslab_distance(metaslab_t
*msp
, dva_t
*dva
)
613 uint64_t ms_shift
= msp
->ms_group
->mg_vd
->vdev_ms_shift
;
614 uint64_t offset
= DVA_GET_OFFSET(dva
) >> ms_shift
;
615 uint64_t start
= msp
->ms_map
.sm_start
>> ms_shift
;
617 if (msp
->ms_group
->mg_vd
->vdev_id
!= DVA_GET_VDEV(dva
))
621 return ((start
- offset
) << ms_shift
);
623 return ((offset
- start
) << ms_shift
);
628 metaslab_group_alloc(metaslab_group_t
*mg
, uint64_t size
, uint64_t txg
,
629 uint64_t min_distance
, dva_t
*dva
, int d
)
631 metaslab_t
*msp
= NULL
;
632 uint64_t offset
= -1ULL;
633 avl_tree_t
*t
= &mg
->mg_metaslab_tree
;
634 uint64_t activation_weight
;
635 uint64_t target_distance
;
638 activation_weight
= METASLAB_WEIGHT_PRIMARY
;
639 for (i
= 0; i
< d
; i
++)
640 if (DVA_GET_VDEV(&dva
[i
]) == mg
->mg_vd
->vdev_id
)
641 activation_weight
= METASLAB_WEIGHT_SECONDARY
;
644 mutex_enter(&mg
->mg_lock
);
645 for (msp
= avl_first(t
); msp
; msp
= AVL_NEXT(t
, msp
)) {
646 if (msp
->ms_weight
< size
) {
647 mutex_exit(&mg
->mg_lock
);
651 if (activation_weight
== METASLAB_WEIGHT_PRIMARY
)
654 target_distance
= min_distance
+
655 (msp
->ms_smo
.smo_alloc
? 0 : min_distance
>> 1);
657 for (i
= 0; i
< d
; i
++)
658 if (metaslab_distance(msp
, &dva
[i
]) <
664 mutex_exit(&mg
->mg_lock
);
668 mutex_enter(&msp
->ms_lock
);
671 * Ensure that the metaslab we have selected is still
672 * capable of handling our request. It's possible that
673 * another thread may have changed the weight while we
674 * were blocked on the metaslab lock.
676 if (msp
->ms_weight
< size
) {
677 mutex_exit(&msp
->ms_lock
);
681 if ((msp
->ms_weight
& METASLAB_WEIGHT_SECONDARY
) &&
682 activation_weight
== METASLAB_WEIGHT_PRIMARY
) {
683 metaslab_passivate(msp
,
684 msp
->ms_weight
& ~METASLAB_ACTIVE_MASK
);
685 mutex_exit(&msp
->ms_lock
);
689 if (metaslab_activate(msp
, activation_weight
) != 0) {
690 mutex_exit(&msp
->ms_lock
);
694 if ((offset
= space_map_alloc(&msp
->ms_map
, size
)) != -1ULL)
697 metaslab_passivate(msp
, size
- 1);
699 mutex_exit(&msp
->ms_lock
);
702 if (msp
->ms_allocmap
[txg
& TXG_MASK
].sm_space
== 0)
703 vdev_dirty(mg
->mg_vd
, VDD_METASLAB
, msp
, txg
);
705 space_map_add(&msp
->ms_allocmap
[txg
& TXG_MASK
], offset
, size
);
707 mutex_exit(&msp
->ms_lock
);
713 * Allocate a block for the specified i/o.
716 metaslab_alloc_dva(spa_t
*spa
, metaslab_class_t
*mc
, uint64_t psize
,
717 dva_t
*dva
, int d
, dva_t
*hintdva
, uint64_t txg
, int flags
)
719 metaslab_group_t
*mg
, *rotor
;
723 uint64_t offset
= -1ULL;
727 ASSERT(!DVA_IS_VALID(&dva
[d
]));
730 * For testing, make some blocks above a certain size be gang blocks.
732 if (psize
>= metaslab_gang_bang
&& (lbolt
& 3) == 0)
736 * Start at the rotor and loop through all mgs until we find something.
737 * Note that there's no locking on mc_rotor or mc_allocated because
738 * nothing actually breaks if we miss a few updates -- we just won't
739 * allocate quite as evenly. It all balances out over time.
741 * If we are doing ditto or log blocks, try to spread them across
742 * consecutive vdevs. If we're forced to reuse a vdev before we've
743 * allocated all of our ditto blocks, then try and spread them out on
744 * that vdev as much as possible. If it turns out to not be possible,
745 * gradually lower our standards until anything becomes acceptable.
746 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
747 * gives us hope of containing our fault domains to something we're
748 * able to reason about. Otherwise, any two top-level vdev failures
749 * will guarantee the loss of data. With consecutive allocation,
750 * only two adjacent top-level vdev failures will result in data loss.
752 * If we are doing gang blocks (hintdva is non-NULL), try to keep
753 * ourselves on the same vdev as our gang block header. That
754 * way, we can hope for locality in vdev_cache, plus it makes our
755 * fault domains something tractable.
758 vd
= vdev_lookup_top(spa
, DVA_GET_VDEV(&hintdva
[d
]));
759 if (flags
& METASLAB_HINTBP_AVOID
)
760 mg
= vd
->vdev_mg
->mg_next
;
764 vd
= vdev_lookup_top(spa
, DVA_GET_VDEV(&dva
[d
- 1]));
765 mg
= vd
->vdev_mg
->mg_next
;
771 * If the hint put us into the wrong class, just follow the rotor.
773 if (mg
->mg_class
!= mc
)
782 * Don't allocate from faulted devices.
784 if (!vdev_allocatable(vd
))
787 * Avoid writing single-copy data to a failing vdev
789 if ((vd
->vdev_stat
.vs_write_errors
> 0 ||
790 vd
->vdev_state
< VDEV_STATE_HEALTHY
) &&
791 d
== 0 && dshift
== 3) {
796 ASSERT(mg
->mg_class
== mc
);
798 distance
= vd
->vdev_asize
>> dshift
;
799 if (distance
<= (1ULL << vd
->vdev_ms_shift
))
804 asize
= vdev_psize_to_asize(vd
, psize
);
805 ASSERT(P2PHASE(asize
, 1ULL << vd
->vdev_ashift
) == 0);
807 offset
= metaslab_group_alloc(mg
, asize
, txg
, distance
, dva
, d
);
808 if (offset
!= -1ULL) {
810 * If we've just selected this metaslab group,
811 * figure out whether the corresponding vdev is
812 * over- or under-used relative to the pool,
813 * and set an allocation bias to even it out.
815 if (mc
->mc_allocated
== 0) {
816 vdev_stat_t
*vs
= &vd
->vdev_stat
;
817 uint64_t alloc
, space
;
820 alloc
= spa_get_alloc(spa
);
821 space
= spa_get_space(spa
);
824 * Determine percent used in units of 0..1024.
825 * (This is just to avoid floating point.)
827 vu
= (vs
->vs_alloc
<< 10) / (vs
->vs_space
+ 1);
828 su
= (alloc
<< 10) / (space
+ 1);
831 * Bias by at most +/- 25% of the aliquot.
833 mg
->mg_bias
= ((su
- vu
) *
834 (int64_t)mg
->mg_aliquot
) / (1024 * 4);
837 if (atomic_add_64_nv(&mc
->mc_allocated
, asize
) >=
838 mg
->mg_aliquot
+ mg
->mg_bias
) {
839 mc
->mc_rotor
= mg
->mg_next
;
840 mc
->mc_allocated
= 0;
843 DVA_SET_VDEV(&dva
[d
], vd
->vdev_id
);
844 DVA_SET_OFFSET(&dva
[d
], offset
);
845 DVA_SET_GANG(&dva
[d
], !!(flags
& METASLAB_GANG_HEADER
));
846 DVA_SET_ASIZE(&dva
[d
], asize
);
851 mc
->mc_rotor
= mg
->mg_next
;
852 mc
->mc_allocated
= 0;
853 } while ((mg
= mg
->mg_next
) != rotor
);
861 bzero(&dva
[d
], sizeof (dva_t
));
867 * Free the block represented by DVA in the context of the specified
871 metaslab_free_dva(spa_t
*spa
, const dva_t
*dva
, uint64_t txg
, boolean_t now
)
873 uint64_t vdev
= DVA_GET_VDEV(dva
);
874 uint64_t offset
= DVA_GET_OFFSET(dva
);
875 uint64_t size
= DVA_GET_ASIZE(dva
);
879 ASSERT(DVA_IS_VALID(dva
));
881 if (txg
> spa_freeze_txg(spa
))
884 if ((vd
= vdev_lookup_top(spa
, vdev
)) == NULL
||
885 (offset
>> vd
->vdev_ms_shift
) >= vd
->vdev_ms_count
) {
886 cmn_err(CE_WARN
, "metaslab_free_dva(): bad DVA %llu:%llu",
887 (u_longlong_t
)vdev
, (u_longlong_t
)offset
);
892 msp
= vd
->vdev_ms
[offset
>> vd
->vdev_ms_shift
];
894 if (DVA_GET_GANG(dva
))
895 size
= vdev_psize_to_asize(vd
, SPA_GANGBLOCKSIZE
);
897 mutex_enter(&msp
->ms_lock
);
900 space_map_remove(&msp
->ms_allocmap
[txg
& TXG_MASK
],
902 space_map_free(&msp
->ms_map
, offset
, size
);
904 if (msp
->ms_freemap
[txg
& TXG_MASK
].sm_space
== 0)
905 vdev_dirty(vd
, VDD_METASLAB
, msp
, txg
);
906 space_map_add(&msp
->ms_freemap
[txg
& TXG_MASK
], offset
, size
);
909 mutex_exit(&msp
->ms_lock
);
913 * Intent log support: upon opening the pool after a crash, notify the SPA
914 * of blocks that the intent log has allocated for immediate write, but
915 * which are still considered free by the SPA because the last transaction
916 * group didn't commit yet.
919 metaslab_claim_dva(spa_t
*spa
, const dva_t
*dva
, uint64_t txg
)
921 uint64_t vdev
= DVA_GET_VDEV(dva
);
922 uint64_t offset
= DVA_GET_OFFSET(dva
);
923 uint64_t size
= DVA_GET_ASIZE(dva
);
928 ASSERT(DVA_IS_VALID(dva
));
930 if ((vd
= vdev_lookup_top(spa
, vdev
)) == NULL
||
931 (offset
>> vd
->vdev_ms_shift
) >= vd
->vdev_ms_count
)
934 msp
= vd
->vdev_ms
[offset
>> vd
->vdev_ms_shift
];
936 if (DVA_GET_GANG(dva
))
937 size
= vdev_psize_to_asize(vd
, SPA_GANGBLOCKSIZE
);
939 mutex_enter(&msp
->ms_lock
);
941 error
= metaslab_activate(msp
, METASLAB_WEIGHT_SECONDARY
);
942 if (error
|| txg
== 0) { /* txg == 0 indicates dry run */
943 mutex_exit(&msp
->ms_lock
);
947 space_map_claim(&msp
->ms_map
, offset
, size
);
949 if (spa_mode
& FWRITE
) { /* don't dirty if we're zdb(1M) */
950 if (msp
->ms_allocmap
[txg
& TXG_MASK
].sm_space
== 0)
951 vdev_dirty(vd
, VDD_METASLAB
, msp
, txg
);
952 space_map_add(&msp
->ms_allocmap
[txg
& TXG_MASK
], offset
, size
);
955 mutex_exit(&msp
->ms_lock
);
961 metaslab_alloc(spa_t
*spa
, metaslab_class_t
*mc
, uint64_t psize
, blkptr_t
*bp
,
962 int ndvas
, uint64_t txg
, blkptr_t
*hintbp
, int flags
)
964 dva_t
*dva
= bp
->blk_dva
;
965 dva_t
*hintdva
= hintbp
->blk_dva
;
968 ASSERT(bp
->blk_birth
== 0);
970 spa_config_enter(spa
, SCL_ALLOC
, FTAG
, RW_READER
);
972 if (mc
->mc_rotor
== NULL
) { /* no vdevs in this class */
973 spa_config_exit(spa
, SCL_ALLOC
, FTAG
);
977 ASSERT(ndvas
> 0 && ndvas
<= spa_max_replication(spa
));
978 ASSERT(BP_GET_NDVAS(bp
) == 0);
979 ASSERT(hintbp
== NULL
|| ndvas
<= BP_GET_NDVAS(hintbp
));
981 for (int d
= 0; d
< ndvas
; d
++) {
982 error
= metaslab_alloc_dva(spa
, mc
, psize
, dva
, d
, hintdva
,
985 for (d
--; d
>= 0; d
--) {
986 metaslab_free_dva(spa
, &dva
[d
], txg
, B_TRUE
);
987 bzero(&dva
[d
], sizeof (dva_t
));
989 spa_config_exit(spa
, SCL_ALLOC
, FTAG
);
994 ASSERT(BP_GET_NDVAS(bp
) == ndvas
);
996 spa_config_exit(spa
, SCL_ALLOC
, FTAG
);
1004 metaslab_free(spa_t
*spa
, const blkptr_t
*bp
, uint64_t txg
, boolean_t now
)
1006 const dva_t
*dva
= bp
->blk_dva
;
1007 int ndvas
= BP_GET_NDVAS(bp
);
1009 ASSERT(!BP_IS_HOLE(bp
));
1010 ASSERT(!now
|| bp
->blk_birth
>= spa
->spa_syncing_txg
);
1012 spa_config_enter(spa
, SCL_FREE
, FTAG
, RW_READER
);
1014 for (int d
= 0; d
< ndvas
; d
++)
1015 metaslab_free_dva(spa
, &dva
[d
], txg
, now
);
1017 spa_config_exit(spa
, SCL_FREE
, FTAG
);
1021 metaslab_claim(spa_t
*spa
, const blkptr_t
*bp
, uint64_t txg
)
1023 const dva_t
*dva
= bp
->blk_dva
;
1024 int ndvas
= BP_GET_NDVAS(bp
);
1027 ASSERT(!BP_IS_HOLE(bp
));
1031 * First do a dry run to make sure all DVAs are claimable,
1032 * so we don't have to unwind from partial failures below.
1034 if ((error
= metaslab_claim(spa
, bp
, 0)) != 0)
1038 spa_config_enter(spa
, SCL_ALLOC
, FTAG
, RW_READER
);
1040 for (int d
= 0; d
< ndvas
; d
++)
1041 if ((error
= metaslab_claim_dva(spa
, &dva
[d
], txg
)) != 0)
1044 spa_config_exit(spa
, SCL_ALLOC
, FTAG
);
1046 ASSERT(error
== 0 || txg
== 0);