4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #pragma ident "%Z%%M% %I% %E% SMI"
28 #include <sys/zfs_context.h>
32 #include <sys/space_map.h>
36 * NOTE: caller is responsible for all locking.
39 space_map_seg_compare(const void *x1
, const void *x2
)
41 const space_seg_t
*s1
= x1
;
42 const space_seg_t
*s2
= x2
;
44 if (s1
->ss_start
< s2
->ss_start
) {
45 if (s1
->ss_end
> s2
->ss_start
)
49 if (s1
->ss_start
> s2
->ss_start
) {
50 if (s1
->ss_start
< s2
->ss_end
)
58 space_map_create(space_map_t
*sm
, uint64_t start
, uint64_t size
, uint8_t shift
,
61 bzero(sm
, sizeof (*sm
));
63 avl_create(&sm
->sm_root
, space_map_seg_compare
,
64 sizeof (space_seg_t
), offsetof(struct space_seg
, ss_node
));
66 cv_init(&sm
->sm_load_cv
, NULL
, CV_DEFAULT
, NULL
);
75 space_map_destroy(space_map_t
*sm
)
77 ASSERT(!sm
->sm_loaded
&& !sm
->sm_loading
);
78 VERIFY3U(sm
->sm_space
, ==, 0);
79 avl_destroy(&sm
->sm_root
);
81 cv_destroy(&sm
->sm_load_cv
);
85 space_map_add(space_map_t
*sm
, uint64_t start
, uint64_t size
)
88 space_seg_t ssearch
, *ss_before
, *ss_after
, *ss
;
89 uint64_t end
= start
+ size
;
90 int merge_before
, merge_after
;
92 ASSERT(MUTEX_HELD(sm
->sm_lock
));
94 VERIFY3U(start
, >=, sm
->sm_start
);
95 VERIFY3U(end
, <=, sm
->sm_start
+ sm
->sm_size
);
96 VERIFY(sm
->sm_space
+ size
<= sm
->sm_size
);
97 VERIFY(P2PHASE(start
, 1ULL << sm
->sm_shift
) == 0);
98 VERIFY(P2PHASE(size
, 1ULL << sm
->sm_shift
) == 0);
100 ssearch
.ss_start
= start
;
101 ssearch
.ss_end
= end
;
102 ss
= avl_find(&sm
->sm_root
, &ssearch
, &where
);
104 if (ss
!= NULL
&& ss
->ss_start
<= start
&& ss
->ss_end
>= end
) {
105 zfs_panic_recover("zfs: allocating allocated segment"
106 "(offset=%llu size=%llu)\n",
107 (longlong_t
)start
, (longlong_t
)size
);
111 /* Make sure we don't overlap with either of our neighbors */
114 ss_before
= avl_nearest(&sm
->sm_root
, where
, AVL_BEFORE
);
115 ss_after
= avl_nearest(&sm
->sm_root
, where
, AVL_AFTER
);
117 merge_before
= (ss_before
!= NULL
&& ss_before
->ss_end
== start
);
118 merge_after
= (ss_after
!= NULL
&& ss_after
->ss_start
== end
);
120 if (merge_before
&& merge_after
) {
121 avl_remove(&sm
->sm_root
, ss_before
);
122 ss_after
->ss_start
= ss_before
->ss_start
;
123 kmem_free(ss_before
, sizeof (*ss_before
));
124 } else if (merge_before
) {
125 ss_before
->ss_end
= end
;
126 } else if (merge_after
) {
127 ss_after
->ss_start
= start
;
129 ss
= kmem_alloc(sizeof (*ss
), KM_SLEEP
);
130 ss
->ss_start
= start
;
132 avl_insert(&sm
->sm_root
, ss
, where
);
135 sm
->sm_space
+= size
;
139 space_map_remove(space_map_t
*sm
, uint64_t start
, uint64_t size
)
142 space_seg_t ssearch
, *ss
, *newseg
;
143 uint64_t end
= start
+ size
;
144 int left_over
, right_over
;
146 ASSERT(MUTEX_HELD(sm
->sm_lock
));
148 VERIFY(P2PHASE(start
, 1ULL << sm
->sm_shift
) == 0);
149 VERIFY(P2PHASE(size
, 1ULL << sm
->sm_shift
) == 0);
151 ssearch
.ss_start
= start
;
152 ssearch
.ss_end
= end
;
153 ss
= avl_find(&sm
->sm_root
, &ssearch
, &where
);
155 /* Make sure we completely overlap with someone */
157 zfs_panic_recover("zfs: freeing free segment "
158 "(offset=%llu size=%llu)",
159 (longlong_t
)start
, (longlong_t
)size
);
162 VERIFY3U(ss
->ss_start
, <=, start
);
163 VERIFY3U(ss
->ss_end
, >=, end
);
164 VERIFY(sm
->sm_space
- size
<= sm
->sm_size
);
166 left_over
= (ss
->ss_start
!= start
);
167 right_over
= (ss
->ss_end
!= end
);
169 if (left_over
&& right_over
) {
170 newseg
= kmem_alloc(sizeof (*newseg
), KM_SLEEP
);
171 newseg
->ss_start
= end
;
172 newseg
->ss_end
= ss
->ss_end
;
174 avl_insert_here(&sm
->sm_root
, newseg
, ss
, AVL_AFTER
);
175 } else if (left_over
) {
177 } else if (right_over
) {
180 avl_remove(&sm
->sm_root
, ss
);
181 kmem_free(ss
, sizeof (*ss
));
184 sm
->sm_space
-= size
;
188 space_map_contains(space_map_t
*sm
, uint64_t start
, uint64_t size
)
191 space_seg_t ssearch
, *ss
;
192 uint64_t end
= start
+ size
;
194 ASSERT(MUTEX_HELD(sm
->sm_lock
));
196 VERIFY(P2PHASE(start
, 1ULL << sm
->sm_shift
) == 0);
197 VERIFY(P2PHASE(size
, 1ULL << sm
->sm_shift
) == 0);
199 ssearch
.ss_start
= start
;
200 ssearch
.ss_end
= end
;
201 ss
= avl_find(&sm
->sm_root
, &ssearch
, &where
);
203 return (ss
!= NULL
&& ss
->ss_start
<= start
&& ss
->ss_end
>= end
);
207 space_map_vacate(space_map_t
*sm
, space_map_func_t
*func
, space_map_t
*mdest
)
212 ASSERT(MUTEX_HELD(sm
->sm_lock
));
214 while ((ss
= avl_destroy_nodes(&sm
->sm_root
, &cookie
)) != NULL
) {
216 func(mdest
, ss
->ss_start
, ss
->ss_end
- ss
->ss_start
);
217 kmem_free(ss
, sizeof (*ss
));
223 space_map_walk(space_map_t
*sm
, space_map_func_t
*func
, space_map_t
*mdest
)
227 for (ss
= avl_first(&sm
->sm_root
); ss
; ss
= AVL_NEXT(&sm
->sm_root
, ss
))
228 func(mdest
, ss
->ss_start
, ss
->ss_end
- ss
->ss_start
);
232 space_map_excise(space_map_t
*sm
, uint64_t start
, uint64_t size
)
234 avl_tree_t
*t
= &sm
->sm_root
;
236 space_seg_t
*ss
, search
;
237 uint64_t end
= start
+ size
;
238 uint64_t rm_start
, rm_end
;
240 ASSERT(MUTEX_HELD(sm
->sm_lock
));
242 search
.ss_start
= start
;
243 search
.ss_end
= start
;
246 ss
= avl_find(t
, &search
, &where
);
249 ss
= avl_nearest(t
, where
, AVL_AFTER
);
251 if (ss
== NULL
|| ss
->ss_start
>= end
)
254 rm_start
= MAX(ss
->ss_start
, start
);
255 rm_end
= MIN(ss
->ss_end
, end
);
257 space_map_remove(sm
, rm_start
, rm_end
- rm_start
);
262 * Replace smd with the union of smd and sms.
265 space_map_union(space_map_t
*smd
, space_map_t
*sms
)
267 avl_tree_t
*t
= &sms
->sm_root
;
270 ASSERT(MUTEX_HELD(smd
->sm_lock
));
273 * For each source segment, remove any intersections with the
274 * destination, then add the source segment to the destination.
276 for (ss
= avl_first(t
); ss
!= NULL
; ss
= AVL_NEXT(t
, ss
)) {
277 space_map_excise(smd
, ss
->ss_start
, ss
->ss_end
- ss
->ss_start
);
278 space_map_add(smd
, ss
->ss_start
, ss
->ss_end
- ss
->ss_start
);
283 * Wait for any in-progress space_map_load() to complete.
286 space_map_load_wait(space_map_t
*sm
)
288 ASSERT(MUTEX_HELD(sm
->sm_lock
));
290 while (sm
->sm_loading
)
291 cv_wait(&sm
->sm_load_cv
, sm
->sm_lock
);
295 * Note: space_map_load() will drop sm_lock across dmu_read() calls.
296 * The caller must be OK with this.
299 space_map_load(space_map_t
*sm
, space_map_ops_t
*ops
, uint8_t maptype
,
300 space_map_obj_t
*smo
, objset_t
*os
)
302 uint64_t *entry
, *entry_map
, *entry_map_end
;
303 uint64_t bufsize
, size
, offset
, end
, space
;
304 uint64_t mapstart
= sm
->sm_start
;
307 ASSERT(MUTEX_HELD(sm
->sm_lock
));
309 space_map_load_wait(sm
);
314 sm
->sm_loading
= B_TRUE
;
315 end
= smo
->smo_objsize
;
316 space
= smo
->smo_alloc
;
318 ASSERT(sm
->sm_ops
== NULL
);
319 VERIFY3U(sm
->sm_space
, ==, 0);
321 if (maptype
== SM_FREE
) {
322 space_map_add(sm
, sm
->sm_start
, sm
->sm_size
);
323 space
= sm
->sm_size
- space
;
326 bufsize
= 1ULL << SPACE_MAP_BLOCKSHIFT
;
327 entry_map
= zio_buf_alloc(bufsize
);
329 mutex_exit(sm
->sm_lock
);
331 dmu_prefetch(os
, smo
->smo_object
, bufsize
, end
- bufsize
);
332 mutex_enter(sm
->sm_lock
);
334 for (offset
= 0; offset
< end
; offset
+= bufsize
) {
335 size
= MIN(end
- offset
, bufsize
);
336 VERIFY(P2PHASE(size
, sizeof (uint64_t)) == 0);
339 dprintf("object=%llu offset=%llx size=%llx\n",
340 smo
->smo_object
, offset
, size
);
342 mutex_exit(sm
->sm_lock
);
343 error
= dmu_read(os
, smo
->smo_object
, offset
, size
, entry_map
);
344 mutex_enter(sm
->sm_lock
);
348 entry_map_end
= entry_map
+ (size
/ sizeof (uint64_t));
349 for (entry
= entry_map
; entry
< entry_map_end
; entry
++) {
352 if (SM_DEBUG_DECODE(e
)) /* Skip debug entries */
355 (SM_TYPE_DECODE(e
) == maptype
?
356 space_map_add
: space_map_remove
)(sm
,
357 (SM_OFFSET_DECODE(e
) << sm
->sm_shift
) + mapstart
,
358 SM_RUN_DECODE(e
) << sm
->sm_shift
);
363 VERIFY3U(sm
->sm_space
, ==, space
);
365 sm
->sm_loaded
= B_TRUE
;
370 space_map_vacate(sm
, NULL
, NULL
);
373 zio_buf_free(entry_map
, bufsize
);
375 sm
->sm_loading
= B_FALSE
;
377 cv_broadcast(&sm
->sm_load_cv
);
383 space_map_unload(space_map_t
*sm
)
385 ASSERT(MUTEX_HELD(sm
->sm_lock
));
387 if (sm
->sm_loaded
&& sm
->sm_ops
!= NULL
)
388 sm
->sm_ops
->smop_unload(sm
);
390 sm
->sm_loaded
= B_FALSE
;
393 space_map_vacate(sm
, NULL
, NULL
);
397 space_map_alloc(space_map_t
*sm
, uint64_t size
)
401 start
= sm
->sm_ops
->smop_alloc(sm
, size
);
403 space_map_remove(sm
, start
, size
);
408 space_map_claim(space_map_t
*sm
, uint64_t start
, uint64_t size
)
410 sm
->sm_ops
->smop_claim(sm
, start
, size
);
411 space_map_remove(sm
, start
, size
);
415 space_map_free(space_map_t
*sm
, uint64_t start
, uint64_t size
)
417 space_map_add(sm
, start
, size
);
418 sm
->sm_ops
->smop_free(sm
, start
, size
);
422 * Note: space_map_sync() will drop sm_lock across dmu_write() calls.
425 space_map_sync(space_map_t
*sm
, uint8_t maptype
,
426 space_map_obj_t
*smo
, objset_t
*os
, dmu_tx_t
*tx
)
428 spa_t
*spa
= dmu_objset_spa(os
);
431 uint64_t bufsize
, start
, size
, run_len
;
432 uint64_t *entry
, *entry_map
, *entry_map_end
;
434 ASSERT(MUTEX_HELD(sm
->sm_lock
));
436 if (sm
->sm_space
== 0)
439 dprintf("object %4llu, txg %llu, pass %d, %c, count %lu, space %llx\n",
440 smo
->smo_object
, dmu_tx_get_txg(tx
), spa_sync_pass(spa
),
441 maptype
== SM_ALLOC
? 'A' : 'F', avl_numnodes(&sm
->sm_root
),
444 if (maptype
== SM_ALLOC
)
445 smo
->smo_alloc
+= sm
->sm_space
;
447 smo
->smo_alloc
-= sm
->sm_space
;
449 bufsize
= (8 + avl_numnodes(&sm
->sm_root
)) * sizeof (uint64_t);
450 bufsize
= MIN(bufsize
, 1ULL << SPACE_MAP_BLOCKSHIFT
);
451 entry_map
= zio_buf_alloc(bufsize
);
452 entry_map_end
= entry_map
+ (bufsize
/ sizeof (uint64_t));
455 *entry
++ = SM_DEBUG_ENCODE(1) |
456 SM_DEBUG_ACTION_ENCODE(maptype
) |
457 SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa
)) |
458 SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx
));
460 while ((ss
= avl_destroy_nodes(&sm
->sm_root
, &cookie
)) != NULL
) {
461 size
= ss
->ss_end
- ss
->ss_start
;
462 start
= (ss
->ss_start
- sm
->sm_start
) >> sm
->sm_shift
;
464 sm
->sm_space
-= size
;
465 size
>>= sm
->sm_shift
;
468 run_len
= MIN(size
, SM_RUN_MAX
);
470 if (entry
== entry_map_end
) {
471 mutex_exit(sm
->sm_lock
);
472 dmu_write(os
, smo
->smo_object
, smo
->smo_objsize
,
473 bufsize
, entry_map
, tx
);
474 mutex_enter(sm
->sm_lock
);
475 smo
->smo_objsize
+= bufsize
;
479 *entry
++ = SM_OFFSET_ENCODE(start
) |
480 SM_TYPE_ENCODE(maptype
) |
481 SM_RUN_ENCODE(run_len
);
486 kmem_free(ss
, sizeof (*ss
));
489 if (entry
!= entry_map
) {
490 size
= (entry
- entry_map
) * sizeof (uint64_t);
491 mutex_exit(sm
->sm_lock
);
492 dmu_write(os
, smo
->smo_object
, smo
->smo_objsize
,
493 size
, entry_map
, tx
);
494 mutex_enter(sm
->sm_lock
);
495 smo
->smo_objsize
+= size
;
498 zio_buf_free(entry_map
, bufsize
);
500 VERIFY3U(sm
->sm_space
, ==, 0);
504 space_map_truncate(space_map_obj_t
*smo
, objset_t
*os
, dmu_tx_t
*tx
)
506 VERIFY(dmu_free_range(os
, smo
->smo_object
, 0, -1ULL, tx
) == 0);
508 smo
->smo_objsize
= 0;