2 * Copyright (C) 2012 Red Hat, Inc.
4 * This file is released under the GPL.
8 #include "dm-space-map.h"
9 #include "dm-transaction-manager.h"
11 #include <linux/export.h>
12 #include <linux/device-mapper.h>
14 #define DM_MSG_PREFIX "array"
16 /*----------------------------------------------------------------*/
19 * The array is implemented as a fully populated btree, which points to
20 * blocks that contain the packed values. This is more space efficient
21 * than just using a btree since we don't store 1 key per value.
28 __le64 blocknr
; /* Block this node is supposed to live in. */
31 /*----------------------------------------------------------------*/
34 * Validator methods. As usual we calculate a checksum, and also write the
35 * block location into the header (paranoia about ssds remapping areas by
38 #define CSUM_XOR 595846735
40 static void array_block_prepare_for_write(struct dm_block_validator
*v
,
44 struct array_block
*bh_le
= dm_block_data(b
);
46 bh_le
->blocknr
= cpu_to_le64(dm_block_location(b
));
47 bh_le
->csum
= cpu_to_le32(dm_bm_checksum(&bh_le
->max_entries
,
48 size_of_block
- sizeof(__le32
),
52 static int array_block_check(struct dm_block_validator
*v
,
56 struct array_block
*bh_le
= dm_block_data(b
);
59 if (dm_block_location(b
) != le64_to_cpu(bh_le
->blocknr
)) {
60 DMERR_LIMIT("array_block_check failed: blocknr %llu != wanted %llu",
61 (unsigned long long) le64_to_cpu(bh_le
->blocknr
),
62 (unsigned long long) dm_block_location(b
));
66 csum_disk
= cpu_to_le32(dm_bm_checksum(&bh_le
->max_entries
,
67 size_of_block
- sizeof(__le32
),
69 if (csum_disk
!= bh_le
->csum
) {
70 DMERR_LIMIT("array_block_check failed: csum %u != wanted %u",
71 (unsigned) le32_to_cpu(csum_disk
),
72 (unsigned) le32_to_cpu(bh_le
->csum
));
79 static struct dm_block_validator array_validator
= {
81 .prepare_for_write
= array_block_prepare_for_write
,
82 .check
= array_block_check
85 /*----------------------------------------------------------------*/
88 * Functions for manipulating the array blocks.
92 * Returns a pointer to a value within an array block.
94 * index - The index into _this_ specific block.
96 static void *element_at(struct dm_array_info
*info
, struct array_block
*ab
,
99 unsigned char *entry
= (unsigned char *) (ab
+ 1);
101 entry
+= index
* info
->value_type
.size
;
107 * Utility function that calls one of the value_type methods on every value
110 static void on_entries(struct dm_array_info
*info
, struct array_block
*ab
,
111 void (*fn
)(void *, const void *))
113 unsigned i
, nr_entries
= le32_to_cpu(ab
->nr_entries
);
115 for (i
= 0; i
< nr_entries
; i
++)
116 fn(info
->value_type
.context
, element_at(info
, ab
, i
));
120 * Increment every value in an array block.
122 static void inc_ablock_entries(struct dm_array_info
*info
, struct array_block
*ab
)
124 struct dm_btree_value_type
*vt
= &info
->value_type
;
127 on_entries(info
, ab
, vt
->inc
);
131 * Decrement every value in an array block.
133 static void dec_ablock_entries(struct dm_array_info
*info
, struct array_block
*ab
)
135 struct dm_btree_value_type
*vt
= &info
->value_type
;
138 on_entries(info
, ab
, vt
->dec
);
142 * Each array block can hold this many values.
144 static uint32_t calc_max_entries(size_t value_size
, size_t size_of_block
)
146 return (size_of_block
- sizeof(struct array_block
)) / value_size
;
150 * Allocate a new array block. The caller will need to unlock block.
152 static int alloc_ablock(struct dm_array_info
*info
, size_t size_of_block
,
153 uint32_t max_entries
,
154 struct dm_block
**block
, struct array_block
**ab
)
158 r
= dm_tm_new_block(info
->btree_info
.tm
, &array_validator
, block
);
162 (*ab
) = dm_block_data(*block
);
163 (*ab
)->max_entries
= cpu_to_le32(max_entries
);
164 (*ab
)->nr_entries
= cpu_to_le32(0);
165 (*ab
)->value_size
= cpu_to_le32(info
->value_type
.size
);
171 * Pad an array block out with a particular value. Every instance will
172 * cause an increment of the value_type. new_nr must always be more than
173 * the current number of entries.
175 static void fill_ablock(struct dm_array_info
*info
, struct array_block
*ab
,
176 const void *value
, unsigned new_nr
)
180 struct dm_btree_value_type
*vt
= &info
->value_type
;
182 BUG_ON(new_nr
> le32_to_cpu(ab
->max_entries
));
183 BUG_ON(new_nr
< le32_to_cpu(ab
->nr_entries
));
185 nr_entries
= le32_to_cpu(ab
->nr_entries
);
186 for (i
= nr_entries
; i
< new_nr
; i
++) {
188 vt
->inc(vt
->context
, value
);
189 memcpy(element_at(info
, ab
, i
), value
, vt
->size
);
191 ab
->nr_entries
= cpu_to_le32(new_nr
);
195 * Remove some entries from the back of an array block. Every value
196 * removed will be decremented. new_nr must be <= the current number of
199 static void trim_ablock(struct dm_array_info
*info
, struct array_block
*ab
,
204 struct dm_btree_value_type
*vt
= &info
->value_type
;
206 BUG_ON(new_nr
> le32_to_cpu(ab
->max_entries
));
207 BUG_ON(new_nr
> le32_to_cpu(ab
->nr_entries
));
209 nr_entries
= le32_to_cpu(ab
->nr_entries
);
210 for (i
= nr_entries
; i
> new_nr
; i
--)
212 vt
->dec(vt
->context
, element_at(info
, ab
, i
- 1));
213 ab
->nr_entries
= cpu_to_le32(new_nr
);
217 * Read locks a block, and coerces it to an array block. The caller must
218 * unlock 'block' when finished.
220 static int get_ablock(struct dm_array_info
*info
, dm_block_t b
,
221 struct dm_block
**block
, struct array_block
**ab
)
225 r
= dm_tm_read_lock(info
->btree_info
.tm
, b
, &array_validator
, block
);
229 *ab
= dm_block_data(*block
);
234 * Unlocks an array block.
236 static int unlock_ablock(struct dm_array_info
*info
, struct dm_block
*block
)
238 return dm_tm_unlock(info
->btree_info
.tm
, block
);
241 /*----------------------------------------------------------------*/
244 * Btree manipulation.
248 * Looks up an array block in the btree, and then read locks it.
250 * index is the index of the index of the array_block, (ie. the array index
253 static int lookup_ablock(struct dm_array_info
*info
, dm_block_t root
,
254 unsigned index
, struct dm_block
**block
,
255 struct array_block
**ab
)
258 uint64_t key
= index
;
261 r
= dm_btree_lookup(&info
->btree_info
, root
, &key
, &block_le
);
265 return get_ablock(info
, le64_to_cpu(block_le
), block
, ab
);
269 * Insert an array block into the btree. The block is _not_ unlocked.
271 static int insert_ablock(struct dm_array_info
*info
, uint64_t index
,
272 struct dm_block
*block
, dm_block_t
*root
)
274 __le64 block_le
= cpu_to_le64(dm_block_location(block
));
276 __dm_bless_for_disk(block_le
);
277 return dm_btree_insert(&info
->btree_info
, *root
, &index
, &block_le
, root
);
281 * Looks up an array block in the btree. Then shadows it, and updates the
282 * btree to point to this new shadow. 'root' is an input/output parameter
283 * for both the current root block, and the new one.
285 static int shadow_ablock(struct dm_array_info
*info
, dm_block_t
*root
,
286 unsigned index
, struct dm_block
**block
,
287 struct array_block
**ab
)
290 uint64_t key
= index
;
297 r
= dm_btree_lookup(&info
->btree_info
, *root
, &key
, &block_le
);
300 b
= le64_to_cpu(block_le
);
305 r
= dm_tm_shadow_block(info
->btree_info
.tm
, b
,
306 &array_validator
, block
, &inc
);
310 *ab
= dm_block_data(*block
);
312 inc_ablock_entries(info
, *ab
);
317 * The shadow op will often be a noop. Only insert if it really
320 if (dm_block_location(*block
) != b
) {
322 * dm_tm_shadow_block will have already decremented the old
323 * block, but it is still referenced by the btree. We
324 * increment to stop the insert decrementing it below zero
325 * when overwriting the old value.
327 dm_tm_inc(info
->btree_info
.tm
, b
);
328 r
= insert_ablock(info
, index
, *block
, root
);
335 * Allocate an new array block, and fill it with some values.
337 static int insert_new_ablock(struct dm_array_info
*info
, size_t size_of_block
,
338 uint32_t max_entries
,
339 unsigned block_index
, uint32_t nr
,
340 const void *value
, dm_block_t
*root
)
343 struct dm_block
*block
;
344 struct array_block
*ab
;
346 r
= alloc_ablock(info
, size_of_block
, max_entries
, &block
, &ab
);
350 fill_ablock(info
, ab
, value
, nr
);
351 r
= insert_ablock(info
, block_index
, block
, root
);
352 unlock_ablock(info
, block
);
357 static int insert_full_ablocks(struct dm_array_info
*info
, size_t size_of_block
,
358 unsigned begin_block
, unsigned end_block
,
359 unsigned max_entries
, const void *value
,
364 for (; !r
&& begin_block
!= end_block
; begin_block
++)
365 r
= insert_new_ablock(info
, size_of_block
, max_entries
, begin_block
, max_entries
, value
, root
);
371 * There are a bunch of functions involved with resizing an array. This
372 * structure holds information that commonly needed by them. Purely here
373 * to reduce parameter count.
377 * Describes the array.
379 struct dm_array_info
*info
;
382 * The current root of the array. This gets updated.
387 * Metadata block size. Used to calculate the nr entries in an
390 size_t size_of_block
;
393 * Maximum nr entries in an array block.
395 unsigned max_entries
;
398 * nr of completely full blocks in the array.
400 * 'old' refers to before the resize, 'new' after.
402 unsigned old_nr_full_blocks
, new_nr_full_blocks
;
405 * Number of entries in the final block. 0 iff only full blocks in
408 unsigned old_nr_entries_in_last_block
, new_nr_entries_in_last_block
;
411 * The default value used when growing the array.
417 * Removes a consecutive set of array blocks from the btree. The values
418 * in block are decremented as a side effect of the btree remove.
420 * begin_index - the index of the first array block to remove.
421 * end_index - the one-past-the-end value. ie. this block is not removed.
423 static int drop_blocks(struct resize
*resize
, unsigned begin_index
,
428 while (begin_index
!= end_index
) {
429 uint64_t key
= begin_index
++;
430 r
= dm_btree_remove(&resize
->info
->btree_info
, resize
->root
,
431 &key
, &resize
->root
);
440 * Calculates how many blocks are needed for the array.
442 static unsigned total_nr_blocks_needed(unsigned nr_full_blocks
,
443 unsigned nr_entries_in_last_block
)
445 return nr_full_blocks
+ (nr_entries_in_last_block
? 1 : 0);
451 static int shrink(struct resize
*resize
)
455 struct dm_block
*block
;
456 struct array_block
*ab
;
459 * Lose some blocks from the back?
461 if (resize
->new_nr_full_blocks
< resize
->old_nr_full_blocks
) {
462 begin
= total_nr_blocks_needed(resize
->new_nr_full_blocks
,
463 resize
->new_nr_entries_in_last_block
);
464 end
= total_nr_blocks_needed(resize
->old_nr_full_blocks
,
465 resize
->old_nr_entries_in_last_block
);
467 r
= drop_blocks(resize
, begin
, end
);
473 * Trim the new tail block
475 if (resize
->new_nr_entries_in_last_block
) {
476 r
= shadow_ablock(resize
->info
, &resize
->root
,
477 resize
->new_nr_full_blocks
, &block
, &ab
);
481 trim_ablock(resize
->info
, ab
, resize
->new_nr_entries_in_last_block
);
482 unlock_ablock(resize
->info
, block
);
491 static int grow_extend_tail_block(struct resize
*resize
, uint32_t new_nr_entries
)
494 struct dm_block
*block
;
495 struct array_block
*ab
;
497 r
= shadow_ablock(resize
->info
, &resize
->root
,
498 resize
->old_nr_full_blocks
, &block
, &ab
);
502 fill_ablock(resize
->info
, ab
, resize
->value
, new_nr_entries
);
503 unlock_ablock(resize
->info
, block
);
508 static int grow_add_tail_block(struct resize
*resize
)
510 return insert_new_ablock(resize
->info
, resize
->size_of_block
,
512 resize
->new_nr_full_blocks
,
513 resize
->new_nr_entries_in_last_block
,
514 resize
->value
, &resize
->root
);
517 static int grow_needs_more_blocks(struct resize
*resize
)
520 unsigned old_nr_blocks
= resize
->old_nr_full_blocks
;
522 if (resize
->old_nr_entries_in_last_block
> 0) {
525 r
= grow_extend_tail_block(resize
, resize
->max_entries
);
530 r
= insert_full_ablocks(resize
->info
, resize
->size_of_block
,
532 resize
->new_nr_full_blocks
,
533 resize
->max_entries
, resize
->value
,
538 if (resize
->new_nr_entries_in_last_block
)
539 r
= grow_add_tail_block(resize
);
544 static int grow(struct resize
*resize
)
546 if (resize
->new_nr_full_blocks
> resize
->old_nr_full_blocks
)
547 return grow_needs_more_blocks(resize
);
549 else if (resize
->old_nr_entries_in_last_block
)
550 return grow_extend_tail_block(resize
, resize
->new_nr_entries_in_last_block
);
553 return grow_add_tail_block(resize
);
556 /*----------------------------------------------------------------*/
559 * These are the value_type functions for the btree elements, which point
562 static void block_inc(void *context
, const void *value
)
565 struct dm_array_info
*info
= context
;
567 memcpy(&block_le
, value
, sizeof(block_le
));
568 dm_tm_inc(info
->btree_info
.tm
, le64_to_cpu(block_le
));
571 static void block_dec(void *context
, const void *value
)
577 struct dm_block
*block
;
578 struct array_block
*ab
;
579 struct dm_array_info
*info
= context
;
581 memcpy(&block_le
, value
, sizeof(block_le
));
582 b
= le64_to_cpu(block_le
);
584 r
= dm_tm_ref(info
->btree_info
.tm
, b
, &ref_count
);
586 DMERR_LIMIT("couldn't get reference count for block %llu",
587 (unsigned long long) b
);
591 if (ref_count
== 1) {
593 * We're about to drop the last reference to this ablock.
594 * So we need to decrement the ref count of the contents.
596 r
= get_ablock(info
, b
, &block
, &ab
);
598 DMERR_LIMIT("couldn't get array block %llu",
599 (unsigned long long) b
);
603 dec_ablock_entries(info
, ab
);
604 unlock_ablock(info
, block
);
607 dm_tm_dec(info
->btree_info
.tm
, b
);
610 static int block_equal(void *context
, const void *value1
, const void *value2
)
612 return !memcmp(value1
, value2
, sizeof(__le64
));
615 /*----------------------------------------------------------------*/
617 void dm_array_info_init(struct dm_array_info
*info
,
618 struct dm_transaction_manager
*tm
,
619 struct dm_btree_value_type
*vt
)
621 struct dm_btree_value_type
*bvt
= &info
->btree_info
.value_type
;
623 memcpy(&info
->value_type
, vt
, sizeof(info
->value_type
));
624 info
->btree_info
.tm
= tm
;
625 info
->btree_info
.levels
= 1;
628 bvt
->size
= sizeof(__le64
);
629 bvt
->inc
= block_inc
;
630 bvt
->dec
= block_dec
;
631 bvt
->equal
= block_equal
;
633 EXPORT_SYMBOL_GPL(dm_array_info_init
);
635 int dm_array_empty(struct dm_array_info
*info
, dm_block_t
*root
)
637 return dm_btree_empty(&info
->btree_info
, root
);
639 EXPORT_SYMBOL_GPL(dm_array_empty
);
641 static int array_resize(struct dm_array_info
*info
, dm_block_t root
,
642 uint32_t old_size
, uint32_t new_size
,
643 const void *value
, dm_block_t
*new_root
)
646 struct resize resize
;
648 if (old_size
== new_size
)
653 resize
.size_of_block
= dm_bm_block_size(dm_tm_get_bm(info
->btree_info
.tm
));
654 resize
.max_entries
= calc_max_entries(info
->value_type
.size
,
655 resize
.size_of_block
);
657 resize
.old_nr_full_blocks
= old_size
/ resize
.max_entries
;
658 resize
.old_nr_entries_in_last_block
= old_size
% resize
.max_entries
;
659 resize
.new_nr_full_blocks
= new_size
/ resize
.max_entries
;
660 resize
.new_nr_entries_in_last_block
= new_size
% resize
.max_entries
;
661 resize
.value
= value
;
663 r
= ((new_size
> old_size
) ? grow
: shrink
)(&resize
);
667 *new_root
= resize
.root
;
671 int dm_array_resize(struct dm_array_info
*info
, dm_block_t root
,
672 uint32_t old_size
, uint32_t new_size
,
673 const void *value
, dm_block_t
*new_root
)
674 __dm_written_to_disk(value
)
676 int r
= array_resize(info
, root
, old_size
, new_size
, value
, new_root
);
677 __dm_unbless_for_disk(value
);
680 EXPORT_SYMBOL_GPL(dm_array_resize
);
682 int dm_array_del(struct dm_array_info
*info
, dm_block_t root
)
684 return dm_btree_del(&info
->btree_info
, root
);
686 EXPORT_SYMBOL_GPL(dm_array_del
);
688 int dm_array_get_value(struct dm_array_info
*info
, dm_block_t root
,
689 uint32_t index
, void *value_le
)
692 struct dm_block
*block
;
693 struct array_block
*ab
;
694 size_t size_of_block
;
695 unsigned entry
, max_entries
;
697 size_of_block
= dm_bm_block_size(dm_tm_get_bm(info
->btree_info
.tm
));
698 max_entries
= calc_max_entries(info
->value_type
.size
, size_of_block
);
700 r
= lookup_ablock(info
, root
, index
/ max_entries
, &block
, &ab
);
704 entry
= index
% max_entries
;
705 if (entry
>= le32_to_cpu(ab
->nr_entries
))
708 memcpy(value_le
, element_at(info
, ab
, entry
),
709 info
->value_type
.size
);
711 unlock_ablock(info
, block
);
714 EXPORT_SYMBOL_GPL(dm_array_get_value
);
716 static int array_set_value(struct dm_array_info
*info
, dm_block_t root
,
717 uint32_t index
, const void *value
, dm_block_t
*new_root
)
720 struct dm_block
*block
;
721 struct array_block
*ab
;
722 size_t size_of_block
;
723 unsigned max_entries
;
726 struct dm_btree_value_type
*vt
= &info
->value_type
;
728 size_of_block
= dm_bm_block_size(dm_tm_get_bm(info
->btree_info
.tm
));
729 max_entries
= calc_max_entries(info
->value_type
.size
, size_of_block
);
731 r
= shadow_ablock(info
, &root
, index
/ max_entries
, &block
, &ab
);
736 entry
= index
% max_entries
;
737 if (entry
>= le32_to_cpu(ab
->nr_entries
)) {
742 old_value
= element_at(info
, ab
, entry
);
744 (!vt
->equal
|| !vt
->equal(vt
->context
, old_value
, value
))) {
745 vt
->dec(vt
->context
, old_value
);
747 vt
->inc(vt
->context
, value
);
750 memcpy(old_value
, value
, info
->value_type
.size
);
753 unlock_ablock(info
, block
);
757 int dm_array_set_value(struct dm_array_info
*info
, dm_block_t root
,
758 uint32_t index
, const void *value
, dm_block_t
*new_root
)
759 __dm_written_to_disk(value
)
763 r
= array_set_value(info
, root
, index
, value
, new_root
);
764 __dm_unbless_for_disk(value
);
767 EXPORT_SYMBOL_GPL(dm_array_set_value
);
770 struct dm_array_info
*info
;
771 int (*fn
)(void *context
, uint64_t key
, void *leaf
);
775 static int walk_ablock(void *context
, uint64_t *keys
, void *leaf
)
777 struct walk_info
*wi
= context
;
782 unsigned nr_entries
, max_entries
;
783 struct dm_block
*block
;
784 struct array_block
*ab
;
786 memcpy(&block_le
, leaf
, sizeof(block_le
));
787 r
= get_ablock(wi
->info
, le64_to_cpu(block_le
), &block
, &ab
);
791 max_entries
= le32_to_cpu(ab
->max_entries
);
792 nr_entries
= le32_to_cpu(ab
->nr_entries
);
793 for (i
= 0; i
< nr_entries
; i
++) {
794 r
= wi
->fn(wi
->context
, keys
[0] * max_entries
+ i
,
795 element_at(wi
->info
, ab
, i
));
801 unlock_ablock(wi
->info
, block
);
805 int dm_array_walk(struct dm_array_info
*info
, dm_block_t root
,
806 int (*fn
)(void *, uint64_t key
, void *leaf
),
813 wi
.context
= context
;
815 return dm_btree_walk(&info
->btree_info
, root
, walk_ablock
, &wi
);
817 EXPORT_SYMBOL_GPL(dm_array_walk
);
819 /*----------------------------------------------------------------*/