Merge remote-tracking branch 'moduleh/module.h-split'
[linux-2.6/next.git] / drivers / md / persistent-data / dm-btree.c
blob408b762532a777659905ca4870dc58b131b48396
1 /*
2 * Copyright (C) 2011 Red Hat, Inc. All rights reserved.
4 * This file is released under the GPL.
5 */
7 #include "dm-btree-internal.h"
8 #include "dm-space-map.h"
9 #include "dm-transaction-manager.h"
11 #include <linux/export.h>
12 #include <linux/device-mapper.h>
14 #define DM_MSG_PREFIX "btree"
16 /*----------------------------------------------------------------
17 * Array manipulation
18 *--------------------------------------------------------------*/
19 static void memcpy_disk(void *dest, const void *src, size_t len)
20 __dm_written_to_disk(src)
22 memcpy(dest, src, len);
23 __dm_unbless_for_disk(src);
26 static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
27 unsigned index, void *elt)
28 __dm_written_to_disk(elt)
30 if (index < nr_elts)
31 memmove(base + (elt_size * (index + 1)),
32 base + (elt_size * index),
33 (nr_elts - index) * elt_size);
35 memcpy_disk(base + (elt_size * index), elt, elt_size);
38 /*----------------------------------------------------------------*/
40 /* makes the assumption that no two keys are the same. */
41 static int bsearch(struct node *n, uint64_t key, int want_hi)
43 int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
45 while (hi - lo > 1) {
46 int mid = lo + ((hi - lo) / 2);
47 uint64_t mid_key = le64_to_cpu(n->keys[mid]);
49 if (mid_key == key)
50 return mid;
52 if (mid_key < key)
53 lo = mid;
54 else
55 hi = mid;
58 return want_hi ? hi : lo;
61 int lower_bound(struct node *n, uint64_t key)
63 return bsearch(n, key, 0);
66 void inc_children(struct dm_transaction_manager *tm, struct node *n,
67 struct dm_btree_value_type *vt)
69 unsigned i;
70 uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
72 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE)
73 for (i = 0; i < nr_entries; i++)
74 dm_tm_inc(tm, value64(n, i));
75 else if (vt->inc)
76 for (i = 0; i < nr_entries; i++)
77 vt->inc(vt->context,
78 value_ptr(n, i, vt->size));
81 static int insert_at(size_t value_size, struct node *node, unsigned index,
82 uint64_t key, void *value)
83 __dm_written_to_disk(value)
85 uint32_t nr_entries = le32_to_cpu(node->header.nr_entries);
86 __le64 key_le = cpu_to_le64(key);
88 if (index > nr_entries ||
89 index >= le32_to_cpu(node->header.max_entries)) {
90 DMERR("too many entries in btree node for insert");
91 __dm_unbless_for_disk(value);
92 return -ENOMEM;
95 __dm_bless_for_disk(&key_le);
97 array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le);
98 array_insert(value_base(node), value_size, nr_entries, index, value);
99 node->header.nr_entries = cpu_to_le32(nr_entries + 1);
101 return 0;
104 /*----------------------------------------------------------------*/
107 * We want 3n entries (for some n). This works more nicely for repeated
108 * insert remove loops than (2n + 1).
110 static uint32_t calc_max_entries(size_t value_size, size_t block_size)
112 uint32_t total, n;
113 size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */
115 block_size -= sizeof(struct node_header);
116 total = block_size / elt_size;
117 n = total / 3; /* rounds down */
119 return 3 * n;
122 int dm_btree_create(struct dm_btree_info *info, dm_block_t *root)
124 int r;
125 struct dm_block *b;
126 struct node *n;
127 size_t block_size;
128 uint32_t max_entries;
130 r = new_block(info, &b);
131 if (r < 0)
132 return r;
134 block_size = dm_bm_block_size(dm_tm_get_bm(info->tm));
135 max_entries = calc_max_entries(info->value_type.size, block_size);
137 n = dm_block_data(b);
138 memset(n, 0, block_size);
139 n->header.flags = cpu_to_le32(LEAF_NODE);
140 n->header.nr_entries = cpu_to_le32(0);
141 n->header.max_entries = cpu_to_le32(max_entries);
142 n->header.value_size = cpu_to_le32(info->value_type.size);
144 *root = dm_block_location(b);
146 return unlock_block(info, b);
148 EXPORT_SYMBOL_GPL(dm_btree_create);
150 /*----------------------------------------------------------------*/
153 * Deletion uses a recursive algorithm, since we have limited stack space
154 * we explicitly manage our own stack on the heap.
156 #define MAX_SPINE_DEPTH 64
157 struct frame {
158 struct dm_block *b;
159 struct node *n;
160 unsigned level;
161 unsigned nr_children;
162 unsigned current_child;
165 struct del_stack {
166 struct dm_transaction_manager *tm;
167 int top;
168 struct frame spine[MAX_SPINE_DEPTH];
171 static int top_frame(struct del_stack *s, struct frame **f)
173 if (s->top < 0) {
174 DMERR("btree deletion stack empty");
175 return -EINVAL;
178 *f = s->spine + s->top;
180 return 0;
183 static int unprocessed_frames(struct del_stack *s)
185 return s->top >= 0;
188 static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
190 int r;
191 uint32_t ref_count;
193 if (s->top >= MAX_SPINE_DEPTH - 1) {
194 DMERR("btree deletion stack out of memory");
195 return -ENOMEM;
198 r = dm_tm_ref(s->tm, b, &ref_count);
199 if (r)
200 return r;
202 if (ref_count > 1)
204 * This is a shared node, so we can just decrement its
205 * reference counter and leave the children.
207 dm_tm_dec(s->tm, b);
209 else {
210 struct frame *f = s->spine + ++s->top;
212 r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
213 if (r) {
214 s->top--;
215 return r;
218 f->n = dm_block_data(f->b);
219 f->level = level;
220 f->nr_children = le32_to_cpu(f->n->header.nr_entries);
221 f->current_child = 0;
224 return 0;
227 static void pop_frame(struct del_stack *s)
229 struct frame *f = s->spine + s->top--;
231 dm_tm_dec(s->tm, dm_block_location(f->b));
232 dm_tm_unlock(s->tm, f->b);
235 int dm_btree_destroy(struct dm_btree_info *info, dm_block_t root)
237 int r;
238 struct del_stack *s;
240 s = kmalloc(sizeof(*s), GFP_KERNEL);
241 if (!s)
242 return -ENOMEM;
244 s->tm = info->tm;
245 s->top = -1;
247 r = push_frame(s, root, 1);
248 if (r)
249 goto out;
251 while (unprocessed_frames(s)) {
252 uint32_t flags;
253 struct frame *f;
254 dm_block_t b;
256 r = top_frame(s, &f);
257 if (r)
258 goto out;
260 if (f->current_child >= f->nr_children) {
261 pop_frame(s);
262 continue;
265 flags = le32_to_cpu(f->n->header.flags);
266 if (flags & INTERNAL_NODE) {
267 b = value64(f->n, f->current_child);
268 f->current_child++;
269 r = push_frame(s, b, f->level);
270 if (r)
271 goto out;
273 } else if (f->level != (info->levels - 1)) {
274 b = value64(f->n, f->current_child);
275 f->current_child++;
276 r = push_frame(s, b, f->level + 1);
277 if (r)
278 goto out;
280 } else {
281 if (info->value_type.dec) {
282 unsigned i;
284 for (i = 0; i < f->nr_children; i++)
285 info->value_type.dec(info->value_type.context,
286 value_ptr(f->n, i, info->value_type.size));
288 f->current_child = f->nr_children;
292 out:
293 kfree(s);
294 return r;
296 EXPORT_SYMBOL_GPL(dm_btree_destroy);
298 // FIXME Implement or remove this fn before final submission.
299 int dm_btree_delete_gt(struct dm_btree_info *info, dm_block_t root, uint64_t *key,
300 dm_block_t *new_root)
302 /* FIXME: implement */
303 return 0;
305 EXPORT_SYMBOL_GPL(dm_btree_delete_gt);
307 /*----------------------------------------------------------------*/
309 static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
310 int (*search_fn)(struct node *, uint64_t),
311 uint64_t *result_key, void *v, size_t value_size)
313 int i, r;
314 uint32_t flags, nr_entries;
316 do {
317 r = ro_step(s, block);
318 if (r < 0)
319 return r;
321 i = search_fn(ro_node(s), key);
323 flags = le32_to_cpu(ro_node(s)->header.flags);
324 nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries);
325 if (i < 0 || i >= nr_entries)
326 return -ENODATA;
328 if (flags & INTERNAL_NODE)
329 block = value64(ro_node(s), i);
331 } while (!(flags & LEAF_NODE));
333 *result_key = le64_to_cpu(ro_node(s)->keys[i]);
334 memcpy(v, value_ptr(ro_node(s), i, value_size), value_size);
336 return 0;
339 int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
340 uint64_t *keys, void *value_le)
342 unsigned level, last_level = info->levels - 1;
343 int r = -ENODATA;
344 uint64_t rkey;
345 __le64 internal_value_le;
346 struct ro_spine spine;
348 init_ro_spine(&spine, info);
349 for (level = 0; level < info->levels; level++) {
350 size_t size;
351 void *value_p;
353 if (level == last_level) {
354 value_p = value_le;
355 size = info->value_type.size;
357 } else {
358 value_p = &internal_value_le;
359 size = sizeof(uint64_t);
362 r = btree_lookup_raw(&spine, root, keys[level],
363 lower_bound, &rkey,
364 value_p, size);
366 if (!r) {
367 if (rkey != keys[level]) {
368 exit_ro_spine(&spine);
369 return -ENODATA;
371 } else {
372 exit_ro_spine(&spine);
373 return r;
376 root = le64_to_cpu(internal_value_le);
378 exit_ro_spine(&spine);
380 return r;
382 EXPORT_SYMBOL_GPL(dm_btree_lookup);
385 * Splits a node by creating a sibling node and shifting half the nodes
386 * contents across. Assumes there is a parent node, and it has room for
387 * another child.
389 * Before:
390 * +--------+
391 * | Parent |
392 * +--------+
395 * +----------+
396 * | A ++++++ |
397 * +----------+
400 * After:
401 * +--------+
402 * | Parent |
403 * +--------+
404 * | |
405 * v +------+
406 * +---------+ |
407 * | A* +++ | v
408 * +---------+ +-------+
409 * | B +++ |
410 * +-------+
412 * Where A* is a shadow of A.
414 static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
415 unsigned parent_index, uint64_t key)
417 int r;
418 size_t size;
419 unsigned nr_left, nr_right;
420 struct dm_block *left, *right, *parent;
421 struct node *ln, *rn, *pn;
422 __le64 location;
424 left = shadow_current(s);
426 r = new_block(s->info, &right);
427 if (r < 0)
428 return r;
430 ln = dm_block_data(left);
431 rn = dm_block_data(right);
433 nr_left = le32_to_cpu(ln->header.nr_entries) / 2;
434 nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left;
436 ln->header.nr_entries = cpu_to_le32(nr_left);
438 rn->header.flags = ln->header.flags;
439 rn->header.nr_entries = cpu_to_le32(nr_right);
440 rn->header.max_entries = ln->header.max_entries;
441 rn->header.value_size = ln->header.value_size;
442 memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0]));
444 size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
445 sizeof(uint64_t) : s->info->value_type.size;
446 memcpy(value_ptr(rn, 0, size), value_ptr(ln, nr_left, size),
447 size * nr_right);
450 * Patch up the parent
452 parent = shadow_parent(s);
454 pn = dm_block_data(parent);
455 location = cpu_to_le64(dm_block_location(left));
456 __dm_bless_for_disk(&location);
457 memcpy_disk(value_ptr(pn, parent_index, sizeof(__le64)),
458 &location, sizeof(__le64));
460 location = cpu_to_le64(dm_block_location(right));
461 __dm_bless_for_disk(&location);
463 r = insert_at(sizeof(__le64), pn, parent_index + 1,
464 le64_to_cpu(rn->keys[0]), &location);
465 if (r)
466 return r;
468 if (key < le64_to_cpu(rn->keys[0])) {
469 unlock_block(s->info, right);
470 s->nodes[1] = left;
471 } else {
472 unlock_block(s->info, left);
473 s->nodes[1] = right;
476 return 0;
480 * Splits a node by creating two new children beneath the given node.
482 * Before:
483 * +----------+
484 * | A ++++++ |
485 * +----------+
488 * After:
489 * +------------+
490 * | A (shadow) |
491 * +------------+
492 * | |
493 * +------+ +----+
494 * | |
495 * v v
496 * +-------+ +-------+
497 * | B +++ | | C +++ |
498 * +-------+ +-------+
500 static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
502 int r;
503 size_t size;
504 unsigned nr_left, nr_right;
505 struct dm_block *left, *right, *new_parent;
506 struct node *pn, *ln, *rn;
507 __le64 val;
509 new_parent = shadow_current(s);
511 r = new_block(s->info, &left);
512 if (r < 0)
513 return r;
515 r = new_block(s->info, &right);
516 if (r < 0) {
517 /* FIXME: put left */
518 return r;
521 pn = dm_block_data(new_parent);
522 ln = dm_block_data(left);
523 rn = dm_block_data(right);
525 nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
526 nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
528 ln->header.flags = pn->header.flags;
529 ln->header.nr_entries = cpu_to_le32(nr_left);
530 ln->header.max_entries = pn->header.max_entries;
531 ln->header.value_size = pn->header.value_size;
533 rn->header.flags = pn->header.flags;
534 rn->header.nr_entries = cpu_to_le32(nr_right);
535 rn->header.max_entries = pn->header.max_entries;
536 rn->header.value_size = pn->header.value_size;
538 memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
539 memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
541 size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
542 sizeof(__le64) : s->info->value_type.size;
543 memcpy(value_ptr(ln, 0, size), value_ptr(pn, 0, size), nr_left * size);
544 memcpy(value_ptr(rn, 0, size), value_ptr(pn, nr_left, size),
545 nr_right * size);
547 /* new_parent should just point to l and r now */
548 pn->header.flags = cpu_to_le32(INTERNAL_NODE);
549 pn->header.nr_entries = cpu_to_le32(2);
550 pn->header.max_entries = cpu_to_le32(
551 calc_max_entries(sizeof(__le64),
552 dm_bm_block_size(
553 dm_tm_get_bm(s->info->tm))));
554 pn->header.value_size = cpu_to_le32(sizeof(__le64));
556 val = cpu_to_le64(dm_block_location(left));
557 __dm_bless_for_disk(&val);
558 pn->keys[0] = ln->keys[0];
559 memcpy_disk(value_ptr(pn, 0, sizeof(__le64)), &val, sizeof(__le64));
561 val = cpu_to_le64(dm_block_location(right));
562 __dm_bless_for_disk(&val);
563 pn->keys[1] = rn->keys[0];
564 memcpy_disk(value_ptr(pn, 1, sizeof(__le64)), &val, sizeof(__le64));
567 * rejig the spine. This is ugly, since it knows too
568 * much about the spine
570 if (s->nodes[0] != new_parent) {
571 unlock_block(s->info, s->nodes[0]);
572 s->nodes[0] = new_parent;
574 if (key < le64_to_cpu(rn->keys[0])) {
575 unlock_block(s->info, right);
576 s->nodes[1] = left;
577 } else {
578 unlock_block(s->info, left);
579 s->nodes[1] = right;
581 s->count = 2;
583 return 0;
586 static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
587 struct dm_btree_value_type *vt,
588 uint64_t key, unsigned *index)
590 int r, i = *index, inc, top = 1;
591 struct node *node;
593 for (;;) {
594 r = shadow_step(s, root, vt, &inc);
595 if (r < 0)
596 return r;
598 node = dm_block_data(shadow_current(s));
599 if (inc)
600 inc_children(s->info->tm, node, vt);
603 * We have to patch up the parent node, ugly, but I don't
604 * see a way to do this automatically as part of the spine
605 * op.
607 if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */
608 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
610 __dm_bless_for_disk(&location);
611 memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(uint64_t)),
612 &location, sizeof(__le64));
615 node = dm_block_data(shadow_current(s));
617 if (node->header.nr_entries == node->header.max_entries) {
618 if (top)
619 r = btree_split_beneath(s, key);
620 else
621 r = btree_split_sibling(s, root, i, key);
623 if (r < 0)
624 return r;
627 node = dm_block_data(shadow_current(s));
629 i = lower_bound(node, key);
631 if (le32_to_cpu(node->header.flags) & LEAF_NODE)
632 break;
634 if (i < 0) {
635 /* change the bounds on the lowest key */
636 node->keys[0] = cpu_to_le64(key);
637 i = 0;
640 root = value64(node, i);
641 top = 0;
644 if (i < 0 || le64_to_cpu(node->keys[i]) != key)
645 i++;
647 /* we're about to overwrite this value, so undo the increment for it */
648 /* FIXME: shame that inc information is leaking outside the spine.
649 * Plus inc is just plain wrong in the event of a split */
650 if (le64_to_cpu(node->keys[i]) == key && inc)
651 if (vt->dec)
652 vt->dec(vt->context, value_ptr(node, i, vt->size));
654 *index = i;
655 return 0;
658 static int insert(struct dm_btree_info *info, dm_block_t root,
659 uint64_t *keys, void *value, dm_block_t *new_root,
660 int *inserted)
661 __dm_written_to_disk(value)
663 int r, need_insert;
664 unsigned level, index = -1, last_level = info->levels - 1;
665 dm_block_t block = root;
666 struct shadow_spine spine;
667 struct node *n;
668 struct dm_btree_value_type le64_type;
670 le64_type.context = NULL;
671 le64_type.size = sizeof(__le64);
672 le64_type.inc = NULL;
673 le64_type.dec = NULL;
674 le64_type.equal = NULL;
676 init_shadow_spine(&spine, info);
678 for (level = 0; level < (info->levels - 1); level++) {
679 r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index);
680 if (r < 0)
681 goto bad;
683 n = dm_block_data(shadow_current(&spine));
684 need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
685 (le64_to_cpu(n->keys[index]) != keys[level]));
687 if (need_insert) {
688 dm_block_t new_tree;
689 __le64 new_le;
691 r = dm_btree_create(info, &new_tree);
692 if (r < 0)
693 goto bad;
695 new_le = cpu_to_le64(new_tree);
696 __dm_bless_for_disk(&new_le);
698 r = insert_at(sizeof(uint64_t), n, index,
699 keys[level], &new_le);
700 if (r)
701 goto bad;
704 if (level < last_level)
705 block = value64(n, index);
708 r = btree_insert_raw(&spine, block, &info->value_type,
709 keys[level], &index);
710 if (r < 0)
711 goto bad;
713 n = dm_block_data(shadow_current(&spine));
714 need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
715 (le64_to_cpu(n->keys[index]) != keys[level]));
717 if (need_insert) {
718 if (inserted)
719 *inserted = 1;
721 r = insert_at(info->value_type.size, n, index,
722 keys[level], value);
723 if (r)
724 goto bad_unblessed;
725 } else {
726 if (inserted)
727 *inserted = 0;
729 if (info->value_type.dec &&
730 (!info->value_type.equal ||
731 !info->value_type.equal(
732 info->value_type.context,
733 value_ptr(n, index, info->value_type.size),
734 value))) {
735 info->value_type.dec(info->value_type.context,
736 value_ptr(n, index, info->value_type.size));
738 memcpy_disk(value_ptr(n, index, info->value_type.size),
739 value, info->value_type.size);
742 *new_root = shadow_root(&spine);
743 exit_shadow_spine(&spine);
745 return 0;
747 bad:
748 __dm_unbless_for_disk(value);
749 bad_unblessed:
750 exit_shadow_spine(&spine);
751 return r;
754 int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
755 uint64_t *keys, void *value, dm_block_t *new_root)
756 __dm_written_to_disk(value)
758 return insert(info, root, keys, value, new_root, NULL);
760 EXPORT_SYMBOL_GPL(dm_btree_insert);
762 int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
763 uint64_t *keys, void *value, dm_block_t *new_root,
764 int *inserted)
765 __dm_written_to_disk(value)
767 return insert(info, root, keys, value, new_root, inserted);
769 EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
771 /*----------------------------------------------------------------*/
773 int dm_btree_clone(struct dm_btree_info *info, dm_block_t root,
774 dm_block_t *clone)
776 int r;
777 struct dm_block *b, *orig_b;
778 struct node *b_node, *orig_node;
780 /* Copy the root node */
781 r = new_block(info, &b);
782 if (r < 0)
783 return r;
785 r = dm_tm_read_lock(info->tm, root, &btree_node_validator, &orig_b);
786 if (r < 0) {
787 dm_block_t location = dm_block_location(b);
789 unlock_block(info, b);
790 dm_tm_dec(info->tm, location);
793 *clone = dm_block_location(b);
794 b_node = dm_block_data(b);
795 orig_node = dm_block_data(orig_b);
797 memcpy(b_node, orig_node,
798 dm_bm_block_size(dm_tm_get_bm(info->tm)));
799 dm_tm_unlock(info->tm, orig_b);
800 inc_children(info->tm, b_node, &info->value_type);
801 dm_tm_unlock(info->tm, b);
803 return 0;
805 EXPORT_SYMBOL_GPL(dm_btree_clone);
807 /*----------------------------------------------------------------*/
809 static int find_highest_key(struct ro_spine *s, dm_block_t block,
810 uint64_t *result_key, dm_block_t *next_block)
812 int i, r;
813 uint32_t flags;
815 do {
816 r = ro_step(s, block);
817 if (r < 0)
818 return r;
820 flags = le32_to_cpu(ro_node(s)->header.flags);
821 i = le32_to_cpu(ro_node(s)->header.nr_entries);
822 if (!i)
823 return -ENODATA;
824 else
825 i--;
827 *result_key = le64_to_cpu(ro_node(s)->keys[i]);
828 if (next_block || flags & INTERNAL_NODE)
829 block = value64(ro_node(s), i);
831 } while (flags & INTERNAL_NODE);
833 if (next_block)
834 *next_block = block;
835 return 0;
838 int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
839 uint64_t *result_keys)
841 int r = 0, count = 0, level;
842 struct ro_spine spine;
844 init_ro_spine(&spine, info);
845 for (level = 0; level < info->levels; level++) {
846 r = find_highest_key(&spine, root, result_keys + level,
847 level == info->levels - 1 ? NULL : &root);
848 if (r == -ENODATA) {
849 r = 0;
850 break;
852 } else if (r)
853 break;
855 count++;
857 exit_ro_spine(&spine);
859 return r ? r : count;
861 EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);