2 * Code for working with individual keys, and sorted sets of keys with in a
5 * Copyright 2012 Google, Inc.
8 #define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
13 #include <linux/console.h>
14 #include <linux/random.h>
15 #include <linux/prefetch.h>
17 #ifdef CONFIG_BCACHE_DEBUG
19 void bch_dump_bset(struct btree_keys
*b
, struct bset
*i
, unsigned set
)
21 struct bkey
*k
, *next
;
23 for (k
= i
->start
; k
< bset_bkey_last(i
); k
= next
) {
26 printk(KERN_ERR
"block %u key %u/%u: ", set
,
27 (unsigned) ((u64
*) k
- i
->d
), i
->keys
);
30 b
->ops
->key_dump(b
, k
);
32 printk("%llu:%llu\n", KEY_INODE(k
), KEY_OFFSET(k
));
34 if (next
< bset_bkey_last(i
) &&
35 bkey_cmp(k
, b
->ops
->is_extents
?
36 &START_KEY(next
) : next
) > 0)
37 printk(KERN_ERR
"Key skipped backwards\n");
41 void bch_dump_bucket(struct btree_keys
*b
)
46 for (i
= 0; i
<= b
->nsets
; i
++)
47 bch_dump_bset(b
, b
->set
[i
].data
,
48 bset_sector_offset(b
, b
->set
[i
].data
));
52 int __bch_count_data(struct btree_keys
*b
)
55 struct btree_iter iter
;
58 if (b
->ops
->is_extents
)
59 for_each_key(b
, k
, &iter
)
64 void __bch_check_keys(struct btree_keys
*b
, const char *fmt
, ...)
67 struct bkey
*k
, *p
= NULL
;
68 struct btree_iter iter
;
71 for_each_key(b
, k
, &iter
) {
72 if (b
->ops
->is_extents
) {
73 err
= "Keys out of order";
74 if (p
&& bkey_cmp(&START_KEY(p
), &START_KEY(k
)) > 0)
77 if (bch_ptr_invalid(b
, k
))
80 err
= "Overlapping keys";
81 if (p
&& bkey_cmp(p
, &START_KEY(k
)) > 0)
84 if (bch_ptr_bad(b
, k
))
87 err
= "Duplicate keys";
88 if (p
&& !bkey_cmp(p
, k
))
94 err
= "Key larger than btree node key";
95 if (p
&& bkey_cmp(p
, &b
->key
) > 0)
106 panic("bch_check_keys error: %s:\n", err
);
109 static void bch_btree_iter_next_check(struct btree_iter
*iter
)
111 struct bkey
*k
= iter
->data
->k
, *next
= bkey_next(k
);
113 if (next
< iter
->data
->end
&&
114 bkey_cmp(k
, iter
->b
->ops
->is_extents
?
115 &START_KEY(next
) : next
) > 0) {
116 bch_dump_bucket(iter
->b
);
117 panic("Key skipped backwards\n");
123 static inline void bch_btree_iter_next_check(struct btree_iter
*iter
) {}
129 int __bch_keylist_realloc(struct keylist
*l
, unsigned u64s
)
131 size_t oldsize
= bch_keylist_nkeys(l
);
132 size_t newsize
= oldsize
+ u64s
;
133 uint64_t *old_keys
= l
->keys_p
== l
->inline_keys
? NULL
: l
->keys_p
;
136 newsize
= roundup_pow_of_two(newsize
);
138 if (newsize
<= KEYLIST_INLINE
||
139 roundup_pow_of_two(oldsize
) == newsize
)
142 new_keys
= krealloc(old_keys
, sizeof(uint64_t) * newsize
, GFP_NOIO
);
148 memcpy(new_keys
, l
->inline_keys
, sizeof(uint64_t) * oldsize
);
150 l
->keys_p
= new_keys
;
151 l
->top_p
= new_keys
+ oldsize
;
156 struct bkey
*bch_keylist_pop(struct keylist
*l
)
158 struct bkey
*k
= l
->keys
;
163 while (bkey_next(k
) != l
->top
)
169 void bch_keylist_pop_front(struct keylist
*l
)
171 l
->top_p
-= bkey_u64s(l
->keys
);
175 bch_keylist_bytes(l
));
178 /* Key/pointer manipulation */
180 void bch_bkey_copy_single_ptr(struct bkey
*dest
, const struct bkey
*src
,
183 BUG_ON(i
> KEY_PTRS(src
));
185 /* Only copy the header, key, and one pointer. */
186 memcpy(dest
, src
, 2 * sizeof(uint64_t));
187 dest
->ptr
[0] = src
->ptr
[i
];
188 SET_KEY_PTRS(dest
, 1);
189 /* We didn't copy the checksum so clear that bit. */
190 SET_KEY_CSUM(dest
, 0);
193 bool __bch_cut_front(const struct bkey
*where
, struct bkey
*k
)
197 if (bkey_cmp(where
, &START_KEY(k
)) <= 0)
200 if (bkey_cmp(where
, k
) < 0)
201 len
= KEY_OFFSET(k
) - KEY_OFFSET(where
);
203 bkey_copy_key(k
, where
);
205 for (i
= 0; i
< KEY_PTRS(k
); i
++)
206 SET_PTR_OFFSET(k
, i
, PTR_OFFSET(k
, i
) + KEY_SIZE(k
) - len
);
208 BUG_ON(len
> KEY_SIZE(k
));
209 SET_KEY_SIZE(k
, len
);
213 bool __bch_cut_back(const struct bkey
*where
, struct bkey
*k
)
217 if (bkey_cmp(where
, k
) >= 0)
220 BUG_ON(KEY_INODE(where
) != KEY_INODE(k
));
222 if (bkey_cmp(where
, &START_KEY(k
)) > 0)
223 len
= KEY_OFFSET(where
) - KEY_START(k
);
225 bkey_copy_key(k
, where
);
227 BUG_ON(len
> KEY_SIZE(k
));
228 SET_KEY_SIZE(k
, len
);
232 /* Auxiliary search trees */
235 #define BKEY_MID_BITS 3
236 #define BKEY_EXPONENT_BITS 7
237 #define BKEY_MANTISSA_BITS (32 - BKEY_MID_BITS - BKEY_EXPONENT_BITS)
238 #define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1)
241 unsigned exponent
:BKEY_EXPONENT_BITS
;
242 unsigned m
:BKEY_MID_BITS
;
243 unsigned mantissa
:BKEY_MANTISSA_BITS
;
247 * BSET_CACHELINE was originally intended to match the hardware cacheline size -
248 * it used to be 64, but I realized the lookup code would touch slightly less
249 * memory if it was 128.
251 * It definites the number of bytes (in struct bset) per struct bkey_float in
252 * the auxiliar search tree - when we're done searching the bset_float tree we
253 * have this many bytes left that we do a linear search over.
255 * Since (after level 5) every level of the bset_tree is on a new cacheline,
256 * we're touching one fewer cacheline in the bset tree in exchange for one more
257 * cacheline in the linear search - but the linear search might stop before it
258 * gets to the second cacheline.
261 #define BSET_CACHELINE 128
263 /* Space required for the btree node keys */
264 static inline size_t btree_keys_bytes(struct btree_keys
*b
)
266 return PAGE_SIZE
<< b
->page_order
;
269 static inline size_t btree_keys_cachelines(struct btree_keys
*b
)
271 return btree_keys_bytes(b
) / BSET_CACHELINE
;
274 /* Space required for the auxiliary search trees */
275 static inline size_t bset_tree_bytes(struct btree_keys
*b
)
277 return btree_keys_cachelines(b
) * sizeof(struct bkey_float
);
280 /* Space required for the prev pointers */
281 static inline size_t bset_prev_bytes(struct btree_keys
*b
)
283 return btree_keys_cachelines(b
) * sizeof(uint8_t);
286 /* Memory allocation */
288 void bch_btree_keys_free(struct btree_keys
*b
)
290 struct bset_tree
*t
= b
->set
;
292 if (bset_prev_bytes(b
) < PAGE_SIZE
)
295 free_pages((unsigned long) t
->prev
,
296 get_order(bset_prev_bytes(b
)));
298 if (bset_tree_bytes(b
) < PAGE_SIZE
)
301 free_pages((unsigned long) t
->tree
,
302 get_order(bset_tree_bytes(b
)));
304 free_pages((unsigned long) t
->data
, b
->page_order
);
310 EXPORT_SYMBOL(bch_btree_keys_free
);
312 int bch_btree_keys_alloc(struct btree_keys
*b
, unsigned page_order
, gfp_t gfp
)
314 struct bset_tree
*t
= b
->set
;
318 b
->page_order
= page_order
;
320 t
->data
= (void *) __get_free_pages(gfp
, b
->page_order
);
324 t
->tree
= bset_tree_bytes(b
) < PAGE_SIZE
325 ? kmalloc(bset_tree_bytes(b
), gfp
)
326 : (void *) __get_free_pages(gfp
, get_order(bset_tree_bytes(b
)));
330 t
->prev
= bset_prev_bytes(b
) < PAGE_SIZE
331 ? kmalloc(bset_prev_bytes(b
), gfp
)
332 : (void *) __get_free_pages(gfp
, get_order(bset_prev_bytes(b
)));
338 bch_btree_keys_free(b
);
341 EXPORT_SYMBOL(bch_btree_keys_alloc
);
343 void bch_btree_keys_init(struct btree_keys
*b
, const struct btree_keys_ops
*ops
,
344 bool *expensive_debug_checks
)
349 b
->expensive_debug_checks
= expensive_debug_checks
;
351 b
->last_set_unwritten
= 0;
353 /* XXX: shouldn't be needed */
354 for (i
= 0; i
< MAX_BSETS
; i
++)
357 * Second loop starts at 1 because b->keys[0]->data is the memory we
360 for (i
= 1; i
< MAX_BSETS
; i
++)
361 b
->set
[i
].data
= NULL
;
363 EXPORT_SYMBOL(bch_btree_keys_init
);
365 /* Binary tree stuff for auxiliary search trees */
367 static unsigned inorder_next(unsigned j
, unsigned size
)
369 if (j
* 2 + 1 < size
) {
380 static unsigned inorder_prev(unsigned j
, unsigned size
)
385 while (j
* 2 + 1 < size
)
393 /* I have no idea why this code works... and I'm the one who wrote it
395 * However, I do know what it does:
396 * Given a binary tree constructed in an array (i.e. how you normally implement
397 * a heap), it converts a node in the tree - referenced by array index - to the
398 * index it would have if you did an inorder traversal.
400 * Also tested for every j, size up to size somewhere around 6 million.
402 * The binary tree starts at array index 1, not 0
403 * extra is a function of size:
404 * extra = (size - rounddown_pow_of_two(size - 1)) << 1;
406 static unsigned __to_inorder(unsigned j
, unsigned size
, unsigned extra
)
409 unsigned shift
= fls(size
- 1) - b
;
417 j
-= (j
- extra
) >> 1;
422 static unsigned to_inorder(unsigned j
, struct bset_tree
*t
)
424 return __to_inorder(j
, t
->size
, t
->extra
);
427 static unsigned __inorder_to_tree(unsigned j
, unsigned size
, unsigned extra
)
437 j
|= roundup_pow_of_two(size
) >> shift
;
442 static unsigned inorder_to_tree(unsigned j
, struct bset_tree
*t
)
444 return __inorder_to_tree(j
, t
->size
, t
->extra
);
448 void inorder_test(void)
450 unsigned long done
= 0;
451 ktime_t start
= ktime_get();
453 for (unsigned size
= 2;
456 unsigned extra
= (size
- rounddown_pow_of_two(size
- 1)) << 1;
457 unsigned i
= 1, j
= rounddown_pow_of_two(size
- 1);
460 printk(KERN_NOTICE
"loop %u, %llu per us\n", size
,
461 done
/ ktime_us_delta(ktime_get(), start
));
464 if (__inorder_to_tree(i
, size
, extra
) != j
)
465 panic("size %10u j %10u i %10u", size
, j
, i
);
467 if (__to_inorder(j
, size
, extra
) != i
)
468 panic("size %10u j %10u i %10u", size
, j
, i
);
470 if (j
== rounddown_pow_of_two(size
) - 1)
473 BUG_ON(inorder_prev(inorder_next(j
, size
), size
) != j
);
475 j
= inorder_next(j
, size
);
485 * Cacheline/offset <-> bkey pointer arithmetic:
487 * t->tree is a binary search tree in an array; each node corresponds to a key
488 * in one cacheline in t->set (BSET_CACHELINE bytes).
490 * This means we don't have to store the full index of the key that a node in
491 * the binary tree points to; to_inorder() gives us the cacheline, and then
492 * bkey_float->m gives us the offset within that cacheline, in units of 8 bytes.
494 * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to
497 * To construct the bfloat for an arbitrary key we need to know what the key
498 * immediately preceding it is: we have to check if the two keys differ in the
499 * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size
500 * of the previous key so we can walk backwards to it from t->tree[j]'s key.
503 static struct bkey
*cacheline_to_bkey(struct bset_tree
*t
, unsigned cacheline
,
506 return ((void *) t
->data
) + cacheline
* BSET_CACHELINE
+ offset
* 8;
509 static unsigned bkey_to_cacheline(struct bset_tree
*t
, struct bkey
*k
)
511 return ((void *) k
- (void *) t
->data
) / BSET_CACHELINE
;
514 static unsigned bkey_to_cacheline_offset(struct bset_tree
*t
,
518 return (u64
*) k
- (u64
*) cacheline_to_bkey(t
, cacheline
, 0);
521 static struct bkey
*tree_to_bkey(struct bset_tree
*t
, unsigned j
)
523 return cacheline_to_bkey(t
, to_inorder(j
, t
), t
->tree
[j
].m
);
526 static struct bkey
*tree_to_prev_bkey(struct bset_tree
*t
, unsigned j
)
528 return (void *) (((uint64_t *) tree_to_bkey(t
, j
)) - t
->prev
[j
]);
532 * For the write set - the one we're currently inserting keys into - we don't
533 * maintain a full search tree, we just keep a simple lookup table in t->prev.
535 static struct bkey
*table_to_bkey(struct bset_tree
*t
, unsigned cacheline
)
537 return cacheline_to_bkey(t
, cacheline
, t
->prev
[cacheline
]);
540 static inline uint64_t shrd128(uint64_t high
, uint64_t low
, uint8_t shift
)
543 low
|= (high
<< 1) << (63U - shift
);
547 static inline unsigned bfloat_mantissa(const struct bkey
*k
,
548 struct bkey_float
*f
)
550 const uint64_t *p
= &k
->low
- (f
->exponent
>> 6);
551 return shrd128(p
[-1], p
[0], f
->exponent
& 63) & BKEY_MANTISSA_MASK
;
554 static void make_bfloat(struct bset_tree
*t
, unsigned j
)
556 struct bkey_float
*f
= &t
->tree
[j
];
557 struct bkey
*m
= tree_to_bkey(t
, j
);
558 struct bkey
*p
= tree_to_prev_bkey(t
, j
);
560 struct bkey
*l
= is_power_of_2(j
)
562 : tree_to_prev_bkey(t
, j
>> ffs(j
));
564 struct bkey
*r
= is_power_of_2(j
+ 1)
565 ? bset_bkey_idx(t
->data
, t
->data
->keys
- bkey_u64s(&t
->end
))
566 : tree_to_bkey(t
, j
>> (ffz(j
) + 1));
568 BUG_ON(m
< l
|| m
> r
);
569 BUG_ON(bkey_next(p
) != m
);
571 if (KEY_INODE(l
) != KEY_INODE(r
))
572 f
->exponent
= fls64(KEY_INODE(r
) ^ KEY_INODE(l
)) + 64;
574 f
->exponent
= fls64(r
->low
^ l
->low
);
576 f
->exponent
= max_t(int, f
->exponent
- BKEY_MANTISSA_BITS
, 0);
579 * Setting f->exponent = 127 flags this node as failed, and causes the
580 * lookup code to fall back to comparing against the original key.
583 if (bfloat_mantissa(m
, f
) != bfloat_mantissa(p
, f
))
584 f
->mantissa
= bfloat_mantissa(m
, f
) - 1;
589 static void bset_alloc_tree(struct btree_keys
*b
, struct bset_tree
*t
)
592 unsigned j
= roundup(t
[-1].size
,
593 64 / sizeof(struct bkey_float
));
595 t
->tree
= t
[-1].tree
+ j
;
596 t
->prev
= t
[-1].prev
+ j
;
599 while (t
< b
->set
+ MAX_BSETS
)
603 static void bch_bset_build_unwritten_tree(struct btree_keys
*b
)
605 struct bset_tree
*t
= bset_tree_last(b
);
607 BUG_ON(b
->last_set_unwritten
);
608 b
->last_set_unwritten
= 1;
610 bset_alloc_tree(b
, t
);
612 if (t
->tree
!= b
->set
->tree
+ btree_keys_cachelines(b
)) {
613 t
->prev
[0] = bkey_to_cacheline_offset(t
, 0, t
->data
->start
);
618 void bch_bset_init_next(struct btree_keys
*b
, struct bset
*i
, uint64_t magic
)
620 if (i
!= b
->set
->data
) {
621 b
->set
[++b
->nsets
].data
= i
;
622 i
->seq
= b
->set
->data
->seq
;
624 get_random_bytes(&i
->seq
, sizeof(uint64_t));
630 bch_bset_build_unwritten_tree(b
);
632 EXPORT_SYMBOL(bch_bset_init_next
);
634 void bch_bset_build_written_tree(struct btree_keys
*b
)
636 struct bset_tree
*t
= bset_tree_last(b
);
637 struct bkey
*prev
= NULL
, *k
= t
->data
->start
;
638 unsigned j
, cacheline
= 1;
640 b
->last_set_unwritten
= 0;
642 bset_alloc_tree(b
, t
);
644 t
->size
= min_t(unsigned,
645 bkey_to_cacheline(t
, bset_bkey_last(t
->data
)),
646 b
->set
->tree
+ btree_keys_cachelines(b
) - t
->tree
);
653 t
->extra
= (t
->size
- rounddown_pow_of_two(t
->size
- 1)) << 1;
655 /* First we figure out where the first key in each cacheline is */
656 for (j
= inorder_next(0, t
->size
);
658 j
= inorder_next(j
, t
->size
)) {
659 while (bkey_to_cacheline(t
, k
) < cacheline
)
660 prev
= k
, k
= bkey_next(k
);
662 t
->prev
[j
] = bkey_u64s(prev
);
663 t
->tree
[j
].m
= bkey_to_cacheline_offset(t
, cacheline
++, k
);
666 while (bkey_next(k
) != bset_bkey_last(t
->data
))
671 /* Then we build the tree */
672 for (j
= inorder_next(0, t
->size
);
674 j
= inorder_next(j
, t
->size
))
677 EXPORT_SYMBOL(bch_bset_build_written_tree
);
681 void bch_bset_fix_invalidated_key(struct btree_keys
*b
, struct bkey
*k
)
684 unsigned inorder
, j
= 1;
686 for (t
= b
->set
; t
<= bset_tree_last(b
); t
++)
687 if (k
< bset_bkey_last(t
->data
))
692 if (!t
->size
|| !bset_written(b
, t
))
695 inorder
= bkey_to_cacheline(t
, k
);
697 if (k
== t
->data
->start
)
700 if (bkey_next(k
) == bset_bkey_last(t
->data
)) {
705 j
= inorder_to_tree(inorder
, t
);
709 k
== tree_to_bkey(t
, j
))
713 } while (j
< t
->size
);
715 j
= inorder_to_tree(inorder
+ 1, t
);
719 k
== tree_to_prev_bkey(t
, j
))
723 } while (j
< t
->size
);
725 EXPORT_SYMBOL(bch_bset_fix_invalidated_key
);
727 static void bch_bset_fix_lookup_table(struct btree_keys
*b
,
731 unsigned shift
= bkey_u64s(k
);
732 unsigned j
= bkey_to_cacheline(t
, k
);
734 /* We're getting called from btree_split() or btree_gc, just bail out */
738 /* k is the key we just inserted; we need to find the entry in the
739 * lookup table for the first key that is strictly greater than k:
740 * it's either k's cacheline or the next one
742 while (j
< t
->size
&&
743 table_to_bkey(t
, j
) <= k
)
746 /* Adjust all the lookup table entries, and find a new key for any that
747 * have gotten too big
749 for (; j
< t
->size
; j
++) {
752 if (t
->prev
[j
] > 7) {
753 k
= table_to_bkey(t
, j
- 1);
755 while (k
< cacheline_to_bkey(t
, j
, 0))
758 t
->prev
[j
] = bkey_to_cacheline_offset(t
, j
, k
);
762 if (t
->size
== b
->set
->tree
+ btree_keys_cachelines(b
) - t
->tree
)
765 /* Possibly add a new entry to the end of the lookup table */
767 for (k
= table_to_bkey(t
, t
->size
- 1);
768 k
!= bset_bkey_last(t
->data
);
770 if (t
->size
== bkey_to_cacheline(t
, k
)) {
771 t
->prev
[t
->size
] = bkey_to_cacheline_offset(t
, t
->size
, k
);
777 * Tries to merge l and r: l should be lower than r
778 * Returns true if we were able to merge. If we did merge, l will be the merged
779 * key, r will be untouched.
781 bool bch_bkey_try_merge(struct btree_keys
*b
, struct bkey
*l
, struct bkey
*r
)
783 if (!b
->ops
->key_merge
)
787 * Generic header checks
788 * Assumes left and right are in order
789 * Left and right must be exactly aligned
791 if (!bch_bkey_equal_header(l
, r
) ||
792 bkey_cmp(l
, &START_KEY(r
)))
795 return b
->ops
->key_merge(b
, l
, r
);
797 EXPORT_SYMBOL(bch_bkey_try_merge
);
799 void bch_bset_insert(struct btree_keys
*b
, struct bkey
*where
,
802 struct bset_tree
*t
= bset_tree_last(b
);
804 BUG_ON(!b
->last_set_unwritten
);
805 BUG_ON(bset_byte_offset(b
, t
->data
) +
806 __set_bytes(t
->data
, t
->data
->keys
+ bkey_u64s(insert
)) >
807 PAGE_SIZE
<< b
->page_order
);
809 memmove((uint64_t *) where
+ bkey_u64s(insert
),
811 (void *) bset_bkey_last(t
->data
) - (void *) where
);
813 t
->data
->keys
+= bkey_u64s(insert
);
814 bkey_copy(where
, insert
);
815 bch_bset_fix_lookup_table(b
, t
, where
);
817 EXPORT_SYMBOL(bch_bset_insert
);
819 unsigned bch_btree_insert_key(struct btree_keys
*b
, struct bkey
*k
,
820 struct bkey
*replace_key
)
822 unsigned status
= BTREE_INSERT_STATUS_NO_INSERT
;
823 struct bset
*i
= bset_tree_last(b
)->data
;
824 struct bkey
*m
, *prev
= NULL
;
825 struct btree_iter iter
;
827 BUG_ON(b
->ops
->is_extents
&& !KEY_SIZE(k
));
829 m
= bch_btree_iter_init(b
, &iter
, b
->ops
->is_extents
830 ? PRECEDING_KEY(&START_KEY(k
))
833 if (b
->ops
->insert_fixup(b
, k
, &iter
, replace_key
))
836 status
= BTREE_INSERT_STATUS_INSERT
;
838 while (m
!= bset_bkey_last(i
) &&
839 bkey_cmp(k
, b
->ops
->is_extents
? &START_KEY(m
) : m
) > 0)
840 prev
= m
, m
= bkey_next(m
);
842 /* prev is in the tree, if we merge we're done */
843 status
= BTREE_INSERT_STATUS_BACK_MERGE
;
845 bch_bkey_try_merge(b
, prev
, k
))
848 status
= BTREE_INSERT_STATUS_OVERWROTE
;
849 if (m
!= bset_bkey_last(i
) &&
850 KEY_PTRS(m
) == KEY_PTRS(k
) && !KEY_SIZE(m
))
853 status
= BTREE_INSERT_STATUS_FRONT_MERGE
;
854 if (m
!= bset_bkey_last(i
) &&
855 bch_bkey_try_merge(b
, k
, m
))
858 bch_bset_insert(b
, m
, k
);
859 copy
: bkey_copy(m
, k
);
863 EXPORT_SYMBOL(bch_btree_insert_key
);
867 struct bset_search_iter
{
871 static struct bset_search_iter
bset_search_write_set(struct bset_tree
*t
,
872 const struct bkey
*search
)
874 unsigned li
= 0, ri
= t
->size
;
876 while (li
+ 1 != ri
) {
877 unsigned m
= (li
+ ri
) >> 1;
879 if (bkey_cmp(table_to_bkey(t
, m
), search
) > 0)
885 return (struct bset_search_iter
) {
886 table_to_bkey(t
, li
),
887 ri
< t
->size
? table_to_bkey(t
, ri
) : bset_bkey_last(t
->data
)
891 static struct bset_search_iter
bset_search_tree(struct bset_tree
*t
,
892 const struct bkey
*search
)
895 struct bkey_float
*f
;
896 unsigned inorder
, j
, n
= 1;
900 p
&= ((int) (p
- t
->size
)) >> 31;
902 prefetch(&t
->tree
[p
]);
908 * n = (f->mantissa > bfloat_mantissa())
912 * We need to subtract 1 from f->mantissa for the sign bit trick
913 * to work - that's done in make_bfloat()
915 if (likely(f
->exponent
!= 127))
916 n
= j
* 2 + (((unsigned)
918 bfloat_mantissa(search
, f
))) >> 31);
920 n
= (bkey_cmp(tree_to_bkey(t
, j
), search
) > 0)
923 } while (n
< t
->size
);
925 inorder
= to_inorder(j
, t
);
928 * n would have been the node we recursed to - the low bit tells us if
929 * we recursed left or recursed right.
932 l
= cacheline_to_bkey(t
, inorder
, f
->m
);
934 if (++inorder
!= t
->size
) {
935 f
= &t
->tree
[inorder_next(j
, t
->size
)];
936 r
= cacheline_to_bkey(t
, inorder
, f
->m
);
938 r
= bset_bkey_last(t
->data
);
940 r
= cacheline_to_bkey(t
, inorder
, f
->m
);
943 f
= &t
->tree
[inorder_prev(j
, t
->size
)];
944 l
= cacheline_to_bkey(t
, inorder
, f
->m
);
949 return (struct bset_search_iter
) {l
, r
};
952 struct bkey
*__bch_bset_search(struct btree_keys
*b
, struct bset_tree
*t
,
953 const struct bkey
*search
)
955 struct bset_search_iter i
;
958 * First, we search for a cacheline, then lastly we do a linear search
959 * within that cacheline.
961 * To search for the cacheline, there's three different possibilities:
962 * * The set is too small to have a search tree, so we just do a linear
963 * search over the whole set.
964 * * The set is the one we're currently inserting into; keeping a full
965 * auxiliary search tree up to date would be too expensive, so we
966 * use a much simpler lookup table to do a binary search -
967 * bset_search_write_set().
968 * * Or we use the auxiliary search tree we constructed earlier -
972 if (unlikely(!t
->size
)) {
973 i
.l
= t
->data
->start
;
974 i
.r
= bset_bkey_last(t
->data
);
975 } else if (bset_written(b
, t
)) {
977 * Each node in the auxiliary search tree covers a certain range
978 * of bits, and keys above and below the set it covers might
979 * differ outside those bits - so we have to special case the
980 * start and end - handle that here:
983 if (unlikely(bkey_cmp(search
, &t
->end
) >= 0))
984 return bset_bkey_last(t
->data
);
986 if (unlikely(bkey_cmp(search
, t
->data
->start
) < 0))
987 return t
->data
->start
;
989 i
= bset_search_tree(t
, search
);
992 t
->size
< bkey_to_cacheline(t
, bset_bkey_last(t
->data
)));
994 i
= bset_search_write_set(t
, search
);
997 if (btree_keys_expensive_checks(b
)) {
998 BUG_ON(bset_written(b
, t
) &&
999 i
.l
!= t
->data
->start
&&
1000 bkey_cmp(tree_to_prev_bkey(t
,
1001 inorder_to_tree(bkey_to_cacheline(t
, i
.l
), t
)),
1004 BUG_ON(i
.r
!= bset_bkey_last(t
->data
) &&
1005 bkey_cmp(i
.r
, search
) <= 0);
1008 while (likely(i
.l
!= i
.r
) &&
1009 bkey_cmp(i
.l
, search
) <= 0)
1010 i
.l
= bkey_next(i
.l
);
1014 EXPORT_SYMBOL(__bch_bset_search
);
1016 /* Btree iterator */
1018 typedef bool (btree_iter_cmp_fn
)(struct btree_iter_set
,
1019 struct btree_iter_set
);
1021 static inline bool btree_iter_cmp(struct btree_iter_set l
,
1022 struct btree_iter_set r
)
1024 return bkey_cmp(l
.k
, r
.k
) > 0;
1027 static inline bool btree_iter_end(struct btree_iter
*iter
)
1032 void bch_btree_iter_push(struct btree_iter
*iter
, struct bkey
*k
,
1036 BUG_ON(!heap_add(iter
,
1037 ((struct btree_iter_set
) { k
, end
}),
1041 static struct bkey
*__bch_btree_iter_init(struct btree_keys
*b
,
1042 struct btree_iter
*iter
,
1043 struct bkey
*search
,
1044 struct bset_tree
*start
)
1046 struct bkey
*ret
= NULL
;
1047 iter
->size
= ARRAY_SIZE(iter
->data
);
1050 #ifdef CONFIG_BCACHE_DEBUG
1054 for (; start
<= bset_tree_last(b
); start
++) {
1055 ret
= bch_bset_search(b
, start
, search
);
1056 bch_btree_iter_push(iter
, ret
, bset_bkey_last(start
->data
));
1062 struct bkey
*bch_btree_iter_init(struct btree_keys
*b
,
1063 struct btree_iter
*iter
,
1064 struct bkey
*search
)
1066 return __bch_btree_iter_init(b
, iter
, search
, b
->set
);
1068 EXPORT_SYMBOL(bch_btree_iter_init
);
1070 static inline struct bkey
*__bch_btree_iter_next(struct btree_iter
*iter
,
1071 btree_iter_cmp_fn
*cmp
)
1073 struct btree_iter_set unused
;
1074 struct bkey
*ret
= NULL
;
1076 if (!btree_iter_end(iter
)) {
1077 bch_btree_iter_next_check(iter
);
1079 ret
= iter
->data
->k
;
1080 iter
->data
->k
= bkey_next(iter
->data
->k
);
1082 if (iter
->data
->k
> iter
->data
->end
) {
1083 WARN_ONCE(1, "bset was corrupt!\n");
1084 iter
->data
->k
= iter
->data
->end
;
1087 if (iter
->data
->k
== iter
->data
->end
)
1088 heap_pop(iter
, unused
, cmp
);
1090 heap_sift(iter
, 0, cmp
);
1096 struct bkey
*bch_btree_iter_next(struct btree_iter
*iter
)
1098 return __bch_btree_iter_next(iter
, btree_iter_cmp
);
1101 EXPORT_SYMBOL(bch_btree_iter_next
);
1103 struct bkey
*bch_btree_iter_next_filter(struct btree_iter
*iter
,
1104 struct btree_keys
*b
, ptr_filter_fn fn
)
1109 ret
= bch_btree_iter_next(iter
);
1110 } while (ret
&& fn(b
, ret
));
1117 void bch_bset_sort_state_free(struct bset_sort_state
*state
)
1120 mempool_destroy(state
->pool
);
1123 int bch_bset_sort_state_init(struct bset_sort_state
*state
, unsigned page_order
)
1125 spin_lock_init(&state
->time
.lock
);
1127 state
->page_order
= page_order
;
1128 state
->crit_factor
= int_sqrt(1 << page_order
);
1130 state
->pool
= mempool_create_page_pool(1, page_order
);
1136 EXPORT_SYMBOL(bch_bset_sort_state_init
);
1138 static void btree_mergesort(struct btree_keys
*b
, struct bset
*out
,
1139 struct btree_iter
*iter
,
1140 bool fixup
, bool remove_stale
)
1143 struct bkey
*k
, *last
= NULL
;
1145 bool (*bad
)(struct btree_keys
*, const struct bkey
*) = remove_stale
1149 /* Heapify the iterator, using our comparison function */
1150 for (i
= iter
->used
/ 2 - 1; i
>= 0; --i
)
1151 heap_sift(iter
, i
, b
->ops
->sort_cmp
);
1153 while (!btree_iter_end(iter
)) {
1154 if (b
->ops
->sort_fixup
&& fixup
)
1155 k
= b
->ops
->sort_fixup(iter
, &tmp
.k
);
1160 k
= __bch_btree_iter_next(iter
, b
->ops
->sort_cmp
);
1168 } else if (!bch_bkey_try_merge(b
, last
, k
)) {
1169 last
= bkey_next(last
);
1174 out
->keys
= last
? (uint64_t *) bkey_next(last
) - out
->d
: 0;
1176 pr_debug("sorted %i keys", out
->keys
);
1179 static void __btree_sort(struct btree_keys
*b
, struct btree_iter
*iter
,
1180 unsigned start
, unsigned order
, bool fixup
,
1181 struct bset_sort_state
*state
)
1183 uint64_t start_time
;
1184 bool used_mempool
= false;
1185 struct bset
*out
= (void *) __get_free_pages(__GFP_NOWARN
|GFP_NOWAIT
,
1190 BUG_ON(order
> state
->page_order
);
1192 outp
= mempool_alloc(state
->pool
, GFP_NOIO
);
1193 out
= page_address(outp
);
1194 used_mempool
= true;
1195 order
= state
->page_order
;
1198 start_time
= local_clock();
1200 btree_mergesort(b
, out
, iter
, fixup
, false);
1203 if (!start
&& order
== b
->page_order
) {
1205 * Our temporary buffer is the same size as the btree node's
1206 * buffer, we can just swap buffers instead of doing a big
1210 out
->magic
= b
->set
->data
->magic
;
1211 out
->seq
= b
->set
->data
->seq
;
1212 out
->version
= b
->set
->data
->version
;
1213 swap(out
, b
->set
->data
);
1215 b
->set
[start
].data
->keys
= out
->keys
;
1216 memcpy(b
->set
[start
].data
->start
, out
->start
,
1217 (void *) bset_bkey_last(out
) - (void *) out
->start
);
1221 mempool_free(virt_to_page(out
), state
->pool
);
1223 free_pages((unsigned long) out
, order
);
1225 bch_bset_build_written_tree(b
);
1228 bch_time_stats_update(&state
->time
, start_time
);
1231 void bch_btree_sort_partial(struct btree_keys
*b
, unsigned start
,
1232 struct bset_sort_state
*state
)
1234 size_t order
= b
->page_order
, keys
= 0;
1235 struct btree_iter iter
;
1236 int oldsize
= bch_count_data(b
);
1238 __bch_btree_iter_init(b
, &iter
, NULL
, &b
->set
[start
]);
1243 for (i
= start
; i
<= b
->nsets
; i
++)
1244 keys
+= b
->set
[i
].data
->keys
;
1246 order
= get_order(__set_bytes(b
->set
->data
, keys
));
1249 __btree_sort(b
, &iter
, start
, order
, false, state
);
1251 EBUG_ON(oldsize
>= 0 && bch_count_data(b
) != oldsize
);
1253 EXPORT_SYMBOL(bch_btree_sort_partial
);
1255 void bch_btree_sort_and_fix_extents(struct btree_keys
*b
,
1256 struct btree_iter
*iter
,
1257 struct bset_sort_state
*state
)
1259 __btree_sort(b
, iter
, 0, b
->page_order
, true, state
);
1262 void bch_btree_sort_into(struct btree_keys
*b
, struct btree_keys
*new,
1263 struct bset_sort_state
*state
)
1265 uint64_t start_time
= local_clock();
1267 struct btree_iter iter
;
1268 bch_btree_iter_init(b
, &iter
, NULL
);
1270 btree_mergesort(b
, new->set
->data
, &iter
, false, true);
1272 bch_time_stats_update(&state
->time
, start_time
);
1274 new->set
->size
= 0; // XXX: why?
1277 #define SORT_CRIT (4096 / sizeof(uint64_t))
1279 void bch_btree_sort_lazy(struct btree_keys
*b
, struct bset_sort_state
*state
)
1281 unsigned crit
= SORT_CRIT
;
1284 /* Don't sort if nothing to do */
1288 for (i
= b
->nsets
- 1; i
>= 0; --i
) {
1289 crit
*= state
->crit_factor
;
1291 if (b
->set
[i
].data
->keys
< crit
) {
1292 bch_btree_sort_partial(b
, i
, state
);
1297 /* Sort if we'd overflow */
1298 if (b
->nsets
+ 1 == MAX_BSETS
) {
1299 bch_btree_sort(b
, state
);
1304 bch_bset_build_written_tree(b
);
1306 EXPORT_SYMBOL(bch_btree_sort_lazy
);
1308 void bch_btree_keys_stats(struct btree_keys
*b
, struct bset_stats
*stats
)
1312 for (i
= 0; i
<= b
->nsets
; i
++) {
1313 struct bset_tree
*t
= &b
->set
[i
];
1314 size_t bytes
= t
->data
->keys
* sizeof(uint64_t);
1317 if (bset_written(b
, t
)) {
1318 stats
->sets_written
++;
1319 stats
->bytes_written
+= bytes
;
1321 stats
->floats
+= t
->size
- 1;
1323 for (j
= 1; j
< t
->size
; j
++)
1324 if (t
->tree
[j
].exponent
== 127)
1327 stats
->sets_unwritten
++;
1328 stats
->bytes_unwritten
+= bytes
;