1 /* BPlusTree - BFS B+Tree implementation
3 ** Copyright 2001-2002 pinc Software. All Rights Reserved.
4 ** Released under the terms of the MIT license.
20 #define MAX_NODES_IN_CACHE 20
22 class CacheableNode
: public NodeCache::Cacheable
25 CacheableNode(off_t offset
,bplustree_node
*node
)
32 virtual ~CacheableNode()
38 virtual bool Equals(off_t offset
)
40 return offset
== fOffset
;
44 bplustree_node
*fNode
;
48 NodeCache::NodeCache(BPlusTree
*tree
)
55 Cache
<off_t
>::Cacheable
*
56 NodeCache::NewCacheable(off_t offset
)
58 return new CacheableNode(offset
,fTree
->Node(offset
,false));
63 NodeCache::Get(off_t offset
)
65 CacheableNode
*node
= (CacheableNode
*)Cache
<off_t
>::Get(offset
);
73 BPlusTree::BPlusTree(int32 keyType
,int32 nodeSize
,bool allowDuplicates
)
78 fCurrentNodeOffset(BPLUSTREE_NULL
)
80 SetTo(keyType
,nodeSize
,allowDuplicates
);
84 BPlusTree::BPlusTree(BPositionIO
*stream
,bool allowDuplicates
)
89 fCurrentNodeOffset(BPLUSTREE_NULL
)
91 SetTo(stream
,allowDuplicates
);
95 BPlusTree::BPlusTree()
99 fNodeSize(BPLUSTREE_NODE_SIZE
),
100 fAllowDuplicates(true),
103 fCurrentNodeOffset(BPLUSTREE_NULL
)
108 BPlusTree::~BPlusTree()
114 void BPlusTree::Initialize(int32 nodeSize
)
117 fCache
.Clear(0,true);
124 fNodeSize
= nodeSize
;
125 fHeader
= (bplustree_header
*)malloc(fNodeSize
);
126 memset(fHeader
,0,fNodeSize
);
128 fCurrentNodeOffset
= BPLUSTREE_NULL
;
132 status_t
BPlusTree::SetTo(int32 keyType
,int32 nodeSize
,bool allowDuplicates
)
134 // initializes in-memory B+Tree
136 Initialize(nodeSize
);
138 fAllowDuplicates
= allowDuplicates
;
139 fCache
.SetHoldChanges(true);
141 // initialize b+tree header
142 fHeader
->magic
= BPLUSTREE_MAGIC
;
143 fHeader
->node_size
= fNodeSize
;
144 fHeader
->max_number_of_levels
= 1;
145 fHeader
->data_type
= keyType
;
146 fHeader
->root_node_pointer
= fNodeSize
;
147 fHeader
->free_node_pointer
= BPLUSTREE_NULL
;
148 fHeader
->maximum_size
= fNodeSize
* 2;
150 return fStatus
= B_OK
;
154 status_t
BPlusTree::SetTo(BPositionIO
*stream
,bool allowDuplicates
)
156 // initializes on-disk B+Tree
158 bplustree_header header
;
159 ssize_t read
= stream
->ReadAt(0,&header
,sizeof(bplustree_header
));
161 return fStatus
= read
;
165 stream
->Seek(0,SEEK_END
);
166 off_t size
= stream
->Position();
167 stream
->Seek(0,SEEK_SET
);
169 //dump_bplustree_header(&header);
171 if (header
.magic
!= BPLUSTREE_MAGIC
172 || header
.maximum_size
!= size
173 || (header
.root_node_pointer
% header
.node_size
) != 0
174 || !header
.IsValidLink(header
.root_node_pointer
)
175 || !header
.IsValidLink(header
.free_node_pointer
))
176 return fStatus
= B_BAD_DATA
;
178 fAllowDuplicates
= allowDuplicates
;
180 if (DataStream
*dataStream
= dynamic_cast<DataStream
*>(stream
))
182 uint32 toMode
[] = {S_STR_INDEX
, S_INT_INDEX
, S_UINT_INDEX
, S_LONG_LONG_INDEX
,
183 S_ULONG_LONG_INDEX
, S_FLOAT_INDEX
, S_DOUBLE_INDEX
};
184 uint32 mode
= dataStream
->Mode() & (S_STR_INDEX
| S_INT_INDEX
| S_UINT_INDEX
| S_LONG_LONG_INDEX
185 | S_ULONG_LONG_INDEX
| S_FLOAT_INDEX
| S_DOUBLE_INDEX
);
187 if (header
.data_type
> BPLUSTREE_DOUBLE_TYPE
188 || (dataStream
->Mode() & S_INDEX_DIR
) && toMode
[header
.data_type
] != mode
189 || !dataStream
->IsDirectory())
190 return fStatus
= B_BAD_TYPE
;
192 // although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused
193 fAllowDuplicates
= (dataStream
->Mode() & (S_INDEX_DIR
| 0777)) == S_INDEX_DIR
;
195 //printf("allows duplicates? %s\n",fAllowDuplicates ? "yes" : "no");
198 Initialize(header
.node_size
);
200 memcpy(fHeader
,&header
,sizeof(bplustree_header
));
204 bplustree_node
*node
= fCache
.Get(header
.root_node_pointer
);
206 // dump_bplustree_node(node);
208 return fStatus
= node
&& CheckNode(node
) ? B_OK
: B_BAD_DATA
;
212 status_t
BPlusTree::InitCheck()
218 struct validate_info
{
226 BPlusTree::Validate(bool verbose
)
228 // validates the on-disk b+tree
229 // (for now only the node integrity is checked)
234 struct validate_info info
;
236 Stack
<validate_info
> stack
;
237 info
.offset
= fHeader
->root_node_pointer
;
238 info
.from
= BPLUSTREE_NULL
;
242 if (fHeader
->free_node_pointer
!= BPLUSTREE_NULL
) {
243 info
.offset
= fHeader
->free_node_pointer
;
248 char escape
[3] = {0x1b, '[', 0};
249 int32 count
= 0,freeCount
= 0;
252 if (!stack
.Pop(&info
)) {
254 printf(" %" B_PRId32
" B+tree node(s) processed (%" B_PRId32
255 " free node(s)).\n", count
, freeCount
);
260 if (!info
.free
&& verbose
&& ++count
% 10 == 0)
261 printf(" %" B_PRId32
"%s1A\n", count
, escape
);
265 bplustree_node
*node
;
266 if ((node
= fCache
.Get(info
.offset
)) == NULL
267 || !info
.free
&& !CheckNode(node
)) {
269 fprintf(stderr
," B+Tree: Could not get data at position %"
270 B_PRIdOFF
" (referenced by node at %" B_PRIdOFF
")\n",
271 info
.offset
, info
.from
);
272 if ((node
= fCache
.Get(info
.from
)) != NULL
)
273 dump_bplustree_node(node
,fHeader
);
278 info
.from
= info
.offset
;
280 if (node
->overflow_link
!= BPLUSTREE_NULL
&& !info
.free
) {
281 info
.offset
= node
->overflow_link
;
284 //dump_bplustree_node(node,fHeader);
285 off_t
*values
= node
->Values();
286 for (int32 i
= 0;i
< node
->all_key_count
;i
++) {
287 info
.offset
= values
[i
];
290 } else if (node
->left_link
!= BPLUSTREE_NULL
&& info
.free
) {
291 info
.offset
= node
->left_link
;
299 BPlusTree::WriteTo(BPositionIO
*stream
)
301 ssize_t written
= stream
->WriteAt(0,fHeader
,fNodeSize
);
302 if (written
< fNodeSize
)
303 return written
< B_OK
? written
: B_IO_ERROR
;
305 for (off_t offset
= fNodeSize
;offset
< fHeader
->maximum_size
;offset
+= fNodeSize
)
307 bplustree_node
*node
= fCache
.Get(offset
);
311 written
= stream
->WriteAt(offset
,node
,fNodeSize
);
312 if (written
< fNodeSize
)
313 return written
< B_OK
? written
: B_IO_ERROR
;
315 return stream
->SetSize(fHeader
->maximum_size
);
322 void BPlusTree::SetCurrentNode(bplustree_node
*node
,off_t offset
,int8 to
)
324 fCurrentNodeOffset
= offset
;
325 fCurrentKey
= to
== BPLUSTREE_BEGIN
? -1 : node
->all_key_count
;
326 fDuplicateNode
= BPLUSTREE_NULL
;
330 status_t
BPlusTree::Goto(int8 to
)
336 if (stack
.Push(fHeader
->root_node_pointer
) < B_OK
)
339 bplustree_node
*node
;
341 while (stack
.Pop(&pos
) && (node
= fCache
.Get(pos
)) != NULL
&& CheckNode(node
))
343 // is the node a leaf node?
344 if (node
->overflow_link
== BPLUSTREE_NULL
)
346 SetCurrentNode(node
,pos
,to
);
351 if (to
== BPLUSTREE_END
|| node
->all_key_count
== 0)
352 pos
= node
->overflow_link
;
355 if (node
->all_key_length
> fNodeSize
356 || (addr_t
)node
->Values() > (addr_t
)node
+ fNodeSize
357 - 8 * node
->all_key_count
)
360 pos
= *node
->Values();
363 if (stack
.Push(pos
) < B_OK
)
370 status_t
BPlusTree::Traverse(int8 direction
,void *key
,uint16
*keyLength
,uint16 maxLength
,off_t
*value
)
372 if (fCurrentNodeOffset
== BPLUSTREE_NULL
373 && Goto(direction
== BPLUSTREE_FORWARD
? BPLUSTREE_BEGIN
: BPLUSTREE_END
) < B_OK
)
376 bplustree_node
*node
;
378 if (fDuplicateNode
!= BPLUSTREE_NULL
)
380 // regardless of traverse direction the duplicates are always presented in
381 // the same order; since they are all considered as equal, this shouldn't
382 // cause any problems
384 if (!fIsFragment
|| fDuplicate
< fNumDuplicates
)
385 node
= fCache
.Get(bplustree_node::FragmentOffset(fDuplicateNode
));
391 if (!fIsFragment
&& fDuplicate
>= fNumDuplicates
)
393 // if the node is out of duplicates, we go directly to the next one
394 fDuplicateNode
= node
->right_link
;
395 if (fDuplicateNode
!= BPLUSTREE_NULL
396 && (node
= fCache
.Get(fDuplicateNode
)) != NULL
)
398 fNumDuplicates
= node
->CountDuplicates(fDuplicateNode
,false);
402 if (fDuplicate
< fNumDuplicates
)
404 *value
= node
->DuplicateAt(fDuplicateNode
,fIsFragment
,fDuplicate
++);
408 fDuplicateNode
= BPLUSTREE_NULL
;
411 off_t savedNodeOffset
= fCurrentNodeOffset
;
412 if ((node
= fCache
.Get(fCurrentNodeOffset
)) == NULL
|| !CheckNode(node
))
415 fCurrentKey
+= direction
;
417 // is the current key in the current node?
418 while ((direction
== BPLUSTREE_FORWARD
&& fCurrentKey
>= node
->all_key_count
)
419 || (direction
== BPLUSTREE_BACKWARD
&& fCurrentKey
< 0))
421 fCurrentNodeOffset
= direction
== BPLUSTREE_FORWARD
? node
->right_link
: node
->left_link
;
423 // are there any more nodes?
424 if (fCurrentNodeOffset
!= BPLUSTREE_NULL
)
426 node
= fCache
.Get(fCurrentNodeOffset
);
427 if (!node
|| !CheckNode(node
))
431 fCurrentKey
= direction
== BPLUSTREE_FORWARD
? 0 : node
->all_key_count
;
435 // there are no nodes left, so turn back to the last key
436 fCurrentNodeOffset
= savedNodeOffset
;
437 fCurrentKey
= direction
== BPLUSTREE_FORWARD
? node
->all_key_count
: -1;
439 return B_ENTRY_NOT_FOUND
;
443 if (node
->all_key_count
== 0)
444 return B_ERROR
; //B_ENTRY_NOT_FOUND;
447 uint8
*keyStart
= node
->KeyAt(fCurrentKey
,&length
);
449 length
= min_c(length
,maxLength
);
450 memcpy(key
,keyStart
,length
);
452 if (fHeader
->data_type
== BPLUSTREE_STRING_TYPE
) // terminate string type
454 if (length
== maxLength
)
456 ((char *)key
)[length
] = '\0';
460 off_t offset
= node
->Values()[fCurrentKey
];
462 // duplicate fragments?
463 uint8 type
= bplustree_node::LinkType(offset
);
464 if (type
== BPLUSTREE_DUPLICATE_FRAGMENT
|| type
== BPLUSTREE_DUPLICATE_NODE
)
466 fDuplicateNode
= offset
;
468 node
= fCache
.Get(bplustree_node::FragmentOffset(fDuplicateNode
));
472 fIsFragment
= type
== BPLUSTREE_DUPLICATE_FRAGMENT
;
474 fNumDuplicates
= node
->CountDuplicates(offset
,fIsFragment
);
477 // give back first duplicate
479 offset
= node
->DuplicateAt(offset
,fIsFragment
,0);
483 // shouldn't happen, but we're dealing here with corrupt disks...
484 fDuplicateNode
= BPLUSTREE_NULL
;
494 int32
BPlusTree::CompareKeys(const void *key1
, int keyLength1
, const void *key2
, int keyLength2
)
496 switch (fHeader
->data_type
)
498 case BPLUSTREE_STRING_TYPE
:
500 int len
= min_c(keyLength1
,keyLength2
);
501 int result
= strncmp((const char *)key1
,(const char *)key2
,len
);
504 result
= keyLength1
- keyLength2
;
509 case BPLUSTREE_INT32_TYPE
:
510 return *(int32
*)key1
- *(int32
*)key2
;
512 case BPLUSTREE_UINT32_TYPE
:
514 if (*(uint32
*)key1
== *(uint32
*)key2
)
516 else if (*(uint32
*)key1
> *(uint32
*)key2
)
522 case BPLUSTREE_INT64_TYPE
:
524 if (*(int64
*)key1
== *(int64
*)key2
)
526 else if (*(int64
*)key1
> *(int64
*)key2
)
532 case BPLUSTREE_UINT64_TYPE
:
534 if (*(uint64
*)key1
== *(uint64
*)key2
)
536 else if (*(uint64
*)key1
> *(uint64
*)key2
)
542 case BPLUSTREE_FLOAT_TYPE
:
544 float result
= *(float *)key1
- *(float *)key2
;
548 return (result
< 0.0f
) ? -1 : 1;
551 case BPLUSTREE_DOUBLE_TYPE
:
553 double result
= *(double *)key1
- *(double *)key2
;
557 return (result
< 0.0) ? -1 : 1;
564 status_t
BPlusTree::FindKey(bplustree_node
*node
,uint8
*key
,uint16 keyLength
,uint16
*index
,off_t
*next
)
566 if (node
->all_key_count
== 0)
571 *next
= node
->overflow_link
;
572 return B_ENTRY_NOT_FOUND
;
575 off_t
*values
= node
->Values();
578 // binary search in the key array
579 for (int16 first
= 0,last
= node
->all_key_count
- 1;first
<= last
;)
581 uint16 i
= (first
+ last
) >> 1;
584 uint8
*searchKey
= node
->KeyAt(i
,&searchLength
);
586 int32 cmp
= CompareKeys(key
,keyLength
,searchKey
,searchLength
);
594 saveIndex
= first
= i
+ 1;
610 if (saveIndex
== node
->all_key_count
)
611 *next
= node
->overflow_link
;
613 *next
= values
[saveIndex
];
615 return B_ENTRY_NOT_FOUND
;
619 status_t
BPlusTree::SeekDown(Stack
<node_and_key
> &stack
,uint8
*key
,uint16 keyLength
)
621 node_and_key nodeAndKey
;
622 nodeAndKey
.nodeOffset
= fHeader
->root_node_pointer
;
623 nodeAndKey
.keyIndex
= 0;
625 bplustree_node
*node
;
626 while ((node
= fCache
.Get(nodeAndKey
.nodeOffset
)) != NULL
&& CheckNode(node
)) {
627 // if we are already on leaf level, we're done
628 if (node
->overflow_link
== BPLUSTREE_NULL
) {
629 // put the node on the stack
630 // node that the keyIndex is not properly set here!
631 nodeAndKey
.keyIndex
= 0;
632 stack
.Push(nodeAndKey
);
637 status_t status
= FindKey(node
,key
,keyLength
,&nodeAndKey
.keyIndex
,&nextOffset
);
639 if (status
== B_ENTRY_NOT_FOUND
&& nextOffset
== nodeAndKey
.nodeOffset
)
642 // put the node & the correct keyIndex on the stack
643 stack
.Push(nodeAndKey
);
645 nodeAndKey
.nodeOffset
= nextOffset
;
651 void BPlusTree::InsertKey(bplustree_node
*node
,uint8
*key
,uint16 keyLength
,off_t value
,uint16 index
)
653 // should never happen, but who knows?
654 if (index
> node
->all_key_count
)
657 off_t
*values
= node
->Values();
658 uint16
*keyLengths
= node
->KeyLengths();
659 uint8
*keys
= node
->Keys();
661 node
->all_key_count
++;
662 node
->all_key_length
+= keyLength
;
664 off_t
*newValues
= node
->Values();
665 uint16
*newKeyLengths
= node
->KeyLengths();
667 // move values and copy new value into them
668 memmove(newValues
+ index
+ 1,values
+ index
,sizeof(off_t
) * (node
->all_key_count
- 1 - index
));
669 memmove(newValues
,values
,sizeof(off_t
) * index
);
671 newValues
[index
] = value
;
673 // move and update key length index
674 for (uint16 i
= node
->all_key_count
;i
-- > index
+ 1;)
675 newKeyLengths
[i
] = keyLengths
[i
- 1] + keyLength
;
676 memmove(newKeyLengths
,keyLengths
,sizeof(uint16
) * index
);
679 newKeyLengths
[index
] = keyLength
+ (keyStart
= index
> 0 ? newKeyLengths
[index
- 1] : 0);
681 // move keys and copy new key into them
682 int32 size
= node
->all_key_length
- newKeyLengths
[index
];
684 memmove(keys
+ newKeyLengths
[index
],keys
+ newKeyLengths
[index
] - keyLength
,size
);
686 memcpy(keys
+ keyStart
,key
,keyLength
);
690 status_t
BPlusTree::InsertDuplicate(bplustree_node */
*node*/
,uint16
/*index*/)
692 printf("DUPLICATE ENTRY!!\n");
694 // /* handle dup keys */
697 // bt_errno(b) = BT_DUPKEY;
702 // /* paste new data ptr in page */
703 // /* and write it out again. */
705 // p = (off_t *)KEYCLD(op->p);
706 // *(p + keyat) = rrn;
707 // op->flags = BT_CHE_DIRTY;
708 // if(bt_wpage(b,op) == BT_ERR ||
709 // bt_wpage(b,kp) == BT_ERR)
712 // /* mark all as well with tree */
721 status_t
BPlusTree::SplitNode(bplustree_node
*node
,off_t nodeOffset
,uint16
*_keyIndex
,uint8
*key
,uint16
*_keyLength
,off_t
*_value
)
723 if (*_keyIndex
> node
->all_key_count
+ 1)
726 //printf("before (insert \"%s\" (%d bytes) at %d):\n\n",key,*_keyLength,*_keyIndex);
727 //dump_bplustree_node(node,fHeader);
728 //hexdump(node,fNodeSize);
731 bplustree_node
*other
= fCache
.Get(otherOffset
= fHeader
->maximum_size
); //Node(otherOffset = fHeader->maximum_size/*,false*/);
735 uint16
*inKeyLengths
= node
->KeyLengths();
736 off_t
*inKeyValues
= node
->Values();
737 uint8
*inKeys
= node
->Keys();
738 uint8
*outKeys
= other
->Keys();
739 uint16 keyIndex
= *_keyIndex
;
741 // how many keys will fit in one (half) page?
743 // "bytes" is the number of bytes written for the new key,
744 // "bytesBefore" are the bytes before that key
745 // "bytesAfter" are the bytes after the new key, if any
746 int32 bytes
= 0,bytesBefore
= 0,bytesAfter
= 0;
748 size_t size
= fNodeSize
>> 1;
750 for (in
= out
= 0;in
< node
->all_key_count
+ 1;) {
752 bytesBefore
= in
> 0 ? inKeyLengths
[in
- 1] : 0;
753 if (in
== keyIndex
&& !bytes
) {
756 if (keyIndex
< out
) {
757 bytesAfter
= inKeyLengths
[in
] - bytesBefore
;
758 // fix the key lengths for the new node
759 inKeyLengths
[in
] = bytesAfter
+ bytesBefore
+ bytes
;
765 if (round_up(sizeof(bplustree_node
) + bytesBefore
+ bytesAfter
+ bytes
) +
766 out
* (sizeof(uint16
) + sizeof(off_t
)) >= size
) {
767 // we have found the number of keys in the new node!
772 // if the new key was not inserted, set the length of the keys
773 // that can be copied directly
774 if (keyIndex
>= out
&& in
> 0)
775 bytesBefore
= inKeyLengths
[in
- 1];
777 if (bytesBefore
< 0 || bytesAfter
< 0)
780 //printf("put %ld keys in the other node (%ld, %ld, %ld) (in = %ld)\n",out,bytesBefore,bytes,bytesAfter,in);
782 other
->left_link
= node
->left_link
;
783 other
->right_link
= nodeOffset
;
784 other
->all_key_length
= bytes
+ bytesBefore
+ bytesAfter
;
785 other
->all_key_count
= out
;
786 //printf("-> used = %ld (bplustree_node = %ld bytes)\n",other->Used(),sizeof(bplustree_node));
788 uint16
*outKeyLengths
= other
->KeyLengths();
789 off_t
*outKeyValues
= other
->Values();
790 int32 keys
= out
> keyIndex
? keyIndex
: out
;
794 memcpy(outKeys
,inKeys
,bytesBefore
);
795 memcpy(outKeyLengths
,inKeyLengths
,keys
* sizeof(uint16
));
796 memcpy(outKeyValues
,inKeyValues
,keys
* sizeof(off_t
));
799 // copy the newly inserted key
800 memcpy(outKeys
+ bytesBefore
,key
,bytes
);
801 outKeyLengths
[keyIndex
] = bytes
+ bytesBefore
;
802 outKeyValues
[keyIndex
] = *_value
;
805 // copy the keys after the new key
806 memcpy(outKeys
+ bytesBefore
+ bytes
,inKeys
+ bytesBefore
,bytesAfter
);
807 keys
= out
- keyIndex
- 1;
808 memcpy(outKeyLengths
+ keyIndex
+ 1,inKeyLengths
+ keyIndex
,keys
* sizeof(uint16
));
809 memcpy(outKeyValues
+ keyIndex
+ 1,inKeyValues
+ keyIndex
,keys
* sizeof(off_t
));
813 // if the new key was already inserted, we shouldn't use it again
817 int32 total
= bytesBefore
+ bytesAfter
;
819 // if we have split an index node, we have to drop the first key
820 // of the next node (which can also be the new key to insert)
821 if (node
->overflow_link
!= BPLUSTREE_NULL
) {
822 if (in
== keyIndex
) {
823 other
->overflow_link
= *_value
;
826 other
->overflow_link
= inKeyValues
[in
];
827 total
= inKeyLengths
[in
++];
831 // and now the same game for the other page and the rest of the keys
832 // (but with memmove() instead of memcpy(), because they may overlap)
834 bytesBefore
= bytesAfter
= bytes
= 0;
837 while (in
< node
->all_key_count
+ 1) {
838 //printf("in = %ld, keyIndex = %d, bytes = %ld\n",in,keyIndex,bytes);
839 if (in
== keyIndex
&& !bytes
) {
840 bytesBefore
= in
> skip
? inKeyLengths
[in
- 1] : 0;
841 //printf("bytesBefore = %ld\n",bytesBefore);
843 } else if (in
< node
->all_key_count
) {
844 //printf("1.inKeyLength[%ld] = %u\n",in,inKeyLengths[in]);
845 inKeyLengths
[in
] -= total
;
847 inKeyLengths
[in
] += bytes
;
848 bytesAfter
= inKeyLengths
[in
] - bytesBefore
- bytes
;
849 //printf("2.inKeyLength[%ld] = %u, bytesAfter = %ld, bytesBefore = %ld\n",in,inKeyLengths[in],bytesAfter,bytesBefore);
858 //printf("-> out = %ld, keylen = %ld, %ld bytes total\n",out,bytesBefore,round_up(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes) +
859 // out * (sizeof(uint16) + sizeof(off_t)));
860 if (in
> node
->all_key_count
&& keyIndex
< in
)
864 //printf("bytes = (%ld, %ld, %ld), in = %ld, total = %ld\n",bytesBefore,bytes,bytesAfter,in,total);
866 if (keyIndex
>= in
&& keyIndex
- skip
< out
) {
867 bytesAfter
= inKeyLengths
[in
] - bytesBefore
- total
;
868 } else if (keyIndex
< skip
)
869 bytesBefore
= node
->all_key_length
- total
;
871 //printf("bytes = (%ld, %ld, %ld), in = %ld, total = %ld\n",bytesBefore,bytes,bytesAfter,in,total);
873 if (bytesBefore
< 0 || bytesAfter
< 0)
876 node
->left_link
= otherOffset
;
877 // right link, and overflow link can stay the same
878 node
->all_key_length
= bytes
+ bytesBefore
+ bytesAfter
;
879 node
->all_key_count
= out
- 1;
881 // array positions have changed
882 outKeyLengths
= node
->KeyLengths();
883 outKeyValues
= node
->Values();
885 //printf("put %ld keys in the first node (%ld, %ld, %ld) (in = %ld)\n",out,bytesBefore,bytes,bytesAfter,in);
887 // move the keys in the old node: the order is important here,
888 // because we don't want to overwrite any contents
890 keys
= keyIndex
<= skip
? out
: keyIndex
- skip
;
894 memmove(inKeys
,inKeys
+ total
,bytesBefore
);
896 memmove(inKeys
+ bytesBefore
+ bytes
,inKeys
+ total
+ bytesBefore
,bytesAfter
);
899 memmove(outKeyLengths
,inKeyLengths
+ skip
,keys
* sizeof(uint16
));
900 in
= out
- keyIndex
- 1;
902 memmove(outKeyLengths
+ keyIndex
+ 1,inKeyLengths
+ skip
+ keyIndex
,in
* sizeof(uint16
));
905 memmove(outKeyValues
,inKeyValues
+ skip
,keys
* sizeof(off_t
));
907 memmove(outKeyValues
+ keyIndex
+ 1,inKeyValues
+ skip
+ keyIndex
,in
* sizeof(off_t
));
910 // finally, copy the newly inserted key (don't overwrite anything)
911 memcpy(inKeys
+ bytesBefore
,key
,bytes
);
912 outKeyLengths
[keyIndex
] = bytes
+ bytesBefore
;
913 outKeyValues
[keyIndex
] = *_value
;
916 //puts("\n!!!!!!!!!!!!!!! after: !!!!!!!!!!!!!!!\n");
917 //dump_bplustree_node(other,fHeader);
918 //hexdump(other,fNodeSize);
920 //dump_bplustree_node(node,fHeader);
921 //hexdump(node,fNodeSize);
923 // write the updated nodes back
925 fCache
.SetDirty(otherOffset
,true);
926 fCache
.SetDirty(nodeOffset
,true);
928 // update the right link of the node in the left of the new node
929 if (other
->left_link
!= BPLUSTREE_NULL
) {
930 bplustree_node
*left
= fCache
.Get(other
->left_link
);
932 left
->right_link
= otherOffset
;
933 fCache
.SetDirty(other
->left_link
,true);
937 // prepare key to insert in the parent node
939 //printf("skip: %ld, count-1: %u\n",skip,other->all_key_count - 1);
941 uint8
*lastKey
= other
->KeyAt(other
->all_key_count
- 1,&length
);
942 memcpy(key
,lastKey
,length
);
943 *_keyLength
= length
;
944 *_value
= otherOffset
;
950 status_t
BPlusTree::Insert(uint8
*key
,uint16 keyLength
,off_t value
)
952 if (keyLength
< BPLUSTREE_MIN_KEY_LENGTH
|| keyLength
> BPLUSTREE_MAX_KEY_LENGTH
)
955 Stack
<node_and_key
> stack
;
956 if (SeekDown(stack
,key
,keyLength
) != B_OK
)
959 uint8 keyBuffer
[BPLUSTREE_MAX_KEY_LENGTH
+ 1];
961 memcpy(keyBuffer
,key
,keyLength
);
962 keyBuffer
[keyLength
] = 0;
964 off_t valueToInsert
= value
;
966 fCurrentNodeOffset
= BPLUSTREE_NULL
;
968 node_and_key nodeAndKey
;
969 bplustree_node
*node
;
972 while (stack
.Pop(&nodeAndKey
) && (node
= fCache
.Get(nodeAndKey
.nodeOffset
)) != NULL
&& CheckNode(node
))
974 if (count
++ == 0) // first round, check for duplicate entries
976 status_t status
= FindKey(node
,key
,keyLength
,&nodeAndKey
.keyIndex
);
977 if (status
== B_ERROR
)
980 // is this a duplicate entry?
981 if (status
== B_OK
&& node
->overflow_link
== BPLUSTREE_NULL
)
983 if (fAllowDuplicates
)
984 return InsertDuplicate(node
,nodeAndKey
.keyIndex
);
986 return B_NAME_IN_USE
;
990 // is the node big enough to hold the pair?
991 if (int32(round_up(sizeof(bplustree_node
) + node
->all_key_length
+ keyLength
)
992 + (node
->all_key_count
+ 1) * (sizeof(uint16
) + sizeof(off_t
))) < fNodeSize
)
994 InsertKey(node
,keyBuffer
,keyLength
,valueToInsert
,nodeAndKey
.keyIndex
);
995 fCache
.SetDirty(nodeAndKey
.nodeOffset
,true);
1001 // do we need to allocate a new root node? if so, then do
1003 bplustree_node
*rootNode
= NULL
;
1004 off_t newRoot
= BPLUSTREE_NULL
;
1005 if (nodeAndKey
.nodeOffset
== fHeader
->root_node_pointer
) {
1006 rootNode
= fCache
.Get(newRoot
= fHeader
->maximum_size
);
1007 if (rootNode
== NULL
) {
1008 // the tree is most likely dead
1013 if (SplitNode(node
,nodeAndKey
.nodeOffset
,&nodeAndKey
.keyIndex
,keyBuffer
,&keyLength
,&valueToInsert
) < B_OK
) {
1014 if (newRoot
!= BPLUSTREE_NULL
) {
1020 if (newRoot
!= BPLUSTREE_NULL
) {
1021 rootNode
->overflow_link
= nodeAndKey
.nodeOffset
;
1023 InsertKey(rootNode
,keyBuffer
,keyLength
,node
->left_link
,0);
1025 fHeader
->root_node_pointer
= newRoot
;
1026 fHeader
->max_number_of_levels
++;
1029 fCache
.SetDirty(newRoot
,true);
1040 status_t
BPlusTree::Find(uint8
*key
,uint16 keyLength
,off_t
*value
)
1042 if (keyLength
< BPLUSTREE_MIN_KEY_LENGTH
|| keyLength
> BPLUSTREE_MAX_KEY_LENGTH
)
1045 Stack
<node_and_key
> stack
;
1046 if (SeekDown(stack
,key
,keyLength
) != B_OK
)
1049 fCurrentNodeOffset
= BPLUSTREE_NULL
;
1051 node_and_key nodeAndKey
;
1052 bplustree_node
*node
;
1054 if (stack
.Pop(&nodeAndKey
) && (node
= fCache
.Get(nodeAndKey
.nodeOffset
)) != NULL
&& CheckNode(node
))
1056 status_t status
= FindKey(node
,key
,keyLength
,&nodeAndKey
.keyIndex
);
1057 if (status
== B_ERROR
)
1060 SetCurrentNode(node
,nodeAndKey
.nodeOffset
);
1062 if (status
== B_OK
&& node
->overflow_link
== BPLUSTREE_NULL
)
1064 *value
= node
->Values()[nodeAndKey
.keyIndex
];
1068 return B_ENTRY_NOT_FOUND
;
1076 BPlusTree::CheckNode(bplustree_node
*node
)
1078 if (!fHeader
->IsValidLink(node
->left_link
)
1079 || !fHeader
->IsValidLink(node
->right_link
)
1080 || !fHeader
->IsValidLink(node
->overflow_link
)
1081 || (int8
*)node
->Values() + node
->all_key_count
* sizeof(off_t
) > (int8
*)node
+ fNodeSize
)
1088 bplustree_node
*BPlusTree::Node(off_t nodeOffset
,bool check
)
1090 /*printf("1: %d,%d,%d\n",
1091 nodeOffset > fHeader->maximum_size - fNodeSize,
1092 nodeOffset < 0 && nodeOffset != BPLUSTREE_NULL,
1093 (nodeOffset % fNodeSize) != 0);
1095 // the super node is always in memory, and shouldn't
1096 // never be taken out of the cache
1097 if (nodeOffset
> fHeader
->maximum_size
/*- fNodeSize*/
1098 || nodeOffset
<= 0 && nodeOffset
!= BPLUSTREE_NULL
1099 || (nodeOffset
% fNodeSize
) != 0)
1102 bplustree_node
*node
= (bplustree_node
*)malloc(fNodeSize
);
1106 if (nodeOffset
== BPLUSTREE_NULL
|| !fStream
)
1108 node
->left_link
= BPLUSTREE_NULL
;
1109 node
->right_link
= BPLUSTREE_NULL
;
1110 node
->overflow_link
= BPLUSTREE_NULL
;
1111 node
->all_key_count
= 0;
1112 node
->all_key_length
= 0;
1114 else if (fStream
&& fStream
->ReadAt(nodeOffset
,node
,fNodeSize
) < fNodeSize
)
1120 if (check
&& node
!= NULL
)
1122 // sanity checks (links, all_key_count)
1123 if (!fHeader
->IsValidLink(node
->left_link
)
1124 || !fHeader
->IsValidLink(node
->right_link
)
1125 || !fHeader
->IsValidLink(node
->overflow_link
)
1126 || (int8
*)node
->Values() + node
->all_key_count
* sizeof(off_t
) > (int8
*)node
+ fNodeSize
)
1130 if (!fStream
&& nodeOffset
> fHeader
->maximum_size
- fNodeSize
) {
1131 // do some hacks here
1132 fHeader
->maximum_size
+= fNodeSize
;
1139 void BPlusTree::SetHoldChanges(bool hold
)
1141 fCache
.SetHoldChanges(hold
);
1148 uint8
*bplustree_node::KeyAt(int32 index
,uint16
*keyLength
) const
1150 if (index
< 0 || index
> all_key_count
)
1153 uint8
*keyStart
= Keys();
1154 uint16
*keyLengths
= KeyLengths();
1156 *keyLength
= keyLengths
[index
] - (index
!= 0 ? keyLengths
[index
- 1] : 0);
1158 keyStart
+= keyLengths
[index
- 1];
1164 uint8
bplustree_node::CountDuplicates(off_t offset
,bool isFragment
) const
1166 // the duplicate fragment handling is currently hard-coded to a node size
1167 // of 1024 bytes - with future versions of BFS, this may be a problem
1171 uint32 fragment
= 8 * ((uint64
)offset
& 0x3ff);
1173 return ((off_t
*)this)[fragment
];
1175 return overflow_link
;
1179 off_t
bplustree_node::DuplicateAt(off_t offset
,bool isFragment
,int8 index
) const
1183 start
= 8 * ((uint64
)offset
& 0x3ff);
1187 return ((off_t
*)this)[start
+ 1 + index
];