2 * Copyright (c) 1990 The Regents of the University of California.
5 * This code is derived from software contributed to Berkeley by
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 * must display the following acknowledgement:
18 * This product includes software developed by the University of
19 * California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 #if defined(LIBC_SCCS) && !defined(lint)
38 static char sccsid
[] = "@(#)split.c 5.2 (Berkeley) 2/22/91";
39 #endif /* LIBC_SCCS and not lint */
41 #include <sys/types.h>
48 * _BT_SPLIT -- Split a page into two pages.
50 * Splits are caused by insertions, and propogate up the tree in
51 * the usual way. The root page is always page 1 in the file on
52 * disk, so root splits are handled specially. On entry to this
53 * routine, t->bt_curpage is the page to be split.
56 * t -- btree in which to do split.
59 * RET_SUCCESS, RET_ERROR.
62 * Changes the notion of the current page.
70 BTHEADER
*left
, *right
;
71 pgno_t nextpgno
, parent
;
80 h
= (BTHEADER
*) t
->bt_curpage
;
82 /* split root page specially, since it must remain page 1 */
83 if (h
->h_pgno
== P_ROOT
) {
84 return (_bt_splitroot(t
));
88 * This is a little complicated. We go to some trouble to
89 * figure out which of the three possible cases -- in-memory tree,
90 * disk tree (no cache), and disk tree (cache) -- we have, in order
91 * to avoid unnecessary copying. If we have a disk cache, then we
92 * have to do some extra copying, though, since the cache code
93 * manages buffers externally to this code.
96 if (ISDISK(t
) && ISCACHE(t
)) {
97 if ((left
= (BTHEADER
*) malloc((unsigned) t
->bt_psize
))
100 left
->h_pgno
= left
->h_prevpg
= left
->h_nextpg
= P_NONE
;
101 left
->h_flags
= t
->bt_curpage
->h_flags
;
102 left
->h_lower
= (index_t
)
103 (((char *) &(left
->h_linp
[0])) - ((char *) left
));
104 left
->h_upper
= t
->bt_psize
;
107 if ((left
= _bt_allocpg(t
)) == (BTHEADER
*) NULL
)
110 left
->h_pgno
= h
->h_pgno
;
112 if ((right
= _bt_allocpg(t
)) == (BTHEADER
*) NULL
)
114 right
->h_pgno
= ++(t
->bt_npages
);
116 /* now do the split */
117 if (_bt_dopsplit(t
, left
, right
) == RET_ERROR
)
120 right
->h_prevpg
= left
->h_pgno
;
121 nextpgno
= right
->h_nextpg
= h
->h_nextpg
;
122 left
->h_nextpg
= right
->h_pgno
;
123 left
->h_prevpg
= h
->h_prevpg
;
125 /* okay, now use the left half of the page as the new page */
126 if (ISDISK(t
) && ISCACHE(t
)) {
127 (void) bcopy((char *) left
, (char *) t
->bt_curpage
,
129 (void) free ((char *) left
);
130 left
= t
->bt_curpage
;
132 (void) free((char *) t
->bt_curpage
);
133 t
->bt_curpage
= left
;
137 * Write the new pages out. We need them to stay where they are
138 * until we're done updating the parent pages.
141 if (_bt_write(t
, left
, NORELEASE
) == RET_ERROR
)
143 if (_bt_write(t
, right
, NORELEASE
) == RET_ERROR
)
146 /* update 'prev' pointer of old neighbor of left */
147 if (nextpgno
!= P_NONE
) {
148 if (_bt_getpage(t
, nextpgno
) == RET_ERROR
)
151 h
->h_prevpg
= right
->h_pgno
;
152 h
->h_flags
|= F_DIRTY
;
155 if ((parent
= _bt_pop(t
)) != P_NONE
) {
156 if (right
->h_flags
& F_LEAF
) {
157 d
= (DATUM
*) GETDATUM(right
, 0);
159 if (d
->d_flags
& D_BIGKEY
) {
160 bcopy(&(d
->d_bytes
[0]),
163 if (_bt_markchain(t
, oldchain
) == RET_ERROR
)
165 src
= (char *) &oldchain
;
168 src
= (char *) &(d
->d_bytes
[0]);
172 id
= (IDATUM
*) GETDATUM(right
, 0);
175 src
= (char *) &(id
->i_bytes
[0]);
177 nbytes
= len
+ (sizeof(IDATUM
) - sizeof(char));
178 new = (IDATUM
*) malloc((unsigned) nbytes
);
179 if (new == (IDATUM
*) NULL
)
182 new->i_pgno
= right
->h_pgno
;
183 new->i_flags
= flags
;
185 (void) bcopy(src
, (char *) &(new->i_bytes
[0]), len
);
187 nbytes
= LONGALIGN(nbytes
) + sizeof(index_t
);
188 if (_bt_getpage(t
, parent
) == RET_ERROR
)
194 * Split the parent if we need to, then reposition the
195 * tree's current page pointer for the new datum.
197 if ((h
->h_upper
- h
->h_lower
) < nbytes
) {
198 if (_bt_split(t
) == RET_ERROR
)
200 if (_bt_reposition(t
, new, parent
, right
->h_prevpg
)
205 /* okay, now insert the new idatum */
206 if (_bt_inserti(t
, new, right
->h_prevpg
) == RET_ERROR
)
211 * Okay, split is done; don't need right page stapled down anymore.
212 * The page we call 'left' above is the new version of the old
213 * (split) page, so we can't release it.
216 if (_bt_release(t
, right
) == RET_ERROR
)
218 if (ISDISK(t
) && !ISCACHE(t
))
219 (void) free((char *) right
);
221 return (RET_SUCCESS
);
225 * _BT_REPOSITION -- Reposition the current page pointer of a btree.
227 * After splitting a node in the tree in order to make room for
228 * an insertion, we need to figure out which page after the split
229 * should get the item we want to insert. This routine positions
230 * the tree's current page pointer appropriately.
233 * t -- tree to position
234 * new -- the item we want to insert
235 * parent -- parent of the node that we just split
236 * prev -- page number of item directly to the left of
237 * new's position in the tree.
240 * RET_SUCCESS, RET_ERROR.
247 _bt_reposition(t
, new, parent
, prev
)
256 if (parent
== P_ROOT
) {
259 * If we just split the root page, then there are guaranteed
260 * to be exactly two IDATUMs on it. Look at both of them
261 * to see if they point to the page that we want.
264 if (_bt_getpage(t
, parent
) == RET_ERROR
)
267 next
= NEXTINDEX(t
->bt_curpage
);
268 for (i
= 0; i
< next
; i
++) {
269 idx
= (IDATUM
*) GETDATUM(t
->bt_curpage
, i
);
270 if (_bt_getpage(t
, idx
->i_pgno
) == RET_ERROR
)
272 if (_bt_isonpage(t
, new, prev
) == RET_SUCCESS
)
273 return (RET_SUCCESS
);
274 if (_bt_getpage(t
, parent
) == RET_ERROR
)
280 * Get the parent page -- which is where the new item would
281 * have gone -- and figure out whether the new item now goes
282 * on the parent, or the page immediately to the right of
286 if (_bt_getpage(t
, parent
) == RET_ERROR
)
288 if (_bt_isonpage(t
, new, prev
) == RET_SUCCESS
)
289 return (RET_SUCCESS
);
290 if (_bt_getpage(t
, t
->bt_curpage
->h_nextpg
) == RET_ERROR
)
292 if (_bt_isonpage(t
, new, prev
) == RET_SUCCESS
)
293 return (RET_SUCCESS
);
299 * _BT_ISONPAGE -- Is the IDATUM for a given page number on the current page?
301 * This routine is used by _bt_reposition to decide whether the current
302 * page is the correct one on which to insert a new item.
306 * new -- the item that will be inserted (used for binary search)
307 * prev -- page number of page whose IDATUM is immediately to
308 * the left of new's position in the tree.
311 * RET_SUCCESS, RET_ERROR (corresponding to TRUE, FALSE).
315 _bt_isonpage(t
, new, prev
)
320 BTHEADER
*h
= (BTHEADER
*) t
->bt_curpage
;
324 i
= _bt_binsrch(t
, &(new->i_bytes
[0]));
325 while (i
!= 0 && _bt_cmp(t
, &(new->i_bytes
[0]), i
) == 0)
328 idx
= (IDATUM
*) GETDATUM(h
, i
);
329 while (i
< next
&& idx
->i_pgno
!= prev
) {
331 idx
= (IDATUM
*) GETDATUM(h
, i
);
333 if (idx
->i_pgno
== prev
)
334 return (RET_SUCCESS
);
340 * _BT_SPLITROOT -- Split the root of a btree.
342 * The root page for a btree is always page one. This means that in
343 * order to split the root, we need to do extra work.
349 * RET_SUCCESS, RET_ERROR.
352 * Splits root upward in the usual way, adding two new pages
353 * to the tree (rather than just one, as in usual splits).
360 BTHEADER
*h
= t
->bt_curpage
;
361 BTHEADER
*left
, *right
;
370 /* get two new pages for the split */
371 if ((left
= _bt_allocpg(t
)) == (BTHEADER
*) NULL
)
373 left
->h_pgno
= ++(t
->bt_npages
);
374 if ((right
= _bt_allocpg(t
)) == (BTHEADER
*) NULL
)
376 right
->h_pgno
= ++(t
->bt_npages
);
379 if (_bt_dopsplit(t
, left
, right
) == RET_ERROR
)
382 /* connect the new pages correctly */
383 right
->h_prevpg
= left
->h_pgno
;
384 left
->h_nextpg
= right
->h_pgno
;
387 * Write the child pages out now. We need them to remain
388 * where they are until we finish updating parent pages,
392 if (_bt_write(t
, left
, NORELEASE
) == RET_ERROR
)
394 if (_bt_write(t
, right
, NORELEASE
) == RET_ERROR
)
397 /* now change the root page into an internal page */
398 was_leaf
= (h
->h_flags
& F_LEAF
);
399 h
->h_flags
&= ~F_LEAF
;
400 h
->h_lower
= (index_t
) (((char *) (&(h
->h_linp
[0]))) - ((char *) h
));
401 h
->h_upper
= (index_t
) t
->bt_psize
;
402 (void) bzero((char *) &(h
->h_linp
[0]), (int) (h
->h_upper
- h
->h_lower
));
404 /* put two new keys on root page */
411 data
= (DATUM
*) GETDATUM(where_h
, 0);
413 if (where_h
== left
) {
414 len
= 0; /* first key in tree is null */
416 if (data
->d_flags
& D_BIGKEY
) {
417 bcopy(&(data
->d_bytes
[0]),
420 if (_bt_markchain(t
, oldchain
) == RET_ERROR
)
422 src
= (char *) &oldchain
;
425 src
= (char *) &(data
->d_bytes
[0]);
431 idata
= (IDATUM
*) GETDATUM(where_h
, 0);
433 flags
= idata
->i_flags
;
434 src
= &(idata
->i_bytes
[0]);
436 dest
= ((char *) h
) + h
->h_upper
;
437 nbytes
= len
+ (sizeof (IDATUM
) - sizeof(char));
438 dest
-= LONGALIGN(nbytes
);
439 id
= (IDATUM
*) dest
;
441 id
->i_pgno
= where_h
->h_pgno
;
444 (void) bcopy((char *) src
, (char *) &(id
->i_bytes
[0]), len
);
446 h
->h_linp
[NEXTINDEX(h
)] = (index_t
) dest
;
447 h
->h_upper
= (index_t
) dest
;
448 h
->h_lower
+= sizeof(index_t
);
454 where_h
= (BTHEADER
*) NULL
;
457 if (_bt_release(t
, left
) == RET_ERROR
)
459 if (_bt_release(t
, right
) == RET_ERROR
)
463 * That's it, split is done. If we're doing a non-cached disk
464 * btree, we can free up the pages we allocated, as they're on
468 if (ISDISK(t
) && !ISCACHE(t
)) {
469 (void) free ((char *) left
);
470 (void) free ((char *) right
);
473 h
->h_flags
|= F_DIRTY
;
475 return (RET_SUCCESS
);
479 * _BT_DOPSPLIT -- Do the work of splitting a page
481 * This routine takes two page pointers and splits the data on the
482 * current page of the btree between them.
484 * We do a lot of work here to handle duplicate keys on a page; we
485 * have to place these keys carefully to guarantee that later searches
486 * will find them correctly. See comments in the code below for details.
490 * left -- pointer to page to get left half of the data
491 * right -- pointer to page to get right half of the data
498 _bt_dopsplit(t
, left
, right
)
503 BTHEADER
*h
= t
->bt_curpage
;
508 int nbytes
, dsize
, fixedsize
, freespc
;
510 index_t save_lower
, save_upper
, save_i
;
521 * Our strategy is to put half the bytes on each page. We figure
522 * out how many bytes we have total, and then move items until
523 * the last item moved put at least 50% of the data on the left
526 save_key
= (char *) NULL
;
527 psize
= (int) t
->bt_psize
;
528 where
= ((char *) left
) + psize
;
531 nbytes
= psize
- (int) ((char *) &(h
->h_linp
[0]) - ((char *) h
));
535 if (h
->h_flags
& F_LEAF
)
536 fixedsize
= (sizeof(DATUM
) - sizeof(char));
538 fixedsize
= (sizeof(IDATUM
) - sizeof(char));
540 save_key
= (char *) NULL
;
543 for (i
= 0; i
< top
; i
++) {
546 * Internal and leaf pages have different layouts for
547 * data items, but in both cases the first entry in the
548 * data item is a size_t.
550 d
= (DATUM
*) GETDATUM(h
,i
);
551 if (h
->h_flags
& F_LEAF
) {
552 dsize
= d
->d_ksize
+ d
->d_dsize
+ fixedsize
;
554 dsize
= d
->d_ksize
+ fixedsize
;
558 * If a page contains duplicate keys, we have to be
559 * careful about splits. A sequence of duplicate keys
560 * may not begin in the middle of one page and end in
561 * the middle of another; it must begin on a page boundary,
562 * in order for searches on the internal nodes to work
565 if (where_h
== left
) {
566 if (save_key
== (char *) NULL
) {
567 if (h
->h_flags
& F_LEAF
) {
568 if (d
->d_flags
& D_BIGKEY
) {
570 bcopy(&(d
->d_bytes
[0]),
573 if (_bt_getbig(t
, chain
,
580 save_key
= (char *) &(d
->d_bytes
[0]);
583 IDATUM
*id
= (IDATUM
*) d
;
585 if (id
->i_flags
& D_BIGKEY
) {
587 bcopy(&(id
->i_bytes
[0]),
590 if (_bt_getbig(t
, chain
,
602 save_lower
= where_h
->h_lower
;
603 save_upper
= where_h
->h_upper
;
605 if (_bt_cmp(t
, save_key
, i
) != 0) {
606 save_lower
= where_h
->h_lower
;
607 save_upper
= where_h
->h_upper
;
610 if (h
->h_flags
& F_LEAF
) {
612 (void) free(save_key
);
613 if (d
->d_flags
& D_BIGKEY
) {
615 bcopy(&(d
->d_bytes
[0]),
618 if (_bt_getbig(t
, chain
,
625 save_key
= (char *) &(d
->d_bytes
[0]);
628 IDATUM
*id
= (IDATUM
*) d
;
630 if (id
->i_flags
& D_BIGKEY
) {
632 bcopy(&(id
->i_bytes
[0]),
635 if (_bt_getbig(t
, chain
,
649 /* copy data and update page state */
650 where
-= LONGALIGN(dsize
);
651 (void) bcopy((char *) d
, (char *) where
, dsize
);
652 where_h
->h_upper
= where_h
->h_linp
[where_i
] =
653 (index_t
) (where
- (int) where_h
);
654 where_h
->h_lower
+= sizeof(index_t
);
657 /* if we've moved half, switch to the right-hand page */
658 nbytes
-= LONGALIGN(dsize
) + sizeof(index_t
);
659 if ((freespc
- nbytes
) > nbytes
) {
660 nbytes
= 2 * freespc
;
662 /* identical keys go on the same page */
664 /* i gets incremented at loop bottom... */
666 where_h
->h_lower
= save_lower
;
667 where_h
->h_upper
= save_upper
;
669 where
= ((char *) right
) + psize
;
677 * If there was an active scan on the database, and we just
678 * split the page that the cursor was on, we may need to
679 * adjust the cursor to point to the same entry as before the
684 if ((t
->bt_flags
& BTF_SEQINIT
)
685 && (c
->c_pgno
== h
->h_pgno
)
686 && (c
->c_index
>= switch_i
)) {
687 c
->c_pgno
= where_h
->h_pgno
;
688 c
->c_index
-= where_i
;
690 return (RET_SUCCESS
);