Sync usage with man page.
[netbsd-mini2440.git] / lib / libc / db / btree / split.c
blob3dd34868e0e586a17d035fa293f284ea1949226a
1 /*-
2 * Copyright (c) 1990 The Regents of the University of California.
3 * All rights reserved.
5 * This code is derived from software contributed to Berkeley by
6 * Mike Olson.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 * must display the following acknowledgement:
18 * This product includes software developed by the University of
19 * California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
37 #if defined(LIBC_SCCS) && !defined(lint)
38 static char sccsid[] = "@(#)split.c 5.2 (Berkeley) 2/22/91";
39 #endif /* LIBC_SCCS and not lint */
41 #include <sys/types.h>
42 #include <db.h>
43 #include <stdlib.h>
44 #include <string.h>
45 #include "btree.h"
48 * _BT_SPLIT -- Split a page into two pages.
50 * Splits are caused by insertions, and propogate up the tree in
51 * the usual way. The root page is always page 1 in the file on
52 * disk, so root splits are handled specially. On entry to this
53 * routine, t->bt_curpage is the page to be split.
55 * Parameters:
56 * t -- btree in which to do split.
58 * Returns:
59 * RET_SUCCESS, RET_ERROR.
61 * Side Effects:
62 * Changes the notion of the current page.
65 int
66 _bt_split(t)
67 BTREE_P t;
69 BTHEADER *h;
70 BTHEADER *left, *right;
71 pgno_t nextpgno, parent;
72 int nbytes, len;
73 IDATUM *id;
74 DATUM *d;
75 char *src;
76 IDATUM *new;
77 pgno_t oldchain;
78 u_char flags;
80 h = (BTHEADER *) t->bt_curpage;
82 /* split root page specially, since it must remain page 1 */
83 if (h->h_pgno == P_ROOT) {
84 return (_bt_splitroot(t));
88 * This is a little complicated. We go to some trouble to
89 * figure out which of the three possible cases -- in-memory tree,
90 * disk tree (no cache), and disk tree (cache) -- we have, in order
91 * to avoid unnecessary copying. If we have a disk cache, then we
92 * have to do some extra copying, though, since the cache code
93 * manages buffers externally to this code.
96 if (ISDISK(t) && ISCACHE(t)) {
97 if ((left = (BTHEADER *) malloc((unsigned) t->bt_psize))
98 == (BTHEADER *) NULL)
99 return (RET_ERROR);
100 left->h_pgno = left->h_prevpg = left->h_nextpg = P_NONE;
101 left->h_flags = t->bt_curpage->h_flags;
102 left->h_lower = (index_t)
103 (((char *) &(left->h_linp[0])) - ((char *) left));
104 left->h_upper = t->bt_psize;
106 } else {
107 if ((left = _bt_allocpg(t)) == (BTHEADER *) NULL)
108 return (RET_ERROR);
110 left->h_pgno = h->h_pgno;
112 if ((right = _bt_allocpg(t)) == (BTHEADER *) NULL)
113 return (RET_ERROR);
114 right->h_pgno = ++(t->bt_npages);
116 /* now do the split */
117 if (_bt_dopsplit(t, left, right) == RET_ERROR)
118 return (RET_ERROR);
120 right->h_prevpg = left->h_pgno;
121 nextpgno = right->h_nextpg = h->h_nextpg;
122 left->h_nextpg = right->h_pgno;
123 left->h_prevpg = h->h_prevpg;
125 /* okay, now use the left half of the page as the new page */
126 if (ISDISK(t) && ISCACHE(t)) {
127 (void) bcopy((char *) left, (char *) t->bt_curpage,
128 (int) t->bt_psize);
129 (void) free ((char *) left);
130 left = t->bt_curpage;
131 } else {
132 (void) free((char *) t->bt_curpage);
133 t->bt_curpage = left;
137 * Write the new pages out. We need them to stay where they are
138 * until we're done updating the parent pages.
141 if (_bt_write(t, left, NORELEASE) == RET_ERROR)
142 return (RET_ERROR);
143 if (_bt_write(t, right, NORELEASE) == RET_ERROR)
144 return (RET_ERROR);
146 /* update 'prev' pointer of old neighbor of left */
147 if (nextpgno != P_NONE) {
148 if (_bt_getpage(t, nextpgno) == RET_ERROR)
149 return (RET_ERROR);
150 h = t->bt_curpage;
151 h->h_prevpg = right->h_pgno;
152 h->h_flags |= F_DIRTY;
155 if ((parent = _bt_pop(t)) != P_NONE) {
156 if (right->h_flags & F_LEAF) {
157 d = (DATUM *) GETDATUM(right, 0);
158 len = d->d_ksize;
159 if (d->d_flags & D_BIGKEY) {
160 bcopy(&(d->d_bytes[0]),
161 (char *) &oldchain,
162 sizeof(oldchain));
163 if (_bt_markchain(t, oldchain) == RET_ERROR)
164 return (RET_ERROR);
165 src = (char *) &oldchain;
166 flags = D_BIGKEY;
167 } else {
168 src = (char *) &(d->d_bytes[0]);
169 flags = 0;
171 } else {
172 id = (IDATUM *) GETDATUM(right, 0);
173 len = id->i_size;
174 flags = id->i_flags;
175 src = (char *) &(id->i_bytes[0]);
177 nbytes = len + (sizeof(IDATUM) - sizeof(char));
178 new = (IDATUM *) malloc((unsigned) nbytes);
179 if (new == (IDATUM *) NULL)
180 return (RET_ERROR);
181 new->i_size = len;
182 new->i_pgno = right->h_pgno;
183 new->i_flags = flags;
184 if (len > 0)
185 (void) bcopy(src, (char *) &(new->i_bytes[0]), len);
187 nbytes = LONGALIGN(nbytes) + sizeof(index_t);
188 if (_bt_getpage(t, parent) == RET_ERROR)
189 return (RET_ERROR);
191 h = t->bt_curpage;
194 * Split the parent if we need to, then reposition the
195 * tree's current page pointer for the new datum.
197 if ((h->h_upper - h->h_lower) < nbytes) {
198 if (_bt_split(t) == RET_ERROR)
199 return (RET_ERROR);
200 if (_bt_reposition(t, new, parent, right->h_prevpg)
201 == RET_ERROR)
202 return (RET_ERROR);
205 /* okay, now insert the new idatum */
206 if (_bt_inserti(t, new, right->h_prevpg) == RET_ERROR)
207 return (RET_ERROR);
211 * Okay, split is done; don't need right page stapled down anymore.
212 * The page we call 'left' above is the new version of the old
213 * (split) page, so we can't release it.
216 if (_bt_release(t, right) == RET_ERROR)
217 return (RET_ERROR);
218 if (ISDISK(t) && !ISCACHE(t))
219 (void) free((char *) right);
221 return (RET_SUCCESS);
225 * _BT_REPOSITION -- Reposition the current page pointer of a btree.
227 * After splitting a node in the tree in order to make room for
228 * an insertion, we need to figure out which page after the split
229 * should get the item we want to insert. This routine positions
230 * the tree's current page pointer appropriately.
232 * Parameters:
233 * t -- tree to position
234 * new -- the item we want to insert
235 * parent -- parent of the node that we just split
236 * prev -- page number of item directly to the left of
237 * new's position in the tree.
239 * Returns:
240 * RET_SUCCESS, RET_ERROR.
242 * Side Effects:
243 * None.
247 _bt_reposition(t, new, parent, prev)
248 BTREE_P t;
249 IDATUM *new;
250 pgno_t parent;
251 pgno_t prev;
253 index_t i, next;
254 IDATUM *idx;
256 if (parent == P_ROOT) {
259 * If we just split the root page, then there are guaranteed
260 * to be exactly two IDATUMs on it. Look at both of them
261 * to see if they point to the page that we want.
264 if (_bt_getpage(t, parent) == RET_ERROR)
265 return (RET_ERROR);
267 next = NEXTINDEX(t->bt_curpage);
268 for (i = 0; i < next; i++) {
269 idx = (IDATUM *) GETDATUM(t->bt_curpage, i);
270 if (_bt_getpage(t, idx->i_pgno) == RET_ERROR)
271 return (RET_ERROR);
272 if (_bt_isonpage(t, new, prev) == RET_SUCCESS)
273 return (RET_SUCCESS);
274 if (_bt_getpage(t, parent) == RET_ERROR)
275 return (RET_ERROR);
277 } else {
280 * Get the parent page -- which is where the new item would
281 * have gone -- and figure out whether the new item now goes
282 * on the parent, or the page immediately to the right of
283 * the parent.
286 if (_bt_getpage(t, parent) == RET_ERROR)
287 return (RET_ERROR);
288 if (_bt_isonpage(t, new, prev) == RET_SUCCESS)
289 return (RET_SUCCESS);
290 if (_bt_getpage(t, t->bt_curpage->h_nextpg) == RET_ERROR)
291 return (RET_ERROR);
292 if (_bt_isonpage(t, new, prev) == RET_SUCCESS)
293 return (RET_SUCCESS);
295 return (RET_ERROR);
299 * _BT_ISONPAGE -- Is the IDATUM for a given page number on the current page?
301 * This routine is used by _bt_reposition to decide whether the current
302 * page is the correct one on which to insert a new item.
304 * Parameters:
305 * t -- tree to check
306 * new -- the item that will be inserted (used for binary search)
307 * prev -- page number of page whose IDATUM is immediately to
308 * the left of new's position in the tree.
310 * Returns:
311 * RET_SUCCESS, RET_ERROR (corresponding to TRUE, FALSE).
315 _bt_isonpage(t, new, prev)
316 BTREE_P t;
317 IDATUM *new;
318 pgno_t prev;
320 BTHEADER *h = (BTHEADER *) t->bt_curpage;
321 index_t i, next;
322 IDATUM *idx;
324 i = _bt_binsrch(t, &(new->i_bytes[0]));
325 while (i != 0 && _bt_cmp(t, &(new->i_bytes[0]), i) == 0)
326 --i;
327 next = NEXTINDEX(h);
328 idx = (IDATUM *) GETDATUM(h, i);
329 while (i < next && idx->i_pgno != prev) {
330 i++;
331 idx = (IDATUM *) GETDATUM(h, i);
333 if (idx->i_pgno == prev)
334 return (RET_SUCCESS);
335 else
336 return (RET_ERROR);
340 * _BT_SPLITROOT -- Split the root of a btree.
342 * The root page for a btree is always page one. This means that in
343 * order to split the root, we need to do extra work.
345 * Parameters:
346 * t -- tree to split
348 * Returns:
349 * RET_SUCCESS, RET_ERROR.
351 * Side Effects:
352 * Splits root upward in the usual way, adding two new pages
353 * to the tree (rather than just one, as in usual splits).
357 _bt_splitroot(t)
358 BTREE_P t;
360 BTHEADER *h = t->bt_curpage;
361 BTHEADER *left, *right;
362 IDATUM *id;
363 BTHEADER *where_h;
364 char *src, *dest;
365 int len, nbytes;
366 u_long was_leaf;
367 pgno_t oldchain;
368 u_char flags;
370 /* get two new pages for the split */
371 if ((left = _bt_allocpg(t)) == (BTHEADER *) NULL)
372 return (RET_ERROR);
373 left->h_pgno = ++(t->bt_npages);
374 if ((right = _bt_allocpg(t)) == (BTHEADER *) NULL)
375 return (RET_ERROR);
376 right->h_pgno = ++(t->bt_npages);
378 /* do the split */
379 if (_bt_dopsplit(t, left, right) == RET_ERROR)
380 return (RET_ERROR);
382 /* connect the new pages correctly */
383 right->h_prevpg = left->h_pgno;
384 left->h_nextpg = right->h_pgno;
387 * Write the child pages out now. We need them to remain
388 * where they are until we finish updating parent pages,
389 * however.
392 if (_bt_write(t, left, NORELEASE) == RET_ERROR)
393 return (RET_ERROR);
394 if (_bt_write(t, right, NORELEASE) == RET_ERROR)
395 return (RET_ERROR);
397 /* now change the root page into an internal page */
398 was_leaf = (h->h_flags & F_LEAF);
399 h->h_flags &= ~F_LEAF;
400 h->h_lower = (index_t) (((char *) (&(h->h_linp[0]))) - ((char *) h));
401 h->h_upper = (index_t) t->bt_psize;
402 (void) bzero((char *) &(h->h_linp[0]), (int) (h->h_upper - h->h_lower));
404 /* put two new keys on root page */
405 where_h = left;
406 while (where_h) {
407 DATUM *data;
408 IDATUM *idata;
410 if (was_leaf) {
411 data = (DATUM *) GETDATUM(where_h, 0);
413 if (where_h == left) {
414 len = 0; /* first key in tree is null */
415 } else {
416 if (data->d_flags & D_BIGKEY) {
417 bcopy(&(data->d_bytes[0]),
418 (char *) &oldchain,
419 sizeof(oldchain));
420 if (_bt_markchain(t, oldchain) == RET_ERROR)
421 return (RET_ERROR);
422 src = (char *) &oldchain;
423 flags = D_BIGKEY;
424 } else {
425 src = (char *) &(data->d_bytes[0]);
426 flags = 0;
428 len = data->d_ksize;
430 } else {
431 idata = (IDATUM *) GETDATUM(where_h, 0);
432 len = idata->i_size;
433 flags = idata->i_flags;
434 src = &(idata->i_bytes[0]);
436 dest = ((char *) h) + h->h_upper;
437 nbytes = len + (sizeof (IDATUM) - sizeof(char));
438 dest -= LONGALIGN(nbytes);
439 id = (IDATUM *) dest;
440 id->i_size = len;
441 id->i_pgno = where_h->h_pgno;
442 id->i_flags = flags;
443 if (len > 0)
444 (void) bcopy((char *) src, (char *) &(id->i_bytes[0]), len);
445 dest -= ((int) h);
446 h->h_linp[NEXTINDEX(h)] = (index_t) dest;
447 h->h_upper = (index_t) dest;
448 h->h_lower += sizeof(index_t);
450 /* next page */
451 if (where_h == left)
452 where_h = right;
453 else
454 where_h = (BTHEADER *) NULL;
457 if (_bt_release(t, left) == RET_ERROR)
458 return (RET_ERROR);
459 if (_bt_release(t, right) == RET_ERROR)
460 return (RET_ERROR);
463 * That's it, split is done. If we're doing a non-cached disk
464 * btree, we can free up the pages we allocated, as they're on
465 * disk, now.
468 if (ISDISK(t) && !ISCACHE(t)) {
469 (void) free ((char *) left);
470 (void) free ((char *) right);
473 h->h_flags |= F_DIRTY;
475 return (RET_SUCCESS);
479 * _BT_DOPSPLIT -- Do the work of splitting a page
481 * This routine takes two page pointers and splits the data on the
482 * current page of the btree between them.
484 * We do a lot of work here to handle duplicate keys on a page; we
485 * have to place these keys carefully to guarantee that later searches
486 * will find them correctly. See comments in the code below for details.
488 * Parameters:
489 * t -- tree to split
490 * left -- pointer to page to get left half of the data
491 * right -- pointer to page to get right half of the data
493 * Returns:
494 * None.
498 _bt_dopsplit(t, left, right)
499 BTREE_P t;
500 BTHEADER *left;
501 BTHEADER *right;
503 BTHEADER *h = t->bt_curpage;
504 size_t psize;
505 char *where;
506 BTHEADER *where_h;
507 index_t where_i;
508 int nbytes, dsize, fixedsize, freespc;
509 index_t i;
510 index_t save_lower, save_upper, save_i;
511 index_t switch_i;
512 char *save_key;
513 DATUM *d;
514 CURSOR *c;
515 index_t top;
516 int free_save;
517 pgno_t chain;
518 int ignore;
521 * Our strategy is to put half the bytes on each page. We figure
522 * out how many bytes we have total, and then move items until
523 * the last item moved put at least 50% of the data on the left
524 * page.
526 save_key = (char *) NULL;
527 psize = (int) t->bt_psize;
528 where = ((char *) left) + psize;
529 where_h = left;
530 where_i = 0;
531 nbytes = psize - (int) ((char *) &(h->h_linp[0]) - ((char *) h));
532 freespc = nbytes;
534 top = NEXTINDEX(h);
535 if (h->h_flags & F_LEAF)
536 fixedsize = (sizeof(DATUM) - sizeof(char));
537 else
538 fixedsize = (sizeof(IDATUM) - sizeof(char));
540 save_key = (char *) NULL;
542 /* move data */
543 for (i = 0; i < top; i++) {
546 * Internal and leaf pages have different layouts for
547 * data items, but in both cases the first entry in the
548 * data item is a size_t.
550 d = (DATUM *) GETDATUM(h,i);
551 if (h->h_flags & F_LEAF) {
552 dsize = d->d_ksize + d->d_dsize + fixedsize;
553 } else {
554 dsize = d->d_ksize + fixedsize;
558 * If a page contains duplicate keys, we have to be
559 * careful about splits. A sequence of duplicate keys
560 * may not begin in the middle of one page and end in
561 * the middle of another; it must begin on a page boundary,
562 * in order for searches on the internal nodes to work
563 * correctly.
565 if (where_h == left) {
566 if (save_key == (char *) NULL) {
567 if (h->h_flags & F_LEAF) {
568 if (d->d_flags & D_BIGKEY) {
569 free_save = TRUE;
570 bcopy(&(d->d_bytes[0]),
571 (char *) &chain,
572 sizeof(chain));
573 if (_bt_getbig(t, chain,
574 &save_key,
575 &ignore)
576 == RET_ERROR)
577 return (RET_ERROR);
578 } else {
579 free_save = FALSE;
580 save_key = (char *) &(d->d_bytes[0]);
582 } else {
583 IDATUM *id = (IDATUM *) d;
585 if (id->i_flags & D_BIGKEY) {
586 free_save = TRUE;
587 bcopy(&(id->i_bytes[0]),
588 (char *) &chain,
589 sizeof(chain));
590 if (_bt_getbig(t, chain,
591 &save_key,
592 &ignore)
593 == RET_ERROR)
594 return (RET_ERROR);
595 } else {
596 free_save = FALSE;
597 save_key = (char *)
598 &(id->i_bytes[0]);
601 save_i = 0;
602 save_lower = where_h->h_lower;
603 save_upper = where_h->h_upper;
604 } else {
605 if (_bt_cmp(t, save_key, i) != 0) {
606 save_lower = where_h->h_lower;
607 save_upper = where_h->h_upper;
608 save_i = i;
610 if (h->h_flags & F_LEAF) {
611 if (free_save)
612 (void) free(save_key);
613 if (d->d_flags & D_BIGKEY) {
614 free_save = TRUE;
615 bcopy(&(d->d_bytes[0]),
616 (char *) &chain,
617 sizeof(chain));
618 if (_bt_getbig(t, chain,
619 &save_key,
620 &ignore)
621 == RET_ERROR)
622 return (RET_ERROR);
623 } else {
624 free_save = FALSE;
625 save_key = (char *) &(d->d_bytes[0]);
627 } else {
628 IDATUM *id = (IDATUM *) d;
630 if (id->i_flags & D_BIGKEY) {
631 free_save = TRUE;
632 bcopy(&(id->i_bytes[0]),
633 (char *) &chain,
634 sizeof(chain));
635 if (_bt_getbig(t, chain,
636 &save_key,
637 &ignore)
638 == RET_ERROR)
639 return (RET_ERROR);
640 } else {
641 free_save = FALSE;
642 save_key = (char *)
643 &(id->i_bytes[0]);
649 /* copy data and update page state */
650 where -= LONGALIGN(dsize);
651 (void) bcopy((char *) d, (char *) where, dsize);
652 where_h->h_upper = where_h->h_linp[where_i] =
653 (index_t) (where - (int) where_h);
654 where_h->h_lower += sizeof(index_t);
655 where_i++;
657 /* if we've moved half, switch to the right-hand page */
658 nbytes -= LONGALIGN(dsize) + sizeof(index_t);
659 if ((freespc - nbytes) > nbytes) {
660 nbytes = 2 * freespc;
662 /* identical keys go on the same page */
663 if (save_i > 0) {
664 /* i gets incremented at loop bottom... */
665 i = save_i - 1;
666 where_h->h_lower = save_lower;
667 where_h->h_upper = save_upper;
669 where = ((char *) right) + psize;
670 where_h = right;
671 switch_i = where_i;
672 where_i = 0;
677 * If there was an active scan on the database, and we just
678 * split the page that the cursor was on, we may need to
679 * adjust the cursor to point to the same entry as before the
680 * split.
683 c = &(t->bt_cursor);
684 if ((t->bt_flags & BTF_SEQINIT)
685 && (c->c_pgno == h->h_pgno)
686 && (c->c_index >= switch_i)) {
687 c->c_pgno = where_h->h_pgno;
688 c->c_index -= where_i;
690 return (RET_SUCCESS);