2 * Copyright (c) 1990 The Regents of the University of California.
5 * This code is derived from software contributed to Berkeley by
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 * must display the following acknowledgement:
18 * This product includes software developed by the University of
19 * California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 #if defined(LIBC_SCCS) && !defined(lint)
38 static char sccsid
[] = "@(#)btree.c 5.9 (Berkeley) 5/7/91";
39 #endif /* LIBC_SCCS and not lint */
42 * btree.c -- implementation of btree access method for 4.4BSD.
44 * The design here is based on that of the btree access method used
45 * in the Postgres database system at UC Berkeley. The implementation
46 * is wholly independent of the Postgres code.
48 * This implementation supports btrees on disk (supply a filename) or
49 * in memory (don't). Public interfaces defined here are:
51 * btree_open() -- wrapper; returns a filled DB struct for
54 * bt_open() -- open a new or existing btree.
55 * bt_get() -- fetch data from a tree by key.
56 * bt_seq() -- do a sequential scan on a tree.
57 * bt_put() -- add data to a tree by key.
58 * bt_delete() -- remove data from a tree by key.
59 * bt_close() -- close a btree.
60 * bt_sync() -- force btree pages to disk (disk trees only).
63 #include <sys/param.h>
74 BTREEINFO _DefaultBTInfo
= {
83 * BTREE_OPEN -- Wrapper routine to open a btree.
85 * Creates and fills a DB struct, and calls the routine that actually
90 * flags: flag bits passed to open
91 * mode: permission bits, used if O_CREAT specified
92 * b: BTREEINFO pointer
95 * Filled-in DBT on success; NULL on failure, with errno
99 * Allocates memory for the DB struct.
103 btree_open(f
, flags
, mode
, b
)
112 if ((db
= (DB
*) malloc((unsigned) sizeof(DB
))) == (DB
*) NULL
)
113 return ((DB
*) NULL
);
115 if ((t
= bt_open(f
, flags
, mode
, b
)) == (BTREE
) NULL
) {
116 (void) free ((char *) db
);
117 return ((DB
*) NULL
);
120 db
->internal
= (char *) t
;
121 db
->close
= bt_close
;
133 * BT_OPEN -- Open a btree.
135 * This routine creates the correct kind (disk or in-memory) of
136 * btree and initializes it as required.
139 * f -- name of btree (NULL for in-memory btrees)
140 * flags -- flags passed to open()
141 * mode -- mode passed to open ()
142 * b -- BTREEINFO structure, describing btree
145 * (Opaque) pointer to the btree. On failure, returns NULL
146 * with errno set as appropriate.
149 * Allocates memory, opens files.
153 bt_open(f
, flags
, mode
, b
)
167 /* use the default info if none was provided */
168 if (b
== (BTREEINFO
*) NULL
)
171 if ((t
= (BTREE_P
) malloc((unsigned) sizeof *t
)) == (BTREE_P
) NULL
)
172 return ((BTREE
) NULL
);
175 t
->bt_compare
= b
->compare
;
177 t
->bt_compare
= strcmp
;
180 t
->bt_curpage
= (BTHEADER
*) NULL
;
185 c
->c_flags
= (u_char
) NULL
;
186 c
->c_key
= (char *) NULL
;
187 t
->bt_stack
= (BTSTACK
*) NULL
;
191 * If no file name was supplied, this is an in-memory btree.
192 * Otherwise, it's a disk-based btree.
194 if (f
== (char *) NULL
) {
196 if ((t
->bt_psize
= b
->psize
) < MINPSIZE
) {
197 if (t
->bt_psize
!= 0) {
198 (void) free ((char *) t
);
200 return ((BTREE
) NULL
);
202 t
->bt_psize
= getpagesize();
205 nbytes
= HTSIZE
* sizeof(HTBUCKET
*);
206 if ((ht
= (HTABLE
) malloc((unsigned) nbytes
))
208 (void) free((char *) t
);
209 return ((BTREE
) NULL
);
211 (void) bzero((char *) ht
, nbytes
);
214 t
->bt_lorder
= BYTE_ORDER
;
215 if (!(b
->flags
& R_DUP
))
216 t
->bt_flags
|= BTF_NODUPS
;
219 if ((fd
= open(f
, O_RDONLY
, 0)) < 0) {
220 /* doesn't exist yet, be sure page is big enough */
221 if ((t
->bt_psize
= b
->psize
) < sizeof(BTHEADER
)
223 (void) free((char *) t
);
225 return ((BTREE
) NULL
);
228 b
->lorder
= BYTE_ORDER
;
230 if (b
->lorder
!= BIG_ENDIAN
231 && b
->lorder
!= LITTLE_ENDIAN
) {
232 (void) free((char *) t
);
234 return ((BTREE
) NULL
);
236 t
->bt_lorder
= b
->lorder
;
237 if (!(b
->flags
& R_DUP
))
238 t
->bt_flags
|= BTF_NODUPS
;
240 /* exists, get page size from file */
241 if (read(fd
, &m
, sizeof(m
)) < sizeof(m
)) {
243 (void) free((char *) t
);
245 return ((BTREE
) NULL
);
248 /* lorder always stored in host-independent format */
250 if (m
.m_lorder
!= BIG_ENDIAN
251 && m
.m_lorder
!= LITTLE_ENDIAN
) {
252 (void) free((char *) t
);
254 return ((BTREE
) NULL
);
256 t
->bt_lorder
= m
.m_lorder
;
258 if (t
->bt_lorder
!= BYTE_ORDER
) {
266 if (m
.m_magic
!= BTREEMAGIC
267 || m
.m_version
!= BTREEVERSION
268 || m
.m_psize
< MINPSIZE
) {
270 (void) free((char *) t
);
275 return ((BTREE
) NULL
);
277 t
->bt_psize
= m
.m_psize
;
278 t
->bt_free
= m
.m_free
;
279 t
->bt_flags
|= (m
.m_flags
& BTF_NODUPS
) | BTF_METAOK
;
283 /* now open the file the way the user wanted it */
284 if ((t
->bt_s
.bt_d
.d_fd
= open(f
, flags
, mode
)) < 0) {
285 (void) free ((char *) t
);
286 return ((BTREE
) NULL
);
289 /* access method files are always close-on-exec */
290 if (fcntl(t
->bt_s
.bt_d
.d_fd
, F_SETFL
, 1) == -1) {
291 (void) close(t
->bt_s
.bt_d
.d_fd
);
292 (void) free ((char *) t
);
293 return ((BTREE
) NULL
);
296 /* get number of pages, page size if necessary */
297 (void) fstat(t
->bt_s
.bt_d
.d_fd
, &buf
);
298 if (t
->bt_psize
== 0)
299 t
->bt_psize
= buf
.st_blksize
;
300 t
->bt_npages
= (pgno_t
) (buf
.st_size
/ t
->bt_psize
);
302 /* page zero is metadata, doesn't count */
303 if (t
->bt_npages
> 0)
306 if (b
->cachesize
== 0)
307 b
->cachesize
= DEFCACHE
;
309 /* get an lru buffer cache, if the user asked for one */
310 if ((b
->cachesize
/ t
->bt_psize
) > 0) {
311 BTDISK
*d
= &(t
->bt_s
.bt_d
);
313 d
->d_cache
= lruinit(d
->d_fd
,
314 (int) (b
->cachesize
/ t
->bt_psize
),
316 (char *) t
->bt_lorder
,
317 _bt_pgin
, _bt_pgout
);
319 if (d
->d_cache
== (char *) NULL
) {
320 (void) free((char *) t
);
321 return ((BTREE
) NULL
);
326 /* remember if tree was opened for write */
327 if (((flags
& O_WRONLY
) == O_WRONLY
)
328 || ((flags
& O_RDWR
) == O_RDWR
))
329 t
->bt_flags
|= BTF_ISWRITE
;
335 * BT_GET -- Get an entry from a btree.
337 * Does a key lookup in the tree to find the specified key, and returns
338 * the key/data pair found.
341 * tree -- btree in which to do lookup
343 * data -- pointer to DBT in which to return data
347 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the key is not
348 * found. If key is not found, nothing is stored in the
355 * Return data is statically allocated, and will be overwritten
360 bt_get(dbp
, key
, data
, flag
)
365 BTREE_P t
= (BTREE_P
) (dbp
->internal
);
375 item
= _bt_search(t
, key
);
377 /* clear parent pointer stack */
378 while (_bt_pop(t
) != P_NONE
)
381 if (item
== (BTITEM
*) NULL
)
384 h
= (BTHEADER
*) t
->bt_curpage
;
386 data
->data
= (u_char
*) NULL
;
389 if (VALIDITEM(t
, item
)
390 && _bt_cmp(t
, key
->data
, item
->bti_index
) == 0) {
391 d
= (DATUM
*) GETDATUM(h
, item
->bti_index
);
392 return (_bt_buildret(t
, d
, data
, key
));
396 return (RET_SPECIAL
);
400 * BT_PUT -- Add an entry to a btree.
402 * The specified (key, data) pair is added to the tree. If the tree
403 * was created for unique keys only, then duplicates will not be
404 * entered. If the requested key exists in the tree, it will be over-
405 * written unless the flags parameter is R_NOOVERWRITE, in which case
406 * the update will not be done. If duplicate keys are permitted in the
407 * tree, duplicates will be inserted and will not overwrite existing
408 * keys. Nodes are split as required.
411 * tree -- btree in which to put the new entry
412 * key -- key component to add
413 * data -- data corresponding to key
414 * flag -- R_NOOVERWRITE or zero.
417 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the
418 * NOOVERWRITE flag was set and the specified key exists
426 bt_put(dbp
, key
, data
, flag
)
434 t
= (BTREE_P
)dbp
->internal
;
436 /* look for this key in the tree */
437 item
= _bt_search(t
, key
);
440 * If this tree was originally created without R_DUP, then duplicate
441 * keys are not allowed. We need to check this at insertion time.
444 if (VALIDITEM(t
, item
) && _bt_cmp(t
, key
->data
, item
->bti_index
) == 0) {
445 if ((t
->bt_flags
& BTF_NODUPS
) && flag
== R_NOOVERWRITE
) {
446 if (_bt_delone(t
, item
->bti_index
) == RET_ERROR
) {
447 while (_bt_pop(t
) != P_NONE
)
454 return (_bt_insert(t
, item
, key
, data
, flag
));
458 * BT_DELETE -- delete a key from the tree.
460 * Deletes all keys (and their associated data items) matching the
461 * supplied key from the tree. If the flags entry is R_CURSOR, then
462 * the current item in the active scan is deleted.
465 * tree -- btree from which to delete key
466 * key -- key to delete
467 * flags -- R_CURSOR or zero
470 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the specified
471 * key was not in the tree.
478 bt_delete(dbp
, key
, flags
)
488 t
= (BTREE_P
)dbp
->internal
;
490 if (flags
== R_CURSOR
)
491 return (_bt_crsrdel(t
));
493 /* find the first matching key in the tree */
494 item
= _bt_first(t
, key
);
497 /* don't need the descent stack for deletes */
498 while (_bt_pop(t
) != P_NONE
)
501 /* delete all matching keys */
503 while (VALIDITEM(t
, item
)
504 && (_bt_cmp(t
, key
->data
, item
->bti_index
) == 0)) {
505 if (_bt_delone(t
, item
->bti_index
) == RET_ERROR
)
510 if (VALIDITEM(t
, item
) || h
->h_nextpg
== P_NONE
)
513 /* next page, if necessary */
515 if (_bt_getpage(t
, h
->h_nextpg
) == RET_ERROR
)
518 } while (NEXTINDEX(h
) == 0 && h
->h_nextpg
!= P_NONE
);
520 item
->bti_pgno
= h
->h_pgno
;
523 if (!VALIDITEM(t
, item
)
524 || _bt_cmp(t
, key
->data
, item
->bti_index
) != 0)
528 /* flush changes to disk */
530 if (h
->h_flags
& F_DIRTY
) {
531 if (_bt_write(t
, t
->bt_curpage
, NORELEASE
) == RET_ERROR
)
537 return (RET_SPECIAL
);
539 return (RET_SUCCESS
);
543 * BT_SYNC -- sync the btree to disk.
546 * tree -- btree to sync.
549 * RET_SUCCESS, RET_ERROR.
559 t
= (BTREE_P
)dbp
->internal
;
561 /* if this is an in-memory btree, syncing is a no-op */
563 return (RET_SUCCESS
);
565 h
= (BTHEADER
*) t
->bt_curpage
;
566 h
->h_flags
&= ~F_DIRTY
;
569 pgno
= t
->bt_curpage
->h_pgno
;
570 if (_bt_write(t
, h
, RELEASE
) == RET_ERROR
)
572 if (lrusync(t
->bt_s
.bt_d
.d_cache
) < RET_ERROR
)
574 if (_bt_getpage(t
, pgno
) == RET_ERROR
)
577 if (_bt_write(t
, h
, NORELEASE
) == RET_ERROR
)
581 return (fsync(t
->bt_s
.bt_d
.d_fd
));
585 * BT_SEQ -- Sequential scan interface.
587 * This routine supports sequential scans on the btree. If called with
588 * flags set to R_CURSOR, or if no seq scan has been initialized in the
589 * current tree, we initialize the scan. Otherwise, we advance the scan
590 * and return the next item.
592 * Scans can be either keyed or non-keyed. Keyed scans basically have
593 * a starting point somewhere in the middle of the tree. Non-keyed
594 * scans start at an endpoint. Also, scans can be backward (descending
595 * order), forward (ascending order), or no movement (keep returning
598 * Flags is checked every time we enter the routine, so the user can
599 * change directions on an active scan if desired. The key argument
600 * is examined only when we initialize the scan, in order to position
603 * Items are returned via the key and data arguments passed in.
606 * tree -- btree in which to do scan
607 * key -- key, used to position scan on initialization, and
608 * used to return key components to the user.
609 * data -- used to return data components to the user.
610 * flags -- specify R_CURSOR, R_FIRST, R_LAST, R_NEXT, or
614 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if no more data
615 * exists in the tree in the specified direction.
618 * Changes the btree's notion of the current position in the
622 * The key and data items returned are static and will be
623 * overwritten by the next bt_get or bt_seq.
627 bt_seq(dbp
, key
, data
, flags
)
637 t
= (BTREE_P
)dbp
->internal
;
639 /* do we need to initialize the scan? */
640 if (flags
== R_CURSOR
|| flags
== R_LAST
|| flags
== R_FIRST
641 || !(t
->bt_flags
& BTF_SEQINIT
)) {
644 status
= _bt_seqinit(t
, key
, flags
);
646 /* just advance the current scan pointer */
647 status
= _bt_seqadvance(t
, flags
);
650 key
->size
= data
->size
= 0;
651 key
->data
= data
->data
= (u_char
*) NULL
;
655 /* is there a valid item at the current scan location? */
656 if (status
== RET_SPECIAL
) {
657 if (flags
== R_NEXT
) {
658 if (t
->bt_cursor
.c_index
>= NEXTINDEX(h
)) {
659 if (NEXTINDEX(h
) > 0)
660 t
->bt_cursor
.c_index
= NEXTINDEX(h
) - 1;
662 t
->bt_cursor
.c_index
= 0;
665 t
->bt_cursor
.c_index
= 0;
667 return (RET_SPECIAL
);
668 } else if (status
== RET_ERROR
)
671 /* okay, return the data */
672 d
= (DATUM
*) GETDATUM(h
, t
->bt_cursor
.c_index
);
674 return (_bt_buildret(t
, d
, data
, key
));
678 * BT_CLOSE -- Close a btree
681 * tree -- tree to close
684 * RET_SUCCESS, RET_ERROR.
687 * Frees space occupied by the tree.
694 struct HTBUCKET
*b
, *sb
;
701 t
= (BTREE_P
)dbp
->internal
;
703 if (t
->bt_cursor
.c_key
!= (char *) NULL
)
704 (void) free(t
->bt_cursor
.c_key
);
707 /* in-memory tree, release hash table memory */
709 for (i
= 0; i
< HTSIZE
; i
++) {
710 if ((b
= ht
[i
]) == (struct HTBUCKET
*) NULL
)
714 (void) free((char *) b
->ht_page
);
716 (void) free((char *) sb
);
717 } while (b
!= (struct HTBUCKET
*) NULL
);
719 (void) free ((char *) ht
);
720 (void) free ((char *) t
);
721 return (RET_SUCCESS
);
724 if ((t
->bt_flags
& BTF_ISWRITE
) && !(t
->bt_flags
& BTF_METAOK
)) {
725 if (_bt_wrtmeta(t
) == RET_ERROR
)
729 if (t
->bt_curpage
!= (BTHEADER
*) NULL
) {
731 if (h
->h_flags
& F_DIRTY
) {
732 if (_bt_write(t
, h
, RELEASE
) == RET_ERROR
)
735 if (_bt_release(t
, h
) == RET_ERROR
)
739 /* flush and free the cache, if there is one */
741 cache
= t
->bt_s
.bt_d
.d_cache
;
742 if (lrusync(cache
) == RET_ERROR
)
746 (void) free ((char *) h
);
749 fd
= t
->bt_s
.bt_d
.d_fd
;
750 (void) free ((char *) t
);