Sync usage with man page.
[netbsd-mini2440.git] / lib / libc / db / btree / btree.c
blobd47699a78316a2256110a4150fc1f18dc1267d5a
1 /*-
2 * Copyright (c) 1990 The Regents of the University of California.
3 * All rights reserved.
5 * This code is derived from software contributed to Berkeley by
6 * Mike Olson.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 * must display the following acknowledgement:
18 * This product includes software developed by the University of
19 * California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
37 #if defined(LIBC_SCCS) && !defined(lint)
38 static char sccsid[] = "@(#)btree.c 5.9 (Berkeley) 5/7/91";
39 #endif /* LIBC_SCCS and not lint */
42 * btree.c -- implementation of btree access method for 4.4BSD.
44 * The design here is based on that of the btree access method used
45 * in the Postgres database system at UC Berkeley. The implementation
46 * is wholly independent of the Postgres code.
48 * This implementation supports btrees on disk (supply a filename) or
49 * in memory (don't). Public interfaces defined here are:
51 * btree_open() -- wrapper; returns a filled DB struct for
52 * a btree.
54 * bt_open() -- open a new or existing btree.
55 * bt_get() -- fetch data from a tree by key.
56 * bt_seq() -- do a sequential scan on a tree.
57 * bt_put() -- add data to a tree by key.
58 * bt_delete() -- remove data from a tree by key.
59 * bt_close() -- close a btree.
60 * bt_sync() -- force btree pages to disk (disk trees only).
63 #include <sys/param.h>
64 #include <sys/stat.h>
65 #include <signal.h>
66 #include <errno.h>
67 #include <fcntl.h>
68 #include <db.h>
69 #include <stdlib.h>
70 #include <string.h>
71 #include <unistd.h>
72 #include "btree.h"
74 BTREEINFO _DefaultBTInfo = {
75 0, /* flags */
76 0, /* cachesize */
77 0, /* psize */
78 strcmp, /* compare */
83 * BTREE_OPEN -- Wrapper routine to open a btree.
85 * Creates and fills a DB struct, and calls the routine that actually
86 * opens the btree.
88 * Parameters:
89 * f: filename to open
90 * flags: flag bits passed to open
91 * mode: permission bits, used if O_CREAT specified
92 * b: BTREEINFO pointer
94 * Returns:
95 * Filled-in DBT on success; NULL on failure, with errno
96 * set as appropriate.
98 * Side Effects:
99 * Allocates memory for the DB struct.
102 DB *
103 btree_open(f, flags, mode, b)
104 const char *f;
105 int flags;
106 int mode;
107 const BTREEINFO *b;
109 DB *db;
110 BTREE t;
112 if ((db = (DB *) malloc((unsigned) sizeof(DB))) == (DB *) NULL)
113 return ((DB *) NULL);
115 if ((t = bt_open(f, flags, mode, b)) == (BTREE) NULL) {
116 (void) free ((char *) db);
117 return ((DB *) NULL);
120 db->internal = (char *) t;
121 db->close = bt_close;
122 db->del = bt_delete;
123 db->get = bt_get;
124 db->put = bt_put;
125 db->seq = bt_seq;
126 db->sync = bt_sync;
127 db->type = DB_BTREE;
129 return (db);
133 * BT_OPEN -- Open a btree.
135 * This routine creates the correct kind (disk or in-memory) of
136 * btree and initializes it as required.
138 * Parameters:
139 * f -- name of btree (NULL for in-memory btrees)
140 * flags -- flags passed to open()
141 * mode -- mode passed to open ()
142 * b -- BTREEINFO structure, describing btree
144 * Returns:
145 * (Opaque) pointer to the btree. On failure, returns NULL
146 * with errno set as appropriate.
148 * Side Effects:
149 * Allocates memory, opens files.
152 BTREE
153 bt_open(f, flags, mode, b)
154 char *f;
155 int flags;
156 int mode;
157 BTREEINFO *b;
159 BTREE_P t;
160 HTABLE ht;
161 int nbytes;
162 int fd;
163 CURSOR *c;
164 BTMETA m;
165 struct stat buf;
167 /* use the default info if none was provided */
168 if (b == (BTREEINFO *) NULL)
169 b = &_DefaultBTInfo;
171 if ((t = (BTREE_P) malloc((unsigned) sizeof *t)) == (BTREE_P) NULL)
172 return ((BTREE) NULL);
174 if (b->compare)
175 t->bt_compare = b->compare;
176 else
177 t->bt_compare = strcmp;
179 t->bt_fname = f;
180 t->bt_curpage = (BTHEADER *) NULL;
181 t->bt_free = P_NONE;
182 c = &(t->bt_cursor);
183 c->c_pgno = P_NONE;
184 c->c_index = 0;
185 c->c_flags = (u_char) NULL;
186 c->c_key = (char *) NULL;
187 t->bt_stack = (BTSTACK *) NULL;
188 t->bt_flags = 0;
191 * If no file name was supplied, this is an in-memory btree.
192 * Otherwise, it's a disk-based btree.
194 if (f == (char *) NULL) {
195 /* in memory */
196 if ((t->bt_psize = b->psize) < MINPSIZE) {
197 if (t->bt_psize != 0) {
198 (void) free ((char *) t);
199 errno = EINVAL;
200 return ((BTREE) NULL);
202 t->bt_psize = getpagesize();
205 nbytes = HTSIZE * sizeof(HTBUCKET *);
206 if ((ht = (HTABLE) malloc((unsigned) nbytes))
207 == (HTABLE) NULL) {
208 (void) free((char *) t);
209 return ((BTREE) NULL);
211 (void) bzero((char *) ht, nbytes);
212 t->bt_s.bt_ht = ht;
213 t->bt_npages = 0;
214 t->bt_lorder = BYTE_ORDER;
215 if (!(b->flags & R_DUP))
216 t->bt_flags |= BTF_NODUPS;
217 } else {
218 /* on disk */
219 if ((fd = open(f, O_RDONLY, 0)) < 0) {
220 /* doesn't exist yet, be sure page is big enough */
221 if ((t->bt_psize = b->psize) < sizeof(BTHEADER)
222 && b->psize != 0) {
223 (void) free((char *) t);
224 errno = EINVAL;
225 return ((BTREE) NULL);
227 if (b->lorder == 0)
228 b->lorder = BYTE_ORDER;
230 if (b->lorder != BIG_ENDIAN
231 && b->lorder != LITTLE_ENDIAN) {
232 (void) free((char *) t);
233 errno = EINVAL;
234 return ((BTREE) NULL);
236 t->bt_lorder = b->lorder;
237 if (!(b->flags & R_DUP))
238 t->bt_flags |= BTF_NODUPS;
239 } else {
240 /* exists, get page size from file */
241 if (read(fd, &m, sizeof(m)) < sizeof(m)) {
242 (void) close(fd);
243 (void) free((char *) t);
244 errno = EINVAL;
245 return ((BTREE) NULL);
248 /* lorder always stored in host-independent format */
249 NTOHL(m.m_lorder);
250 if (m.m_lorder != BIG_ENDIAN
251 && m.m_lorder != LITTLE_ENDIAN) {
252 (void) free((char *) t);
253 errno = EINVAL;
254 return ((BTREE) NULL);
256 t->bt_lorder = m.m_lorder;
258 if (t->bt_lorder != BYTE_ORDER) {
259 BLSWAP(m.m_magic);
260 BLSWAP(m.m_version);
261 BLSWAP(m.m_psize);
262 BLSWAP(m.m_free);
263 BLSWAP(m.m_flags);
266 if (m.m_magic != BTREEMAGIC
267 || m.m_version != BTREEVERSION
268 || m.m_psize < MINPSIZE) {
269 (void) close(fd);
270 (void) free((char *) t);
271 #ifndef EFTYPE
272 #define EFTYPE -100
273 #endif
274 errno = EFTYPE;
275 return ((BTREE) NULL);
277 t->bt_psize = m.m_psize;
278 t->bt_free = m.m_free;
279 t->bt_flags |= (m.m_flags & BTF_NODUPS) | BTF_METAOK;
280 (void) close(fd);
283 /* now open the file the way the user wanted it */
284 if ((t->bt_s.bt_d.d_fd = open(f, flags, mode)) < 0) {
285 (void) free ((char *) t);
286 return ((BTREE) NULL);
289 /* access method files are always close-on-exec */
290 if (fcntl(t->bt_s.bt_d.d_fd, F_SETFL, 1) == -1) {
291 (void) close(t->bt_s.bt_d.d_fd);
292 (void) free ((char *) t);
293 return ((BTREE) NULL);
296 /* get number of pages, page size if necessary */
297 (void) fstat(t->bt_s.bt_d.d_fd, &buf);
298 if (t->bt_psize == 0)
299 t->bt_psize = buf.st_blksize;
300 t->bt_npages = (pgno_t) (buf.st_size / t->bt_psize);
302 /* page zero is metadata, doesn't count */
303 if (t->bt_npages > 0)
304 --(t->bt_npages);
306 if (b->cachesize == 0)
307 b->cachesize = DEFCACHE;
309 /* get an lru buffer cache, if the user asked for one */
310 if ((b->cachesize / t->bt_psize) > 0) {
311 BTDISK *d = &(t->bt_s.bt_d);
313 d->d_cache = lruinit(d->d_fd,
314 (int) (b->cachesize / t->bt_psize),
315 (int) t->bt_psize,
316 (char *) t->bt_lorder,
317 _bt_pgin, _bt_pgout);
319 if (d->d_cache == (char *) NULL) {
320 (void) free((char *) t);
321 return ((BTREE) NULL);
326 /* remember if tree was opened for write */
327 if (((flags & O_WRONLY) == O_WRONLY)
328 || ((flags & O_RDWR) == O_RDWR))
329 t->bt_flags |= BTF_ISWRITE;
331 return ((BTREE) t);
335 * BT_GET -- Get an entry from a btree.
337 * Does a key lookup in the tree to find the specified key, and returns
338 * the key/data pair found.
340 * Parameters:
341 * tree -- btree in which to do lookup
342 * key -- key to find
343 * data -- pointer to DBT in which to return data
344 * flag -- ignored
346 * Returns:
347 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the key is not
348 * found. If key is not found, nothing is stored in the
349 * return DBT 'data'.
351 * Side Effects:
352 * None.
354 * Warnings:
355 * Return data is statically allocated, and will be overwritten
356 * at the next call.
360 bt_get(dbp, key, data, flag)
361 DB *dbp;
362 DBT *key, *data;
363 u_long flag;
365 BTREE_P t = (BTREE_P) (dbp->internal);
366 BTHEADER *h;
367 DATUM *d;
368 BTITEM *item;
370 #ifdef lint
371 flag = flag;
372 #endif /* lint */
374 /* lookup */
375 item = _bt_search(t, key);
377 /* clear parent pointer stack */
378 while (_bt_pop(t) != P_NONE)
379 continue;
381 if (item == (BTITEM *) NULL)
382 return (RET_ERROR);
384 h = (BTHEADER *) t->bt_curpage;
385 data->size = 0;
386 data->data = (u_char *) NULL;
388 /* match? */
389 if (VALIDITEM(t, item)
390 && _bt_cmp(t, key->data, item->bti_index) == 0) {
391 d = (DATUM *) GETDATUM(h, item->bti_index);
392 return (_bt_buildret(t, d, data, key));
395 /* nope */
396 return (RET_SPECIAL);
400 * BT_PUT -- Add an entry to a btree.
402 * The specified (key, data) pair is added to the tree. If the tree
403 * was created for unique keys only, then duplicates will not be
404 * entered. If the requested key exists in the tree, it will be over-
405 * written unless the flags parameter is R_NOOVERWRITE, in which case
406 * the update will not be done. If duplicate keys are permitted in the
407 * tree, duplicates will be inserted and will not overwrite existing
408 * keys. Nodes are split as required.
410 * Parameters:
411 * tree -- btree in which to put the new entry
412 * key -- key component to add
413 * data -- data corresponding to key
414 * flag -- R_NOOVERWRITE or zero.
416 * Returns:
417 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the
418 * NOOVERWRITE flag was set and the specified key exists
419 * in the database.
421 * Side Effects:
422 * None.
426 bt_put(dbp, key, data, flag)
427 DB *dbp;
428 DBT *key, *data;
429 u_long flag;
431 BTREE_P t;
432 BTITEM *item;
434 t = (BTREE_P)dbp->internal;
436 /* look for this key in the tree */
437 item = _bt_search(t, key);
440 * If this tree was originally created without R_DUP, then duplicate
441 * keys are not allowed. We need to check this at insertion time.
444 if (VALIDITEM(t, item) && _bt_cmp(t, key->data, item->bti_index) == 0) {
445 if ((t->bt_flags & BTF_NODUPS) && flag == R_NOOVERWRITE) {
446 if (_bt_delone(t, item->bti_index) == RET_ERROR) {
447 while (_bt_pop(t) != P_NONE)
448 continue;
449 return (RET_ERROR);
454 return (_bt_insert(t, item, key, data, flag));
458 * BT_DELETE -- delete a key from the tree.
460 * Deletes all keys (and their associated data items) matching the
461 * supplied key from the tree. If the flags entry is R_CURSOR, then
462 * the current item in the active scan is deleted.
464 * Parameters:
465 * tree -- btree from which to delete key
466 * key -- key to delete
467 * flags -- R_CURSOR or zero
469 * Returns:
470 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the specified
471 * key was not in the tree.
473 * Side Effects:
474 * None.
478 bt_delete(dbp, key, flags)
479 DB *dbp;
480 DBT *key;
481 u_long flags;
483 BTREE_P t;
484 BTHEADER *h;
485 BTITEM *item;
486 int ndel = 0;
488 t = (BTREE_P)dbp->internal;
490 if (flags == R_CURSOR)
491 return (_bt_crsrdel(t));
493 /* find the first matching key in the tree */
494 item = _bt_first(t, key);
495 h = t->bt_curpage;
497 /* don't need the descent stack for deletes */
498 while (_bt_pop(t) != P_NONE)
499 continue;
501 /* delete all matching keys */
502 for (;;) {
503 while (VALIDITEM(t, item)
504 && (_bt_cmp(t, key->data, item->bti_index) == 0)) {
505 if (_bt_delone(t, item->bti_index) == RET_ERROR)
506 return (RET_ERROR);
507 ndel++;
510 if (VALIDITEM(t, item) || h->h_nextpg == P_NONE)
511 break;
513 /* next page, if necessary */
514 do {
515 if (_bt_getpage(t, h->h_nextpg) == RET_ERROR)
516 return (RET_ERROR);
517 h = t->bt_curpage;
518 } while (NEXTINDEX(h) == 0 && h->h_nextpg != P_NONE);
520 item->bti_pgno = h->h_pgno;
521 item->bti_index = 0;
523 if (!VALIDITEM(t, item)
524 || _bt_cmp(t, key->data, item->bti_index) != 0)
525 break;
528 /* flush changes to disk */
529 if (ISDISK(t)) {
530 if (h->h_flags & F_DIRTY) {
531 if (_bt_write(t, t->bt_curpage, NORELEASE) == RET_ERROR)
532 return (RET_ERROR);
536 if (ndel == 0)
537 return (RET_SPECIAL);
539 return (RET_SUCCESS);
543 * BT_SYNC -- sync the btree to disk.
545 * Parameters:
546 * tree -- btree to sync.
548 * Returns:
549 * RET_SUCCESS, RET_ERROR.
552 bt_sync(dbp)
553 DB *dbp;
555 BTREE_P t;
556 BTHEADER *h;
557 pgno_t pgno;
559 t = (BTREE_P)dbp->internal;
561 /* if this is an in-memory btree, syncing is a no-op */
562 if (!ISDISK(t))
563 return (RET_SUCCESS);
565 h = (BTHEADER *) t->bt_curpage;
566 h->h_flags &= ~F_DIRTY;
568 if (ISCACHE(t)) {
569 pgno = t->bt_curpage->h_pgno;
570 if (_bt_write(t, h, RELEASE) == RET_ERROR)
571 return(RET_ERROR);
572 if (lrusync(t->bt_s.bt_d.d_cache) < RET_ERROR)
573 return (RET_ERROR);
574 if (_bt_getpage(t, pgno) == RET_ERROR)
575 return (RET_ERROR);
576 } else {
577 if (_bt_write(t, h, NORELEASE) == RET_ERROR)
578 return (RET_ERROR);
581 return (fsync(t->bt_s.bt_d.d_fd));
585 * BT_SEQ -- Sequential scan interface.
587 * This routine supports sequential scans on the btree. If called with
588 * flags set to R_CURSOR, or if no seq scan has been initialized in the
589 * current tree, we initialize the scan. Otherwise, we advance the scan
590 * and return the next item.
592 * Scans can be either keyed or non-keyed. Keyed scans basically have
593 * a starting point somewhere in the middle of the tree. Non-keyed
594 * scans start at an endpoint. Also, scans can be backward (descending
595 * order), forward (ascending order), or no movement (keep returning
596 * the same item).
598 * Flags is checked every time we enter the routine, so the user can
599 * change directions on an active scan if desired. The key argument
600 * is examined only when we initialize the scan, in order to position
601 * it properly.
603 * Items are returned via the key and data arguments passed in.
605 * Parameters:
606 * tree -- btree in which to do scan
607 * key -- key, used to position scan on initialization, and
608 * used to return key components to the user.
609 * data -- used to return data components to the user.
610 * flags -- specify R_CURSOR, R_FIRST, R_LAST, R_NEXT, or
611 * R_PREV.
613 * Returns:
614 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if no more data
615 * exists in the tree in the specified direction.
617 * Side Effects:
618 * Changes the btree's notion of the current position in the
619 * scan.
621 * Warnings:
622 * The key and data items returned are static and will be
623 * overwritten by the next bt_get or bt_seq.
627 bt_seq(dbp, key, data, flags)
628 DB *dbp;
629 DBT *key, *data;
630 u_long flags;
632 BTREE_P t;
633 BTHEADER *h;
634 DATUM *d;
635 int status;
637 t = (BTREE_P)dbp->internal;
639 /* do we need to initialize the scan? */
640 if (flags == R_CURSOR || flags == R_LAST || flags == R_FIRST
641 || !(t->bt_flags & BTF_SEQINIT)) {
643 /* initialize it */
644 status = _bt_seqinit(t, key, flags);
645 } else {
646 /* just advance the current scan pointer */
647 status = _bt_seqadvance(t, flags);
650 key->size = data->size = 0;
651 key->data = data->data = (u_char *) NULL;
653 h = t->bt_curpage;
655 /* is there a valid item at the current scan location? */
656 if (status == RET_SPECIAL) {
657 if (flags == R_NEXT) {
658 if (t->bt_cursor.c_index >= NEXTINDEX(h)) {
659 if (NEXTINDEX(h) > 0)
660 t->bt_cursor.c_index = NEXTINDEX(h) - 1;
661 else
662 t->bt_cursor.c_index = 0;
664 } else {
665 t->bt_cursor.c_index = 0;
667 return (RET_SPECIAL);
668 } else if (status == RET_ERROR)
669 return (RET_ERROR);
671 /* okay, return the data */
672 d = (DATUM *) GETDATUM(h, t->bt_cursor.c_index);
674 return (_bt_buildret(t, d, data, key));
678 * BT_CLOSE -- Close a btree
680 * Parameters:
681 * tree -- tree to close
683 * Returns:
684 * RET_SUCCESS, RET_ERROR.
686 * Side Effects:
687 * Frees space occupied by the tree.
691 bt_close(dbp)
692 DB *dbp;
694 struct HTBUCKET *b, *sb;
695 BTREE_P t;
696 BTHEADER *h;
697 HTABLE ht;
698 int fd, i;
699 char *cache;
701 t = (BTREE_P)dbp->internal;
703 if (t->bt_cursor.c_key != (char *) NULL)
704 (void) free(t->bt_cursor.c_key);
706 if (!ISDISK(t)) {
707 /* in-memory tree, release hash table memory */
708 ht = t->bt_s.bt_ht;
709 for (i = 0; i < HTSIZE; i++) {
710 if ((b = ht[i]) == (struct HTBUCKET *) NULL)
711 break;
712 do {
713 sb = b;
714 (void) free((char *) b->ht_page);
715 b = b->ht_next;
716 (void) free((char *) sb);
717 } while (b != (struct HTBUCKET *) NULL);
719 (void) free ((char *) ht);
720 (void) free ((char *) t);
721 return (RET_SUCCESS);
724 if ((t->bt_flags & BTF_ISWRITE) && !(t->bt_flags & BTF_METAOK)) {
725 if (_bt_wrtmeta(t) == RET_ERROR)
726 return (RET_ERROR);
729 if (t->bt_curpage != (BTHEADER *) NULL) {
730 h = t->bt_curpage;
731 if (h->h_flags & F_DIRTY) {
732 if (_bt_write(t, h, RELEASE) == RET_ERROR)
733 return (RET_ERROR);
734 } else {
735 if (_bt_release(t, h) == RET_ERROR)
736 return (RET_ERROR);
739 /* flush and free the cache, if there is one */
740 if (ISCACHE(t)) {
741 cache = t->bt_s.bt_d.d_cache;
742 if (lrusync(cache) == RET_ERROR)
743 return (RET_ERROR);
744 lrufree(cache);
746 (void) free ((char *) h);
749 fd = t->bt_s.bt_d.d_fd;
750 (void) free ((char *) t);
751 return (close(fd));