2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 1996, 1997, 1998
5 * Sleepycat Software. All rights reserved.
8 * Copyright (c) 1990, 1993, 1994, 1995, 1996
9 * Keith Bostic. All rights reserved.
12 * Copyright (c) 1990, 1993, 1994, 1995
13 * The Regents of the University of California. All rights reserved.
15 * This code is derived from software contributed to Berkeley by
18 * Redistribution and use in source and binary forms, with or without
19 * modification, are permitted provided that the following conditions
21 * 1. Redistributions of source code must retain the above copyright
22 * notice, this list of conditions and the following disclaimer.
23 * 2. Redistributions in binary form must reproduce the above copyright
24 * notice, this list of conditions and the following disclaimer in the
25 * documentation and/or other materials provided with the distribution.
26 * 3. All advertising materials mentioning features or use of this software
27 * must display the following acknowledgement:
28 * This product includes software developed by the University of
29 * California, Berkeley and its contributors.
30 * 4. Neither the name of the University nor the names of its contributors
31 * may be used to endorse or promote products derived from this software
32 * without specific prior written permission.
34 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
35 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
36 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
37 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
38 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
40 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
41 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
42 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
43 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50 static const char sccsid
[] = "@(#)bt_search.c 10.25 (Sleepycat) 12/16/98";
53 #ifndef NO_SYSTEM_INCLUDES
54 #include <sys/types.h>
66 * Search a btree for a key.
68 * PUBLIC: int __bam_search __P((DBC *,
69 * PUBLIC: const DBT *, u_int32_t, int, db_recno_t *, int *));
72 __bam_search(dbc
, key
, flags
, stop
, recnop
, exactp
)
84 db_indx_t base
, i
, indx
, lim
;
87 int cmp
, jump
, ret
, stack
;
97 * There are several ways we search a btree tree. The flags argument
98 * specifies if we're acquiring read or write locks, if we position
99 * to the first or last item in a set of duplicates, if we return
100 * deleted items, and if we are locking pairs of pages. In addition,
101 * if we're modifying record numbers, we have to lock the entire tree
102 * regardless. See btree.h for more details.
104 * If write-locking pages, we need to know whether or not to acquire a
105 * write lock on a page before getting it. This depends on how deep it
106 * is in tree, which we don't know until we acquire the root page. So,
107 * if we need to lock the root page we may have to upgrade it later,
108 * because we won't get the correct lock initially.
110 * Retrieve the root page.
113 stack
= F_ISSET(dbp
, DB_BT_RECNUM
) && LF_ISSET(S_STACK
);
114 if ((ret
= __bam_lget(dbc
,
115 0, pg
, stack
? DB_LOCK_WRITE
: DB_LOCK_READ
, &lock
)) != 0)
117 if ((ret
= memp_fget(dbp
->mpf
, &pg
, 0, &h
)) != 0) {
118 (void)__BT_LPUT(dbc
, lock
);
123 * Decide if we need to save this page; if we do, write lock it.
124 * We deliberately don't lock-couple on this call. If the tree
125 * is tiny, i.e., one page, and two threads are busily updating
126 * the root page, we're almost guaranteed deadlocks galore, as
127 * each one gets a read lock and then blocks the other's attempt
131 ((LF_ISSET(S_PARENT
) && (u_int8_t
)(stop
+ 1) >= h
->level
) ||
132 (LF_ISSET(S_WRITE
) && h
->level
== LEAFLEVEL
))) {
133 (void)memp_fput(dbp
->mpf
, h
, 0);
134 (void)__BT_LPUT(dbc
, lock
);
135 if ((ret
= __bam_lget(dbc
, 0, pg
, DB_LOCK_WRITE
, &lock
)) != 0)
137 if ((ret
= memp_fget(dbp
->mpf
, &pg
, 0, &h
)) != 0) {
138 (void)__BT_LPUT(dbc
, lock
);
146 * Do a binary search on the current page. If we're searching
147 * a leaf page, we have to manipulate the indices in groups of
148 * two. If we're searching an internal page, they're an index
149 * per page item. If we find an exact match on a leaf page,
152 jump
= TYPE(h
) == P_LBTREE
? P_INDX
: O_INDX
;
154 lim
= NUM_ENT(h
) / (db_indx_t
)jump
; lim
!= 0; lim
>>= 1) {
155 indx
= base
+ ((lim
>> 1) * jump
);
157 __bam_cmp(dbp
, key
, h
, indx
, t
->bt_compare
)) == 0) {
158 if (TYPE(h
) == P_LBTREE
)
169 * No match found. Base is the smallest index greater than
170 * key and may be zero or a last + O_INDX index.
172 * If it's a leaf page, return base as the "found" value.
173 * Delete only deletes exact matches.
175 if (TYPE(h
) == P_LBTREE
) {
178 if (LF_ISSET(S_EXACT
))
183 * Possibly returning a deleted record -- DB_SET_RANGE,
184 * DB_KEYFIRST and DB_KEYLAST don't require an exact
185 * match, and we don't want to walk multiple pages here
186 * to find an undeleted record. This is handled in the
187 * __bam_c_search() routine.
189 BT_STK_ENTER(cp
, h
, base
, lock
, ret
);
194 * If it's not a leaf page, record the internal page (which is
195 * a parent page for the key). Decrement the base by 1 if it's
196 * non-zero so that if a split later occurs, the inserted page
197 * will be to the right of the saved page.
199 indx
= base
> 0 ? base
- O_INDX
: base
;
202 * If we're trying to calculate the record number, sum up
203 * all the record numbers on this page up to the indx point.
206 for (i
= 0; i
< indx
; ++i
)
207 recno
+= GET_BINTERNAL(h
, i
)->nrecs
;
209 next
: pg
= GET_BINTERNAL(h
, indx
)->pgno
;
211 /* Return if this is the lowest page wanted. */
212 if (LF_ISSET(S_PARENT
) && stop
== h
->level
) {
213 BT_STK_ENTER(cp
, h
, indx
, lock
, ret
);
216 BT_STK_PUSH(cp
, h
, indx
, lock
, ret
);
221 __bam_lget(dbc
, 0, pg
, DB_LOCK_WRITE
, &lock
)) != 0)
225 * Decide if we want to return a reference to the next
226 * page in the return stack. If so, lock it and never
229 if ((LF_ISSET(S_PARENT
) &&
230 (u_int8_t
)(stop
+ 1) >= (u_int8_t
)(h
->level
- 1)) ||
231 (h
->level
- 1) == LEAFLEVEL
)
234 (void)memp_fput(dbp
->mpf
, h
, 0);
237 __bam_lget(dbc
, 1, pg
, stack
&& LF_ISSET(S_WRITE
) ?
238 DB_LOCK_WRITE
: DB_LOCK_READ
, &lock
)) != 0)
241 if ((ret
= memp_fget(dbp
->mpf
, &pg
, 0, &h
)) != 0)
249 * If we're trying to calculate the record number, add in the
250 * offset on this page and correct for the fact that records
251 * in the tree are 0-based.
254 *recnop
= recno
+ (indx
/ P_INDX
) + 1;
257 * If we got here, we know that we have a btree leaf page.
259 * If there are duplicates, go to the first/last one. This is
260 * safe because we know that we're not going to leave the page,
261 * all duplicate sets that are not on overflow pages exist on a
264 if (LF_ISSET(S_DUPLAST
))
265 while (indx
< (db_indx_t
)(NUM_ENT(h
) - P_INDX
) &&
266 h
->inp
[indx
] == h
->inp
[indx
+ P_INDX
])
270 h
->inp
[indx
] == h
->inp
[indx
- P_INDX
])
274 * Now check if we are allowed to return deleted items; if not
275 * find the next (or previous) non-deleted item.
277 if (LF_ISSET(S_DELNO
)) {
278 if (LF_ISSET(S_DUPLAST
))
279 while (B_DISSET(GET_BKEYDATA(h
, indx
+ O_INDX
)->type
) &&
281 h
->inp
[indx
] == h
->inp
[indx
- P_INDX
])
284 while (B_DISSET(GET_BKEYDATA(h
, indx
+ O_INDX
)->type
) &&
285 indx
< (db_indx_t
)(NUM_ENT(h
) - P_INDX
) &&
286 h
->inp
[indx
] == h
->inp
[indx
+ P_INDX
])
289 if (B_DISSET(GET_BKEYDATA(h
, indx
+ O_INDX
)->type
))
293 BT_STK_ENTER(cp
, h
, indx
, lock
, ret
);
297 (void)memp_fput(dbp
->mpf
, h
, 0);
298 (void)__BT_LPUT(dbc
, lock
);
301 err
: if (cp
->csp
> cp
->sp
) {
303 __bam_stkrel(dbc
, 0);
310 * Release all pages currently held in the stack.
312 * PUBLIC: int __bam_stkrel __P((DBC *, int));
315 __bam_stkrel(dbc
, nolocks
)
326 /* Release inner pages first. */
327 for (epg
= cp
->sp
; epg
<= cp
->csp
; ++epg
) {
328 if (epg
->page
!= NULL
)
329 (void)memp_fput(dbp
->mpf
, epg
->page
, 0);
330 if (epg
->lock
!= LOCK_INVALID
)
332 (void)__BT_LPUT(dbc
, epg
->lock
);
334 (void)__BT_TLPUT(dbc
, epg
->lock
);
337 /* Clear the stack, all pages have been released. */
347 * PUBLIC: int __bam_stkgrow __P((CURSOR *));
357 entries
= cp
->esp
- cp
->sp
;
359 if ((ret
= __os_calloc(entries
* 2, sizeof(EPG
), &p
)) != 0)
361 memcpy(p
, cp
->sp
, entries
* sizeof(EPG
));
362 if (cp
->sp
!= cp
->stack
)
363 __os_free(cp
->sp
, entries
* sizeof(EPG
));
365 cp
->csp
= p
+ entries
;
366 cp
->esp
= p
+ entries
* 2;