8322 nl: misleading-indentation
[unleashed/tickless.git] / usr / src / cmd / sendmail / db / hash / hash_page.c
blobf7a9c91f2f9a4f9419b454e1369178587c7a62ae
1 /*-
2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 1996, 1997, 1998
5 * Sleepycat Software. All rights reserved.
6 */
7 /*
8 * Copyright (c) 1990, 1993, 1994
9 * Margo Seltzer. All rights reserved.
12 * Copyright (c) 1990, 1993, 1994
13 * The Regents of the University of California. All rights reserved.
15 * This code is derived from software contributed to Berkeley by
16 * Margo Seltzer.
18 * Redistribution and use in source and binary forms, with or without
19 * modification, are permitted provided that the following conditions
20 * are met:
21 * 1. Redistributions of source code must retain the above copyright
22 * notice, this list of conditions and the following disclaimer.
23 * 2. Redistributions in binary form must reproduce the above copyright
24 * notice, this list of conditions and the following disclaimer in the
25 * documentation and/or other materials provided with the distribution.
26 * 3. All advertising materials mentioning features or use of this software
27 * must display the following acknowledgement:
28 * This product includes software developed by the University of
29 * California, Berkeley and its contributors.
30 * 4. Neither the name of the University nor the names of its contributors
31 * may be used to endorse or promote products derived from this software
32 * without specific prior written permission.
34 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
35 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
36 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
37 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
38 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
40 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
41 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
42 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
43 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
44 * SUCH DAMAGE.
47 #include "config.h"
49 #ifndef lint
50 static const char sccsid[] = "@(#)hash_page.c 10.55 (Sleepycat) 1/3/99";
51 #endif /* not lint */
54 * PACKAGE: hashing
56 * DESCRIPTION:
57 * Page manipulation for hashing package.
59 * ROUTINES:
61 * External
62 * __get_page
63 * __add_ovflpage
64 * __overflow_page
65 * Internal
66 * open_temp
69 #ifndef NO_SYSTEM_INCLUDES
70 #include <sys/types.h>
72 #include <errno.h>
73 #include <string.h>
74 #endif
76 #include "db_int.h"
77 #include "db_page.h"
78 #include "hash.h"
80 static int __ham_lock_bucket __P((DBC *, db_lockmode_t));
82 #ifdef DEBUG_SLOW
83 static void __account_page(DB *, db_pgno_t, int);
84 #endif
87 * PUBLIC: int __ham_item __P((DBC *, db_lockmode_t));
89 int
90 __ham_item(dbc, mode)
91 DBC *dbc;
92 db_lockmode_t mode;
94 DB *dbp;
95 HASH_CURSOR *hcp;
96 db_pgno_t next_pgno;
97 int ret;
99 dbp = dbc->dbp;
100 hcp = (HASH_CURSOR *)dbc->internal;
102 if (F_ISSET(hcp, H_DELETED))
103 return (EINVAL);
104 F_CLR(hcp, H_OK | H_NOMORE);
106 /* Check if we need to get a page for this cursor. */
107 if ((ret = __ham_get_cpage(dbc, mode)) != 0)
108 return (ret);
110 /* Check if we are looking for space in which to insert an item. */
111 if (hcp->seek_size && hcp->seek_found_page == PGNO_INVALID
112 && hcp->seek_size < P_FREESPACE(hcp->pagep))
113 hcp->seek_found_page = hcp->pgno;
115 /* Check if we need to go on to the next page. */
116 if (F_ISSET(hcp, H_ISDUP) && hcp->dpgno == PGNO_INVALID)
118 * ISDUP is set, and offset is at the beginning of the datum.
119 * We need to grab the length of the datum, then set the datum
120 * pointer to be the beginning of the datum.
122 memcpy(&hcp->dup_len,
123 HKEYDATA_DATA(H_PAIRDATA(hcp->pagep, hcp->bndx)) +
124 hcp->dup_off, sizeof(db_indx_t));
125 else if (F_ISSET(hcp, H_ISDUP)) {
126 /* Make sure we're not about to run off the page. */
127 if (hcp->dpagep == NULL && (ret = __ham_get_page(dbp,
128 hcp->dpgno, &hcp->dpagep)) != 0)
129 return (ret);
131 if (hcp->dndx >= NUM_ENT(hcp->dpagep)) {
132 if (NEXT_PGNO(hcp->dpagep) == PGNO_INVALID) {
133 if (F_ISSET(hcp, H_DUPONLY)) {
134 F_CLR(hcp, H_OK);
135 F_SET(hcp, H_NOMORE);
136 return (0);
138 if ((ret = __ham_put_page(dbp,
139 hcp->dpagep, 0)) != 0)
140 return (ret);
141 F_CLR(hcp, H_ISDUP);
142 hcp->dpagep = NULL;
143 hcp->dpgno = PGNO_INVALID;
144 hcp->dndx = NDX_INVALID;
145 hcp->bndx++;
146 } else if ((ret = __ham_next_cpage(dbc,
147 NEXT_PGNO(hcp->dpagep), 0, H_ISDUP)) != 0)
148 return (ret);
152 if (hcp->bndx >= (db_indx_t)H_NUMPAIRS(hcp->pagep)) {
153 /* Fetch next page. */
154 if (NEXT_PGNO(hcp->pagep) == PGNO_INVALID) {
155 F_SET(hcp, H_NOMORE);
156 if (hcp->dpagep != NULL &&
157 (ret = __ham_put_page(dbp, hcp->dpagep, 0)) != 0)
158 return (ret);
159 hcp->dpgno = PGNO_INVALID;
160 return (DB_NOTFOUND);
162 next_pgno = NEXT_PGNO(hcp->pagep);
163 hcp->bndx = 0;
164 if ((ret = __ham_next_cpage(dbc, next_pgno, 0, 0)) != 0)
165 return (ret);
168 F_SET(hcp, H_OK);
169 return (0);
173 * PUBLIC: int __ham_item_reset __P((DBC *));
176 __ham_item_reset(dbc)
177 DBC *dbc;
179 HASH_CURSOR *hcp;
180 DB *dbp;
181 int ret;
183 ret = 0;
184 dbp = dbc->dbp;
185 hcp = (HASH_CURSOR *)dbc->internal;
186 if (hcp->pagep != NULL)
187 ret = __ham_put_page(dbp, hcp->pagep, 0);
188 if (ret == 0 && hcp->dpagep != NULL)
189 ret = __ham_put_page(dbp, hcp->dpagep, 0);
191 __ham_item_init(hcp);
192 return (ret);
196 * PUBLIC: void __ham_item_init __P((HASH_CURSOR *));
198 void
199 __ham_item_init(hcp)
200 HASH_CURSOR *hcp;
203 * If this cursor still holds any locks, we must
204 * release them if we are not running with transactions.
206 if (hcp->lock && hcp->dbc->txn == NULL)
207 (void)lock_put(hcp->dbc->dbp->dbenv->lk_info, hcp->lock);
210 * The following fields must *not* be initialized here
211 * because they may have meaning across inits.
212 * hlock, hdr, split_buf, stats
214 hcp->bucket = BUCKET_INVALID;
215 hcp->lbucket = BUCKET_INVALID;
216 hcp->lock = 0;
217 hcp->pagep = NULL;
218 hcp->pgno = PGNO_INVALID;
219 hcp->bndx = NDX_INVALID;
220 hcp->dpagep = NULL;
221 hcp->dpgno = PGNO_INVALID;
222 hcp->dndx = NDX_INVALID;
223 hcp->dup_off = 0;
224 hcp->dup_len = 0;
225 hcp->dup_tlen = 0;
226 hcp->seek_size = 0;
227 hcp->seek_found_page = PGNO_INVALID;
228 hcp->flags = 0;
232 * PUBLIC: int __ham_item_done __P((DBC *, int));
235 __ham_item_done(dbc, dirty)
236 DBC *dbc;
237 int dirty;
239 DB *dbp;
240 HASH_CURSOR *hcp;
241 int ret, t_ret;
243 dbp = dbc->dbp;
244 hcp = (HASH_CURSOR *)dbc->internal;
245 t_ret = ret = 0;
247 if (hcp->pagep)
248 ret = __ham_put_page(dbp, hcp->pagep,
249 dirty && hcp->dpagep == NULL);
250 hcp->pagep = NULL;
252 if (hcp->dpagep)
253 t_ret = __ham_put_page(dbp, hcp->dpagep, dirty);
254 hcp->dpagep = NULL;
256 if (ret == 0 && t_ret != 0)
257 ret = t_ret;
260 * We don't throw out the page number since we might want to
261 * continue getting on this page.
263 return (ret != 0 ? ret : t_ret);
267 * Returns the last item in a bucket.
269 * PUBLIC: int __ham_item_last __P((DBC *, db_lockmode_t));
272 __ham_item_last(dbc, mode)
273 DBC *dbc;
274 db_lockmode_t mode;
276 HASH_CURSOR *hcp;
277 int ret;
279 hcp = (HASH_CURSOR *)dbc->internal;
280 if ((ret = __ham_item_reset(dbc)) != 0)
281 return (ret);
283 hcp->bucket = hcp->hdr->max_bucket;
284 F_SET(hcp, H_OK);
285 return (__ham_item_prev(dbc, mode));
289 * PUBLIC: int __ham_item_first __P((DBC *, db_lockmode_t));
292 __ham_item_first(dbc, mode)
293 DBC *dbc;
294 db_lockmode_t mode;
296 HASH_CURSOR *hcp;
297 int ret;
299 hcp = (HASH_CURSOR *)dbc->internal;
300 if ((ret = __ham_item_reset(dbc)) != 0)
301 return (ret);
302 F_SET(hcp, H_OK);
303 hcp->bucket = 0;
304 return (__ham_item_next(dbc, mode));
308 * __ham_item_prev --
309 * Returns a pointer to key/data pair on a page. In the case of
310 * bigkeys, just returns the page number and index of the bigkey
311 * pointer pair.
313 * PUBLIC: int __ham_item_prev __P((DBC *, db_lockmode_t));
316 __ham_item_prev(dbc, mode)
317 DBC *dbc;
318 db_lockmode_t mode;
320 DB *dbp;
321 HASH_CURSOR *hcp;
322 db_pgno_t next_pgno;
323 int ret;
325 dbp = dbc->dbp;
326 hcp = (HASH_CURSOR *)dbc->internal;
328 * There are N cases for backing up in a hash file.
329 * Case 1: In the middle of a page, no duplicates, just dec the index.
330 * Case 2: In the middle of a duplicate set, back up one.
331 * Case 3: At the beginning of a duplicate set, get out of set and
332 * back up to next key.
333 * Case 4: At the beginning of a page; go to previous page.
334 * Case 5: At the beginning of a bucket; go to prev bucket.
336 F_CLR(hcp, H_OK | H_NOMORE | H_DELETED);
339 * First handle the duplicates. Either you'll get the key here
340 * or you'll exit the duplicate set and drop into the code below
341 * to handle backing up through keys.
343 if (F_ISSET(hcp, H_ISDUP)) {
344 if (hcp->dpgno == PGNO_INVALID) {
345 /* Duplicates are on-page. */
346 if (hcp->dup_off != 0)
347 if ((ret = __ham_get_cpage(dbc, mode)) != 0)
348 return (ret);
349 else {
350 HASH_CURSOR *h;
351 h = hcp;
352 memcpy(&h->dup_len, HKEYDATA_DATA(
353 H_PAIRDATA(h->pagep, h->bndx))
354 + h->dup_off - sizeof(db_indx_t),
355 sizeof(db_indx_t));
356 hcp->dup_off -=
357 DUP_SIZE(hcp->dup_len);
358 hcp->dndx--;
359 return (__ham_item(dbc, mode));
361 } else if (hcp->dndx > 0) { /* Duplicates are off-page. */
362 hcp->dndx--;
363 return (__ham_item(dbc, mode));
364 } else if ((ret = __ham_get_cpage(dbc, mode)) != 0)
365 return (ret);
366 else if (PREV_PGNO(hcp->dpagep) == PGNO_INVALID) {
367 if (F_ISSET(hcp, H_DUPONLY)) {
368 F_CLR(hcp, H_OK);
369 F_SET(hcp, H_NOMORE);
370 return (0);
371 } else {
372 F_CLR(hcp, H_ISDUP); /* End of dups */
373 hcp->dpgno = PGNO_INVALID;
374 if (hcp->dpagep != NULL)
375 (void)__ham_put_page(dbp,
376 hcp->dpagep, 0);
377 hcp->dpagep = NULL;
379 } else if ((ret = __ham_next_cpage(dbc,
380 PREV_PGNO(hcp->dpagep), 0, H_ISDUP)) != 0)
381 return (ret);
382 else {
383 hcp->dndx = NUM_ENT(hcp->pagep) - 1;
384 return (__ham_item(dbc, mode));
389 * If we get here, we are not in a duplicate set, and just need
390 * to back up the cursor. There are still three cases:
391 * midpage, beginning of page, beginning of bucket.
394 if (F_ISSET(hcp, H_DUPONLY)) {
395 F_CLR(hcp, H_OK);
396 F_SET(hcp, H_NOMORE);
397 return (0);
400 if (hcp->bndx == 0) { /* Beginning of page. */
401 if ((ret = __ham_get_cpage(dbc, mode)) != 0)
402 return (ret);
403 hcp->pgno = PREV_PGNO(hcp->pagep);
404 if (hcp->pgno == PGNO_INVALID) {
405 /* Beginning of bucket. */
406 F_SET(hcp, H_NOMORE);
407 return (DB_NOTFOUND);
408 } else if ((ret =
409 __ham_next_cpage(dbc, hcp->pgno, 0, 0)) != 0)
410 return (ret);
411 else
412 hcp->bndx = H_NUMPAIRS(hcp->pagep);
416 * Either we've got the cursor set up to be decremented, or we
417 * have to find the end of a bucket.
419 if (hcp->bndx == NDX_INVALID) {
420 if (hcp->pagep == NULL)
421 next_pgno = BUCKET_TO_PAGE(hcp, hcp->bucket);
422 else
423 goto got_page;
425 do {
426 if ((ret = __ham_next_cpage(dbc, next_pgno, 0, 0)) != 0)
427 return (ret);
428 got_page: next_pgno = NEXT_PGNO(hcp->pagep);
429 hcp->bndx = H_NUMPAIRS(hcp->pagep);
430 } while (next_pgno != PGNO_INVALID);
432 if (hcp->bndx == 0) {
433 /* Bucket was empty. */
434 F_SET(hcp, H_NOMORE);
435 return (DB_NOTFOUND);
439 hcp->bndx--;
441 return (__ham_item(dbc, mode));
445 * Sets the cursor to the next key/data pair on a page.
447 * PUBLIC: int __ham_item_next __P((DBC *, db_lockmode_t));
450 __ham_item_next(dbc, mode)
451 DBC *dbc;
452 db_lockmode_t mode;
454 HASH_CURSOR *hcp;
456 hcp = (HASH_CURSOR *)dbc->internal;
458 * Deleted on-page duplicates are a weird case. If we delete the last
459 * one, then our cursor is at the very end of a duplicate set and
460 * we actually need to go on to the next key.
462 if (F_ISSET(hcp, H_DELETED)) {
463 if (hcp->bndx != NDX_INVALID &&
464 F_ISSET(hcp, H_ISDUP) &&
465 hcp->dpgno == PGNO_INVALID &&
466 hcp->dup_tlen == hcp->dup_off) {
467 if (F_ISSET(hcp, H_DUPONLY)) {
468 F_CLR(hcp, H_OK);
469 F_SET(hcp, H_NOMORE);
470 return (0);
471 } else {
472 F_CLR(hcp, H_ISDUP);
473 hcp->dpgno = PGNO_INVALID;
474 hcp->bndx++;
476 } else if (!F_ISSET(hcp, H_ISDUP) &&
477 F_ISSET(hcp, H_DUPONLY)) {
478 F_CLR(hcp, H_OK);
479 F_SET(hcp, H_NOMORE);
480 return (0);
482 F_CLR(hcp, H_DELETED);
483 } else if (hcp->bndx == NDX_INVALID) {
484 hcp->bndx = 0;
485 hcp->dpgno = PGNO_INVALID;
486 F_CLR(hcp, H_ISDUP);
487 } else if (F_ISSET(hcp, H_ISDUP) && hcp->dpgno != PGNO_INVALID)
488 hcp->dndx++;
489 else if (F_ISSET(hcp, H_ISDUP)) {
490 if (hcp->dup_off + DUP_SIZE(hcp->dup_len) >=
491 hcp->dup_tlen && F_ISSET(hcp, H_DUPONLY)) {
492 F_CLR(hcp, H_OK);
493 F_SET(hcp, H_NOMORE);
494 return (0);
496 hcp->dndx++;
497 hcp->dup_off += DUP_SIZE(hcp->dup_len);
498 if (hcp->dup_off >= hcp->dup_tlen) {
499 F_CLR(hcp, H_ISDUP);
500 hcp->dpgno = PGNO_INVALID;
501 hcp->bndx++;
503 } else if (F_ISSET(hcp, H_DUPONLY)) {
504 F_CLR(hcp, H_OK);
505 F_SET(hcp, H_NOMORE);
506 return (0);
507 } else
508 hcp->bndx++;
510 return (__ham_item(dbc, mode));
514 * PUBLIC: void __ham_putitem __P((PAGE *p, const DBT *, int));
516 * This is a little bit sleazy in that we're overloading the meaning
517 * of the H_OFFPAGE type here. When we recover deletes, we have the
518 * entire entry instead of having only the DBT, so we'll pass type
519 * H_OFFPAGE to mean, "copy the whole entry" as opposed to constructing
520 * an H_KEYDATA around it.
522 void
523 __ham_putitem(p, dbt, type)
524 PAGE *p;
525 const DBT *dbt;
526 int type;
528 u_int16_t n, off;
530 n = NUM_ENT(p);
532 /* Put the item element on the page. */
533 if (type == H_OFFPAGE) {
534 off = HOFFSET(p) - dbt->size;
535 HOFFSET(p) = p->inp[n] = off;
536 memcpy(P_ENTRY(p, n), dbt->data, dbt->size);
537 } else {
538 off = HOFFSET(p) - HKEYDATA_SIZE(dbt->size);
539 HOFFSET(p) = p->inp[n] = off;
540 PUT_HKEYDATA(P_ENTRY(p, n), dbt->data, dbt->size, type);
543 /* Adjust page info. */
544 NUM_ENT(p) += 1;
548 * PUBLIC: void __ham_reputpair
549 * PUBLIC: __P((PAGE *p, u_int32_t, u_int32_t, const DBT *, const DBT *));
551 * This is a special case to restore a key/data pair to its original
552 * location during recovery. We are guaranteed that the pair fits
553 * on the page and is not the last pair on the page (because if it's
554 * the last pair, the normal insert works).
556 void
557 __ham_reputpair(p, psize, ndx, key, data)
558 PAGE *p;
559 u_int32_t psize, ndx;
560 const DBT *key, *data;
562 db_indx_t i, movebytes, newbytes;
563 u_int8_t *from;
565 /* First shuffle the existing items up on the page. */
566 movebytes =
567 (ndx == 0 ? psize : p->inp[H_DATAINDEX(ndx - 1)]) - HOFFSET(p);
568 newbytes = key->size + data->size;
569 from = (u_int8_t *)p + HOFFSET(p);
570 memmove(from - newbytes, from, movebytes);
573 * Adjust the indices and move them up 2 spaces. Note that we
574 * have to check the exit condition inside the loop just in case
575 * we are dealing with index 0 (db_indx_t's are unsigned).
577 for (i = NUM_ENT(p) - 1; ; i-- ) {
578 p->inp[i + 2] = p->inp[i] - newbytes;
579 if (i == H_KEYINDEX(ndx))
580 break;
583 /* Put the key and data on the page. */
584 p->inp[H_KEYINDEX(ndx)] =
585 (ndx == 0 ? psize : p->inp[H_DATAINDEX(ndx - 1)]) - key->size;
586 p->inp[H_DATAINDEX(ndx)] = p->inp[H_KEYINDEX(ndx)] - data->size;
587 memcpy(P_ENTRY(p, H_KEYINDEX(ndx)), key->data, key->size);
588 memcpy(P_ENTRY(p, H_DATAINDEX(ndx)), data->data, data->size);
590 /* Adjust page info. */
591 HOFFSET(p) -= newbytes;
592 NUM_ENT(p) += 2;
597 * PUBLIC: int __ham_del_pair __P((DBC *, int));
600 __ham_del_pair(dbc, reclaim_page)
601 DBC *dbc;
602 int reclaim_page;
604 DB *dbp;
605 HASH_CURSOR *hcp;
606 DBT data_dbt, key_dbt;
607 DB_ENV *dbenv;
608 DB_LSN new_lsn, *n_lsn, tmp_lsn;
609 PAGE *p;
610 db_indx_t ndx;
611 db_pgno_t chg_pgno, pgno;
612 int ret, tret;
614 dbp = dbc->dbp;
615 hcp = (HASH_CURSOR *)dbc->internal;
617 dbenv = dbp->dbenv;
618 ndx = hcp->bndx;
619 if (hcp->pagep == NULL &&
620 (ret = __ham_get_page(dbp, hcp->pgno, &hcp->pagep)) != 0)
621 return (ret);
623 p = hcp->pagep;
626 * We optimize for the normal case which is when neither the key nor
627 * the data are large. In this case, we write a single log record
628 * and do the delete. If either is large, we'll call __big_delete
629 * to remove the big item and then update the page to remove the
630 * entry referring to the big item.
632 ret = 0;
633 if (HPAGE_PTYPE(H_PAIRKEY(p, ndx)) == H_OFFPAGE) {
634 memcpy(&pgno, HOFFPAGE_PGNO(P_ENTRY(p, H_KEYINDEX(ndx))),
635 sizeof(db_pgno_t));
636 ret = __db_doff(dbc, pgno, __ham_del_page);
639 if (ret == 0)
640 switch (HPAGE_PTYPE(H_PAIRDATA(p, ndx))) {
641 case H_OFFPAGE:
642 memcpy(&pgno,
643 HOFFPAGE_PGNO(P_ENTRY(p, H_DATAINDEX(ndx))),
644 sizeof(db_pgno_t));
645 ret = __db_doff(dbc, pgno, __ham_del_page);
646 break;
647 case H_OFFDUP:
648 memcpy(&pgno,
649 HOFFDUP_PGNO(P_ENTRY(p, H_DATAINDEX(ndx))),
650 sizeof(db_pgno_t));
651 ret = __db_ddup(dbc, pgno, __ham_del_page);
652 F_CLR(hcp, H_ISDUP);
653 break;
654 case H_DUPLICATE:
656 * If we delete a pair that is/was a duplicate, then
657 * we had better clear the flag so that we update the
658 * cursor appropriately.
660 F_CLR(hcp, H_ISDUP);
661 break;
664 if (ret)
665 return (ret);
667 /* Now log the delete off this page. */
668 if (DB_LOGGING(dbc)) {
669 key_dbt.data = P_ENTRY(p, H_KEYINDEX(ndx));
670 key_dbt.size =
671 LEN_HITEM(p, hcp->hdr->pagesize, H_KEYINDEX(ndx));
672 data_dbt.data = P_ENTRY(p, H_DATAINDEX(ndx));
673 data_dbt.size =
674 LEN_HITEM(p, hcp->hdr->pagesize, H_DATAINDEX(ndx));
676 if ((ret = __ham_insdel_log(dbenv->lg_info,
677 dbc->txn, &new_lsn, 0, DELPAIR,
678 dbp->log_fileid, PGNO(p), (u_int32_t)ndx,
679 &LSN(p), &key_dbt, &data_dbt)) != 0)
680 return (ret);
682 /* Move lsn onto page. */
683 LSN(p) = new_lsn;
686 __ham_dpair(dbp, p, ndx);
689 * If we are locking, we will not maintain this, because it is
690 * a hot spot.
691 * XXX perhaps we can retain incremental numbers and apply them
692 * later.
694 if (!F_ISSET(dbp, DB_AM_LOCKING))
695 --hcp->hdr->nelem;
698 * If we need to reclaim the page, then check if the page is empty.
699 * There are two cases. If it's empty and it's not the first page
700 * in the bucket (i.e., the bucket page) then we can simply remove
701 * it. If it is the first chain in the bucket, then we need to copy
702 * the second page into it and remove the second page.
704 if (reclaim_page && NUM_ENT(p) == 0 && PREV_PGNO(p) == PGNO_INVALID &&
705 NEXT_PGNO(p) != PGNO_INVALID) {
706 PAGE *n_pagep, *nn_pagep;
707 db_pgno_t tmp_pgno;
710 * First page in chain is empty and we know that there
711 * are more pages in the chain.
713 if ((ret =
714 __ham_get_page(dbp, NEXT_PGNO(p), &n_pagep)) != 0)
715 return (ret);
717 if (NEXT_PGNO(n_pagep) != PGNO_INVALID) {
718 if ((ret =
719 __ham_get_page(dbp, NEXT_PGNO(n_pagep),
720 &nn_pagep)) != 0) {
721 (void) __ham_put_page(dbp, n_pagep, 0);
722 return (ret);
726 if (DB_LOGGING(dbc)) {
727 key_dbt.data = n_pagep;
728 key_dbt.size = hcp->hdr->pagesize;
729 if ((ret = __ham_copypage_log(dbenv->lg_info,
730 dbc->txn, &new_lsn, 0, dbp->log_fileid, PGNO(p),
731 &LSN(p), PGNO(n_pagep), &LSN(n_pagep),
732 NEXT_PGNO(n_pagep),
733 NEXT_PGNO(n_pagep) == PGNO_INVALID ? NULL :
734 &LSN(nn_pagep), &key_dbt)) != 0)
735 return (ret);
737 /* Move lsn onto page. */
738 LSN(p) = new_lsn; /* Structure assignment. */
739 LSN(n_pagep) = new_lsn;
740 if (NEXT_PGNO(n_pagep) != PGNO_INVALID)
741 LSN(nn_pagep) = new_lsn;
743 if (NEXT_PGNO(n_pagep) != PGNO_INVALID) {
744 PREV_PGNO(nn_pagep) = PGNO(p);
745 (void)__ham_put_page(dbp, nn_pagep, 1);
748 tmp_pgno = PGNO(p);
749 tmp_lsn = LSN(p);
750 memcpy(p, n_pagep, hcp->hdr->pagesize);
751 PGNO(p) = tmp_pgno;
752 LSN(p) = tmp_lsn;
753 PREV_PGNO(p) = PGNO_INVALID;
756 * Cursor is advanced to the beginning of the next page.
758 hcp->bndx = 0;
759 hcp->pgno = PGNO(p);
760 F_SET(hcp, H_DELETED);
761 chg_pgno = PGNO(p);
762 if ((ret = __ham_dirty_page(dbp, p)) != 0 ||
763 (ret = __ham_del_page(dbc, n_pagep)) != 0)
764 return (ret);
765 } else if (reclaim_page &&
766 NUM_ENT(p) == 0 && PREV_PGNO(p) != PGNO_INVALID) {
767 PAGE *n_pagep, *p_pagep;
769 if ((ret =
770 __ham_get_page(dbp, PREV_PGNO(p), &p_pagep)) != 0)
771 return (ret);
773 if (NEXT_PGNO(p) != PGNO_INVALID) {
774 if ((ret = __ham_get_page(dbp,
775 NEXT_PGNO(p), &n_pagep)) != 0) {
776 (void)__ham_put_page(dbp, p_pagep, 0);
777 return (ret);
779 n_lsn = &LSN(n_pagep);
780 } else {
781 n_pagep = NULL;
782 n_lsn = NULL;
785 NEXT_PGNO(p_pagep) = NEXT_PGNO(p);
786 if (n_pagep != NULL)
787 PREV_PGNO(n_pagep) = PGNO(p_pagep);
789 if (DB_LOGGING(dbc)) {
790 if ((ret = __ham_newpage_log(dbenv->lg_info,
791 dbc->txn, &new_lsn, 0, DELOVFL,
792 dbp->log_fileid, PREV_PGNO(p), &LSN(p_pagep),
793 PGNO(p), &LSN(p), NEXT_PGNO(p), n_lsn)) != 0)
794 return (ret);
796 /* Move lsn onto page. */
797 LSN(p_pagep) = new_lsn; /* Structure assignment. */
798 if (n_pagep)
799 LSN(n_pagep) = new_lsn;
800 LSN(p) = new_lsn;
802 hcp->pgno = NEXT_PGNO(p);
803 hcp->bndx = 0;
805 * Since we are about to delete the cursor page and we have
806 * just moved the cursor, we need to make sure that the
807 * old page pointer isn't left hanging around in the cursor.
809 hcp->pagep = NULL;
810 chg_pgno = PGNO(p);
811 ret = __ham_del_page(dbc, p);
812 if ((tret = __ham_put_page(dbp, p_pagep, 1)) != 0 &&
813 ret == 0)
814 ret = tret;
815 if (n_pagep != NULL &&
816 (tret = __ham_put_page(dbp, n_pagep, 1)) != 0 &&
817 ret == 0)
818 ret = tret;
819 if (ret != 0)
820 return (ret);
821 } else {
823 * Mark item deleted so that we don't try to return it, and
824 * so that we update the cursor correctly on the next call
825 * to next.
827 F_SET(hcp, H_DELETED);
828 chg_pgno = hcp->pgno;
829 ret = __ham_dirty_page(dbp, p);
831 __ham_c_update(hcp, chg_pgno, 0, 0, 0);
834 * Since we just deleted a pair from the master page, anything
835 * in hcp->dpgno should be cleared.
837 hcp->dpgno = PGNO_INVALID;
839 F_CLR(hcp, H_OK);
840 return (ret);
844 * __ham_replpair --
845 * Given the key data indicated by the cursor, replace part/all of it
846 * according to the fields in the dbt.
848 * PUBLIC: int __ham_replpair __P((DBC *, DBT *, u_int32_t));
851 __ham_replpair(dbc, dbt, make_dup)
852 DBC *dbc;
853 DBT *dbt;
854 u_int32_t make_dup;
856 DB *dbp;
857 HASH_CURSOR *hcp;
858 DBT old_dbt, tdata, tmp;
859 DB_LSN new_lsn;
860 int32_t change; /* XXX: Possible overflow. */
861 u_int32_t len;
862 int is_big, ret, type;
863 u_int8_t *beg, *dest, *end, *hk, *src;
866 * Big item replacements are handled in generic code.
867 * Items that fit on the current page fall into 4 classes.
868 * 1. On-page element, same size
869 * 2. On-page element, new is bigger (fits)
870 * 3. On-page element, new is bigger (does not fit)
871 * 4. On-page element, old is bigger
872 * Numbers 1, 2, and 4 are essentially the same (and should
873 * be the common case). We handle case 3 as a delete and
874 * add.
876 dbp = dbc->dbp;
877 hcp = (HASH_CURSOR *)dbc->internal;
880 * We need to compute the number of bytes that we are adding or
881 * removing from the entry. Normally, we can simply substract
882 * the number of bytes we are replacing (dbt->dlen) from the
883 * number of bytes we are inserting (dbt->size). However, if
884 * we are doing a partial put off the end of a record, then this
885 * formula doesn't work, because we are essentially adding
886 * new bytes.
888 change = dbt->size - dbt->dlen;
890 hk = H_PAIRDATA(hcp->pagep, hcp->bndx);
891 is_big = HPAGE_PTYPE(hk) == H_OFFPAGE;
893 if (is_big)
894 memcpy(&len, HOFFPAGE_TLEN(hk), sizeof(u_int32_t));
895 else
896 len = LEN_HKEYDATA(hcp->pagep,
897 dbp->pgsize, H_DATAINDEX(hcp->bndx));
899 if (dbt->doff + dbt->dlen > len)
900 change += dbt->doff + dbt->dlen - len;
903 if (change > (int32_t)P_FREESPACE(hcp->pagep) || is_big) {
905 * Case 3 -- two subcases.
906 * A. This is not really a partial operation, but an overwrite.
907 * Simple del and add works.
908 * B. This is a partial and we need to construct the data that
909 * we are really inserting (yuck).
910 * In both cases, we need to grab the key off the page (in
911 * some cases we could do this outside of this routine; for
912 * cleanliness we do it here. If you happen to be on a big
913 * key, this could be a performance hit).
915 tmp.flags = 0;
916 F_SET(&tmp, DB_DBT_MALLOC | DB_DBT_INTERNAL);
917 if ((ret =
918 __db_ret(dbp, hcp->pagep, H_KEYINDEX(hcp->bndx),
919 &tmp, &dbc->rkey.data, &dbc->rkey.size)) != 0)
920 return (ret);
922 if (dbt->doff == 0 && dbt->dlen == len) {
923 ret = __ham_del_pair(dbc, 0);
924 if (ret == 0)
925 ret = __ham_add_el(dbc, &tmp, dbt, H_KEYDATA);
926 } else { /* Case B */
927 type = HPAGE_PTYPE(hk) != H_OFFPAGE ?
928 HPAGE_PTYPE(hk) : H_KEYDATA;
929 tdata.flags = 0;
930 F_SET(&tdata, DB_DBT_MALLOC | DB_DBT_INTERNAL);
932 if ((ret = __db_ret(dbp, hcp->pagep,
933 H_DATAINDEX(hcp->bndx), &tdata, &dbc->rdata.data,
934 &dbc->rdata.size)) != 0)
935 goto err;
937 /* Now we can delete the item. */
938 if ((ret = __ham_del_pair(dbc, 0)) != 0) {
939 __os_free(tdata.data, tdata.size);
940 goto err;
943 /* Now shift old data around to make room for new. */
944 if (change > 0) {
945 if ((ret = __os_realloc(&tdata.data,
946 tdata.size + change)) != 0)
947 return (ret);
948 memset((u_int8_t *)tdata.data + tdata.size,
949 0, change);
951 end = (u_int8_t *)tdata.data + tdata.size;
953 src = (u_int8_t *)tdata.data + dbt->doff + dbt->dlen;
954 if (src < end && tdata.size > dbt->doff + dbt->dlen) {
955 len = tdata.size - dbt->doff - dbt->dlen;
956 dest = src + change;
957 memmove(dest, src, len);
959 memcpy((u_int8_t *)tdata.data + dbt->doff,
960 dbt->data, dbt->size);
961 tdata.size += change;
963 /* Now add the pair. */
964 ret = __ham_add_el(dbc, &tmp, &tdata, type);
965 __os_free(tdata.data, tdata.size);
967 err: __os_free(tmp.data, tmp.size);
968 return (ret);
972 * Set up pointer into existing data. Do it before the log
973 * message so we can use it inside of the log setup.
975 beg = HKEYDATA_DATA(H_PAIRDATA(hcp->pagep, hcp->bndx));
976 beg += dbt->doff;
979 * If we are going to have to move bytes at all, figure out
980 * all the parameters here. Then log the call before moving
981 * anything around.
983 if (DB_LOGGING(dbc)) {
984 old_dbt.data = beg;
985 old_dbt.size = dbt->dlen;
986 if ((ret = __ham_replace_log(dbp->dbenv->lg_info,
987 dbc->txn, &new_lsn, 0, dbp->log_fileid, PGNO(hcp->pagep),
988 (u_int32_t)H_DATAINDEX(hcp->bndx), &LSN(hcp->pagep),
989 (u_int32_t)dbt->doff, &old_dbt, dbt, make_dup)) != 0)
990 return (ret);
992 LSN(hcp->pagep) = new_lsn; /* Structure assignment. */
995 __ham_onpage_replace(hcp->pagep, dbp->pgsize,
996 (u_int32_t)H_DATAINDEX(hcp->bndx), (int32_t)dbt->doff, change, dbt);
998 return (0);
1002 * Replace data on a page with new data, possibly growing or shrinking what's
1003 * there. This is called on two different occasions. On one (from replpair)
1004 * we are interested in changing only the data. On the other (from recovery)
1005 * we are replacing the entire data (header and all) with a new element. In
1006 * the latter case, the off argument is negative.
1007 * pagep: the page that we're changing
1008 * ndx: page index of the element that is growing/shrinking.
1009 * off: Offset at which we are beginning the replacement.
1010 * change: the number of bytes (+ or -) that the element is growing/shrinking.
1011 * dbt: the new data that gets written at beg.
1012 * PUBLIC: void __ham_onpage_replace __P((PAGE *, size_t, u_int32_t, int32_t,
1013 * PUBLIC: int32_t, DBT *));
1015 void
1016 __ham_onpage_replace(pagep, pgsize, ndx, off, change, dbt)
1017 PAGE *pagep;
1018 size_t pgsize;
1019 u_int32_t ndx;
1020 int32_t off;
1021 int32_t change;
1022 DBT *dbt;
1024 db_indx_t i;
1025 int32_t len;
1026 u_int8_t *src, *dest;
1027 int zero_me;
1029 if (change != 0) {
1030 zero_me = 0;
1031 src = (u_int8_t *)(pagep) + HOFFSET(pagep);
1032 if (off < 0)
1033 len = pagep->inp[ndx] - HOFFSET(pagep);
1034 else if ((u_int32_t)off >= LEN_HKEYDATA(pagep, pgsize, ndx)) {
1035 len = HKEYDATA_DATA(P_ENTRY(pagep, ndx)) +
1036 LEN_HKEYDATA(pagep, pgsize, ndx) - src;
1037 zero_me = 1;
1038 } else
1039 len = (HKEYDATA_DATA(P_ENTRY(pagep, ndx)) + off) - src;
1040 dest = src - change;
1041 memmove(dest, src, len);
1042 if (zero_me)
1043 memset(dest + len, 0, change);
1045 /* Now update the indices. */
1046 for (i = ndx; i < NUM_ENT(pagep); i++)
1047 pagep->inp[i] -= change;
1048 HOFFSET(pagep) -= change;
1050 if (off >= 0)
1051 memcpy(HKEYDATA_DATA(P_ENTRY(pagep, ndx)) + off,
1052 dbt->data, dbt->size);
1053 else
1054 memcpy(P_ENTRY(pagep, ndx), dbt->data, dbt->size);
1058 * PUBLIC: int __ham_split_page __P((DBC *, u_int32_t, u_int32_t));
1061 __ham_split_page(dbc, obucket, nbucket)
1062 DBC *dbc;
1063 u_int32_t obucket, nbucket;
1065 DB *dbp;
1066 HASH_CURSOR *hcp;
1067 DBT key, page_dbt;
1068 DB_ENV *dbenv;
1069 DB_LSN new_lsn;
1070 PAGE **pp, *old_pagep, *temp_pagep, *new_pagep;
1071 db_indx_t n;
1072 db_pgno_t bucket_pgno, next_pgno;
1073 u_int32_t big_len, len;
1074 int ret, tret;
1075 void *big_buf;
1077 dbp = dbc->dbp;
1078 hcp = (HASH_CURSOR *)dbc->internal;
1079 dbenv = dbp->dbenv;
1080 temp_pagep = old_pagep = new_pagep = NULL;
1082 bucket_pgno = BUCKET_TO_PAGE(hcp, obucket);
1083 if ((ret = __ham_get_page(dbp, bucket_pgno, &old_pagep)) != 0)
1084 return (ret);
1085 if ((ret = __ham_new_page(dbp, BUCKET_TO_PAGE(hcp, nbucket), P_HASH,
1086 &new_pagep)) != 0)
1087 goto err;
1089 temp_pagep = hcp->split_buf;
1090 memcpy(temp_pagep, old_pagep, hcp->hdr->pagesize);
1092 if (DB_LOGGING(dbc)) {
1093 page_dbt.size = hcp->hdr->pagesize;
1094 page_dbt.data = old_pagep;
1095 if ((ret = __ham_splitdata_log(dbenv->lg_info,
1096 dbc->txn, &new_lsn, 0, dbp->log_fileid, SPLITOLD,
1097 PGNO(old_pagep), &page_dbt, &LSN(old_pagep))) != 0)
1098 goto err;
1101 P_INIT(old_pagep, hcp->hdr->pagesize, PGNO(old_pagep), PGNO_INVALID,
1102 PGNO_INVALID, 0, P_HASH);
1104 if (DB_LOGGING(dbc))
1105 LSN(old_pagep) = new_lsn; /* Structure assignment. */
1107 big_len = 0;
1108 big_buf = NULL;
1109 key.flags = 0;
1110 while (temp_pagep != NULL) {
1111 for (n = 0; n < (db_indx_t)H_NUMPAIRS(temp_pagep); n++) {
1112 if ((ret =
1113 __db_ret(dbp, temp_pagep, H_KEYINDEX(n),
1114 &key, &big_buf, &big_len)) != 0)
1115 goto err;
1117 if (__ham_call_hash(hcp, key.data, key.size)
1118 == obucket)
1119 pp = &old_pagep;
1120 else
1121 pp = &new_pagep;
1124 * Figure out how many bytes we need on the new
1125 * page to store the key/data pair.
1128 len = LEN_HITEM(temp_pagep, hcp->hdr->pagesize,
1129 H_DATAINDEX(n)) +
1130 LEN_HITEM(temp_pagep, hcp->hdr->pagesize,
1131 H_KEYINDEX(n)) +
1132 2 * sizeof(db_indx_t);
1134 if (P_FREESPACE(*pp) < len) {
1135 if (DB_LOGGING(dbc)) {
1136 page_dbt.size = hcp->hdr->pagesize;
1137 page_dbt.data = *pp;
1138 if ((ret = __ham_splitdata_log(
1139 dbenv->lg_info, dbc->txn,
1140 &new_lsn, 0, dbp->log_fileid,
1141 SPLITNEW, PGNO(*pp), &page_dbt,
1142 &LSN(*pp))) != 0)
1143 goto err;
1144 LSN(*pp) = new_lsn;
1146 if ((ret =
1147 __ham_add_ovflpage(dbc, *pp, 1, pp)) != 0)
1148 goto err;
1150 __ham_copy_item(dbp->pgsize,
1151 temp_pagep, H_KEYINDEX(n), *pp);
1152 __ham_copy_item(dbp->pgsize,
1153 temp_pagep, H_DATAINDEX(n), *pp);
1155 next_pgno = NEXT_PGNO(temp_pagep);
1157 /* Clear temp_page; if it's a link overflow page, free it. */
1158 if (PGNO(temp_pagep) != bucket_pgno && (ret =
1159 __ham_del_page(dbc, temp_pagep)) != 0)
1160 goto err;
1162 if (next_pgno == PGNO_INVALID)
1163 temp_pagep = NULL;
1164 else if ((ret =
1165 __ham_get_page(dbp, next_pgno, &temp_pagep)) != 0)
1166 goto err;
1168 if (temp_pagep != NULL && DB_LOGGING(dbc)) {
1169 page_dbt.size = hcp->hdr->pagesize;
1170 page_dbt.data = temp_pagep;
1171 if ((ret = __ham_splitdata_log(dbenv->lg_info,
1172 dbc->txn, &new_lsn, 0, dbp->log_fileid,
1173 SPLITOLD, PGNO(temp_pagep),
1174 &page_dbt, &LSN(temp_pagep))) != 0)
1175 goto err;
1176 LSN(temp_pagep) = new_lsn;
1179 if (big_buf != NULL)
1180 __os_free(big_buf, big_len);
1183 * If the original bucket spanned multiple pages, then we've got
1184 * a pointer to a page that used to be on the bucket chain. It
1185 * should be deleted.
1187 if (temp_pagep != NULL && PGNO(temp_pagep) != bucket_pgno &&
1188 (ret = __ham_del_page(dbc, temp_pagep)) != 0)
1189 goto err;
1192 * Write new buckets out.
1194 if (DB_LOGGING(dbc)) {
1195 page_dbt.size = hcp->hdr->pagesize;
1196 page_dbt.data = old_pagep;
1197 if ((ret = __ham_splitdata_log(dbenv->lg_info,
1198 dbc->txn, &new_lsn, 0, dbp->log_fileid,
1199 SPLITNEW, PGNO(old_pagep),
1200 &page_dbt, &LSN(old_pagep))) != 0)
1201 goto err;
1202 LSN(old_pagep) = new_lsn;
1204 page_dbt.data = new_pagep;
1205 if ((ret = __ham_splitdata_log(dbenv->lg_info,
1206 dbc->txn, &new_lsn, 0, dbp->log_fileid,
1207 SPLITNEW, PGNO(new_pagep), &page_dbt, &LSN(new_pagep))) != 0)
1208 goto err;
1209 LSN(new_pagep) = new_lsn;
1211 ret = __ham_put_page(dbp, old_pagep, 1);
1212 if ((tret = __ham_put_page(dbp, new_pagep, 1)) != 0 &&
1213 ret == 0)
1214 ret = tret;
1216 if (0) {
1217 err: if (old_pagep != NULL)
1218 (void)__ham_put_page(dbp, old_pagep, 1);
1219 if (new_pagep != NULL)
1220 (void)__ham_put_page(dbp, new_pagep, 1);
1221 if (temp_pagep != NULL && PGNO(temp_pagep) != bucket_pgno)
1222 (void)__ham_put_page(dbp, temp_pagep, 1);
1224 return (ret);
1228 * Add the given pair to the page. The page in question may already be
1229 * held (i.e. it was already gotten). If it is, then the page is passed
1230 * in via the pagep parameter. On return, pagep will contain the page
1231 * to which we just added something. This allows us to link overflow
1232 * pages and return the new page having correctly put the last page.
1234 * PUBLIC: int __ham_add_el __P((DBC *, const DBT *, const DBT *, int));
1237 __ham_add_el(dbc, key, val, type)
1238 DBC *dbc;
1239 const DBT *key, *val;
1240 int type;
1242 DB *dbp;
1243 HASH_CURSOR *hcp;
1244 const DBT *pkey, *pdata;
1245 DBT key_dbt, data_dbt;
1246 DB_LSN new_lsn;
1247 HOFFPAGE doff, koff;
1248 db_pgno_t next_pgno;
1249 u_int32_t data_size, key_size, pairsize, rectype;
1250 int do_expand, is_keybig, is_databig, ret;
1251 int key_type, data_type;
1253 dbp = dbc->dbp;
1254 hcp = (HASH_CURSOR *)dbc->internal;
1255 do_expand = 0;
1257 if (hcp->pagep == NULL && (ret = __ham_get_page(dbp,
1258 hcp->seek_found_page != PGNO_INVALID ? hcp->seek_found_page :
1259 hcp->pgno, &hcp->pagep)) != 0)
1260 return (ret);
1262 key_size = HKEYDATA_PSIZE(key->size);
1263 data_size = HKEYDATA_PSIZE(val->size);
1264 is_keybig = ISBIG(hcp, key->size);
1265 is_databig = ISBIG(hcp, val->size);
1266 if (is_keybig)
1267 key_size = HOFFPAGE_PSIZE;
1268 if (is_databig)
1269 data_size = HOFFPAGE_PSIZE;
1271 pairsize = key_size + data_size;
1273 /* Advance to first page in chain with room for item. */
1274 while (H_NUMPAIRS(hcp->pagep) && NEXT_PGNO(hcp->pagep) !=
1275 PGNO_INVALID) {
1277 * This may not be the end of the chain, but the pair may fit
1278 * anyway. Check if it's a bigpair that fits or a regular
1279 * pair that fits.
1281 if (P_FREESPACE(hcp->pagep) >= pairsize)
1282 break;
1283 next_pgno = NEXT_PGNO(hcp->pagep);
1284 if ((ret =
1285 __ham_next_cpage(dbc, next_pgno, 0, 0)) != 0)
1286 return (ret);
1290 * Check if we need to allocate a new page.
1292 if (P_FREESPACE(hcp->pagep) < pairsize) {
1293 do_expand = 1;
1294 if ((ret = __ham_add_ovflpage(dbc,
1295 hcp->pagep, 1, &hcp->pagep)) != 0)
1296 return (ret);
1297 hcp->pgno = PGNO(hcp->pagep);
1301 * Update cursor.
1303 hcp->bndx = H_NUMPAIRS(hcp->pagep);
1304 F_CLR(hcp, H_DELETED);
1305 if (is_keybig) {
1306 koff.type = H_OFFPAGE;
1307 UMRW(koff.unused[0]);
1308 UMRW(koff.unused[1]);
1309 UMRW(koff.unused[2]);
1310 if ((ret = __db_poff(dbc,
1311 key, &koff.pgno, __ham_overflow_page)) != 0)
1312 return (ret);
1313 koff.tlen = key->size;
1314 key_dbt.data = &koff;
1315 key_dbt.size = sizeof(koff);
1316 pkey = &key_dbt;
1317 key_type = H_OFFPAGE;
1318 } else {
1319 pkey = key;
1320 key_type = H_KEYDATA;
1323 if (is_databig) {
1324 doff.type = H_OFFPAGE;
1325 UMRW(doff.unused[0]);
1326 UMRW(doff.unused[1]);
1327 UMRW(doff.unused[2]);
1328 if ((ret = __db_poff(dbc,
1329 val, &doff.pgno, __ham_overflow_page)) != 0)
1330 return (ret);
1331 doff.tlen = val->size;
1332 data_dbt.data = &doff;
1333 data_dbt.size = sizeof(doff);
1334 pdata = &data_dbt;
1335 data_type = H_OFFPAGE;
1336 } else {
1337 pdata = val;
1338 data_type = type;
1341 if (DB_LOGGING(dbc)) {
1342 rectype = PUTPAIR;
1343 if (is_databig)
1344 rectype |= PAIR_DATAMASK;
1345 if (is_keybig)
1346 rectype |= PAIR_KEYMASK;
1348 if ((ret = __ham_insdel_log(dbp->dbenv->lg_info,
1349 dbc->txn, &new_lsn, 0, rectype,
1350 dbp->log_fileid, PGNO(hcp->pagep),
1351 (u_int32_t)H_NUMPAIRS(hcp->pagep),
1352 &LSN(hcp->pagep), pkey, pdata)) != 0)
1353 return (ret);
1355 /* Move lsn onto page. */
1356 LSN(hcp->pagep) = new_lsn; /* Structure assignment. */
1359 __ham_putitem(hcp->pagep, pkey, key_type);
1360 __ham_putitem(hcp->pagep, pdata, data_type);
1363 * For splits, we are going to update item_info's page number
1364 * field, so that we can easily return to the same page the
1365 * next time we come in here. For other operations, this shouldn't
1366 * matter, since odds are this is the last thing that happens before
1367 * we return to the user program.
1369 hcp->pgno = PGNO(hcp->pagep);
1372 * XXX Maybe keep incremental numbers here
1374 if (!F_ISSET(dbp, DB_AM_LOCKING))
1375 hcp->hdr->nelem++;
1377 if (do_expand || (hcp->hdr->ffactor != 0 &&
1378 (u_int32_t)H_NUMPAIRS(hcp->pagep) > hcp->hdr->ffactor))
1379 F_SET(hcp, H_EXPAND);
1380 return (0);
1385 * Special __putitem call used in splitting -- copies one entry to
1386 * another. Works for all types of hash entries (H_OFFPAGE, H_KEYDATA,
1387 * H_DUPLICATE, H_OFFDUP). Since we log splits at a high level, we
1388 * do not need to do any logging here.
1390 * PUBLIC: void __ham_copy_item __P((size_t, PAGE *, u_int32_t, PAGE *));
1392 void
1393 __ham_copy_item(pgsize, src_page, src_ndx, dest_page)
1394 size_t pgsize;
1395 PAGE *src_page;
1396 u_int32_t src_ndx;
1397 PAGE *dest_page;
1399 u_int32_t len;
1400 void *src, *dest;
1403 * Copy the key and data entries onto this new page.
1405 src = P_ENTRY(src_page, src_ndx);
1407 /* Set up space on dest. */
1408 len = LEN_HITEM(src_page, pgsize, src_ndx);
1409 HOFFSET(dest_page) -= len;
1410 dest_page->inp[NUM_ENT(dest_page)] = HOFFSET(dest_page);
1411 dest = P_ENTRY(dest_page, NUM_ENT(dest_page));
1412 NUM_ENT(dest_page)++;
1414 memcpy(dest, src, len);
1419 * Returns:
1420 * pointer on success
1421 * NULL on error
1423 * PUBLIC: int __ham_add_ovflpage __P((DBC *, PAGE *, int, PAGE **));
1426 __ham_add_ovflpage(dbc, pagep, release, pp)
1427 DBC *dbc;
1428 PAGE *pagep;
1429 int release;
1430 PAGE **pp;
1432 DB *dbp;
1433 HASH_CURSOR *hcp;
1434 DB_LSN new_lsn;
1435 PAGE *new_pagep;
1436 int ret;
1438 dbp = dbc->dbp;
1439 hcp = (HASH_CURSOR *)dbc->internal;
1441 if ((ret = __ham_overflow_page(dbc, P_HASH, &new_pagep)) != 0)
1442 return (ret);
1444 if (DB_LOGGING(dbc)) {
1445 if ((ret = __ham_newpage_log(dbp->dbenv->lg_info,
1446 dbc->txn, &new_lsn, 0, PUTOVFL,
1447 dbp->log_fileid, PGNO(pagep), &LSN(pagep),
1448 PGNO(new_pagep), &LSN(new_pagep), PGNO_INVALID, NULL)) != 0)
1449 return (ret);
1451 /* Move lsn onto page. */
1452 LSN(pagep) = LSN(new_pagep) = new_lsn;
1454 NEXT_PGNO(pagep) = PGNO(new_pagep);
1455 PREV_PGNO(new_pagep) = PGNO(pagep);
1457 if (release)
1458 ret = __ham_put_page(dbp, pagep, 1);
1460 hcp->stats.hash_overflows++;
1461 *pp = new_pagep;
1462 return (ret);
1467 * PUBLIC: int __ham_new_page __P((DB *, u_int32_t, u_int32_t, PAGE **));
1470 __ham_new_page(dbp, addr, type, pp)
1471 DB *dbp;
1472 u_int32_t addr, type;
1473 PAGE **pp;
1475 PAGE *pagep;
1476 int ret;
1478 if ((ret = memp_fget(dbp->mpf,
1479 &addr, DB_MPOOL_CREATE, &pagep)) != 0)
1480 return (ret);
1482 /* This should not be necessary because page-in should do it. */
1483 P_INIT(pagep, dbp->pgsize, addr, PGNO_INVALID, PGNO_INVALID, 0, type);
1485 *pp = pagep;
1486 return (0);
1490 * PUBLIC: int __ham_del_page __P((DBC *, PAGE *));
1493 __ham_del_page(dbc, pagep)
1494 DBC *dbc;
1495 PAGE *pagep;
1497 DB *dbp;
1498 HASH_CURSOR *hcp;
1499 DB_LSN new_lsn;
1500 int ret;
1502 dbp = dbc->dbp;
1503 hcp = (HASH_CURSOR *)dbc->internal;
1504 ret = 0;
1505 DIRTY_META(dbp, hcp, ret);
1506 if (ret != 0) {
1507 if (ret != EAGAIN)
1508 __db_err(dbp->dbenv,
1509 "free_ovflpage: unable to lock meta data page %s\n",
1510 strerror(ret));
1512 * If we are going to return an error, then we should free
1513 * the page, so it doesn't stay pinned forever.
1515 (void)__ham_put_page(dbp, pagep, 0);
1516 return (ret);
1519 if (DB_LOGGING(dbc)) {
1520 if ((ret = __ham_newpgno_log(dbp->dbenv->lg_info,
1521 dbc->txn, &new_lsn, 0, DELPGNO,
1522 dbp->log_fileid, PGNO(pagep), hcp->hdr->last_freed,
1523 (u_int32_t)TYPE(pagep), NEXT_PGNO(pagep), P_INVALID,
1524 &LSN(pagep), &hcp->hdr->lsn)) != 0)
1525 return (ret);
1527 hcp->hdr->lsn = new_lsn;
1528 LSN(pagep) = new_lsn;
1531 #ifdef DIAGNOSTIC
1533 db_pgno_t __pgno;
1534 DB_LSN __lsn;
1535 __pgno = pagep->pgno;
1536 __lsn = pagep->lsn;
1537 memset(pagep, 0xdb, dbp->pgsize);
1538 pagep->pgno = __pgno;
1539 pagep->lsn = __lsn;
1541 #endif
1542 TYPE(pagep) = P_INVALID;
1543 NEXT_PGNO(pagep) = hcp->hdr->last_freed;
1544 hcp->hdr->last_freed = PGNO(pagep);
1546 return (__ham_put_page(dbp, pagep, 1));
1551 * PUBLIC: int __ham_put_page __P((DB *, PAGE *, int32_t));
1554 __ham_put_page(dbp, pagep, is_dirty)
1555 DB *dbp;
1556 PAGE *pagep;
1557 int32_t is_dirty;
1559 #ifdef DEBUG_SLOW
1560 __account_page(dbp, ((BKT *)((char *)pagep - sizeof(BKT)))->pgno, -1);
1561 #endif
1562 return (memp_fput(dbp->mpf, pagep, (is_dirty ? DB_MPOOL_DIRTY : 0)));
1566 * __ham_dirty_page --
1567 * Mark a page dirty.
1569 * PUBLIC: int __ham_dirty_page __P((DB *, PAGE *));
1572 __ham_dirty_page(dbp, pagep)
1573 DB *dbp;
1574 PAGE *pagep;
1576 return (memp_fset(dbp->mpf, pagep, DB_MPOOL_DIRTY));
1580 * PUBLIC: int __ham_get_page __P((DB *, db_pgno_t, PAGE **));
1583 __ham_get_page(dbp, addr, pagep)
1584 DB *dbp;
1585 db_pgno_t addr;
1586 PAGE **pagep;
1588 int ret;
1590 ret = memp_fget(dbp->mpf, &addr, DB_MPOOL_CREATE, pagep);
1591 #ifdef DEBUG_SLOW
1592 if (*pagep != NULL)
1593 __account_page(dbp, addr, 1);
1594 #endif
1595 return (ret);
1599 * PUBLIC: int __ham_overflow_page
1600 * PUBLIC: __P((DBC *, u_int32_t, PAGE **));
1603 __ham_overflow_page(dbc, type, pp)
1604 DBC *dbc;
1605 u_int32_t type;
1606 PAGE **pp;
1608 DB *dbp;
1609 HASH_CURSOR *hcp;
1610 DB_LSN *lsnp, new_lsn;
1611 PAGE *p;
1612 db_pgno_t new_addr, next_free, newalloc_flag;
1613 u_int32_t offset, splitnum;
1614 int ret;
1616 ret = 0;
1617 dbp = dbc->dbp;
1618 hcp = (HASH_CURSOR *)dbc->internal;
1619 DIRTY_META(dbp, hcp, ret);
1620 if (ret != 0)
1621 return (ret);
1624 * This routine is split up into two parts. First we have
1625 * to figure out the address of the new page that we are
1626 * allocating. Then we have to log the allocation. Only
1627 * after the log do we get to complete allocation of the
1628 * new page.
1630 new_addr = hcp->hdr->last_freed;
1631 if (new_addr != PGNO_INVALID) {
1632 if ((ret = __ham_get_page(dbp, new_addr, &p)) != 0)
1633 return (ret);
1634 next_free = NEXT_PGNO(p);
1635 lsnp = &LSN(p);
1636 newalloc_flag = 0;
1637 } else {
1638 splitnum = hcp->hdr->ovfl_point;
1639 hcp->hdr->spares[splitnum]++;
1640 offset = hcp->hdr->spares[splitnum] -
1641 (splitnum ? hcp->hdr->spares[splitnum - 1] : 0);
1642 new_addr = PGNO_OF(hcp, hcp->hdr->ovfl_point, offset);
1643 if (new_addr > MAX_PAGES(hcp)) {
1644 __db_err(dbp->dbenv, "hash: out of file pages");
1645 hcp->hdr->spares[splitnum]--;
1646 return (ENOMEM);
1648 next_free = PGNO_INVALID;
1649 p = NULL;
1650 lsnp = NULL;
1651 newalloc_flag = 1;
1654 if (DB_LOGGING(dbc)) {
1655 if ((ret = __ham_newpgno_log(dbp->dbenv->lg_info,
1656 dbc->txn, &new_lsn, 0, ALLOCPGNO,
1657 dbp->log_fileid, new_addr, next_free,
1658 0, newalloc_flag, type, lsnp, &hcp->hdr->lsn)) != 0)
1659 return (ret);
1661 hcp->hdr->lsn = new_lsn;
1662 if (lsnp != NULL)
1663 *lsnp = new_lsn;
1666 if (p != NULL) {
1667 /* We just took something off the free list, initialize it. */
1668 hcp->hdr->last_freed = next_free;
1669 P_INIT(p, hcp->hdr->pagesize, PGNO(p), PGNO_INVALID,
1670 PGNO_INVALID, 0, (u_int8_t)type);
1671 } else {
1672 /* Get the new page. */
1673 if ((ret = __ham_new_page(dbp, new_addr, type, &p)) != 0)
1674 return (ret);
1676 if (DB_LOGGING(dbc))
1677 LSN(p) = new_lsn;
1679 *pp = p;
1680 return (0);
1683 #ifdef DEBUG
1685 * PUBLIC: #ifdef DEBUG
1686 * PUBLIC: db_pgno_t __bucket_to_page __P((HASH_CURSOR *, db_pgno_t));
1687 * PUBLIC: #endif
1689 db_pgno_t
1690 __bucket_to_page(hcp, n)
1691 HASH_CURSOR *hcp;
1692 db_pgno_t n;
1694 int ret_val;
1696 ret_val = n + 1;
1697 if (n != 0)
1698 ret_val += hcp->hdr->spares[__db_log2(n + 1) - 1];
1699 return (ret_val);
1701 #endif
1704 * Create a bunch of overflow pages at the current split point.
1705 * PUBLIC: void __ham_init_ovflpages __P((DBC *));
1707 void
1708 __ham_init_ovflpages(dbc)
1709 DBC *dbc;
1711 DB *dbp;
1712 HASH_CURSOR *hcp;
1713 DB_LSN new_lsn;
1714 PAGE *p;
1715 db_pgno_t last_pgno, new_pgno;
1716 u_int32_t i, curpages, numpages;
1718 dbp = dbc->dbp;
1719 hcp = (HASH_CURSOR *)dbc->internal;
1721 curpages = hcp->hdr->spares[hcp->hdr->ovfl_point] -
1722 hcp->hdr->spares[hcp->hdr->ovfl_point - 1];
1723 numpages = hcp->hdr->ovfl_point + 1 - curpages;
1725 last_pgno = hcp->hdr->last_freed;
1726 new_pgno = PGNO_OF(hcp, hcp->hdr->ovfl_point, curpages + 1);
1727 if (DB_LOGGING(dbc)) {
1728 (void)__ham_ovfl_log(dbp->dbenv->lg_info,
1729 dbc->txn, &new_lsn, 0, dbp->log_fileid, new_pgno,
1730 numpages, last_pgno, hcp->hdr->ovfl_point, &hcp->hdr->lsn);
1731 hcp->hdr->lsn = new_lsn;
1732 } else
1733 ZERO_LSN(new_lsn);
1735 hcp->hdr->spares[hcp->hdr->ovfl_point] += numpages;
1736 for (i = numpages; i > 0; i--) {
1737 if (__ham_new_page(dbp,
1738 PGNO_OF(hcp, hcp->hdr->ovfl_point, curpages + i),
1739 P_INVALID, &p) != 0)
1740 break;
1741 LSN(p) = new_lsn;
1742 NEXT_PGNO(p) = last_pgno;
1743 last_pgno = PGNO(p);
1744 (void)__ham_put_page(dbp, p, 1);
1746 hcp->hdr->last_freed = last_pgno;
1750 * PUBLIC: int __ham_get_cpage __P((DBC *, db_lockmode_t));
1753 __ham_get_cpage(dbc, mode)
1754 DBC *dbc;
1755 db_lockmode_t mode;
1757 DB *dbp;
1758 HASH_CURSOR *hcp;
1759 int ret;
1761 dbp = dbc->dbp;
1762 hcp = (HASH_CURSOR *)dbc->internal;
1765 * There are three cases with respect to buckets and locks. If there
1766 * is no lock held, then if we are locking, we should get the lock.
1767 * If there is a lock held and it's for the current bucket, we don't
1768 * need to do anything. If there is a lock, but it's for a different
1769 * bucket, then we need to release and get.
1771 if (F_ISSET(dbp, DB_AM_LOCKING)) {
1772 if (hcp->lock != 0 && hcp->lbucket != hcp->bucket) {
1774 * If this is the original lock, don't release it,
1775 * because we may need to restore it upon exit.
1777 if (dbc->txn == NULL &&
1778 !F_ISSET(hcp, H_ORIGINAL) && (ret =
1779 lock_put(dbp->dbenv->lk_info, hcp->lock)) != 0)
1780 return (ret);
1781 F_CLR(hcp, H_ORIGINAL);
1782 hcp->lock = 0;
1784 if (hcp->lock == 0 && (ret = __ham_lock_bucket(dbc, mode)) != 0)
1785 return (ret);
1786 hcp->lbucket = hcp->bucket;
1789 if (hcp->pagep == NULL) {
1790 if (hcp->pgno == PGNO_INVALID) {
1791 hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket);
1792 hcp->bndx = 0;
1795 if ((ret =
1796 __ham_get_page(dbp, hcp->pgno, &hcp->pagep)) != 0)
1797 return (ret);
1800 if (hcp->dpgno != PGNO_INVALID && hcp->dpagep == NULL)
1801 if ((ret =
1802 __ham_get_page(dbp, hcp->dpgno, &hcp->dpagep)) != 0)
1803 return (ret);
1804 return (0);
1808 * Get a new page at the cursor, putting the last page if necessary.
1809 * If the flag is set to H_ISDUP, then we are talking about the
1810 * duplicate page, not the main page.
1812 * PUBLIC: int __ham_next_cpage __P((DBC *, db_pgno_t, int, u_int32_t));
1815 __ham_next_cpage(dbc, pgno, dirty, flags)
1816 DBC *dbc;
1817 db_pgno_t pgno;
1818 int dirty;
1819 u_int32_t flags;
1821 DB *dbp;
1822 HASH_CURSOR *hcp;
1823 PAGE *p;
1824 int ret;
1826 dbp = dbc->dbp;
1827 hcp = (HASH_CURSOR *)dbc->internal;
1828 if (LF_ISSET(H_ISDUP) && hcp->dpagep != NULL &&
1829 (ret = __ham_put_page(dbp, hcp->dpagep, dirty)) != 0)
1830 return (ret);
1831 else if (!LF_ISSET(H_ISDUP) && hcp->pagep != NULL &&
1832 (ret = __ham_put_page(dbp, hcp->pagep, dirty)) != 0)
1833 return (ret);
1835 if ((ret = __ham_get_page(dbp, pgno, &p)) != 0)
1836 return (ret);
1838 if (LF_ISSET(H_ISDUP)) {
1839 hcp->dpagep = p;
1840 hcp->dpgno = pgno;
1841 hcp->dndx = 0;
1842 } else {
1843 hcp->pagep = p;
1844 hcp->pgno = pgno;
1845 hcp->bndx = 0;
1848 return (0);
1852 * __ham_lock_bucket --
1853 * Get the lock on a particular bucket.
1855 static int
1856 __ham_lock_bucket(dbc, mode)
1857 DBC *dbc;
1858 db_lockmode_t mode;
1860 HASH_CURSOR *hcp;
1861 int ret;
1863 hcp = (HASH_CURSOR *)dbc->internal;
1864 dbc->lock.pgno = (db_pgno_t)(hcp->bucket);
1865 if (dbc->txn == NULL)
1866 ret = lock_get(dbc->dbp->dbenv->lk_info, dbc->locker, 0,
1867 &dbc->lock_dbt, mode, &hcp->lock);
1868 else
1869 ret = lock_tget(dbc->dbp->dbenv->lk_info, dbc->txn, 0,
1870 &dbc->lock_dbt, mode, &hcp->lock);
1872 return (ret < 0 ? EAGAIN : ret);
1876 * __ham_dpair --
1877 * Delete a pair on a page, paying no attention to what the pair
1878 * represents. The caller is responsible for freeing up duplicates
1879 * or offpage entries that might be referenced by this pair.
1881 * PUBLIC: void __ham_dpair __P((DB *, PAGE *, u_int32_t));
1883 void
1884 __ham_dpair(dbp, p, pndx)
1885 DB *dbp;
1886 PAGE *p;
1887 u_int32_t pndx;
1889 db_indx_t delta, n;
1890 u_int8_t *dest, *src;
1893 * Compute "delta", the amount we have to shift all of the
1894 * offsets. To find the delta, we just need to calculate
1895 * the size of the pair of elements we are removing.
1897 delta = H_PAIRSIZE(p, dbp->pgsize, pndx);
1900 * The hard case: we want to remove something other than
1901 * the last item on the page. We need to shift data and
1902 * offsets down.
1904 if ((db_indx_t)pndx != H_NUMPAIRS(p) - 1) {
1906 * Move the data: src is the first occupied byte on
1907 * the page. (Length is delta.)
1909 src = (u_int8_t *)p + HOFFSET(p);
1912 * Destination is delta bytes beyond src. This might
1913 * be an overlapping copy, so we have to use memmove.
1915 dest = src + delta;
1916 memmove(dest, src, p->inp[H_DATAINDEX(pndx)] - HOFFSET(p));
1919 /* Adjust the offsets. */
1920 for (n = (db_indx_t)pndx; n < (db_indx_t)(H_NUMPAIRS(p) - 1); n++) {
1921 p->inp[H_KEYINDEX(n)] = p->inp[H_KEYINDEX(n+1)] + delta;
1922 p->inp[H_DATAINDEX(n)] = p->inp[H_DATAINDEX(n+1)] + delta;
1925 /* Adjust page metadata. */
1926 HOFFSET(p) = HOFFSET(p) + delta;
1927 NUM_ENT(p) = NUM_ENT(p) - 2;