2 * Copyright (c) 1990 The Regents of the University of California.
5 * This code is derived from software contributed to Berkeley by
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 * must display the following acknowledgement:
18 * This product includes software developed by the University of
19 * California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 #if defined(LIBC_SCCS) && !defined(lint)
38 static char sccsid
[] = "@(#)storage.c 5.2 (Berkeley) 2/22/91";
39 #endif /* LIBC_SCCS and not lint */
41 #include <sys/param.h>
50 * BT_GETPAGE -- Make pgno the current page of the btree.
52 * This routine is just a wrapper that decides whether to call the
53 * memory or disk-based routine to do the work.
56 * t -- btree in which to get page
57 * pgno -- page number to get
60 * RET_SUCCESS or RET_ERROR.
73 /* see if we can get away without doing any work */
74 if (t
->bt_curpage
!= (BTHEADER
*) NULL
) {
75 if (t
->bt_curpage
->h_pgno
== pgno
)
79 if (t
->bt_fname
== (char *) NULL
)
80 return (_bt_getmpage(t
, pgno
));
82 return (_bt_getdpage(t
, pgno
));
86 * _BT_GETMPAGE -- Make pgno the current page of the btree.
88 * This routine gets pages for in-memory btrees.
91 * t -- btree in which to get page
92 * pgno -- page number to get
95 * RET_SUCCESS or RET_ERROR.
107 if (t
->bt_curpage
== (BTHEADER
*) NULL
) {
108 if (pgno
!= P_ROOT
) {
114 h
= (BTHEADER
*) malloc((unsigned) t
->bt_psize
);
115 if (h
== (BTHEADER
*) NULL
)
120 h
->h_lower
= (index_t
)
121 (((char *) &(h
->h_linp
[0])) - ((char *) h
));
122 h
->h_upper
= t
->bt_psize
;
123 h
->h_prevpg
= h
->h_nextpg
= P_NONE
;
127 /* get the root page into the hash table */
128 if (_bt_write(t
, h
, RELEASE
) == RET_ERROR
)
132 htindex
= HASHKEY(pgno
);
134 for (b
= t
->bt_s
.bt_ht
[htindex
];
135 b
!= (HTBUCKET
*) NULL
;
137 if (b
->ht_pgno
== pgno
) {
138 t
->bt_curpage
= b
->ht_page
;
139 return (RET_SUCCESS
);
146 * _BT_GETDPAGE -- Make pgno the current page of the btree.
148 * This routine gets pages for disk btrees.
150 * Because disk btree pages must be readable across machine architectures,
151 * the btree code writes integers out in network format. This routine
152 * converts them back to host format before returning the page.
155 * t -- btree in which to get page
156 * pgno -- page number to get
159 * RET_SUCCESS, RET_ERROR.
163 _bt_getdpage(t
, pgno
)
172 /* if we have an lru cache, let the cache code do the work */
174 cache
= t
->bt_s
.bt_d
.d_cache
;
176 /* release the old page */
177 if (t
->bt_curpage
!= (BTHEADER
*) NULL
) {
178 pgno_t opgno
= t
->bt_curpage
->h_pgno
;
179 t
->bt_curpage
->h_flags
&= ~F_DIRTY
;
181 if (lruwrite(cache
, (int) opgno
) < 0)
184 if (lrurelease(cache
, (int) opgno
) < 0)
188 if (pgno
> t
->bt_npages
) {
189 if ((h
= (BTHEADER
*) lrugetnew(cache
, (int)pgno
, &nbytes
))
190 == (BTHEADER
*) NULL
)
194 if ((h
= (BTHEADER
*) lruget(cache
, (int)pgno
, &nbytes
))
195 == (BTHEADER
*) NULL
)
199 /* init this page, if necessary */
203 h
->h_lower
= (index_t
)
204 (((char *) &(h
->h_linp
[0])) - ((char *) h
));
205 h
->h_upper
= t
->bt_psize
;
206 h
->h_prevpg
= h
->h_nextpg
= P_NONE
;
211 return (RET_SUCCESS
);
214 /* sync the current page, if necessary */
215 if (t
->bt_curpage
!= (BTHEADER
*) NULL
) {
216 if (t
->bt_curpage
->h_flags
& F_DIRTY
)
217 if (_bt_write(t
, t
->bt_curpage
, RELEASE
) == RET_ERROR
)
220 if (t
->bt_npages
== 0)
223 /* if no current page, get space for one */
224 if ((t
->bt_curpage
= (BTHEADER
*) malloc((unsigned) t
->bt_psize
))
225 == (BTHEADER
*) NULL
) {
231 pos
= (long) (pgno
* n
);
233 /* seek to correct location in file */
234 if (lseek(t
->bt_s
.bt_d
.d_fd
, pos
, L_SET
) != pos
) {
239 if ((nbytes
= read(t
->bt_s
.bt_d
.d_fd
, t
->bt_curpage
, n
)) < n
) {
242 * If we didn't get a full page, we must have gotten no
243 * data at all -- in which case we're trying to read a
244 * root page that doesn't exist yet. This is the only
245 * case in which this is okay. If this happens, construct
246 * an empty root page by hand.
248 if (nbytes
!= 0 || pgno
!= P_ROOT
) {
253 h
= (BTHEADER
*) t
->bt_curpage
;
256 h
->h_lower
= (index_t
)
257 (((char *) &(h
->h_linp
[0])) - ((char *) h
));
258 h
->h_upper
= t
->bt_psize
;
259 h
->h_prevpg
= h
->h_nextpg
= P_NONE
;
261 (void) _bt_pgin(t
->bt_curpage
, (char *) t
->bt_lorder
);
263 return (RET_SUCCESS
);
267 * _BT_PGOUT, _BT_PGIN -- Convert host-specific number layout to/from
268 * the host-independent format stored on disk.
271 * h -- page to convert
272 * _lorder -- byte order for pages (stored as a char * in the
273 * cache, and passed around as a magic cookie).
276 * RET_SUCCESS (lru code requires a return value).
279 * Layout of tree metadata on the page is changed in place.
282 * Everywhere else in the code, the types pgno_t and index_t
283 * are opaque. These two routines know what they really
288 _bt_pgout(h
, _lorder
)
299 lorder
= (int) _lorder
;
300 if (lorder
== BYTE_ORDER
)
301 return (RET_SUCCESS
);
303 if (h
->h_flags
& F_LEAF
) {
304 if (h
->h_flags
& F_CONT
) {
305 if (h
->h_prevpg
== P_NONE
) {
308 (void) bcopy((char *) &(h
->h_linp
[0]),
312 (void) bcopy((char *) &longsz
,
313 (char *) &(h
->h_linp
[0]),
318 for (i
= 0; i
< top
; i
++) {
319 d
= (DATUM
*) GETDATUM(h
, i
);
320 if (d
->d_flags
& D_BIGKEY
) {
321 (void) bcopy((char *) &(d
->d_bytes
[0]),
325 (void) bcopy((char *) &chain
,
326 (char *) &(d
->d_bytes
[0]),
329 if (d
->d_flags
& D_BIGDATA
) {
330 (void) bcopy((char *) &(d
->d_bytes
[d
->d_ksize
]),
334 (void) bcopy((char *) &chain
,
335 (char *) &(d
->d_bytes
[d
->d_ksize
]),
341 BLSWAP(h
->h_linp
[i
]);
346 for (i
= 0; i
< top
; i
++) {
347 id
= (IDATUM
*) GETDATUM(h
, i
);
351 if (id
->i_flags
& D_BIGKEY
) {
352 (void) bcopy((char *) &(id
->i_bytes
[0]),
356 (void) bcopy((char *) &chain
,
357 (char *) &(id
->i_bytes
[0]),
360 BLSWAP(h
->h_linp
[i
]);
370 return (RET_SUCCESS
);
386 * If btree pages are to be stored in the host byte order, don't
389 lorder
= (int) _lorder
;
390 if (lorder
== BYTE_ORDER
)
391 return (RET_SUCCESS
);
400 if (h
->h_flags
& F_LEAF
) {
401 if (h
->h_flags
& F_CONT
) {
402 if (h
->h_prevpg
== P_NONE
) {
405 (void) bcopy((char *) &(h
->h_linp
[0]),
409 (void) bcopy((char *) &longsz
,
410 (char *) &(h
->h_linp
[0]),
415 for (i
= 0; i
< top
; i
++) {
416 BLSWAP(h
->h_linp
[i
]);
417 d
= (DATUM
*) GETDATUM(h
, i
);
421 if (d
->d_flags
& D_BIGKEY
) {
422 (void) bcopy((char *) &(d
->d_bytes
[0]),
426 (void) bcopy((char *) &chain
,
427 (char *) &(d
->d_bytes
[0]),
430 if (d
->d_flags
& D_BIGDATA
) {
431 (void) bcopy((char *) &(d
->d_bytes
[d
->d_ksize
]),
435 (void) bcopy((char *) &chain
,
436 (char *) &(d
->d_bytes
[d
->d_ksize
]),
443 for (i
= 0; i
< top
; i
++) {
444 BLSWAP(h
->h_linp
[i
]);
445 id
= (IDATUM
*) GETDATUM(h
, i
);
449 if (id
->i_flags
& D_BIGKEY
) {
450 (void) bcopy((char *) &(id
->i_bytes
[0]),
454 (void) bcopy((char *) &chain
,
455 (char *) &(id
->i_bytes
[0]),
460 return (RET_SUCCESS
);
464 * _BT_ALLOCPG -- allocate a new page in the btree.
466 * This is called when we split a page, to get space to do the split.
467 * For disk btrees, these pages are released when the split is done.
468 * For memory btrees, they are not.
471 * t -- tree in which to do split
474 * Pointer to the newly-allocated page
481 BTHEADER
*h
= t
->bt_curpage
;
485 /* if we have a cache, let the cache code do the work */
486 if (ISDISK(t
) && ISCACHE(t
)) {
487 nh
= (BTHEADER
*) lrugetnew(t
->bt_s
.bt_d
.d_cache
,
488 (int) (t
->bt_npages
+ 1),
491 nh
= (BTHEADER
*) malloc((unsigned) t
->bt_psize
);
494 if (nh
!= (BTHEADER
*) NULL
) {
495 nh
->h_pgno
= nh
->h_prevpg
= nh
->h_nextpg
= P_NONE
;
496 nh
->h_flags
= h
->h_flags
;
497 nh
->h_lower
= (index_t
)
498 (((char *) &(nh
->h_linp
[0])) - ((char *) nh
));
499 nh
->h_upper
= t
->bt_psize
;
506 * _BT_WRITE -- Write a specific page to a btree file.
509 * t -- btree to get the page
511 * relflag -- (int) this page may/may not be released
514 * RET_SUCCESS, RET_ERROR.
517 * Writes a metadata page if none has been written yet.
521 _bt_write(t
, h
, relflag
)
532 h
->h_flags
&= ~F_DIRTY
;
535 /* if we haven't done so yet, write the metadata */
536 if (!(t
->bt_flags
& BTF_METAOK
)) {
537 if (_bt_wrtmeta(t
) == RET_ERROR
)
544 /* if we have a cache, let the cache code do the work */
545 if ((cache
= t
->bt_s
.bt_d
.d_cache
) != (char *) NULL
) {
546 if (lruwrite(cache
, (int) pgno
) < 0)
548 if (relflag
&& lrurelease(cache
, (int) pgno
) < 0)
552 (void) _bt_pgout(h
, (char *) t
->bt_lorder
);
553 /* now write the current page */
554 pos
= (long) (pgno
* t
->bt_psize
);
555 if (lseek(t
->bt_s
.bt_d
.d_fd
, pos
, L_SET
) != pos
)
557 if (write(t
->bt_s
.bt_d
.d_fd
, (char *) h
, (int) t
->bt_psize
)
561 (void) _bt_pgin(h
, (char *) t
->bt_lorder
);
564 /* in-memory btree */
565 htindex
= HASHKEY(h
->h_pgno
);
567 /* see if we need to overwrite existing entry */
568 for (b
= t
->bt_s
.bt_ht
[htindex
];
569 b
!= (HTBUCKET
*) NULL
;
571 if (b
->ht_pgno
== h
->h_pgno
) {
573 return (RET_SUCCESS
);
577 /* new entry, write it */
578 b
= (HTBUCKET
*) malloc((unsigned) sizeof (HTBUCKET
));
579 if (b
== (HTBUCKET
*) NULL
)
582 b
->ht_pgno
= h
->h_pgno
;
584 b
->ht_next
= t
->bt_s
.bt_ht
[htindex
];
585 t
->bt_s
.bt_ht
[htindex
] = b
;
587 return (RET_SUCCESS
);
591 * _BT_RELEASE -- Release a locked-down cache page
593 * During page splits, we want to force pages out to the cache
594 * before we actually release them, in some cases. This routine
595 * releases such a page when it is no longer needed.
598 * t -- btree in which to release page
599 * h -- page to release
602 * RET_SUCCESS, RET_ERROR.
613 if (ISDISK(t
) && ISCACHE(t
)) {
614 if (lrurelease(t
->bt_s
.bt_d
.d_cache
, (int) (h
->h_pgno
)) < 0)
617 return (RET_SUCCESS
);
621 * _BT_WRTMETA -- Write metadata to the btree.
624 * t -- tree to which to write
627 * RET_SUCCESS, RET_ERROR.
636 if (lseek(t
->bt_s
.bt_d
.d_fd
, 0l, L_SET
) != 0l)
639 /* lorder has to be in host-independent format */
640 m
.m_lorder
= (u_long
) htonl((long) t
->bt_lorder
);
642 m
.m_magic
= BTREEMAGIC
;
643 m
.m_version
= BTREEVERSION
;
644 m
.m_psize
= t
->bt_psize
;
645 m
.m_free
= t
->bt_free
;
646 m
.m_flags
= t
->bt_flags
& BTF_NODUPS
;
648 if (t
->bt_lorder
!= BYTE_ORDER
) {
656 if (write(t
->bt_s
.bt_d
.d_fd
, (char *) &m
, sizeof(m
))
661 t
->bt_flags
|= BTF_METAOK
;
663 return (RET_SUCCESS
);