2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 1996, 1997, 1998
5 * Sleepycat Software. All rights reserved.
10 static const char sccsid
[] = "@(#)mp_fget.c 10.53 (Sleepycat) 11/16/98";
13 #ifndef NO_SYSTEM_INCLUDES
14 #include <sys/types.h>
24 #include "common_ext.h"
28 * Get a page from the file.
31 memp_fget(dbmfp
, pgnoaddr
, flags
, addrp
)
41 size_t bucket
, mf_offset
;
43 int b_incr
, first
, ret
;
55 * Don't test for DB_MPOOL_CREATE and DB_MPOOL_NEW flags for readonly
56 * files here, and create non-existent pages in readonly files if the
57 * flags are set, later. The reason is that the hash access method
58 * wants to get empty pages that don't really exist in readonly files.
59 * The only alternative is for hash to write the last "bucket" all the
60 * time, which we don't want to do because one of our big goals in life
61 * is to keep database files small. It's sleazy as hell, but we catch
62 * any attempt to actually write the file in memp_fput().
64 #define OKFLAGS (DB_MPOOL_CREATE | DB_MPOOL_LAST | DB_MPOOL_NEW)
67 __db_fchk(dbmp
->dbenv
, "memp_fget", flags
, OKFLAGS
)) != 0)
77 return (__db_ferr(dbmp
->dbenv
, "memp_fget", 1));
84 * We want to switch threads as often as possible. Yield every time
85 * we get a new page to ensure contention.
87 if (DB_GLOBAL(db_pageyield
))
91 /* Initialize remaining local variables. */
92 mf_offset
= R_OFFSET(dbmp
, mfp
);
97 /* Determine the hash bucket where this page will live. */
98 bucket
= BUCKET(mp
, mf_offset
, *pgnoaddr
);
103 * Check for the last or last + 1 page requests.
105 * Examine and update the file's last_pgno value. We don't care if
106 * the last_pgno value immediately changes due to another thread --
107 * at this instant in time, the value is correct. We do increment the
108 * current last_pgno value if the thread is asking for a new page,
109 * however, to ensure that two threads creating pages don't get the
112 if (LF_ISSET(DB_MPOOL_LAST
| DB_MPOOL_NEW
)) {
113 if (LF_ISSET(DB_MPOOL_NEW
))
115 *pgnoaddr
= mfp
->last_pgno
;
116 bucket
= BUCKET(mp
, mf_offset
, mfp
->last_pgno
);
118 if (LF_ISSET(DB_MPOOL_NEW
))
123 * If mmap'ing the file and the page is not past the end of the file,
124 * just return a pointer.
126 * The page may be past the end of the file, so check the page number
127 * argument against the original length of the file. If we previously
128 * returned pages past the original end of the file, last_pgno will
129 * have been updated to match the "new" end of the file, and checking
130 * against it would return pointers past the end of the mmap'd region.
132 * If another process has opened the file for writing since we mmap'd
133 * it, we will start playing the game by their rules, i.e. everything
134 * goes through the cache. All pages previously returned will be safe,
135 * as long as the correct locking protocol was observed.
138 * We don't discard the map because we don't know when all of the
139 * pages will have been discarded from the process' address space.
140 * It would be possible to do so by reference counting the open
141 * pages from the mmap, but it's unclear to me that it's worth it.
143 if (dbmfp
->addr
!= NULL
&& F_ISSET(mfp
, MP_CAN_MMAP
))
144 if (*pgnoaddr
> mfp
->orig_last_pgno
) {
147 * See the comment above about non-existent pages and
148 * the hash access method.
150 if (!LF_ISSET(DB_MPOOL_CREATE
)) {
151 __db_err(dbmp
->dbenv
,
152 "%s: page %lu doesn't exist",
153 __memp_fn(dbmfp
), (u_long
)*pgnoaddr
);
159 R_ADDR(dbmfp
, *pgnoaddr
* mfp
->stat
.st_pagesize
);
165 /* Search the hash chain for the page. */
166 for (bhp
= SH_TAILQ_FIRST(&dbmp
->htab
[bucket
], __bh
);
167 bhp
!= NULL
; bhp
= SH_TAILQ_NEXT(bhp
, hq
, __bh
)) {
169 if (bhp
->pgno
!= *pgnoaddr
|| bhp
->mf_offset
!= mf_offset
)
172 /* Increment the reference count. */
173 if (bhp
->ref
== UINT16_T_MAX
) {
174 __db_err(dbmp
->dbenv
,
175 "%s: page %lu: reference count overflow",
176 __memp_fn(dbmfp
), (u_long
)bhp
->pgno
);
182 * Increment the reference count. We may discard the region
183 * lock as we evaluate and/or read the buffer, so we need to
184 * ensure that it doesn't move and that its contents remain
191 * Any buffer we find might be trouble.
194 * I/O is in progress. Because we've incremented the buffer
195 * reference count, we know the buffer can't move. Unlock
196 * the region lock, wait for the I/O to complete, and reacquire
199 for (first
= 1; F_ISSET(bhp
, BH_LOCKED
); first
= 0) {
203 * Explicitly yield the processor if it's not the first
204 * pass through this loop -- if we don't, we might end
205 * up running to the end of our CPU quantum as we will
206 * simply be swapping between the two locks.
211 LOCKBUFFER(dbmp
, bhp
);
212 /* Wait for I/O to finish... */
213 UNLOCKBUFFER(dbmp
, bhp
);
219 * The contents of the buffer are garbage. Shouldn't happen,
220 * and this read is likely to fail, but might as well try.
222 if (F_ISSET(bhp
, BH_TRASH
))
227 * The buffer was converted so it could be written, and the
228 * contents need to be converted again.
230 if (F_ISSET(bhp
, BH_CALLPGIN
)) {
231 if ((ret
= __memp_pg(dbmfp
, bhp
, 1)) != 0)
233 F_CLR(bhp
, BH_CALLPGIN
);
236 ++mp
->stat
.st_cache_hit
;
237 ++mfp
->stat
.st_cache_hit
;
238 *(void **)addrp
= bhp
->buf
;
242 alloc
: /* Allocate new buffer header and data space. */
243 if ((ret
= __memp_alloc(dbmp
, sizeof(BH
) -
244 sizeof(u_int8_t
) + mfp
->stat
.st_pagesize
, NULL
, &bhp
)) != 0)
248 if ((ALIGNTYPE
)bhp
->buf
& (sizeof(size_t) - 1)) {
249 __db_err(dbmp
->dbenv
,
250 "Internal error: BH data NOT size_t aligned.");
255 /* Initialize the BH fields. */
256 memset(bhp
, 0, sizeof(BH
));
257 LOCKINIT(dbmp
, &bhp
->mutex
);
259 bhp
->pgno
= *pgnoaddr
;
260 bhp
->mf_offset
= mf_offset
;
263 * Prepend the bucket header to the head of the appropriate MPOOL
264 * bucket hash list. Append the bucket header to the tail of the
267 SH_TAILQ_INSERT_HEAD(&dbmp
->htab
[bucket
], bhp
, hq
, __bh
);
268 SH_TAILQ_INSERT_TAIL(&mp
->bhq
, bhp
, q
);
271 * If we created the page, zero it out and continue.
274 * Note: DB_MPOOL_NEW specifically doesn't call the pgin function.
275 * If DB_MPOOL_CREATE is used, then the application's pgin function
276 * has to be able to handle pages of 0's -- if it uses DB_MPOOL_NEW,
277 * it can detect all of its page creates, and not bother.
279 * Otherwise, read the page into memory, optionally creating it if
280 * DB_MPOOL_CREATE is set.
282 if (LF_ISSET(DB_MPOOL_NEW
)) {
283 if (mfp
->clear_len
== 0)
284 memset(bhp
->buf
, 0, mfp
->stat
.st_pagesize
);
286 memset(bhp
->buf
, 0, mfp
->clear_len
);
288 memset(bhp
->buf
+ mfp
->clear_len
, 0xdb,
289 mfp
->stat
.st_pagesize
- mfp
->clear_len
);
293 ++mp
->stat
.st_page_create
;
294 ++mfp
->stat
.st_page_create
;
297 * It's possible for the read function to fail, which means
298 * that we fail as well. Note, the __memp_pgread() function
299 * discards the region lock, so the buffer must be pinned
300 * down so that it cannot move and its contents are unchanged.
302 reread
: if ((ret
= __memp_pgread(dbmfp
,
303 bhp
, LF_ISSET(DB_MPOOL_CREATE
))) != 0) {
306 * Discard the buffer unless another thread is waiting
307 * on our I/O to complete. Regardless, the header has
308 * the BH_TRASH flag set.
311 __memp_bhfree(dbmp
, mfp
, bhp
, 1);
315 ++mp
->stat
.st_cache_miss
;
316 ++mfp
->stat
.st_cache_miss
;
320 * If we're returning a page after our current notion of the last-page,
321 * update our information. Note, there's no way to un-instantiate this
322 * page, it's going to exist whether it's returned to us dirty or not.
324 if (bhp
->pgno
> mfp
->last_pgno
)
325 mfp
->last_pgno
= bhp
->pgno
;
327 ++mp
->stat
.st_page_clean
;
328 *(void **)addrp
= bhp
->buf
;
330 done
: /* Update the chain search statistics. */
332 ++mp
->stat
.st_hash_searches
;
333 if (st_hsearch
> mp
->stat
.st_hash_longest
)
334 mp
->stat
.st_hash_longest
= st_hsearch
;
335 mp
->stat
.st_hash_examined
+= st_hsearch
;
344 err
: /* Discard our reference. */
349 *(void **)addrp
= NULL
;