2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 1996, 1997, 1998
5 * Sleepycat Software. All rights reserved.
7 * @(#)mp.h 10.37 (Sleepycat) 1/1/99
10 struct __bh
; typedef struct __bh BH
;
11 struct __db_mpreg
; typedef struct __db_mpreg DB_MPREG
;
12 struct __mpool
; typedef struct __mpool MPOOL
;
13 struct __mpoolfile
; typedef struct __mpoolfile MPOOLFILE
;
15 /* Default mpool name. */
16 #define DB_DEFAULT_MPOOL_FILE "__db_mpool.share"
19 * We default to 256K (32 8K pages) if the user doesn't specify, and
20 * require a minimum of 20K.
22 #ifndef DB_CACHESIZE_DEF
23 #define DB_CACHESIZE_DEF (256 * 1024)
25 #define DB_CACHESIZE_MIN ( 20 * 1024)
27 #define INVALID 0 /* Invalid shared memory offset. */
30 * There are three ways we do locking in the mpool code:
32 * Locking a handle mutex to provide concurrency for DB_THREAD operations.
33 * Locking the region mutex to provide mutual exclusion while reading and
34 * writing structures in the shared region.
35 * Locking buffer header mutexes during I/O.
37 * The first will not be further described here. We use the shared mpool
38 * region lock to provide mutual exclusion while reading/modifying all of
39 * the data structures, including the buffer headers. We use a per-buffer
40 * header lock to wait on buffer I/O. The order of locking is as follows:
42 * Searching for a buffer:
43 * Acquire the region lock.
44 * Find the buffer header.
45 * Increment the reference count (guarantee the buffer stays).
46 * While the BH_LOCKED flag is set (I/O is going on) {
47 * Release the region lock.
48 * Explicitly yield the processor if it's not the first pass
49 * through this loop, otherwise, we can simply spin because
50 * we'll be simply switching between the two locks.
51 * Request the buffer lock.
52 * The I/O will complete...
53 * Acquire the buffer lock.
54 * Release the buffer lock.
55 * Acquire the region lock.
59 * Reading/writing a buffer:
60 * Acquire the region lock.
61 * Find/create the buffer header.
62 * If reading, increment the reference count (guarantee the buffer stays).
63 * Set the BH_LOCKED flag.
64 * Acquire the buffer lock (guaranteed not to block).
65 * Release the region lock.
66 * Do the I/O and/or initialize the buffer contents.
67 * Release the buffer lock.
68 * At this point, the buffer lock is available, but the logical
69 * operation (flagged by BH_LOCKED) is not yet completed. For
70 * this reason, among others, threads checking the BH_LOCKED flag
71 * must loop around their test.
72 * Acquire the region lock.
73 * Clear the BH_LOCKED flag.
74 * Release the region lock.
75 * Return/discard the buffer.
77 * Pointers to DB_MPOOL, MPOOL, DB_MPOOLFILE and MPOOLFILE structures are not
78 * reacquired when a region lock is reacquired because they couldn't have been
79 * closed/discarded and because they never move in memory.
81 #define LOCKINIT(dbmp, mutexp) \
82 if (F_ISSET(dbmp, MP_LOCKHANDLE | MP_LOCKREGION)) \
83 (void)__db_mutex_init(mutexp, \
84 MUTEX_LOCK_OFFSET((dbmp)->reginfo.addr, mutexp))
86 #define LOCKHANDLE(dbmp, mutexp) \
87 if (F_ISSET(dbmp, MP_LOCKHANDLE)) \
88 (void)__db_mutex_lock(mutexp, (dbmp)->reginfo.fd)
89 #define UNLOCKHANDLE(dbmp, mutexp) \
90 if (F_ISSET(dbmp, MP_LOCKHANDLE)) \
91 (void)__db_mutex_unlock(mutexp, (dbmp)->reginfo.fd)
93 #define LOCKREGION(dbmp) \
94 if (F_ISSET(dbmp, MP_LOCKREGION)) \
95 (void)__db_mutex_lock(&((RLAYOUT *)(dbmp)->mp)->lock, \
97 #define UNLOCKREGION(dbmp) \
98 if (F_ISSET(dbmp, MP_LOCKREGION)) \
99 (void)__db_mutex_unlock(&((RLAYOUT *)(dbmp)->mp)->lock, \
102 #define LOCKBUFFER(dbmp, bhp) \
103 if (F_ISSET(dbmp, MP_LOCKREGION)) \
104 (void)__db_mutex_lock(&(bhp)->mutex, (dbmp)->reginfo.fd)
105 #define UNLOCKBUFFER(dbmp, bhp) \
106 if (F_ISSET(dbmp, MP_LOCKREGION)) \
107 (void)__db_mutex_unlock(&(bhp)->mutex, (dbmp)->reginfo.fd)
109 /* Check for region catastrophic shutdown. */
110 #define MP_PANIC_CHECK(dbmp) { \
111 if ((dbmp)->mp->rlayout.panic) \
112 return (DB_RUNRECOVERY); \
117 * Per-process memory pool structure.
120 /* These fields need to be protected for multi-threaded support. */
121 db_mutex_t
*mutexp
; /* Structure lock. */
123 /* List of pgin/pgout routines. */
124 LIST_HEAD(__db_mpregh
, __db_mpreg
) dbregq
;
126 /* List of DB_MPOOLFILE's. */
127 TAILQ_HEAD(__db_mpoolfileh
, __db_mpoolfile
) dbmfq
;
129 /* These fields are not protected. */
130 DB_ENV
*dbenv
; /* Reference to error information. */
131 REGINFO reginfo
; /* Region information. */
133 MPOOL
*mp
; /* Address of the shared MPOOL. */
135 void *addr
; /* Address of shalloc() region. */
137 DB_HASHTAB
*htab
; /* Hash table of bucket headers. */
139 #define MP_LOCKHANDLE 0x01 /* Threaded, lock handles and region. */
140 #define MP_LOCKREGION 0x02 /* Concurrent access, lock region. */
146 * DB_MPOOL registry of pgin/pgout functions.
149 LIST_ENTRY(__db_mpreg
) q
; /* Linked list. */
151 int ftype
; /* File type. */
152 /* Pgin, pgout routines. */
153 int (DB_CALLBACK
*pgin
) __P((db_pgno_t
, void *, DBT
*));
154 int (DB_CALLBACK
*pgout
) __P((db_pgno_t
, void *, DBT
*));
159 * Per-process DB_MPOOLFILE information.
161 struct __db_mpoolfile
{
162 /* These fields need to be protected for multi-threaded support. */
163 db_mutex_t
*mutexp
; /* Structure lock. */
165 int fd
; /* Underlying file descriptor. */
167 u_int32_t ref
; /* Reference count. */
171 * This field is a special case -- it's protected by the region lock
172 * NOT the thread lock. The reason for this is that we always have
173 * the region lock immediately before or after we modify the field,
174 * and we don't want to use the structure lock to protect it because
175 * then I/O (which is done with the structure lock held because of
176 * the race between the seek and write of the file descriptor) will
177 * block any other put/get calls using this DB_MPOOLFILE structure.
179 u_int32_t pinref
; /* Pinned block reference count. */
181 /* These fields are not protected. */
182 TAILQ_ENTRY(__db_mpoolfile
) q
; /* Linked list of DB_MPOOLFILE's. */
184 DB_MPOOL
*dbmp
; /* Overlying DB_MPOOL. */
185 MPOOLFILE
*mfp
; /* Underlying MPOOLFILE. */
187 void *addr
; /* Address of mmap'd region. */
188 size_t len
; /* Length of mmap'd region. */
190 /* These fields need to be protected for multi-threaded support. */
191 #define MP_READONLY 0x01 /* File is readonly. */
192 #define MP_UPGRADE 0x02 /* File descriptor is readwrite. */
193 #define MP_UPGRADE_FAIL 0x04 /* Upgrade wasn't possible. */
199 * Shared memory pool region. One of these is allocated in shared
200 * memory, and describes the pool.
203 RLAYOUT rlayout
; /* General region information. */
205 SH_TAILQ_HEAD(__bhq
) bhq
; /* LRU list of buckets. */
206 SH_TAILQ_HEAD(__bhfq
) bhfq
; /* Free buckets. */
207 SH_TAILQ_HEAD(__mpfq
) mpfq
; /* List of MPOOLFILEs. */
210 * We make the assumption that the early pages of the file are far
211 * more likely to be retrieved than the later pages, which means
212 * that the top bits are more interesting for hashing since they're
213 * less likely to collide. On the other hand, since 512 4K pages
214 * represents a 2MB file, only the bottom 9 bits of the page number
215 * are likely to be set. We XOR in the offset in the MPOOL of the
216 * MPOOLFILE that backs this particular page, since that should also
217 * be unique for the page.
219 #define BUCKET(mp, mf_offset, pgno) \
220 (((pgno) ^ ((mf_offset) << 9)) % (mp)->htab_buckets)
222 size_t htab
; /* Hash table offset. */
223 size_t htab_buckets
; /* Number of hash table entries. */
225 DB_LSN lsn
; /* Maximum checkpoint LSN. */
226 u_int32_t lsn_cnt
; /* Checkpoint buffers left to write. */
228 DB_MPOOL_STAT stat
; /* Global mpool statistics. */
230 #define MP_LSN_RETRY 0x01 /* Retry all BH_WRITE buffers. */
236 * Shared DB_MPOOLFILE information.
239 SH_TAILQ_ENTRY q
; /* List of MPOOLFILEs */
241 u_int32_t ref
; /* Reference count. */
243 int ftype
; /* File type. */
245 int32_t lsn_off
; /* Page's LSN offset. */
246 u_int32_t clear_len
; /* Bytes to clear on page create. */
248 size_t path_off
; /* File name location. */
249 size_t fileid_off
; /* File identification location. */
251 size_t pgcookie_len
; /* Pgin/pgout cookie length. */
252 size_t pgcookie_off
; /* Pgin/pgout cookie location. */
254 u_int32_t lsn_cnt
; /* Checkpoint buffers left to write. */
256 db_pgno_t last_pgno
; /* Last page in the file. */
257 db_pgno_t orig_last_pgno
; /* Original last page in the file. */
259 #define MP_CAN_MMAP 0x01 /* If the file can be mmap'd. */
260 #define MP_TEMP 0x02 /* Backing file is a temporary. */
263 DB_MPOOL_FSTAT stat
; /* Per-file mpool statistics. */
271 db_mutex_t mutex
; /* Structure lock. */
273 u_int16_t ref
; /* Reference count. */
275 #define BH_CALLPGIN 0x001 /* Page needs to be reworked... */
276 #define BH_DIRTY 0x002 /* Page was modified. */
277 #define BH_DISCARD 0x004 /* Page is useless. */
278 #define BH_LOCKED 0x008 /* Page is locked (I/O in progress). */
279 #define BH_TRASH 0x010 /* Page is garbage. */
280 #define BH_WRITE 0x020 /* Page scheduled for writing. */
283 SH_TAILQ_ENTRY q
; /* LRU queue. */
284 SH_TAILQ_ENTRY hq
; /* MPOOL hash bucket queue. */
286 db_pgno_t pgno
; /* Underlying MPOOLFILE page number. */
287 size_t mf_offset
; /* Associated MPOOLFILE offset. */
291 * This array must be size_t aligned -- the DB access methods put PAGE
292 * and other structures into it, and expect to be able to access them
293 * directly. (We guarantee size_t alignment in the db_mpool(3) manual
296 u_int8_t buf
[1]; /* Variable length data. */