Unmark gen_random_uuid() function leakproof.
[pgsql.git] / src / include / storage / buf_internals.h
blobeda6c6992128a98c16fcd1ea70327a5aa1b8a44c
1 /*-------------------------------------------------------------------------
3 * buf_internals.h
4 * Internal definitions for buffer manager and the buffer replacement
5 * strategy.
8 * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
9 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/include/storage/buf_internals.h
13 *-------------------------------------------------------------------------
15 #ifndef BUFMGR_INTERNALS_H
16 #define BUFMGR_INTERNALS_H
18 #include "pgstat.h"
19 #include "port/atomics.h"
20 #include "storage/buf.h"
21 #include "storage/bufmgr.h"
22 #include "storage/condition_variable.h"
23 #include "storage/lwlock.h"
24 #include "storage/shmem.h"
25 #include "storage/smgr.h"
26 #include "storage/spin.h"
27 #include "utils/relcache.h"
28 #include "utils/resowner.h"
31 * Buffer state is a single 32-bit variable where following data is combined.
33 * - 18 bits refcount
34 * - 4 bits usage count
35 * - 10 bits of flags
37 * Combining these values allows to perform some operations without locking
38 * the buffer header, by modifying them together with a CAS loop.
40 * The definition of buffer state components is below.
42 #define BUF_REFCOUNT_ONE 1
43 #define BUF_REFCOUNT_MASK ((1U << 18) - 1)
44 #define BUF_USAGECOUNT_MASK 0x003C0000U
45 #define BUF_USAGECOUNT_ONE (1U << 18)
46 #define BUF_USAGECOUNT_SHIFT 18
47 #define BUF_FLAG_MASK 0xFFC00000U
49 /* Get refcount and usagecount from buffer state */
50 #define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
51 #define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
54 * Flags for buffer descriptors
56 * Note: BM_TAG_VALID essentially means that there is a buffer hashtable
57 * entry associated with the buffer's tag.
59 #define BM_LOCKED (1U << 22) /* buffer header is locked */
60 #define BM_DIRTY (1U << 23) /* data needs writing */
61 #define BM_VALID (1U << 24) /* data is valid */
62 #define BM_TAG_VALID (1U << 25) /* tag is assigned */
63 #define BM_IO_IN_PROGRESS (1U << 26) /* read or write in progress */
64 #define BM_IO_ERROR (1U << 27) /* previous I/O failed */
65 #define BM_JUST_DIRTIED (1U << 28) /* dirtied since write started */
66 #define BM_PIN_COUNT_WAITER (1U << 29) /* have waiter for sole pin */
67 #define BM_CHECKPOINT_NEEDED (1U << 30) /* must write for checkpoint */
68 #define BM_PERMANENT (1U << 31) /* permanent buffer (not unlogged,
69 * or init fork) */
71 * The maximum allowed value of usage_count represents a tradeoff between
72 * accuracy and speed of the clock-sweep buffer management algorithm. A
73 * large value (comparable to NBuffers) would approximate LRU semantics.
74 * But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of
75 * clock sweeps to find a free buffer, so in practice we don't want the
76 * value to be very large.
78 #define BM_MAX_USAGE_COUNT 5
81 * Buffer tag identifies which disk block the buffer contains.
83 * Note: the BufferTag data must be sufficient to determine where to write the
84 * block, without reference to pg_class or pg_tablespace entries. It's
85 * possible that the backend flushing the buffer doesn't even believe the
86 * relation is visible yet (its xact may have started before the xact that
87 * created the rel). The storage manager must be able to cope anyway.
89 * Note: if there's any pad bytes in the struct, InitBufferTag will have
90 * to be fixed to zero them, since this struct is used as a hash key.
92 typedef struct buftag
94 Oid spcOid; /* tablespace oid */
95 Oid dbOid; /* database oid */
96 RelFileNumber relNumber; /* relation file number */
97 ForkNumber forkNum; /* fork number */
98 BlockNumber blockNum; /* blknum relative to begin of reln */
99 } BufferTag;
101 static inline RelFileNumber
102 BufTagGetRelNumber(const BufferTag *tag)
104 return tag->relNumber;
107 static inline ForkNumber
108 BufTagGetForkNum(const BufferTag *tag)
110 return tag->forkNum;
113 static inline void
114 BufTagSetRelForkDetails(BufferTag *tag, RelFileNumber relnumber,
115 ForkNumber forknum)
117 tag->relNumber = relnumber;
118 tag->forkNum = forknum;
121 static inline RelFileLocator
122 BufTagGetRelFileLocator(const BufferTag *tag)
124 RelFileLocator rlocator;
126 rlocator.spcOid = tag->spcOid;
127 rlocator.dbOid = tag->dbOid;
128 rlocator.relNumber = BufTagGetRelNumber(tag);
130 return rlocator;
133 static inline void
134 ClearBufferTag(BufferTag *tag)
136 tag->spcOid = InvalidOid;
137 tag->dbOid = InvalidOid;
138 BufTagSetRelForkDetails(tag, InvalidRelFileNumber, InvalidForkNumber);
139 tag->blockNum = InvalidBlockNumber;
142 static inline void
143 InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator,
144 ForkNumber forkNum, BlockNumber blockNum)
146 tag->spcOid = rlocator->spcOid;
147 tag->dbOid = rlocator->dbOid;
148 BufTagSetRelForkDetails(tag, rlocator->relNumber, forkNum);
149 tag->blockNum = blockNum;
152 static inline bool
153 BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
155 return (tag1->spcOid == tag2->spcOid) &&
156 (tag1->dbOid == tag2->dbOid) &&
157 (tag1->relNumber == tag2->relNumber) &&
158 (tag1->blockNum == tag2->blockNum) &&
159 (tag1->forkNum == tag2->forkNum);
162 static inline bool
163 BufTagMatchesRelFileLocator(const BufferTag *tag,
164 const RelFileLocator *rlocator)
166 return (tag->spcOid == rlocator->spcOid) &&
167 (tag->dbOid == rlocator->dbOid) &&
168 (BufTagGetRelNumber(tag) == rlocator->relNumber);
173 * The shared buffer mapping table is partitioned to reduce contention.
174 * To determine which partition lock a given tag requires, compute the tag's
175 * hash code with BufTableHashCode(), then apply BufMappingPartitionLock().
176 * NB: NUM_BUFFER_PARTITIONS must be a power of 2!
178 static inline uint32
179 BufTableHashPartition(uint32 hashcode)
181 return hashcode % NUM_BUFFER_PARTITIONS;
184 static inline LWLock *
185 BufMappingPartitionLock(uint32 hashcode)
187 return &MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET +
188 BufTableHashPartition(hashcode)].lock;
191 static inline LWLock *
192 BufMappingPartitionLockByIndex(uint32 index)
194 return &MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + index].lock;
198 * BufferDesc -- shared descriptor/state data for a single shared buffer.
200 * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change
201 * tag, state or wait_backend_pgprocno fields. In general, buffer header lock
202 * is a spinlock which is combined with flags, refcount and usagecount into
203 * single atomic variable. This layout allow us to do some operations in a
204 * single atomic operation, without actually acquiring and releasing spinlock;
205 * for instance, increase or decrease refcount. buf_id field never changes
206 * after initialization, so does not need locking. freeNext is protected by
207 * the buffer_strategy_lock not buffer header lock. The LWLock can take care
208 * of itself. The buffer header lock is *not* used to control access to the
209 * data in the buffer!
211 * It's assumed that nobody changes the state field while buffer header lock
212 * is held. Thus buffer header lock holder can do complex updates of the
213 * state variable in single write, simultaneously with lock release (cleaning
214 * BM_LOCKED flag). On the other hand, updating of state without holding
215 * buffer header lock is restricted to CAS, which ensures that BM_LOCKED flag
216 * is not set. Atomic increment/decrement, OR/AND etc. are not allowed.
218 * An exception is that if we have the buffer pinned, its tag can't change
219 * underneath us, so we can examine the tag without locking the buffer header.
220 * Also, in places we do one-time reads of the flags without bothering to
221 * lock the buffer header; this is generally for situations where we don't
222 * expect the flag bit being tested to be changing.
224 * We can't physically remove items from a disk page if another backend has
225 * the buffer pinned. Hence, a backend may need to wait for all other pins
226 * to go away. This is signaled by storing its own pgprocno into
227 * wait_backend_pgprocno and setting flag bit BM_PIN_COUNT_WAITER. At present,
228 * there can be only one such waiter per buffer.
230 * We use this same struct for local buffer headers, but the locks are not
231 * used and not all of the flag bits are useful either. To avoid unnecessary
232 * overhead, manipulations of the state field should be done without actual
233 * atomic operations (i.e. only pg_atomic_read_u32() and
234 * pg_atomic_unlocked_write_u32()).
236 * Be careful to avoid increasing the size of the struct when adding or
237 * reordering members. Keeping it below 64 bytes (the most common CPU
238 * cache line size) is fairly important for performance.
240 * Per-buffer I/O condition variables are currently kept outside this struct in
241 * a separate array. They could be moved in here and still fit within that
242 * limit on common systems, but for now that is not done.
244 typedef struct BufferDesc
246 BufferTag tag; /* ID of page contained in buffer */
247 int buf_id; /* buffer's index number (from 0) */
249 /* state of the tag, containing flags, refcount and usagecount */
250 pg_atomic_uint32 state;
252 int wait_backend_pgprocno; /* backend of pin-count waiter */
253 int freeNext; /* link in freelist chain */
254 LWLock content_lock; /* to lock access to buffer contents */
255 } BufferDesc;
258 * Concurrent access to buffer headers has proven to be more efficient if
259 * they're cache line aligned. So we force the start of the BufferDescriptors
260 * array to be on a cache line boundary and force the elements to be cache
261 * line sized.
263 * XXX: As this is primarily matters in highly concurrent workloads which
264 * probably all are 64bit these days, and the space wastage would be a bit
265 * more noticeable on 32bit systems, we don't force the stride to be cache
266 * line sized on those. If somebody does actual performance testing, we can
267 * reevaluate.
269 * Note that local buffer descriptors aren't forced to be aligned - as there's
270 * no concurrent access to those it's unlikely to be beneficial.
272 * We use a 64-byte cache line size here, because that's the most common
273 * size. Making it bigger would be a waste of memory. Even if running on a
274 * platform with either 32 or 128 byte line sizes, it's good to align to
275 * boundaries and avoid false sharing.
277 #define BUFFERDESC_PAD_TO_SIZE (SIZEOF_VOID_P == 8 ? 64 : 1)
279 typedef union BufferDescPadded
281 BufferDesc bufferdesc;
282 char pad[BUFFERDESC_PAD_TO_SIZE];
283 } BufferDescPadded;
286 * The PendingWriteback & WritebackContext structure are used to keep
287 * information about pending flush requests to be issued to the OS.
289 typedef struct PendingWriteback
291 /* could store different types of pending flushes here */
292 BufferTag tag;
293 } PendingWriteback;
295 /* struct forward declared in bufmgr.h */
296 typedef struct WritebackContext
298 /* pointer to the max number of writeback requests to coalesce */
299 int *max_pending;
301 /* current number of pending writeback requests */
302 int nr_pending;
304 /* pending requests */
305 PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES];
306 } WritebackContext;
308 /* in buf_init.c */
309 extern PGDLLIMPORT BufferDescPadded *BufferDescriptors;
310 extern PGDLLIMPORT ConditionVariableMinimallyPadded *BufferIOCVArray;
311 extern PGDLLIMPORT WritebackContext BackendWritebackContext;
313 /* in localbuf.c */
314 extern PGDLLIMPORT BufferDesc *LocalBufferDescriptors;
317 static inline BufferDesc *
318 GetBufferDescriptor(uint32 id)
320 return &(BufferDescriptors[id]).bufferdesc;
323 static inline BufferDesc *
324 GetLocalBufferDescriptor(uint32 id)
326 return &LocalBufferDescriptors[id];
329 static inline Buffer
330 BufferDescriptorGetBuffer(const BufferDesc *bdesc)
332 return (Buffer) (bdesc->buf_id + 1);
335 static inline ConditionVariable *
336 BufferDescriptorGetIOCV(const BufferDesc *bdesc)
338 return &(BufferIOCVArray[bdesc->buf_id]).cv;
341 static inline LWLock *
342 BufferDescriptorGetContentLock(const BufferDesc *bdesc)
344 return (LWLock *) (&bdesc->content_lock);
348 * The freeNext field is either the index of the next freelist entry,
349 * or one of these special values:
351 #define FREENEXT_END_OF_LIST (-1)
352 #define FREENEXT_NOT_IN_LIST (-2)
355 * Functions for acquiring/releasing a shared buffer header's spinlock. Do
356 * not apply these to local buffers!
358 extern uint32 LockBufHdr(BufferDesc *desc);
360 static inline void
361 UnlockBufHdr(BufferDesc *desc, uint32 buf_state)
363 pg_write_barrier();
364 pg_atomic_write_u32(&desc->state, buf_state & (~BM_LOCKED));
367 /* in bufmgr.c */
370 * Structure to sort buffers per file on checkpoints.
372 * This structure is allocated per buffer in shared memory, so it should be
373 * kept as small as possible.
375 typedef struct CkptSortItem
377 Oid tsId;
378 RelFileNumber relNumber;
379 ForkNumber forkNum;
380 BlockNumber blockNum;
381 int buf_id;
382 } CkptSortItem;
384 extern PGDLLIMPORT CkptSortItem *CkptBufferIds;
386 /* ResourceOwner callbacks to hold buffer I/Os and pins */
387 extern PGDLLIMPORT const ResourceOwnerDesc buffer_io_resowner_desc;
388 extern PGDLLIMPORT const ResourceOwnerDesc buffer_pin_resowner_desc;
390 /* Convenience wrappers over ResourceOwnerRemember/Forget */
391 static inline void
392 ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
394 ResourceOwnerRemember(owner, Int32GetDatum(buffer), &buffer_pin_resowner_desc);
396 static inline void
397 ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
399 ResourceOwnerForget(owner, Int32GetDatum(buffer), &buffer_pin_resowner_desc);
401 static inline void
402 ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer)
404 ResourceOwnerRemember(owner, Int32GetDatum(buffer), &buffer_io_resowner_desc);
406 static inline void
407 ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
409 ResourceOwnerForget(owner, Int32GetDatum(buffer), &buffer_io_resowner_desc);
413 * Internal buffer management routines
415 /* bufmgr.c */
416 extern void WritebackContextInit(WritebackContext *context, int *max_pending);
417 extern void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context);
418 extern void ScheduleBufferTagForWriteback(WritebackContext *wb_context,
419 IOContext io_context, BufferTag *tag);
421 /* freelist.c */
422 extern IOContext IOContextForStrategy(BufferAccessStrategy strategy);
423 extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
424 uint32 *buf_state, bool *from_ring);
425 extern void StrategyFreeBuffer(BufferDesc *buf);
426 extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
427 BufferDesc *buf, bool from_ring);
429 extern int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc);
430 extern void StrategyNotifyBgWriter(int bgwprocno);
432 extern Size StrategyShmemSize(void);
433 extern void StrategyInitialize(bool init);
434 extern bool have_free_buffer(void);
436 /* buf_table.c */
437 extern Size BufTableShmemSize(int size);
438 extern void InitBufTable(int size);
439 extern uint32 BufTableHashCode(BufferTag *tagPtr);
440 extern int BufTableLookup(BufferTag *tagPtr, uint32 hashcode);
441 extern int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id);
442 extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode);
444 /* localbuf.c */
445 extern bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount);
446 extern void UnpinLocalBuffer(Buffer buffer);
447 extern void UnpinLocalBufferNoOwner(Buffer buffer);
448 extern PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr,
449 ForkNumber forkNum,
450 BlockNumber blockNum);
451 extern BufferDesc *LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum,
452 BlockNumber blockNum, bool *foundPtr);
453 extern BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr,
454 ForkNumber fork,
455 uint32 flags,
456 uint32 extend_by,
457 BlockNumber extend_upto,
458 Buffer *buffers,
459 uint32 *extended_by);
460 extern void MarkLocalBufferDirty(Buffer buffer);
461 extern void DropRelationLocalBuffers(RelFileLocator rlocator,
462 ForkNumber forkNum,
463 BlockNumber firstDelBlock);
464 extern void DropRelationAllLocalBuffers(RelFileLocator rlocator);
465 extern void AtEOXact_LocalBuffers(bool isCommit);
467 #endif /* BUFMGR_INTERNALS_H */