1 /*-------------------------------------------------------------------------
3 * predicate_internals.h
4 * POSTGRES internal predicate locking definitions.
7 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
10 * src/include/storage/predicate_internals.h
12 *-------------------------------------------------------------------------
14 #ifndef PREDICATE_INTERNALS_H
15 #define PREDICATE_INTERNALS_H
17 #include "storage/lock.h"
18 #include "storage/lwlock.h"
23 typedef uint64 SerCommitSeqNo
;
26 * Reserved commit sequence numbers:
27 * - 0 is reserved to indicate a non-existent SLRU entry; it cannot be
28 * used as a SerCommitSeqNo, even an invalid one
29 * - InvalidSerCommitSeqNo is used to indicate a transaction that
30 * hasn't committed yet, so use a number greater than all valid
31 * ones to make comparison do the expected thing
32 * - RecoverySerCommitSeqNo is used to refer to transactions that
33 * happened before a crash/recovery, since we restart the sequence
34 * at that point. It's earlier than all normal sequence numbers,
35 * and is only used by recovered prepared transactions
37 #define InvalidSerCommitSeqNo ((SerCommitSeqNo) PG_UINT64_MAX)
38 #define RecoverySerCommitSeqNo ((SerCommitSeqNo) 1)
39 #define FirstNormalSerCommitSeqNo ((SerCommitSeqNo) 2)
42 * The SERIALIZABLEXACT struct contains information needed for each
43 * serializable database transaction to support SSI techniques.
45 * A home-grown list is maintained in shared memory to manage these.
46 * An entry is used when the serializable transaction acquires a snapshot.
47 * Unless the transaction is rolled back, this entry must generally remain
48 * until all concurrent transactions have completed. (There are special
49 * optimizations for READ ONLY transactions which often allow them to be
50 * cleaned up earlier.) A transaction which is rolled back is cleaned up
51 * as soon as possible.
53 * Eligibility for cleanup of committed transactions is generally determined
54 * by comparing the transaction's finishedBefore field to
57 typedef struct SERIALIZABLEXACT
59 VirtualTransactionId vxid
; /* The executing process always has one of
63 * We use two numbers to track the order that transactions commit. Before
64 * commit, a transaction is marked as prepared, and prepareSeqNo is set.
65 * Shortly after commit, it's marked as committed, and commitSeqNo is set.
66 * This doesn't give a strict commit order, but these two values together
67 * are good enough for us, as we can always err on the safe side and
68 * assume that there's a conflict, if we can't be sure of the exact
69 * ordering of two commits.
71 * Note that a transaction is marked as prepared for a short period during
72 * commit processing, even if two-phase commit is not used. But with
73 * two-phase commit, a transaction can stay in prepared state for some
76 SerCommitSeqNo prepareSeqNo
;
77 SerCommitSeqNo commitSeqNo
;
79 /* these values are not both interesting at the same time */
82 SerCommitSeqNo earliestOutConflictCommit
; /* when committed with
84 SerCommitSeqNo lastCommitBeforeSnapshot
; /* when not committed or
87 SHM_QUEUE outConflicts
; /* list of write transactions whose data we
89 SHM_QUEUE inConflicts
; /* list of read transactions which couldn't
91 SHM_QUEUE predicateLocks
; /* list of associated PREDICATELOCK objects */
92 SHM_QUEUE finishedLink
; /* list link in
93 * FinishedSerializableTransactions */
96 * perXactPredicateListLock is only used in parallel queries: it protects
97 * this SERIALIZABLEXACT's predicate lock list against other workers of
100 LWLock perXactPredicateListLock
;
103 * for r/o transactions: list of concurrent r/w transactions that we could
104 * potentially have conflicts with, and vice versa for r/w transactions
106 SHM_QUEUE possibleUnsafeConflicts
;
108 TransactionId topXid
; /* top level xid for the transaction, if one
109 * exists; else invalid */
110 TransactionId finishedBefore
; /* invalid means still running; else the
111 * struct expires when no serializable
112 * xids are before this. */
113 TransactionId xmin
; /* the transaction's snapshot xmin */
114 uint32 flags
; /* OR'd combination of values defined below */
115 int pid
; /* pid of associated process */
118 #define SXACT_FLAG_COMMITTED 0x00000001 /* already committed */
119 #define SXACT_FLAG_PREPARED 0x00000002 /* about to commit */
120 #define SXACT_FLAG_ROLLED_BACK 0x00000004 /* already rolled back */
121 #define SXACT_FLAG_DOOMED 0x00000008 /* will roll back */
123 * The following flag actually means that the flagged transaction has a
124 * conflict out *to a transaction which committed ahead of it*. It's hard
125 * to get that into a name of a reasonable length.
127 #define SXACT_FLAG_CONFLICT_OUT 0x00000010
128 #define SXACT_FLAG_READ_ONLY 0x00000020
129 #define SXACT_FLAG_DEFERRABLE_WAITING 0x00000040
130 #define SXACT_FLAG_RO_SAFE 0x00000080
131 #define SXACT_FLAG_RO_UNSAFE 0x00000100
132 #define SXACT_FLAG_SUMMARY_CONFLICT_IN 0x00000200
133 #define SXACT_FLAG_SUMMARY_CONFLICT_OUT 0x00000400
135 * The following flag means the transaction has been partially released
136 * already, but is being preserved because parallel workers might have a
137 * reference to it. It'll be recycled by the leader at end-of-transaction.
139 #define SXACT_FLAG_PARTIALLY_RELEASED 0x00000800
142 * The following types are used to provide an ad hoc list for holding
143 * SERIALIZABLEXACT objects. An HTAB is overkill, since there is no need to
144 * access these by key -- there are direct pointers to these objects where
145 * needed. If a shared memory list is created, these types can probably be
146 * eliminated in favor of using the general solution.
148 typedef struct PredXactListElementData
151 SERIALIZABLEXACT sxact
;
152 } PredXactListElementData
;
154 typedef struct PredXactListElementData
*PredXactListElement
;
156 #define PredXactListElementDataSize \
157 ((Size)MAXALIGN(sizeof(PredXactListElementData)))
159 typedef struct PredXactListData
161 SHM_QUEUE availableList
;
162 SHM_QUEUE activeList
;
165 * These global variables are maintained when registering and cleaning up
166 * serializable transactions. They must be global across all backends,
167 * but are not needed outside the predicate.c source file. Protected by
168 * SerializableXactHashLock.
170 TransactionId SxactGlobalXmin
; /* global xmin for active serializable
172 int SxactGlobalXminCount
; /* how many active serializable
173 * transactions have this xmin */
174 int WritableSxactCount
; /* how many non-read-only serializable
175 * transactions are active */
176 SerCommitSeqNo LastSxactCommitSeqNo
; /* a strictly monotonically
177 * increasing number for commits
178 * of serializable transactions */
179 /* Protected by SerializableXactHashLock. */
180 SerCommitSeqNo CanPartialClearThrough
; /* can clear predicate locks and
181 * inConflicts for committed
182 * transactions through this seq
184 /* Protected by SerializableFinishedListLock. */
185 SerCommitSeqNo HavePartialClearedThrough
; /* have cleared through this
187 SERIALIZABLEXACT
*OldCommittedSxact
; /* shared copy of dummy sxact */
189 PredXactListElement element
;
192 typedef struct PredXactListData
*PredXactList
;
194 #define PredXactListDataSize \
195 ((Size)MAXALIGN(sizeof(PredXactListData)))
199 * The following types are used to provide lists of rw-conflicts between
200 * pairs of transactions. Since exactly the same information is needed,
201 * they are also used to record possible unsafe transaction relationships
202 * for purposes of identifying safe snapshots for read-only transactions.
204 * When a RWConflictData is not in use to record either type of relationship
205 * between a pair of transactions, it is kept on an "available" list. The
206 * outLink field is used for maintaining that list.
208 typedef struct RWConflictData
210 SHM_QUEUE outLink
; /* link for list of conflicts out from a sxact */
211 SHM_QUEUE inLink
; /* link for list of conflicts in to a sxact */
212 SERIALIZABLEXACT
*sxactOut
;
213 SERIALIZABLEXACT
*sxactIn
;
216 typedef struct RWConflictData
*RWConflict
;
218 #define RWConflictDataSize \
219 ((Size)MAXALIGN(sizeof(RWConflictData)))
221 typedef struct RWConflictPoolHeaderData
223 SHM_QUEUE availableList
;
225 } RWConflictPoolHeaderData
;
227 typedef struct RWConflictPoolHeaderData
*RWConflictPoolHeader
;
229 #define RWConflictPoolHeaderDataSize \
230 ((Size)MAXALIGN(sizeof(RWConflictPoolHeaderData)))
234 * The SERIALIZABLEXIDTAG struct identifies an xid assigned to a serializable
235 * transaction or any of its subtransactions.
237 typedef struct SERIALIZABLEXIDTAG
240 } SERIALIZABLEXIDTAG
;
243 * The SERIALIZABLEXID struct provides a link from a TransactionId for a
244 * serializable transaction to the related SERIALIZABLEXACT record, even if
245 * the transaction has completed and its connection has been closed.
247 * These are created as new top level transaction IDs are first assigned to
248 * transactions which are participating in predicate locking. This may
249 * never happen for a particular transaction if it doesn't write anything.
250 * They are removed with their related serializable transaction objects.
252 * The SubTransGetTopmostTransaction method is used where necessary to get
253 * from an XID which might be from a subtransaction to the top level XID.
255 typedef struct SERIALIZABLEXID
258 SERIALIZABLEXIDTAG tag
;
261 SERIALIZABLEXACT
*myXact
; /* pointer to the top level transaction data */
266 * The PREDICATELOCKTARGETTAG struct identifies a database object which can
267 * be the target of predicate locks.
269 * Note that the hash function being used doesn't properly respect tag
270 * length -- if the length of the structure isn't a multiple of four bytes it
271 * will go to a four byte boundary past the end of the tag. If you change
272 * this struct, make sure any slack space is initialized, so that any random
273 * bytes in the middle or at the end are not included in the hash.
275 * TODO SSI: If we always use the same fields for the same type of value, we
276 * should rename these. Holding off until it's clear there are no exceptions.
277 * Since indexes are relations with blocks and tuples, it's looking likely that
278 * the rename will be possible. If not, we may need to divide the last field
279 * and use part of it for a target type, so that we know how to interpret the
282 typedef struct PREDICATELOCKTARGETTAG
284 uint32 locktag_field1
; /* a 32-bit ID field */
285 uint32 locktag_field2
; /* a 32-bit ID field */
286 uint32 locktag_field3
; /* a 32-bit ID field */
287 uint32 locktag_field4
; /* a 32-bit ID field */
288 } PREDICATELOCKTARGETTAG
;
291 * The PREDICATELOCKTARGET struct represents a database object on which there
292 * are predicate locks.
294 * A hash list of these objects is maintained in shared memory. An entry is
295 * added when a predicate lock is requested on an object which doesn't
296 * already have one. An entry is removed when the last lock is removed from
299 typedef struct PREDICATELOCKTARGET
302 PREDICATELOCKTARGETTAG tag
; /* unique identifier of lockable object */
305 SHM_QUEUE predicateLocks
; /* list of PREDICATELOCK objects assoc. with
306 * predicate lock target */
307 } PREDICATELOCKTARGET
;
311 * The PREDICATELOCKTAG struct identifies an individual predicate lock.
313 * It is the combination of predicate lock target (which is a lockable
314 * object) and a serializable transaction which has acquired a lock on that
317 typedef struct PREDICATELOCKTAG
319 PREDICATELOCKTARGET
*myTarget
;
320 SERIALIZABLEXACT
*myXact
;
324 * The PREDICATELOCK struct represents an individual lock.
326 * An entry can be created here when the related database object is read, or
327 * by promotion of multiple finer-grained targets. All entries related to a
328 * serializable transaction are removed when that serializable transaction is
329 * cleaned up. Entries can also be removed when they are combined into a
330 * single coarser-grained lock entry.
332 typedef struct PREDICATELOCK
335 PREDICATELOCKTAG tag
; /* unique identifier of lock */
338 SHM_QUEUE targetLink
; /* list link in PREDICATELOCKTARGET's list of
340 SHM_QUEUE xactLink
; /* list link in SERIALIZABLEXACT's list of
342 SerCommitSeqNo commitSeqNo
; /* only used for summarized predicate locks */
347 * The LOCALPREDICATELOCK struct represents a local copy of data which is
348 * also present in the PREDICATELOCK table, organized for fast access without
349 * needing to acquire a LWLock. It is strictly for optimization.
351 * Each serializable transaction creates its own local hash table to hold a
352 * collection of these. This information is used to determine when a number
353 * of fine-grained locks should be promoted to a single coarser-grained lock.
354 * The information is maintained more-or-less in parallel to the
355 * PREDICATELOCK data, but because this data is not protected by locks and is
356 * only used in an optimization heuristic, it is allowed to drift in a few
357 * corner cases where maintaining exact data would be expensive.
359 * The hash table is created when the serializable transaction acquires its
360 * snapshot, and its memory is released upon completion of the transaction.
362 typedef struct LOCALPREDICATELOCK
365 PREDICATELOCKTARGETTAG tag
; /* unique identifier of lockable object */
368 bool held
; /* is lock held, or just its children? */
369 int childLocks
; /* number of child locks currently held */
370 } LOCALPREDICATELOCK
;
374 * The types of predicate locks which can be acquired.
376 typedef enum PredicateLockTargetType
378 PREDLOCKTAG_RELATION
,
381 /* TODO SSI: Other types may be needed for index locking */
382 } PredicateLockTargetType
;
386 * This structure is used to quickly capture a copy of all predicate
387 * locks. This is currently used only by the pg_lock_status function,
388 * which in turn is used by the pg_locks view.
390 typedef struct PredicateLockData
393 PREDICATELOCKTARGETTAG
*locktags
;
394 SERIALIZABLEXACT
*xacts
;
399 * These macros define how we map logical IDs of lockable objects into the
400 * physical fields of PREDICATELOCKTARGETTAG. Use these to set up values,
401 * rather than accessing the fields directly. Note multiple eval of target!
403 #define SET_PREDICATELOCKTARGETTAG_RELATION(locktag,dboid,reloid) \
404 ((locktag).locktag_field1 = (dboid), \
405 (locktag).locktag_field2 = (reloid), \
406 (locktag).locktag_field3 = InvalidBlockNumber, \
407 (locktag).locktag_field4 = InvalidOffsetNumber)
409 #define SET_PREDICATELOCKTARGETTAG_PAGE(locktag,dboid,reloid,blocknum) \
410 ((locktag).locktag_field1 = (dboid), \
411 (locktag).locktag_field2 = (reloid), \
412 (locktag).locktag_field3 = (blocknum), \
413 (locktag).locktag_field4 = InvalidOffsetNumber)
415 #define SET_PREDICATELOCKTARGETTAG_TUPLE(locktag,dboid,reloid,blocknum,offnum) \
416 ((locktag).locktag_field1 = (dboid), \
417 (locktag).locktag_field2 = (reloid), \
418 (locktag).locktag_field3 = (blocknum), \
419 (locktag).locktag_field4 = (offnum))
421 #define GET_PREDICATELOCKTARGETTAG_DB(locktag) \
422 ((Oid) (locktag).locktag_field1)
423 #define GET_PREDICATELOCKTARGETTAG_RELATION(locktag) \
424 ((Oid) (locktag).locktag_field2)
425 #define GET_PREDICATELOCKTARGETTAG_PAGE(locktag) \
426 ((BlockNumber) (locktag).locktag_field3)
427 #define GET_PREDICATELOCKTARGETTAG_OFFSET(locktag) \
428 ((OffsetNumber) (locktag).locktag_field4)
429 #define GET_PREDICATELOCKTARGETTAG_TYPE(locktag) \
430 (((locktag).locktag_field4 != InvalidOffsetNumber) ? PREDLOCKTAG_TUPLE : \
431 (((locktag).locktag_field3 != InvalidBlockNumber) ? PREDLOCKTAG_PAGE : \
432 PREDLOCKTAG_RELATION))
435 * Two-phase commit statefile records. There are two types: for each
436 * transaction, we generate one per-transaction record and a variable
437 * number of per-predicate-lock records.
439 typedef enum TwoPhasePredicateRecordType
441 TWOPHASEPREDICATERECORD_XACT
,
442 TWOPHASEPREDICATERECORD_LOCK
443 } TwoPhasePredicateRecordType
;
446 * Per-transaction information to reconstruct a SERIALIZABLEXACT. Not
447 * much is needed because most of it not meaningful for a recovered
448 * prepared transaction.
450 * In particular, we do not record the in and out conflict lists for a
451 * prepared transaction because the associated SERIALIZABLEXACTs will
452 * not be available after recovery. Instead, we simply record the
453 * existence of each type of conflict by setting the transaction's
454 * summary conflict in/out flag.
456 typedef struct TwoPhasePredicateXactRecord
460 } TwoPhasePredicateXactRecord
;
463 typedef struct TwoPhasePredicateLockRecord
465 PREDICATELOCKTARGETTAG target
;
466 uint32 filler
; /* to avoid length change in back-patched fix */
467 } TwoPhasePredicateLockRecord
;
469 typedef struct TwoPhasePredicateRecord
471 TwoPhasePredicateRecordType type
;
474 TwoPhasePredicateXactRecord xactRecord
;
475 TwoPhasePredicateLockRecord lockRecord
;
477 } TwoPhasePredicateRecord
;
480 * Define a macro to use for an "empty" SERIALIZABLEXACT reference.
482 #define InvalidSerializableXact ((SERIALIZABLEXACT *) NULL)
486 * Function definitions for functions needing awareness of predicate
489 extern PredicateLockData
*GetPredicateLockStatusData(void);
490 extern int GetSafeSnapshotBlockingPids(int blocked_pid
,
491 int *output
, int output_size
);
493 #endif /* PREDICATE_INTERNALS_H */