src/include/storage/predicate_internals.h

   1 /*-------------------------------------------------------------------------
   2  *
   3  * predicate_internals.h
   4  *        POSTGRES internal predicate locking definitions.
   5  *
   6  *
   7  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  * src/include/storage/predicate_internals.h
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14 #ifndef PREDICATE_INTERNALS_H
  15 #define PREDICATE_INTERNALS_H
  16
  17 #include "storage/lock.h"
  18 #include "storage/lwlock.h"
  19
  20 /*
  21  * Commit number.
  22  */
  23 typedef uint64 SerCommitSeqNo;
  24
  25 /*
  26  * Reserved commit sequence numbers:
  27  *      - 0 is reserved to indicate a non-existent SLRU entry; it cannot be
  28  *        used as a SerCommitSeqNo, even an invalid one
  29  *      - InvalidSerCommitSeqNo is used to indicate a transaction that
  30  *        hasn't committed yet, so use a number greater than all valid
  31  *        ones to make comparison do the expected thing
  32  *      - RecoverySerCommitSeqNo is used to refer to transactions that
  33  *        happened before a crash/recovery, since we restart the sequence
  34  *        at that point.  It's earlier than all normal sequence numbers,
  35  *        and is only used by recovered prepared transactions
  36  */
  37 #define InvalidSerCommitSeqNo           ((SerCommitSeqNo) PG_UINT64_MAX)
  38 #define RecoverySerCommitSeqNo          ((SerCommitSeqNo) 1)
  39 #define FirstNormalSerCommitSeqNo       ((SerCommitSeqNo) 2)
  40
  41 /*
  42  * The SERIALIZABLEXACT struct contains information needed for each
  43  * serializable database transaction to support SSI techniques.
  44  *
  45  * A home-grown list is maintained in shared memory to manage these.
  46  * An entry is used when the serializable transaction acquires a snapshot.
  47  * Unless the transaction is rolled back, this entry must generally remain
  48  * until all concurrent transactions have completed.  (There are special
  49  * optimizations for READ ONLY transactions which often allow them to be
  50  * cleaned up earlier.)  A transaction which is rolled back is cleaned up
  51  * as soon as possible.
  52  *
  53  * Eligibility for cleanup of committed transactions is generally determined
  54  * by comparing the transaction's finishedBefore field to
  55  * SxactGlobalXmin.
  56  */
  57 typedef struct SERIALIZABLEXACT
  58 {
  59         VirtualTransactionId vxid;      /* The executing process always has one of
  60                                                                  * these. */
  61
  62         /*
  63          * We use two numbers to track the order that transactions commit. Before
  64          * commit, a transaction is marked as prepared, and prepareSeqNo is set.
  65          * Shortly after commit, it's marked as committed, and commitSeqNo is set.
  66          * This doesn't give a strict commit order, but these two values together
  67          * are good enough for us, as we can always err on the safe side and
  68          * assume that there's a conflict, if we can't be sure of the exact
  69          * ordering of two commits.
  70          *
  71          * Note that a transaction is marked as prepared for a short period during
  72          * commit processing, even if two-phase commit is not used. But with
  73          * two-phase commit, a transaction can stay in prepared state for some
  74          * time.
  75          */
  76         SerCommitSeqNo prepareSeqNo;
  77         SerCommitSeqNo commitSeqNo;
  78
  79         /* these values are not both interesting at the same time */
  80         union
  81         {
  82                 SerCommitSeqNo earliestOutConflictCommit;       /* when committed with
  83                                                                                                          * conflict out */
  84                 SerCommitSeqNo lastCommitBeforeSnapshot;        /* when not committed or
  85                                                                                                          * no conflict out */
  86         }                       SeqNo;
  87         SHM_QUEUE       outConflicts;   /* list of write transactions whose data we
  88                                                                  * couldn't read. */
  89         SHM_QUEUE       inConflicts;    /* list of read transactions which couldn't
  90                                                                  * see our write. */
  91         SHM_QUEUE       predicateLocks; /* list of associated PREDICATELOCK objects */
  92         SHM_QUEUE       finishedLink;   /* list link in
  93                                                                  * FinishedSerializableTransactions */
  94
  95         /*
  96          * perXactPredicateListLock is only used in parallel queries: it protects
  97          * this SERIALIZABLEXACT's predicate lock list against other workers of
  98          * the same session.
  99          */
 100         LWLock          perXactPredicateListLock;
 101
 102         /*
 103          * for r/o transactions: list of concurrent r/w transactions that we could
 104          * potentially have conflicts with, and vice versa for r/w transactions
 105          */
 106         SHM_QUEUE       possibleUnsafeConflicts;
 107
 108         TransactionId topXid;           /* top level xid for the transaction, if one
 109                                                                  * exists; else invalid */
 110         TransactionId finishedBefore;   /* invalid means still running; else the
 111                                                                          * struct expires when no serializable
 112                                                                          * xids are before this. */
 113         TransactionId xmin;                     /* the transaction's snapshot xmin */
 114         uint32          flags;                  /* OR'd combination of values defined below */
 115         int                     pid;                    /* pid of associated process */
 116 } SERIALIZABLEXACT;
 117
 118 #define SXACT_FLAG_COMMITTED                    0x00000001      /* already committed */
 119 #define SXACT_FLAG_PREPARED                             0x00000002      /* about to commit */
 120 #define SXACT_FLAG_ROLLED_BACK                  0x00000004      /* already rolled back */
 121 #define SXACT_FLAG_DOOMED                               0x00000008      /* will roll back */
 122 /*
 123  * The following flag actually means that the flagged transaction has a
 124  * conflict out *to a transaction which committed ahead of it*.  It's hard
 125  * to get that into a name of a reasonable length.
 126  */
 127 #define SXACT_FLAG_CONFLICT_OUT                 0x00000010
 128 #define SXACT_FLAG_READ_ONLY                    0x00000020
 129 #define SXACT_FLAG_DEFERRABLE_WAITING   0x00000040
 130 #define SXACT_FLAG_RO_SAFE                              0x00000080
 131 #define SXACT_FLAG_RO_UNSAFE                    0x00000100
 132 #define SXACT_FLAG_SUMMARY_CONFLICT_IN  0x00000200
 133 #define SXACT_FLAG_SUMMARY_CONFLICT_OUT 0x00000400
 134 /*
 135  * The following flag means the transaction has been partially released
 136  * already, but is being preserved because parallel workers might have a
 137  * reference to it.  It'll be recycled by the leader at end-of-transaction.
 138  */
 139 #define SXACT_FLAG_PARTIALLY_RELEASED   0x00000800
 140
 141 /*
 142  * The following types are used to provide an ad hoc list for holding
 143  * SERIALIZABLEXACT objects.  An HTAB is overkill, since there is no need to
 144  * access these by key -- there are direct pointers to these objects where
 145  * needed.  If a shared memory list is created, these types can probably be
 146  * eliminated in favor of using the general solution.
 147  */
 148 typedef struct PredXactListElementData
 149 {
 150         SHM_QUEUE       link;
 151         SERIALIZABLEXACT sxact;
 152 }                       PredXactListElementData;
 153
 154 typedef struct PredXactListElementData *PredXactListElement;
 155
 156 #define PredXactListElementDataSize \
 157                 ((Size)MAXALIGN(sizeof(PredXactListElementData)))
 158
 159 typedef struct PredXactListData
 160 {
 161         SHM_QUEUE       availableList;
 162         SHM_QUEUE       activeList;
 163
 164         /*
 165          * These global variables are maintained when registering and cleaning up
 166          * serializable transactions.  They must be global across all backends,
 167          * but are not needed outside the predicate.c source file. Protected by
 168          * SerializableXactHashLock.
 169          */
 170         TransactionId SxactGlobalXmin;  /* global xmin for active serializable
 171                                                                          * transactions */
 172         int                     SxactGlobalXminCount;   /* how many active serializable
 173                                                                                  * transactions have this xmin */
 174         int                     WritableSxactCount; /* how many non-read-only serializable
 175                                                                          * transactions are active */
 176         SerCommitSeqNo LastSxactCommitSeqNo;    /* a strictly monotonically
 177                                                                                          * increasing number for commits
 178                                                                                          * of serializable transactions */
 179         /* Protected by SerializableXactHashLock. */
 180         SerCommitSeqNo CanPartialClearThrough;  /* can clear predicate locks and
 181                                                                                          * inConflicts for committed
 182                                                                                          * transactions through this seq
 183                                                                                          * no */
 184         /* Protected by SerializableFinishedListLock. */
 185         SerCommitSeqNo HavePartialClearedThrough;       /* have cleared through this
 186                                                                                                  * seq no */
 187         SERIALIZABLEXACT *OldCommittedSxact;    /* shared copy of dummy sxact */
 188
 189         PredXactListElement element;
 190 }                       PredXactListData;
 191
 192 typedef struct PredXactListData *PredXactList;
 193
 194 #define PredXactListDataSize \
 195                 ((Size)MAXALIGN(sizeof(PredXactListData)))
 196
 197
 198 /*
 199  * The following types are used to provide lists of rw-conflicts between
 200  * pairs of transactions.  Since exactly the same information is needed,
 201  * they are also used to record possible unsafe transaction relationships
 202  * for purposes of identifying safe snapshots for read-only transactions.
 203  *
 204  * When a RWConflictData is not in use to record either type of relationship
 205  * between a pair of transactions, it is kept on an "available" list.  The
 206  * outLink field is used for maintaining that list.
 207  */
 208 typedef struct RWConflictData
 209 {
 210         SHM_QUEUE       outLink;                /* link for list of conflicts out from a sxact */
 211         SHM_QUEUE       inLink;                 /* link for list of conflicts in to a sxact */
 212         SERIALIZABLEXACT *sxactOut;
 213         SERIALIZABLEXACT *sxactIn;
 214 }                       RWConflictData;
 215
 216 typedef struct RWConflictData *RWConflict;
 217
 218 #define RWConflictDataSize \
 219                 ((Size)MAXALIGN(sizeof(RWConflictData)))
 220
 221 typedef struct RWConflictPoolHeaderData
 222 {
 223         SHM_QUEUE       availableList;
 224         RWConflict      element;
 225 }                       RWConflictPoolHeaderData;
 226
 227 typedef struct RWConflictPoolHeaderData *RWConflictPoolHeader;
 228
 229 #define RWConflictPoolHeaderDataSize \
 230                 ((Size)MAXALIGN(sizeof(RWConflictPoolHeaderData)))
 231
 232
 233 /*
 234  * The SERIALIZABLEXIDTAG struct identifies an xid assigned to a serializable
 235  * transaction or any of its subtransactions.
 236  */
 237 typedef struct SERIALIZABLEXIDTAG
 238 {
 239         TransactionId xid;
 240 } SERIALIZABLEXIDTAG;
 241
 242 /*
 243  * The SERIALIZABLEXID struct provides a link from a TransactionId for a
 244  * serializable transaction to the related SERIALIZABLEXACT record, even if
 245  * the transaction has completed and its connection has been closed.
 246  *
 247  * These are created as new top level transaction IDs are first assigned to
 248  * transactions which are participating in predicate locking.  This may
 249  * never happen for a particular transaction if it doesn't write anything.
 250  * They are removed with their related serializable transaction objects.
 251  *
 252  * The SubTransGetTopmostTransaction method is used where necessary to get
 253  * from an XID which might be from a subtransaction to the top level XID.
 254  */
 255 typedef struct SERIALIZABLEXID
 256 {
 257         /* hash key */
 258         SERIALIZABLEXIDTAG tag;
 259
 260         /* data */
 261         SERIALIZABLEXACT *myXact;       /* pointer to the top level transaction data */
 262 } SERIALIZABLEXID;
 263
 264
 265 /*
 266  * The PREDICATELOCKTARGETTAG struct identifies a database object which can
 267  * be the target of predicate locks.
 268  *
 269  * Note that the hash function being used doesn't properly respect tag
 270  * length -- if the length of the structure isn't a multiple of four bytes it
 271  * will go to a four byte boundary past the end of the tag.  If you change
 272  * this struct, make sure any slack space is initialized, so that any random
 273  * bytes in the middle or at the end are not included in the hash.
 274  *
 275  * TODO SSI: If we always use the same fields for the same type of value, we
 276  * should rename these.  Holding off until it's clear there are no exceptions.
 277  * Since indexes are relations with blocks and tuples, it's looking likely that
 278  * the rename will be possible.  If not, we may need to divide the last field
 279  * and use part of it for a target type, so that we know how to interpret the
 280  * data..
 281  */
 282 typedef struct PREDICATELOCKTARGETTAG
 283 {
 284         uint32          locktag_field1; /* a 32-bit ID field */
 285         uint32          locktag_field2; /* a 32-bit ID field */
 286         uint32          locktag_field3; /* a 32-bit ID field */
 287         uint32          locktag_field4; /* a 32-bit ID field */
 288 } PREDICATELOCKTARGETTAG;
 289
 290 /*
 291  * The PREDICATELOCKTARGET struct represents a database object on which there
 292  * are predicate locks.
 293  *
 294  * A hash list of these objects is maintained in shared memory.  An entry is
 295  * added when a predicate lock is requested on an object which doesn't
 296  * already have one.  An entry is removed when the last lock is removed from
 297  * its list.
 298  */
 299 typedef struct PREDICATELOCKTARGET
 300 {
 301         /* hash key */
 302         PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */
 303
 304         /* data */
 305         SHM_QUEUE       predicateLocks; /* list of PREDICATELOCK objects assoc. with
 306                                                                  * predicate lock target */
 307 } PREDICATELOCKTARGET;
 308
 309
 310 /*
 311  * The PREDICATELOCKTAG struct identifies an individual predicate lock.
 312  *
 313  * It is the combination of predicate lock target (which is a lockable
 314  * object) and a serializable transaction which has acquired a lock on that
 315  * target.
 316  */
 317 typedef struct PREDICATELOCKTAG
 318 {
 319         PREDICATELOCKTARGET *myTarget;
 320         SERIALIZABLEXACT *myXact;
 321 } PREDICATELOCKTAG;
 322
 323 /*
 324  * The PREDICATELOCK struct represents an individual lock.
 325  *
 326  * An entry can be created here when the related database object is read, or
 327  * by promotion of multiple finer-grained targets.  All entries related to a
 328  * serializable transaction are removed when that serializable transaction is
 329  * cleaned up.  Entries can also be removed when they are combined into a
 330  * single coarser-grained lock entry.
 331  */
 332 typedef struct PREDICATELOCK
 333 {
 334         /* hash key */
 335         PREDICATELOCKTAG tag;           /* unique identifier of lock */
 336
 337         /* data */
 338         SHM_QUEUE       targetLink;             /* list link in PREDICATELOCKTARGET's list of
 339                                                                  * predicate locks */
 340         SHM_QUEUE       xactLink;               /* list link in SERIALIZABLEXACT's list of
 341                                                                  * predicate locks */
 342         SerCommitSeqNo commitSeqNo; /* only used for summarized predicate locks */
 343 } PREDICATELOCK;
 344
 345
 346 /*
 347  * The LOCALPREDICATELOCK struct represents a local copy of data which is
 348  * also present in the PREDICATELOCK table, organized for fast access without
 349  * needing to acquire a LWLock.  It is strictly for optimization.
 350  *
 351  * Each serializable transaction creates its own local hash table to hold a
 352  * collection of these.  This information is used to determine when a number
 353  * of fine-grained locks should be promoted to a single coarser-grained lock.
 354  * The information is maintained more-or-less in parallel to the
 355  * PREDICATELOCK data, but because this data is not protected by locks and is
 356  * only used in an optimization heuristic, it is allowed to drift in a few
 357  * corner cases where maintaining exact data would be expensive.
 358  *
 359  * The hash table is created when the serializable transaction acquires its
 360  * snapshot, and its memory is released upon completion of the transaction.
 361  */
 362 typedef struct LOCALPREDICATELOCK
 363 {
 364         /* hash key */
 365         PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */
 366
 367         /* data */
 368         bool            held;                   /* is lock held, or just its children?  */
 369         int                     childLocks;             /* number of child locks currently held */
 370 } LOCALPREDICATELOCK;
 371
 372
 373 /*
 374  * The types of predicate locks which can be acquired.
 375  */
 376 typedef enum PredicateLockTargetType
 377 {
 378         PREDLOCKTAG_RELATION,
 379         PREDLOCKTAG_PAGE,
 380         PREDLOCKTAG_TUPLE
 381         /* TODO SSI: Other types may be needed for index locking */
 382 } PredicateLockTargetType;
 383
 384
 385 /*
 386  * This structure is used to quickly capture a copy of all predicate
 387  * locks.  This is currently used only by the pg_lock_status function,
 388  * which in turn is used by the pg_locks view.
 389  */
 390 typedef struct PredicateLockData
 391 {
 392         int                     nelements;
 393         PREDICATELOCKTARGETTAG *locktags;
 394         SERIALIZABLEXACT *xacts;
 395 } PredicateLockData;
 396
 397
 398 /*
 399  * These macros define how we map logical IDs of lockable objects into the
 400  * physical fields of PREDICATELOCKTARGETTAG.   Use these to set up values,
 401  * rather than accessing the fields directly.  Note multiple eval of target!
 402  */
 403 #define SET_PREDICATELOCKTARGETTAG_RELATION(locktag,dboid,reloid) \
 404         ((locktag).locktag_field1 = (dboid), \
 405          (locktag).locktag_field2 = (reloid), \
 406          (locktag).locktag_field3 = InvalidBlockNumber, \
 407          (locktag).locktag_field4 = InvalidOffsetNumber)
 408
 409 #define SET_PREDICATELOCKTARGETTAG_PAGE(locktag,dboid,reloid,blocknum) \
 410         ((locktag).locktag_field1 = (dboid), \
 411          (locktag).locktag_field2 = (reloid), \
 412          (locktag).locktag_field3 = (blocknum), \
 413          (locktag).locktag_field4 = InvalidOffsetNumber)
 414
 415 #define SET_PREDICATELOCKTARGETTAG_TUPLE(locktag,dboid,reloid,blocknum,offnum) \
 416         ((locktag).locktag_field1 = (dboid), \
 417          (locktag).locktag_field2 = (reloid), \
 418          (locktag).locktag_field3 = (blocknum), \
 419          (locktag).locktag_field4 = (offnum))
 420
 421 #define GET_PREDICATELOCKTARGETTAG_DB(locktag) \
 422         ((Oid) (locktag).locktag_field1)
 423 #define GET_PREDICATELOCKTARGETTAG_RELATION(locktag) \
 424         ((Oid) (locktag).locktag_field2)
 425 #define GET_PREDICATELOCKTARGETTAG_PAGE(locktag) \
 426         ((BlockNumber) (locktag).locktag_field3)
 427 #define GET_PREDICATELOCKTARGETTAG_OFFSET(locktag) \
 428         ((OffsetNumber) (locktag).locktag_field4)
 429 #define GET_PREDICATELOCKTARGETTAG_TYPE(locktag)                                                         \
 430         (((locktag).locktag_field4 != InvalidOffsetNumber) ? PREDLOCKTAG_TUPLE : \
 431          (((locktag).locktag_field3 != InvalidBlockNumber) ? PREDLOCKTAG_PAGE :   \
 432           PREDLOCKTAG_RELATION))
 433
 434 /*
 435  * Two-phase commit statefile records. There are two types: for each
 436  * transaction, we generate one per-transaction record and a variable
 437  * number of per-predicate-lock records.
 438  */
 439 typedef enum TwoPhasePredicateRecordType
 440 {
 441         TWOPHASEPREDICATERECORD_XACT,
 442         TWOPHASEPREDICATERECORD_LOCK
 443 } TwoPhasePredicateRecordType;
 444
 445 /*
 446  * Per-transaction information to reconstruct a SERIALIZABLEXACT. Not
 447  * much is needed because most of it not meaningful for a recovered
 448  * prepared transaction.
 449  *
 450  * In particular, we do not record the in and out conflict lists for a
 451  * prepared transaction because the associated SERIALIZABLEXACTs will
 452  * not be available after recovery. Instead, we simply record the
 453  * existence of each type of conflict by setting the transaction's
 454  * summary conflict in/out flag.
 455  */
 456 typedef struct TwoPhasePredicateXactRecord
 457 {
 458         TransactionId xmin;
 459         uint32          flags;
 460 } TwoPhasePredicateXactRecord;
 461
 462 /* Per-lock state */
 463 typedef struct TwoPhasePredicateLockRecord
 464 {
 465         PREDICATELOCKTARGETTAG target;
 466         uint32          filler;                 /* to avoid length change in back-patched fix */
 467 } TwoPhasePredicateLockRecord;
 468
 469 typedef struct TwoPhasePredicateRecord
 470 {
 471         TwoPhasePredicateRecordType type;
 472         union
 473         {
 474                 TwoPhasePredicateXactRecord xactRecord;
 475                 TwoPhasePredicateLockRecord lockRecord;
 476         }                       data;
 477 } TwoPhasePredicateRecord;
 478
 479 /*
 480  * Define a macro to use for an "empty" SERIALIZABLEXACT reference.
 481  */
 482 #define InvalidSerializableXact ((SERIALIZABLEXACT *) NULL)
 483
 484
 485 /*
 486  * Function definitions for functions needing awareness of predicate
 487  * locking internals.
 488  */
 489 extern PredicateLockData *GetPredicateLockStatusData(void);
 490 extern int      GetSafeSnapshotBlockingPids(int blocked_pid,
 491                                                                                 int *output, int output_size);
 492
 493 #endif                                                  /* PREDICATE_INTERNALS_H */